source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 3101

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@3101
Revision 3101, 15.7 KB checked in by selatham, 12 years ago (diff)

Cope with slashes in id. More DP config files

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script, plus some other modules for parts of the process.
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents.
12        - discovery_corrected Documents with schema namespaces corrected, ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/    Documents as harvested from OAI
14 Where  /DATACENTRE  varies for the different data providers
15"""
16#History:
17# 12/05/06 SEL spelling correction
18# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
19# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
20# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
21# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
22# 17/10/06 SEL cope with different discovery formats - not just DIF.
23# 23/10/06 SEL keywords not mandatory in config file.
24# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
25
26import os
27import sys
28import commands
29import string
30import SpaceTimeIngestFromMOLES
31import keywordAdder
32from SchemaNameSpace import SchemaNameSpace
33from DIF import DIF
34from MDIP import MDIP
35
36def getID(filename):
37        ''' Gets the identifier out of an input metadata xml record. Copes with DIF and MDIP currently.'''
38        xml=file(filename).read()
39        if datacentre_format == "DIF":
40            d=DIF(xml)
41            ID=d.entryID
42        elif datacentre_format == "MDIP":
43            d=MDIP(xml)
44            ID=d.id
45        else:
46            sys.exit("Only handles DIF or MDIP here.")
47        return ID
48
49status = 0
50numfilesproc = 0
51harvest_home = ""
52datacentre_groups = ""
53datacentre_format = ""
54datacentre_namespace = ""
55NDG_dataProvider = False
56
57if (len(sys.argv) < 2):
58    print "<datacentre>  parameter not supplied."
59    sys.exit()
60else:
61    datacentre = sys.argv[1]
62
63# Other settings and constants
64date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
65os.putenv ('EXIST_HOME', '/usr/local/exist-client')
66os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
67os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
68os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
69
70# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
71# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
72# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
73# Groups are added to the intermediate MOLES when it is created.
74datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
75print "Datacentre config file = %s" %datacentre_config_filename
76datacentre_config_file = open(datacentre_config_filename, "r")
77
78for line in datacentre_config_file.readlines():
79    words  = string.split(line)
80    if len(words) == 0:
81        continue
82    if words[0] == 'host_path':
83        harvest_home = string.rstrip(words[1])
84    if words[0] == 'groups':
85        datacentre_groups = words[1:]
86    if words[0] == 'format':
87        datacentre_format = words[1]
88    if words[0] == 'namespace':
89        datacentre_namespace = words[1]
90    if words[0] == 'NDG_dataProvider':
91        NDG_dataProvider = True
92
93datacentre_config_file.close()
94
95if harvest_home == "":
96    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
97else:
98    print "INFO: harvested records are in %s" %harvest_home
99
100if datacentre_groups == "":
101    print "INFO: No groups/keywords set for datacentre %s" %datacentre
102else:
103    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
104
105if datacentre_format == "":
106    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
107else:
108    print "INFO: format being harvested = %s" %datacentre_format
109
110if datacentre_namespace == "":
111    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
112else:
113    print "INFO: datacentre namespace = %s" %datacentre_namespace
114
115#any records to harvest?
116if len( os.listdir(harvest_home)) == 0:
117    print "Nothing to harvest this time from %s" %datacentre
118    sys.exit()
119
120# The directory to put things for a tape backup (should already exist)
121backupdir = '/disks/glue1/oaiBackup/'
122
123# Create/clear the 'in' directory pristine copy of the discovery records
124if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
125    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
126    print "Executing : " + commandline
127    status = os.system(commandline)
128else:
129    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
130    print "Executing : " + commandline
131    status= os.system(commandline)
132
133if status != 0:
134    sys.exit("Failed at creating copy dir stage")
135
136# make the 'in' pristine copy. Cope with there being lots of files in the directory.
137
138commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
139print "Executing : " + commandline
140status = os.system(commandline)
141if status !=0:
142    sys.exit("Failed at making pristine copy stage")
143
144# Create/clear the directory for the 'out' processed copy of the discovery records.
145if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
146    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
147    print "Executing : " + commandline
148    status = os.system(commandline)
149else:
150    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
151    print "Executing : " + commandline
152    status= os.system(commandline)
153
154# Create/clear the directory for the 'out' namespace corrected copy of the discovery records.
155if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"):
156    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/{\}"
157    print "Executing : " + commandline
158    status = os.system(commandline)
159else:
160    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
161    print "Executing : " + commandline
162    status= os.system(commandline)
163
164# The file config.properties contains the location of the particular datacentres harvested records.
165# Copy the datacentre specific version of config to config.properties file.
166commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
167print "Executing : " + commandline
168status = os.system(commandline)
169if status !=0:
170    sys.exit("Failed at copying config file stage")
171
172#Change os directory to that with the code in it.
173os.chdir('/usr/local/WSClients/OAIBatch')
174
175
176#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
177indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
178outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
179#wrapFlag=False
180filenames = os.listdir(indir)
181for filename in filenames:
182        if filename.find('.xml') != -1:
183                original_filename = indir + "/" + filename
184                ident=getID(original_filename)
185                print "ID extracted from the discovery record = %s" %ident
186                if NDG_dataProvider:
187                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml"
188                else:
189                    ident = ident.replace(":","-")
190                    ident = ident.replace("/","-")
191                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml"
192                print "original file = %s, newfile = %s" %(original_filename, new_filename)
193                commandline = "cp "+original_filename+ " " +new_filename
194                #print "Executing : " + commandline
195                status = os.system(commandline)
196                if status !=0:
197                    sys.exit("ERROR: Failed at re-naming file stage")
198                numfilesproc += 1
199        else:
200                print 'File %s is not xml format. Not processed'  %(full_filename)
201
202#replace any namespace declarations with a standard one which we know works in NDG
203indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
204outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
205filenames = os.listdir(indir)
206for filename in filenames:
207        if filename.find('.xml') != -1:
208                    in_filename = indir + "/" + filename
209                    corrected_filename = outdir + "/" + filename
210                    try:
211                       SchemaNameSpace(in_filename, corrected_filename,datacentre_format)
212                    except:
213                       print "ERROR: SchemaNameSpace failed on file %s"%in_filename
214
215# ingest the datacentres records into eXist db (backups of exist happen nightly).
216commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
217print "Executing : actual command to ingest into exist db"
218status = os.system(commandline)
219if status !=0:
220    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
221
222#are there any old moles records hanging around.If so, remove.
223try:
224    os.stat("./DIF2MOLES")
225except:
226    print "No old moles records hanging around"
227else:
228    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
229    print "Executing : " + commandline
230    status = os.system(commandline)
231    if status !=0:
232        sys.exit("ERROR: Failed at clearing out DIF2MOLES area.")
233
234# Then run the minimum moles creator for each discovery record
235# Put records in ./DIF2MOLES with same filename
236filenames = os.listdir(outdir)
237for filename in filenames:
238        if filename.find('.xml') != -1:
239                    original_filename = outdir + "/" + filename
240                    ident=getID(original_filename)
241                    if NDG_dataProvider:
242                        newident=ident.replace(":","__")
243                        print "identifier is %s" %newident
244                        molesLocalID = newident.split("__",2)[2]
245                    else:
246                        molesLocalID = ident
247                    print "molesLocalID is %s" %molesLocalID
248                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
249                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
250                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
251                    print "Executing command to run d2boneoff.jar"
252                    status= os.system(commandline)
253                    if status==10:
254                        print "WARNING: couldn't find the record"
255                    elif status!=0:
256                        print "ERROR: couldn't create the minimum moles records"
257                        sys.exit
258#There should be some records now
259try:
260    os.stat("./DIF2MOLES")
261except:
262    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
263    sys.exit()
264
265#Add keywords if necessary
266if datacentre_groups == "":
267    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
268    print "Executing : " + commandline
269    status = os.system(commandline)
270    if status !=0:
271        sys.exit("Failed at moving MOLES to FINAL directory")
272else:
273    keywordAdder.main('./DIF2MOLES', './FINALMOLES', datacentre_groups)
274
275# ingest the created discovery minimum molesrecords into eXist db.
276commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES"
277print "Executing : actual command to ingest into exist db"
278status = os.system(commandline)
279if status !=0:
280    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
281
282#Extract the spatiotemporal info from created moles and put in Postgres db
283SpaceTimeIngestFromMOLES.main("./FINALMOLES")
284
285#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
286this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
287commandline = "mkdir " + this_backupdir
288print "Executing : " + commandline
289status = os.system(commandline)
290if status !=0:
291    sys.exit("Failed at creating backup directory %s" %this_backupdir)
292
293commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
294print "Executing : " + commandline
295status = os.system(commandline)
296if status !=0:
297    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
298
299this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
300commandline = "mkdir " + this_backupdir
301print "Executing : " + commandline
302status = os.system(commandline)
303if status !=0:
304    sys.exit("Failed at creating backup directory %s" %this_backupdir)
305
306commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
307print "Executing : " + commandline
308status = os.system(commandline)
309if status !=0:
310    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
311
312this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
313commandline = "mkdir " + this_backupdir
314print "Executing : " + commandline
315status = os.system(commandline)
316if status !=0:
317    sys.exit("Failed at creating backup directory %s" %this_backupdir)
318
319commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
320print "Executing : " + commandline
321status = os.system(commandline)
322if status !=0:
323    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
324
325#Clear out the original harvest records area and FINALMOLES
326commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
327print "Executing : " + commandline
328status = os.system(commandline)
329if status !=0:
330    sys.exit("Failed at clearing out FINALMOLES area %s" %harvest_home)
331
332commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
333print "Executing : " + commandline
334status = os.system(commandline)
335if status !=0:
336    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
337
338
339
340#remove the DIF2MOLES directory
341##commandline = "rmdir ./DIF2MOLES"
342#print "Executing : " + commandline
343#status = os.system(commandline)
344#if status !=0:
345#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
346
347print "======================================================"
348print "No. of files pre-processed = %s" %numfilesproc
349if status == 0:
350    print " Procedure oai_ingest.py ran to end"
351else:
352    print "Procedure oai_ingest.py FAILED with status %s" %status
353
354print "======================================================"
Note: See TracBrowser for help on using the repository browser.