source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 2324

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@2324
Revision 2324, 15.5 KB checked in by selatham, 13 years ago (diff)

new SchemaNameSpace? corrector

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script,
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents ready to ingest in the discovery service.
12                - oai/difYYYYMMDD/    Documents as harvested from OAI
13 Where  /DATACENTRE  varies for the different data providers
14"""
15#History:
16# 12/05/06 SEL spelling correction
17# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
18# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
19# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
20# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
21# 17/10/06 SEL cope with different discovery formats - not just DIF.
22# 23/10/06 SEL keywords not mandatory in config file.
23# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
24
25import os
26import sys
27import commands
28import string
29import SpaceTimeIngestFromMOLES
30import keywordAdder
31import SchemaNameSpace from SchemaNameSpace
32from DIF import DIF
33from MDIP import MDIP
34
35def getID(filename):
36        ''' Gets the identifier out of an input metadata xml record. Copes with DIF and MDIP currently.'''
37        xml=file(filename).read()
38        if datacentre_format == "DIF":
39            d=DIF(xml)
40            ID=d.entryID
41        elif datacentre_format == "MDIP":
42            d=MDIP(xml)
43            ID=d.id
44        else:
45            sys.exit("Only handles DIF or MDIP here.")
46        return ID
47
48status = 0
49numfilesproc = 0
50harvest_home = ""
51datacentre_groups = ""
52datacentre_format = ""
53datacentre_namespace = ""
54NDG_dataProvider = False
55
56if (len(sys.argv) < 2):
57    print "<datacentre>  parameter not supplied."
58    sys.exit()
59else:
60    datacentre = sys.argv[1]
61
62# Other settings and constants
63date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
64os.putenv ('EXIST_HOME', '/usr/local/exist-client')
65os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
66os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
67os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
68
69# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
70# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
71# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
72# Groups are added to the intermediate MOLES when it is created.
73datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
74print "Datacentre config file = %s" %datacentre_config_filename
75datacentre_config_file = open(datacentre_config_filename, "r")
76
77for line in datacentre_config_file.readlines():
78    words  = string.split(line)
79    if len(words) == 0:
80        continue
81    if words[0] == 'host_path':
82        harvest_home = string.rstrip(words[1])
83    if words[0] == 'groups':
84        datacentre_groups = words[1:]
85    if words[0] == 'format':
86        datacentre_format = words[1]
87    if words[0] == 'namespace':
88        datacentre_namespace = words[1]
89    if words[0] == 'NDG_dataProvider':
90        NDG_dataProvider = True
91
92datacentre_config_file.close()
93
94if harvest_home == "":
95    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
96else:
97    print "INFO: harvested records are in %s" %harvest_home
98
99if datacentre_groups == "":
100    print "INFO: No groups/keywords set for datacentre %s" %datacentre
101else:
102    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
103
104if datacentre_format == "":
105    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
106else:
107    print "INFO: format being harvested = %s" %datacentre_format
108
109if datacentre_namespace == "":
110    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
111else:
112    print "INFO: datacentre namespace = %s" %datacentre_namespace
113
114#any records to harvest?
115if len( os.listdir(harvest_home)) == 0:
116    print "Nothing to harvest this time from %s" %datacentre
117    sys.exit()
118
119# The directory to put things for a tape backup (should already exist)
120backupdir = '/disks/glue1/oaiBackup/'
121
122# Create/clear the 'in' directory pristine copy of the discovery records
123if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
124    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
125    print "Executing : " + commandline
126    status = os.system(commandline)
127else:
128    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
129    print "Executing : " + commandline
130    status= os.system(commandline)
131
132if status != 0:
133    sys.exit("Failed at creating copy dir stage")
134
135# make the 'in' pristine copy. Cope with there being lots of files in the directory.
136
137commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
138print "Executing : " + commandline
139status = os.system(commandline)
140if status !=0:
141    sys.exit("Failed at making pristine copy stage")
142
143# Create/clear the directory for the 'out' processed copy of the discovery records.
144if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
145    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
146    print "Executing : " + commandline
147    status = os.system(commandline)
148else:
149    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
150    print "Executing : " + commandline
151    status= os.system(commandline)
152
153# Create/clear the directory for the 'out' namespace corrected copy of the discovery records.
154if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"):
155    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
156    print "Executing : " + commandline
157    status = os.system(commandline)
158else:
159    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
160    print "Executing : " + commandline
161    status= os.system(commandline)
162
163# The file config.properties contains the location of the particular datacentres harvested records.
164# Copy the datacentre specific version of config to config.properties file.
165commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
166print "Executing : " + commandline
167status = os.system(commandline)
168if status !=0:
169    sys.exit("Failed at copying config file stage")
170
171#Change os directory to that with the code in it.
172os.chdir('/usr/local/WSClients/OAIBatch')
173
174
175#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
176indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
177outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
178#wrapFlag=False
179filenames = os.listdir(indir)
180for filename in filenames:
181        if filename.find('.xml') != -1:
182                original_filename = indir + "/" + filename
183                ident=getID(original_filename)
184                print "ID extracted from the discovery record = %s" %ident
185                if NDG_dataProvider:
186                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml"
187                else:
188                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml"
189                print "original file = %s, newfile = %s" %(original_filename, new_filename)
190                commandline = "cp "+original_filename+ " " +new_filename
191                #print "Executing : " + commandline
192                status = os.system(commandline)
193                if status !=0:
194                    sys.exit("Failed at re-naming file stage")
195                numfilesproc += 1
196        else:
197                print 'File %s is not xml format. Not processed'  %(full_filename)
198
199# ingest the datacentres records into eXist db (backups of exist happen nightly).
200commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
201print "Executing : actual command to ingest into exist db"
202status = os.system(commandline)
203if status !=0:
204    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
205
206#replace any namespace declarations with a standard one which we know works in NDG
207indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
208outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
209filenames = os.listdir(indir)
210for filename in filenames:
211        if filename.find('.xml') != -1:
212                    in_filename = indir + "/" + filename
213                    corrected_filename = outdir + "/" + filename
214                    try:
215                       SchemaNameSpace.main(in_filename, corrected_filename,datacentre_format)
216                    except:
217                       print "SchemaNameSpace failed on file %s"%in_filename
218
219#are there any old records hanging around.If so, remove.
220try:
221    os.stat("./DIF2MOLES")
222except:
223    print "No old moles records hanging around"
224else:
225    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
226    print "Executing : " + commandline
227    status = os.system(commandline)
228    if status !=0:
229        sys.exit("Failed at clearing out DIF2MOLES area.")
230
231# Then run the minimum moles creator for each discovery record
232# Put records in ./DIF2MOLES with original filename
233filenames = os.listdir(outdir)
234for filename in filenames:
235        if filename.find('.xml') != -1:
236                    original_filename = outdir + "/" + filename
237                    ident=getID(original_filename)
238                    if NDG_dataProvider:
239                        newident=ident.replace(":","__")
240                        print "identifier is %s" %newident
241                        molesLocalID = newident.split("__",2)[2]
242                    else:
243                        molesLocalID = ident
244                    print "molesLocalID is %s" %molesLocalID
245                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
246                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
247                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
248                    print "Executing command to run d2boneoff.jar"
249                    status= os.system(commandline)
250                    if status==10:
251                        print "WARNING: couldn't find the record"
252                    elif status!=0:
253                        print "ERROR: couldn't create the minimum moles records"
254                        sys.exit
255#There should be some records now
256try:
257    os.stat("./DIF2MOLES")
258except:
259    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
260    sys.exit()
261
262#Add keywords if necessary
263if datacentre_groups == "":
264    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
265    print "Executing : " + commandline
266    status = os.system(commandline)
267    if status !=0:
268        sys.exit("Failed at moving MOLES to FINAL directory")
269else:
270    keywordAdder.main('./DIF2MOLES', './FINALMOLES', ['MDIP', 'http://vocab.ndg.nerc.ac.uk/term/N010/0', 'NDGO0001'])
271
272# ingest the created discovery minimum molesrecords into eXist db.
273commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES"
274print "Executing : actual command to ingest into exist db"
275status = os.system(commandline)
276if status !=0:
277    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
278
279#Extract the spatiotemporal info from created moles and put in Postgres db
280SpaceTimeIngestFromMOLES.main("./FINALMOLES")
281
282#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
283this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
284commandline = "mkdir " + this_backupdir
285print "Executing : " + commandline
286status = os.system(commandline)
287if status !=0:
288    sys.exit("Failed at creating backup directory %s" %this_backupdir)
289
290commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
291print "Executing : " + commandline
292status = os.system(commandline)
293if status !=0:
294    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
295
296this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
297commandline = "mkdir " + this_backupdir
298print "Executing : " + commandline
299status = os.system(commandline)
300if status !=0:
301    sys.exit("Failed at creating backup directory %s" %this_backupdir)
302
303commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
304print "Executing : " + commandline
305status = os.system(commandline)
306if status !=0:
307    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
308
309this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
310commandline = "mkdir " + this_backupdir
311print "Executing : " + commandline
312status = os.system(commandline)
313if status !=0:
314    sys.exit("Failed at creating backup directory %s" %this_backupdir)
315
316commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
317print "Executing : " + commandline
318status = os.system(commandline)
319if status !=0:
320    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
321
322#Clear out the original harvest records area and DIF2MOLES
323commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
324print "Executing : " + commandline
325status = os.system(commandline)
326if status !=0:
327    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
328
329commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
330print "Executing : " + commandline
331status = os.system(commandline)
332if status !=0:
333    sys.exit("Failed at clearing out FINALMOLES area %s" %harvest_home)
334
335#remove the DIF2MOLES directory
336##commandline = "rmdir ./DIF2MOLES"
337#print "Executing : " + commandline
338#status = os.system(commandline)
339#if status !=0:
340#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
341
342print "======================================================"
343print "No. of files pre-processed = %s" %numfilesproc
344if status == 0:
345    print " Procedure oai_ingest.py ran to end"
346else:
347    print "Procedure oai_ingest.py FAILED with status %s" %status
348
349print "======================================================"
Note: See TracBrowser for help on using the repository browser.