source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 2329

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@2329
Revision 2329, 15.6 KB checked in by selatham, 13 years ago (diff)

Using keywordAdder correctly. sorted spatiotemporal null problem.

  • Property svn:executable set to *
RevLine 
[721]1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
[1889]3The /usr/local/WSClients/OAIBatch directory contains:-
[2329]4 - this python script, plus some other modules for parts of the process.
[1889]5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
[721]9 ./data
10 - /DATACENTRE/
[2329]11                - discovery/:         Re-named documents.
12        - discovery_corrected Documents with schema namespaces corrected, ready to ingest in the discovery service.
[1889]13                - oai/difYYYYMMDD/    Documents as harvested from OAI
[721]14 Where  /DATACENTRE  varies for the different data providers
15"""
[916]16#History:
17# 12/05/06 SEL spelling correction
[1066]18# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
[1076]19# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
[1591]20# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
21# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
[1593]22# 17/10/06 SEL cope with different discovery formats - not just DIF.
[1619]23# 23/10/06 SEL keywords not mandatory in config file.
[1620]24# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
[916]25
[721]26import os
27import sys
28import commands
29import string
[1898]30import SpaceTimeIngestFromMOLES
[2088]31import keywordAdder
[2329]32from SchemaNameSpace import SchemaNameSpace
[2252]33from DIF import DIF
34from MDIP import MDIP
[721]35
[2252]36def getID(filename):
37        ''' Gets the identifier out of an input metadata xml record. Copes with DIF and MDIP currently.'''
38        xml=file(filename).read()
39        if datacentre_format == "DIF":
40            d=DIF(xml)
41            ID=d.entryID
42        elif datacentre_format == "MDIP":
43            d=MDIP(xml)
44            ID=d.id
45        else:
46            sys.exit("Only handles DIF or MDIP here.")
47        return ID
48
[721]49status = 0
50numfilesproc = 0
51harvest_home = ""
[1593]52datacentre_groups = ""
53datacentre_format = ""
[1768]54datacentre_namespace = ""
[1971]55NDG_dataProvider = False
[721]56
57if (len(sys.argv) < 2):
58    print "<datacentre>  parameter not supplied."
59    sys.exit()
60else:
61    datacentre = sys.argv[1]
62
63# Other settings and constants
64date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
[1591]65os.putenv ('EXIST_HOME', '/usr/local/exist-client')
[1889]66os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
67os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
68os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
[721]69
[1593]70# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
[721]71# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
[1593]72# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
[1768]73# Groups are added to the intermediate MOLES when it is created.
[721]74datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
75print "Datacentre config file = %s" %datacentre_config_filename
76datacentre_config_file = open(datacentre_config_filename, "r")
77
78for line in datacentre_config_file.readlines():
79    words  = string.split(line)
[863]80    if len(words) == 0:
81        continue
[721]82    if words[0] == 'host_path':
83        harvest_home = string.rstrip(words[1])
[1593]84    if words[0] == 'groups':
85        datacentre_groups = words[1:]
86    if words[0] == 'format':
87        datacentre_format = words[1]
[1768]88    if words[0] == 'namespace':
89        datacentre_namespace = words[1]
[1971]90    if words[0] == 'NDG_dataProvider':
91        NDG_dataProvider = True
92
[1593]93datacentre_config_file.close()
[1768]94
[721]95if harvest_home == "":
96    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
[1620]97else:
98    print "INFO: harvested records are in %s" %harvest_home
99
[1593]100if datacentre_groups == "":
[1620]101    print "INFO: No groups/keywords set for datacentre %s" %datacentre
102else:
103    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
104
[1593]105if datacentre_format == "":
106    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
[1620]107else:
108    print "INFO: format being harvested = %s" %datacentre_format
[721]109
[1768]110if datacentre_namespace == "":
111    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
112else:
113    print "INFO: datacentre namespace = %s" %datacentre_namespace
114
[1593]115#any records to harvest?
[863]116if len( os.listdir(harvest_home)) == 0:
[741]117    print "Nothing to harvest this time from %s" %datacentre
118    sys.exit()
[721]119
120# The directory to put things for a tape backup (should already exist)
121backupdir = '/disks/glue1/oaiBackup/'
122
[1591]123# Create/clear the 'in' directory pristine copy of the discovery records
[1593]124if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
125    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
[721]126    print "Executing : " + commandline
127    status = os.system(commandline)
[1591]128else:
[1593]129    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
[721]130    print "Executing : " + commandline
131    status= os.system(commandline)
132
133if status != 0:
134    sys.exit("Failed at creating copy dir stage")
135
[1591]136# make the 'in' pristine copy. Cope with there being lots of files in the directory.
[1045]137
[1593]138commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
[721]139print "Executing : " + commandline
140status = os.system(commandline)
141if status !=0:
142    sys.exit("Failed at making pristine copy stage")
143
[1591]144# Create/clear the directory for the 'out' processed copy of the discovery records.
[721]145if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
[1620]146    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
[721]147    print "Executing : " + commandline
148    status = os.system(commandline)
149else:
150    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
151    print "Executing : " + commandline
152    status= os.system(commandline)
153
[2324]154# Create/clear the directory for the 'out' namespace corrected copy of the discovery records.
155if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"):
156    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
157    print "Executing : " + commandline
158    status = os.system(commandline)
159else:
160    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
161    print "Executing : " + commandline
162    status= os.system(commandline)
163
[1591]164# The file config.properties contains the location of the particular datacentres harvested records.
[1593]165# Copy the datacentre specific version of config to config.properties file.
[721]166commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
167print "Executing : " + commandline
168status = os.system(commandline)
169if status !=0:
170    sys.exit("Failed at copying config file stage")
171
[1889]172#Change os directory to that with the code in it.
[721]173os.chdir('/usr/local/WSClients/OAIBatch')
174
175
[1971]176#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
[1593]177indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
[1591]178outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
[1768]179#wrapFlag=False
[1591]180filenames = os.listdir(indir)
[721]181for filename in filenames:
182        if filename.find('.xml') != -1:
[1768]183                original_filename = indir + "/" + filename
[2252]184                ident=getID(original_filename)
[2324]185                print "ID extracted from the discovery record = %s" %ident
[2252]186                if NDG_dataProvider:
187                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml"
[1971]188                else:
[2252]189                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml"
190                print "original file = %s, newfile = %s" %(original_filename, new_filename)
[1768]191                commandline = "cp "+original_filename+ " " +new_filename
[1889]192                #print "Executing : " + commandline
[1971]193                status = os.system(commandline)
194                if status !=0:
195                    sys.exit("Failed at re-naming file stage")
[721]196                numfilesproc += 1
197        else:
198                print 'File %s is not xml format. Not processed'  %(full_filename)
199
[2324]200#replace any namespace declarations with a standard one which we know works in NDG
201indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
202outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
203filenames = os.listdir(indir)
204for filename in filenames:
205        if filename.find('.xml') != -1:
206                    in_filename = indir + "/" + filename
207                    corrected_filename = outdir + "/" + filename
208                    try:
[2329]209                       SchemaNameSpace(in_filename, corrected_filename,datacentre_format)
[2324]210                    except:
211                       print "SchemaNameSpace failed on file %s"%in_filename
212
[2329]213# ingest the datacentres records into eXist db (backups of exist happen nightly).
214commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
215print "Executing : actual command to ingest into exist db"
216status = os.system(commandline)
217if status !=0:
218    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
219
220#are there any old moles records hanging around.If so, remove.
[1971]221try:
[2067]222    os.stat("./DIF2MOLES")
[1971]223except:
224    print "No old moles records hanging around"
225else:
226    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
227    print "Executing : " + commandline
228    status = os.system(commandline)
229    if status !=0:
230        sys.exit("Failed at clearing out DIF2MOLES area.")
231
[2067]232# Then run the minimum moles creator for each discovery record
233# Put records in ./DIF2MOLES with original filename
234filenames = os.listdir(outdir)
235for filename in filenames:
236        if filename.find('.xml') != -1:
237                    original_filename = outdir + "/" + filename
[2252]238                    ident=getID(original_filename)
[2067]239                    if NDG_dataProvider:
[2306]240                        newident=ident.replace(":","__")
241                        print "identifier is %s" %newident
242                        molesLocalID = newident.split("__",2)[2]
[2067]243                    else:
[2252]244                        molesLocalID = ident
[2067]245                    print "molesLocalID is %s" %molesLocalID
246                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
247                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
[2252]248                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
[2067]249                    print "Executing command to run d2boneoff.jar"
250                    status= os.system(commandline)
251                    if status==10:
252                        print "WARNING: couldn't find the record"
253                    elif status!=0:
254                        print "ERROR: couldn't create the minimum moles records"
255                        sys.exit
[1971]256#There should be some records now
[1898]257try:
[2067]258    os.stat("./DIF2MOLES")
[1898]259except:
[1971]260    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
[1898]261    sys.exit()
[1768]262
[2088]263#Add keywords if necessary
264if datacentre_groups == "":
265    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
266    print "Executing : " + commandline
267    status = os.system(commandline)
268    if status !=0:
269        sys.exit("Failed at moving MOLES to FINAL directory")
270else:
[2329]271    keywordAdder.main('./DIF2MOLES', './FINALMOLES', datacentre_groups)
[2088]272
[1880]273# ingest the created discovery minimum molesrecords into eXist db.
[2088]274commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES"
[1880]275print "Executing : actual command to ingest into exist db"
276status = os.system(commandline)
277if status !=0:
278    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
279
[1797]280#Extract the spatiotemporal info from created moles and put in Postgres db
[2088]281SpaceTimeIngestFromMOLES.main("./FINALMOLES")
[1768]282
[1880]283#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
[1593]284this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
285commandline = "mkdir " + this_backupdir
[721]286print "Executing : " + commandline
287status = os.system(commandline)
288if status !=0:
289    sys.exit("Failed at creating backup directory %s" %this_backupdir)
[1121]290
[1593]291commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
[721]292print "Executing : " + commandline
293status = os.system(commandline)
294if status !=0:
295    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
296
297this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
298commandline = "mkdir " + this_backupdir
299print "Executing : " + commandline
300status = os.system(commandline)
301if status !=0:
302    sys.exit("Failed at creating backup directory %s" %this_backupdir)
[1126]303
[1121]304commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
[721]305print "Executing : " + commandline
306status = os.system(commandline)
307if status !=0:
308    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
[1593]309
[2088]310this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
[1880]311commandline = "mkdir " + this_backupdir
312print "Executing : " + commandline
313status = os.system(commandline)
314if status !=0:
315    sys.exit("Failed at creating backup directory %s" %this_backupdir)
316
[2088]317commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
[1880]318print "Executing : " + commandline
319status = os.system(commandline)
320if status !=0:
321    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
322
323#Clear out the original harvest records area and DIF2MOLES
[1172]324commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
[1593]325print "Executing : " + commandline
326status = os.system(commandline)
327if status !=0:
328    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
[721]329
[2088]330commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
[1880]331print "Executing : " + commandline
332status = os.system(commandline)
333if status !=0:
[2088]334    sys.exit("Failed at clearing out FINALMOLES area %s" %harvest_home)
[741]335
[1880]336#remove the DIF2MOLES directory
[2067]337##commandline = "rmdir ./DIF2MOLES"
338#print "Executing : " + commandline
339#status = os.system(commandline)
340#if status !=0:
341#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
[1880]342
[721]343print "======================================================"
344print "No. of files pre-processed = %s" %numfilesproc
345if status == 0:
346    print " Procedure oai_ingest.py ran to end"
347else:
[1593]348    print "Procedure oai_ingest.py FAILED with status %s" %status
349
[721]350print "======================================================"
Note: See TracBrowser for help on using the repository browser.