source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 2088

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@2088
Revision 2088, 14.2 KB checked in by selatham, 14 years ago (diff)

implementing keywordAdder.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script,
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents ready to ingest in the discovery service.
12                - oai/difYYYYMMDD/    Documents as harvested from OAI
13 Where  /DATACENTRE  varies for the different data providers
14"""
15#History:
16# 12/05/06 SEL spelling correction
17# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
18# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
19# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
20# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
21# 17/10/06 SEL cope with different discovery formats - not just DIF.
22# 23/10/06 SEL keywords not mandatory in config file.
23# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
24
25import os
26import sys
27import commands
28import string
29import SpaceTimeIngestFromMOLES
30import keywordAdder
31#import oaiClean
32
33status = 0
34numfilesproc = 0
35harvest_home = ""
36datacentre_groups = ""
37datacentre_format = ""
38datacentre_namespace = ""
39NDG_dataProvider = False
40
41if (len(sys.argv) < 2):
42    print "<datacentre>  parameter not supplied."
43    sys.exit()
44else:
45    datacentre = sys.argv[1]
46
47# Other settings and constants
48date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
49os.putenv ('EXIST_HOME', '/usr/local/exist-client')
50os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
51os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
52os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
53
54# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
55# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
56# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
57# Groups are added to the intermediate MOLES when it is created.
58datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
59print "Datacentre config file = %s" %datacentre_config_filename
60datacentre_config_file = open(datacentre_config_filename, "r")
61
62for line in datacentre_config_file.readlines():
63    words  = string.split(line)
64    if len(words) == 0:
65        continue
66    if words[0] == 'host_path':
67        harvest_home = string.rstrip(words[1])
68    if words[0] == 'groups':
69        datacentre_groups = words[1:]
70    if words[0] == 'format':
71        datacentre_format = words[1]
72    if words[0] == 'namespace':
73        datacentre_namespace = words[1]
74    if words[0] == 'NDG_dataProvider':
75        NDG_dataProvider = True
76
77datacentre_config_file.close()
78
79if harvest_home == "":
80    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
81else:
82    print "INFO: harvested records are in %s" %harvest_home
83
84if datacentre_groups == "":
85    print "INFO: No groups/keywords set for datacentre %s" %datacentre
86else:
87    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
88
89if datacentre_format == "":
90    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
91else:
92    print "INFO: format being harvested = %s" %datacentre_format
93
94if datacentre_namespace == "":
95    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
96else:
97    print "INFO: datacentre namespace = %s" %datacentre_namespace
98
99#any records to harvest?
100if len( os.listdir(harvest_home)) == 0:
101    print "Nothing to harvest this time from %s" %datacentre
102    sys.exit()
103
104# The directory to put things for a tape backup (should already exist)
105backupdir = '/disks/glue1/oaiBackup/'
106
107# Create/clear the 'in' directory pristine copy of the discovery records
108if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
109    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
110    print "Executing : " + commandline
111    status = os.system(commandline)
112else:
113    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
114    print "Executing : " + commandline
115    status= os.system(commandline)
116
117if status != 0:
118    sys.exit("Failed at creating copy dir stage")
119
120# make the 'in' pristine copy. Cope with there being lots of files in the directory.
121
122commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
123print "Executing : " + commandline
124status = os.system(commandline)
125if status !=0:
126    sys.exit("Failed at making pristine copy stage")
127
128# Create/clear the directory for the 'out' processed copy of the discovery records.
129if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
130    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
131    print "Executing : " + commandline
132    status = os.system(commandline)
133else:
134    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
135    print "Executing : " + commandline
136    status= os.system(commandline)
137
138# The file config.properties contains the location of the particular datacentres harvested records.
139# Copy the datacentre specific version of config to config.properties file.
140commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
141print "Executing : " + commandline
142status = os.system(commandline)
143if status !=0:
144    sys.exit("Failed at copying config file stage")
145
146#Change os directory to that with the code in it.
147os.chdir('/usr/local/WSClients/OAIBatch')
148
149
150#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
151indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
152outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
153#wrapFlag=False
154filenames = os.listdir(indir)
155for filename in filenames:
156        if filename.find('.xml') != -1:
157                original_filename = indir + "/" + filename
158                if datacentre_format == "DIF":
159                    from DIF import DIF
160                    from ETxmlView import loadET
161                    xml=file(original_filename).read()
162                    y=loadET(xml)
163                    d=DIF(xml)
164                    #print "ID extracted from the DIF = %s" %d.entryID
165                    if NDG_dataProvider:
166                        new_filename = outdir + "/"+d.entryID.replace(":","__")+".xml"
167                    else:
168                        new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+d.entryID+".xml"
169                else:
170                    sys.exit("Doesn't handle anything else but DIF here.")
171                #print "original file = %s, newfile = %s" %(original_filename, new_filename)
172                commandline = "cp "+original_filename+ " " +new_filename
173                #print "Executing : " + commandline
174                status = os.system(commandline)
175                if status !=0:
176                    sys.exit("Failed at re-naming file stage")
177                numfilesproc += 1
178        else:
179                print 'File %s is not xml format. Not processed'  %(full_filename)
180
181# ingest the datacentres records into eXist db (backups of exist happen nightly).
182commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
183print "Executing : actual command to ingest into exist db"
184status = os.system(commandline)
185if status !=0:
186    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
187
188#are there any old records hanging around.If so, remove.
189try:
190    os.stat("./DIF2MOLES")
191except:
192    print "No old moles records hanging around"
193else:
194    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
195    print "Executing : " + commandline
196    status = os.system(commandline)
197    if status !=0:
198        sys.exit("Failed at clearing out DIF2MOLES area.")
199
200# Then run the minimum moles creator for each discovery record
201# Put records in ./DIF2MOLES with original filename
202filenames = os.listdir(outdir)
203for filename in filenames:
204        if filename.find('.xml') != -1:
205                if datacentre_format == "DIF":
206                    original_filename = outdir + "/" + filename
207                    from DIF import DIF
208                    from ETxmlView import loadET
209                    xml=file(original_filename).read()
210                    y=loadET(xml)
211                    d=DIF(xml)
212                    print "ID extracted from the DIF = %s" %d.entryID
213                    if NDG_dataProvider:
214                        molesLocalID = d.entryID.split(":",2)[2]
215                    else:
216                        molesLocalID = d.entryID
217                    print "molesLocalID is %s" %molesLocalID
218                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
219                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
220                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+d.entryID+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
221                    print "Executing command to run d2boneoff.jar"
222                    status= os.system(commandline)
223                    if status==10:
224                        print "WARNING: couldn't find the record"
225                    elif status!=0:
226                        print "ERROR: couldn't create the minimum moles records"
227                        sys.exit
228#There should be some records now
229try:
230    os.stat("./DIF2MOLES")
231except:
232    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
233    sys.exit()
234
235#Add keywords if necessary
236if datacentre_groups == "":
237    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
238    print "Executing : " + commandline
239    status = os.system(commandline)
240    if status !=0:
241        sys.exit("Failed at moving MOLES to FINAL directory")
242else:
243    keywordAdder.main('./DIF2MOLES', './FINALMOLES', ['MDIP', 'http://vocab.ndg.nerc.ac.uk/term/N010/0', 'NDGO0001'])
244
245# ingest the created discovery minimum molesrecords into eXist db.
246commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES"
247print "Executing : actual command to ingest into exist db"
248status = os.system(commandline)
249if status !=0:
250    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
251
252#Extract the spatiotemporal info from created moles and put in Postgres db
253SpaceTimeIngestFromMOLES.main("./FINALMOLES")
254
255#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
256this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
257commandline = "mkdir " + this_backupdir
258print "Executing : " + commandline
259status = os.system(commandline)
260if status !=0:
261    sys.exit("Failed at creating backup directory %s" %this_backupdir)
262
263commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
264print "Executing : " + commandline
265status = os.system(commandline)
266if status !=0:
267    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
268
269this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
270commandline = "mkdir " + this_backupdir
271print "Executing : " + commandline
272status = os.system(commandline)
273if status !=0:
274    sys.exit("Failed at creating backup directory %s" %this_backupdir)
275
276commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
277print "Executing : " + commandline
278status = os.system(commandline)
279if status !=0:
280    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
281
282this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
283commandline = "mkdir " + this_backupdir
284print "Executing : " + commandline
285status = os.system(commandline)
286if status !=0:
287    sys.exit("Failed at creating backup directory %s" %this_backupdir)
288
289commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
290print "Executing : " + commandline
291status = os.system(commandline)
292if status !=0:
293    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
294
295#Clear out the original harvest records area and DIF2MOLES
296commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
297print "Executing : " + commandline
298status = os.system(commandline)
299if status !=0:
300    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
301
302commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
303print "Executing : " + commandline
304status = os.system(commandline)
305if status !=0:
306    sys.exit("Failed at clearing out FINALMOLES area %s" %harvest_home)
307
308#remove the DIF2MOLES directory
309##commandline = "rmdir ./DIF2MOLES"
310#print "Executing : " + commandline
311#status = os.system(commandline)
312#if status !=0:
313#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
314
315print "======================================================"
316print "No. of files pre-processed = %s" %numfilesproc
317if status == 0:
318    print " Procedure oai_ingest.py ran to end"
319else:
320    print "Procedure oai_ingest.py FAILED with status %s" %status
321
322print "======================================================"
Note: See TracBrowser for help on using the repository browser.