source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 3803

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py
Revision 3803, 17.6 KB checked in by selatham, 12 years ago (diff)

Use the exist thatcomes in .war rather than old-hat one in exist-client.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameters <datacentre> <dbinfofile> <existhost> <backupdir> <javahome>.
3NOTE:might have to change the location of tomcat if it's not in the usual place (/usr/local/tomcat).
4
5The /usr/local/WSClients/OAIBatch directory contains:-
6 - this python script, plus some other modules eg ndgUtils for parts of the process.
7 - a DataProvider specific config file,
8 - the python module for extracting spatiotemporal information and adding to postgres db.
9Under this directory the following structure should be maintained:
10 ./data
11 - /DATACENTRE/
12        - discovery/:          Re-named documents.
13        - discovery_corrected  Documents with schema namespaces corrected, ready to ingest in the discovery service.
14        - oai/difYYYYMMDD/    Documents as harvested from OAI
15 Where  /DATACENTRE  varies for the different data providers
16"""
17#History:
18# 12/05/06 SEL spelling correction
19# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
20# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
21# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
22# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
23# 17/10/06 SEL cope with different discovery formats - not just DIF.
24# 23/10/06 SEL keywords not mandatory in config file.
25# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
26#  December 2007 SEL rewrite to use Bryans' python XQuery stuff to create mini-moles instead of java.
27#                    Also extracted hard coded pwds into a file.
28# April 2008 SEL use exist interface that comes with the .war rather than the old-hat stuff in exist-client
29
30import os
31import sys
32import commands
33import string
34import SpaceTimeIngestFromMOLES
35import keywordAdder
36from SchemaNameSpace import SchemaNameSpace
37from DIF import DIF
38from MDIP import MDIP
39import ndgUtils
40from ndgUtils.ndgXqueries import ndgXqueries
41from ndgUtils.eXistInterface import ndg_eXist
42from ndgUtils.ndgObject import ndgObject
43import ConfigParser
44from xmlrpclib import Fault
45from ndgUtils.elementtree import ElementTree as ET
46from ndgUtils.ndgDirectory import ndgDirectory
47
48def getID(filename):
49        ''' Gets the identifier out of an input metadata xml record. Copes with DIF and MDIP currently.'''
50        xml=file(filename).read()
51        if datacentre_format == "DIF":
52            d=DIF(xml)
53            ID=d.entryID
54        elif datacentre_format == "MDIP":
55            d=MDIP(xml)
56            ID=d.id
57        else:
58            sys.exit("Only handles DIF or MDIP here.")
59        return ID
60
61status = 0
62numfilesproc = 0
63harvest_home = ""
64datacentre_groups = ""
65datacentre_format = ""
66datacentre_namespace = ""
67NDG_dataProvider = False
68
69if (len(sys.argv) < 6):
70    print "ERROR: <datacentre> or <db info file> or <existhost> or <backupdir> or <java_home> parameter not supplied."
71    sys.exit()
72else:
73    datacentre = sys.argv[1]
74    dbinfoname = sys.argv[2]
75    existhost = sys.argv[3]
76    # The directory to put things for a tape backup (should already exist)
77    backupdir = sys.argv[4]
78    #backupdir = '/disks/glue1/oaiBackup/'
79    java_home = sys.argv[5]
80
81#Change os directory to that with the code in it.
82os.chdir('/usr/local/WSClients/OAIBatch')
83
84# Other settings and constants
85date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
86#os.putenv ('EXIST_HOME', '/usr/local/exist-client')
87#os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
88#os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
89#os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
90
91#Xquery settings
92xq=ndgXqueries()
93xmldb=ndg_eXist(db=existhost)
94
95# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
96# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
97# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
98# Groups are added to the intermediate MOLES when it is created.
99datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
100print "INFO: Datacentre config file = %s" %datacentre_config_filename
101datacentre_config_file = open(datacentre_config_filename, "r")
102
103for line in datacentre_config_file.readlines():
104    words  = string.split(line)
105    if len(words) == 0:
106        continue
107    if words[0] == 'host_path':
108        harvest_home = string.rstrip(words[1])
109    if words[0] == 'groups':
110        datacentre_groups = words[1:]
111    if words[0] == 'format':
112        datacentre_format = words[1]
113    if words[0] == 'namespace':
114        datacentre_namespace = words[1]
115    if words[0] == 'NDG_dataProvider':
116        NDG_dataProvider = True
117
118datacentre_config_file.close()
119
120if harvest_home == "":
121    sys.exit("ERROR: Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
122else:
123    print "INFO: harvested records are in %s" %harvest_home
124
125if datacentre_groups == "":
126    print "INFO: No groups/keywords set for datacentre %s" %datacentre
127else:
128    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
129
130if datacentre_format == "":
131    sys.exit("ERROR: Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
132else:
133    print "INFO: format being harvested = %s" %datacentre_format
134
135if datacentre_namespace == "":
136    sys.exit("ERROR: Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
137else:
138    print "INFO: datacentre namespace = %s" %datacentre_namespace
139
140#any records to harvest?
141if len( os.listdir(harvest_home)) == 0:
142    print "INFO: Nothing to harvest this time from %s" %datacentre
143    sys.exit()
144
145# get the exist db access info
146#host ='glue.badc.rl.ac.uk'
147dbaccess={}
148dbinfo_file=open(dbinfoname,"r")
149for line in dbinfo_file.readlines():
150    words  = string.split(line)
151    if len(words) < 2:
152        continue
153    dbaccess[(words[0],words[1])] = [words[2]]
154dbinfo_file.close()
155#print dbaccess
156db_admin = dbaccess[(existhost,'admin')][0]
157#print db_admin
158
159# Create/clear the 'in' directory pristine copy of the discovery records
160if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
161    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
162    print "INFO: Executing : " + commandline
163    status = os.system(commandline)
164else:
165    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
166    print "INFO: Executing : " + commandline
167    status= os.system(commandline)
168
169if status != 0:
170    sys.exit("ERROR: Failed at creating copy dir stage")
171
172# make the 'in' pristine copy. Cope with there being lots of files in the directory.
173
174commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
175print "INFO: Executing : " + commandline
176status = os.system(commandline)
177if status !=0:
178    sys.exit("ERROR: Failed at making pristine copy stage")
179
180# Create/clear the directory for the 'out' processed copy of the discovery records.
181if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
182    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
183    print "INFO: Executing : " + commandline
184    status = os.system(commandline)
185else:
186    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
187    print "INFO: Executing : " + commandline
188    status= os.system(commandline)
189
190# Create/clear the directory for the 'out' namespace corrected copy of the discovery records.
191if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"):
192    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/{\}"
193    print "INFO: Executing : " + commandline
194    status = os.system(commandline)
195else:
196    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
197    print "INFO: Executing : " + commandline
198    status= os.system(commandline)
199
200# The file config.properties contains the location of the particular datacentres harvested records.
201# Copy the datacentre specific version of config to config.properties file.
202commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
203print "INFO: Executing : " + commandline
204status = os.system(commandline)
205if status !=0:
206    sys.exit("ERROR: Failed at copying config file stage")
207
208#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
209indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
210outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
211#wrapFlag=False
212filenames = os.listdir(indir)
213for filename in filenames:
214        if filename.find('.xml') != -1:
215                original_filename = indir + "/" + filename
216                ident=getID(original_filename)
217                print "INFO: ID extracted from the discovery record = %s" %ident
218                if NDG_dataProvider:
219                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml"
220                else:
221                    ident = ident.replace(":","-")
222                    ident = ident.replace("/","-")
223                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml"
224                print "INFO: original file = %s, newfile = %s" %(original_filename, new_filename)
225                commandline = "cp "+original_filename+ " " +new_filename
226                #print "Executing : " + commandline
227                status = os.system(commandline)
228                if status !=0:
229                    sys.exit("ERROR: Failed at re-naming file stage")
230                numfilesproc += 1
231        else:
232                print 'WARNING: File %s is not xml format. Not processed'  %(filename)
233
234#replace any namespace declarations with a standard one which we know works in NDG
235indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
236outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"
237filenames = os.listdir(indir)
238for filename in filenames:
239        if filename.find('.xml') != -1:
240                    in_filename = indir + "/" + filename
241                    corrected_filename = outdir + "/" + filename
242                    try:
243                       SchemaNameSpace(in_filename, corrected_filename,datacentre_format)
244                    except:
245                       print "ERROR: SchemaNameSpace failed on file %s"%in_filename
246
247# ingest the datacentres records into eXist db (backups of exist happen nightly).
248commandline = java_home +" -jar -Dexist.home=/usr/local/tomcat/webapps/exist/WEB-INF /usr/local/tomcat/webapps/exist/WEB-INF/lib/start.jar client -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P "+db_admin+" -p "+outdir+ " -ouri=xmldb:exist://"+existhost+":8080/exist/xmlrpc"
249#commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P "+db_admin+" -p "+outdir
250print "INFO: Executing : actual command to ingest originals into exist db."
251status = os.system(commandline)
252if status !=0:
253    sys.exit("ERROR: Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
254
255#are there any old moles records hanging around.If so, remove.
256try:
257   os.stat("./DIF2MOLES")
258except:
259   print "INFO: No old moles records hanging around"
260else:
261   commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
262   print "INFO: Executing : " + commandline
263   status = os.system(commandline)
264   if status !=0:
265       sys.exit("ERROR: Failed at clearing out DIF2MOLES area.")
266
267# Then run the minimum moles creator for each discovery record
268# Put records in ./DIF2MOLES with same filename
269
270# First get the list of discovery record ids from the db collection
271targetCollection = "/db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace
272if datacentre_format == 'DIF':
273    ndgDir=ndgDirectory(targetCollection,existhost,docType='DIF')
274    #print ndgDir.members
275else:
276    print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)'
277
278
279#create the mini-moles for each Discovery record in the collection
280for member in ndgDir.members:
281    #print member
282    filename= member['fileName']
283    disc_id = member['EntryID']
284    print "INFO: internal id = %s" %disc_id
285    print "INFO: discovery filename = %s" %filename
286    # now create the xquery
287    # sort out the output ID stuff ...
288    if NDG_dataProvider:
289        discObj=ndgObject(disc_id)
290        xquery=xq.actual('dif2moles',targetCollection,discObj.repository,discObj.localID)
291    else:
292        xquery=xq.actual('dif2moles',targetCollection,datacentre_namespace,disc_id)
293    # and then sort out the input ID stuff
294    xquery=xquery.replace('Input_Entry_ID',disc_id)
295    xquery=xquery.replace('repository_localid', datacentre_namespace )
296    #print xq.help('dif2moles')
297    molesid,s=xmldb.executeQuery(xquery)
298    moles_from_dif=xmldb.retrieve(molesid,0)
299    #print moles_from_dif
300    # now write out xml to file
301    outdir= './DIF2MOLES'
302    f=open(outdir+"/"+filename,'w')
303    f.write(moles_from_dif)
304    f.close()
305
306#There should be some records now
307try:
308    os.stat("./DIF2MOLES")
309except:
310    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
311    sys.exit()
312
313#Add keywords if necessary
314if datacentre_groups == "":
315    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
316    print "INFO: Executing : " + commandline
317    status = os.system(commandline)
318    if status !=0:
319        sys.exit("ERROR: Failed at moving MOLES to FINAL directory")
320else:
321    keywordAdder.main('./DIF2MOLES', './FINALMOLES', datacentre_groups)
322
323# ingest the created discovery minimum molesrecords into eXist db.
324commandline = java_home +" -jar -Dexist.home=/usr/local/tomcat/webapps/exist/WEB-INF /usr/local/tomcat/webapps/exist/WEB-INF/lib/start.jar client -c /db/discovery/moles -u admin -P "+db_admin+" -p ./FINALMOLES -ouri=xmldb:exist://"+existhost+":8080/exist/xmlrpc"
325#commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P "+db_admin+" -p ./FINALMOLES"
326print "INFO: Executing : actual command to ingest mini-moles into exist db."
327status = os.system(commandline)
328if status !=0:
329    sys.exit("ERROR: Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
330
331#Extract the spatiotemporal info from created moles and put in Postgres db
332SpaceTimeIngestFromMOLES.main("./FINALMOLES")
333
334#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
335this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
336commandline = "mkdir " + this_backupdir
337print "INFO: Executing : " + commandline
338status = os.system(commandline)
339if status !=0:
340    sys.exit("ERROR: Failed at creating backup directory %s" %this_backupdir)
341
342commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
343print "INFO: Executing : " + commandline
344status = os.system(commandline)
345if status !=0:
346    sys.exit("ERROR: Failed at copying to backup directory %s" %this_backupdir)
347
348this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
349commandline = "mkdir " + this_backupdir
350print "INFO: Executing : " + commandline
351status = os.system(commandline)
352if status !=0:
353    sys.exit("ERROR: Failed at creating backup directory %s" %this_backupdir)
354
355commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
356print "INFO: Executing : " + commandline
357status = os.system(commandline)
358if status !=0:
359    sys.exit("ERROR: Failed at copying to backup directory %s" %this_backupdir)
360
361this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
362commandline = "mkdir " + this_backupdir
363print "INFO: Executing : " + commandline
364status = os.system(commandline)
365if status !=0:
366    sys.exit("ERROR: Failed at creating backup directory %s" %this_backupdir)
367
368commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
369print "INFO: Executing : " + commandline
370status = os.system(commandline)
371if status !=0:
372    sys.exit("ERROR: Failed at copying to backup directory %s" %this_backupdir)
373
374#Clear out the original harvest records area and FINALMOLES
375commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
376print "INFO: Executing : " + commandline
377status = os.system(commandline)
378if status !=0:
379    sys.exit("ERROR: Failed at clearing out FINALMOLES area %s" %harvest_home)
380
381commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
382print "INFO: Executing : " + commandline
383status = os.system(commandline)
384if status !=0:
385    sys.exit("ERROR: Failed at clearing out original harvest records area %s" %harvest_home)
386
387print "======================================================"
388print "No. of files pre-processed = %s" %numfilesproc
389if status == 0:
390    print " Procedure oai_ingest.py ran to end"
391else:
392    print "ERROR: Procedure oai_ingest.py FAILED with status %s" %status
393
394print "======================================================"
Note: See TracBrowser for help on using the repository browser.