source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 2306

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@2306
Revision 2306, 14.2 KB checked in by selatham, 13 years ago (diff)

colon to underscores translation.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script,
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents ready to ingest in the discovery service.
12                - oai/difYYYYMMDD/    Documents as harvested from OAI
13 Where  /DATACENTRE  varies for the different data providers
14"""
15#History:
16# 12/05/06 SEL spelling correction
17# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
18# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
19# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
20# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
21# 17/10/06 SEL cope with different discovery formats - not just DIF.
22# 23/10/06 SEL keywords not mandatory in config file.
23# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
24
25import os
26import sys
27import commands
28import string
29import SpaceTimeIngestFromMOLES
30import keywordAdder
31from DIF import DIF
32from MDIP import MDIP
33
34def getID(filename):
35        ''' Gets the identifier out of an input metadata xml record. Copes with DIF and MDIP currently.'''
36        xml=file(filename).read()
37        if datacentre_format == "DIF":
38            d=DIF(xml)
39            ID=d.entryID
40        elif datacentre_format == "MDIP":
41            d=MDIP(xml)
42            ID=d.id
43        else:
44            sys.exit("Only handles DIF or MDIP here.")
45        return ID
46
47status = 0
48numfilesproc = 0
49harvest_home = ""
50datacentre_groups = ""
51datacentre_format = ""
52datacentre_namespace = ""
53NDG_dataProvider = False
54
55if (len(sys.argv) < 2):
56    print "<datacentre>  parameter not supplied."
57    sys.exit()
58else:
59    datacentre = sys.argv[1]
60
61# Other settings and constants
62date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
63os.putenv ('EXIST_HOME', '/usr/local/exist-client')
64os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
65os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
66os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
67
68# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
69# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
70# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
71# Groups are added to the intermediate MOLES when it is created.
72datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
73print "Datacentre config file = %s" %datacentre_config_filename
74datacentre_config_file = open(datacentre_config_filename, "r")
75
76for line in datacentre_config_file.readlines():
77    words  = string.split(line)
78    if len(words) == 0:
79        continue
80    if words[0] == 'host_path':
81        harvest_home = string.rstrip(words[1])
82    if words[0] == 'groups':
83        datacentre_groups = words[1:]
84    if words[0] == 'format':
85        datacentre_format = words[1]
86    if words[0] == 'namespace':
87        datacentre_namespace = words[1]
88    if words[0] == 'NDG_dataProvider':
89        NDG_dataProvider = True
90
91datacentre_config_file.close()
92
93if harvest_home == "":
94    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
95else:
96    print "INFO: harvested records are in %s" %harvest_home
97
98if datacentre_groups == "":
99    print "INFO: No groups/keywords set for datacentre %s" %datacentre
100else:
101    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
102
103if datacentre_format == "":
104    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
105else:
106    print "INFO: format being harvested = %s" %datacentre_format
107
108if datacentre_namespace == "":
109    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
110else:
111    print "INFO: datacentre namespace = %s" %datacentre_namespace
112
113#any records to harvest?
114if len( os.listdir(harvest_home)) == 0:
115    print "Nothing to harvest this time from %s" %datacentre
116    sys.exit()
117
118# The directory to put things for a tape backup (should already exist)
119backupdir = '/disks/glue1/oaiBackup/'
120
121# Create/clear the 'in' directory pristine copy of the discovery records
122if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
123    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
124    print "Executing : " + commandline
125    status = os.system(commandline)
126else:
127    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
128    print "Executing : " + commandline
129    status= os.system(commandline)
130
131if status != 0:
132    sys.exit("Failed at creating copy dir stage")
133
134# make the 'in' pristine copy. Cope with there being lots of files in the directory.
135
136commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
137print "Executing : " + commandline
138status = os.system(commandline)
139if status !=0:
140    sys.exit("Failed at making pristine copy stage")
141
142# Create/clear the directory for the 'out' processed copy of the discovery records.
143if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
144    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
145    print "Executing : " + commandline
146    status = os.system(commandline)
147else:
148    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
149    print "Executing : " + commandline
150    status= os.system(commandline)
151
152# The file config.properties contains the location of the particular datacentres harvested records.
153# Copy the datacentre specific version of config to config.properties file.
154commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
155print "Executing : " + commandline
156status = os.system(commandline)
157if status !=0:
158    sys.exit("Failed at copying config file stage")
159
160#Change os directory to that with the code in it.
161os.chdir('/usr/local/WSClients/OAIBatch')
162
163
164#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
165indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
166outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
167#wrapFlag=False
168filenames = os.listdir(indir)
169for filename in filenames:
170        if filename.find('.xml') != -1:
171                original_filename = indir + "/" + filename
172                ident=getID(original_filename)
173                print "ID extracted from the DIF = %s" %ident
174                if NDG_dataProvider:
175                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml"
176                else:
177                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml"
178                print "original file = %s, newfile = %s" %(original_filename, new_filename)
179                commandline = "cp "+original_filename+ " " +new_filename
180                #print "Executing : " + commandline
181                status = os.system(commandline)
182                if status !=0:
183                    sys.exit("Failed at re-naming file stage")
184                numfilesproc += 1
185        else:
186                print 'File %s is not xml format. Not processed'  %(full_filename)
187
188# ingest the datacentres records into eXist db (backups of exist happen nightly).
189commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
190print "Executing : actual command to ingest into exist db"
191status = os.system(commandline)
192if status !=0:
193    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
194
195#are there any old records hanging around.If so, remove.
196try:
197    os.stat("./DIF2MOLES")
198except:
199    print "No old moles records hanging around"
200else:
201    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
202    print "Executing : " + commandline
203    status = os.system(commandline)
204    if status !=0:
205        sys.exit("Failed at clearing out DIF2MOLES area.")
206
207# Then run the minimum moles creator for each discovery record
208# Put records in ./DIF2MOLES with original filename
209filenames = os.listdir(outdir)
210for filename in filenames:
211        if filename.find('.xml') != -1:
212                    original_filename = outdir + "/" + filename
213                    ident=getID(original_filename)
214                    if NDG_dataProvider:
215                        newident=ident.replace(":","__")
216                        print "identifier is %s" %newident
217                        molesLocalID = newident.split("__",2)[2]
218                    else:
219                        molesLocalID = ident
220                    print "molesLocalID is %s" %molesLocalID
221                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
222                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
223                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
224                    print "Executing command to run d2boneoff.jar"
225                    status= os.system(commandline)
226                    if status==10:
227                        print "WARNING: couldn't find the record"
228                    elif status!=0:
229                        print "ERROR: couldn't create the minimum moles records"
230                        sys.exit
231#There should be some records now
232try:
233    os.stat("./DIF2MOLES")
234except:
235    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
236    sys.exit()
237
238#Add keywords if necessary
239if datacentre_groups == "":
240    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/"
241    print "Executing : " + commandline
242    status = os.system(commandline)
243    if status !=0:
244        sys.exit("Failed at moving MOLES to FINAL directory")
245else:
246    keywordAdder.main('./DIF2MOLES', './FINALMOLES', ['MDIP', 'http://vocab.ndg.nerc.ac.uk/term/N010/0', 'NDGO0001'])
247
248# ingest the created discovery minimum molesrecords into eXist db.
249commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES"
250print "Executing : actual command to ingest into exist db"
251status = os.system(commandline)
252if status !=0:
253    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
254
255#Extract the spatiotemporal info from created moles and put in Postgres db
256SpaceTimeIngestFromMOLES.main("./FINALMOLES")
257
258#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
259this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
260commandline = "mkdir " + this_backupdir
261print "Executing : " + commandline
262status = os.system(commandline)
263if status !=0:
264    sys.exit("Failed at creating backup directory %s" %this_backupdir)
265
266commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
267print "Executing : " + commandline
268status = os.system(commandline)
269if status !=0:
270    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
271
272this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
273commandline = "mkdir " + this_backupdir
274print "Executing : " + commandline
275status = os.system(commandline)
276if status !=0:
277    sys.exit("Failed at creating backup directory %s" %this_backupdir)
278
279commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
280print "Executing : " + commandline
281status = os.system(commandline)
282if status !=0:
283    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
284
285this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES"
286commandline = "mkdir " + this_backupdir
287print "Executing : " + commandline
288status = os.system(commandline)
289if status !=0:
290    sys.exit("Failed at creating backup directory %s" %this_backupdir)
291
292commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir
293print "Executing : " + commandline
294status = os.system(commandline)
295if status !=0:
296    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
297
298#Clear out the original harvest records area and DIF2MOLES
299commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
300print "Executing : " + commandline
301status = os.system(commandline)
302if status !=0:
303    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
304
305commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}"
306print "Executing : " + commandline
307status = os.system(commandline)
308if status !=0:
309    sys.exit("Failed at clearing out FINALMOLES area %s" %harvest_home)
310
311#remove the DIF2MOLES directory
312##commandline = "rmdir ./DIF2MOLES"
313#print "Executing : " + commandline
314#status = os.system(commandline)
315#if status !=0:
316#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
317
318print "======================================================"
319print "No. of files pre-processed = %s" %numfilesproc
320if status == 0:
321    print " Procedure oai_ingest.py ran to end"
322else:
323    print "Procedure oai_ingest.py FAILED with status %s" %status
324
325print "======================================================"
Note: See TracBrowser for help on using the repository browser.