Changeset 3168 for TI01-discovery
- Timestamp:
- 19/12/07 18:16:28 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py
r3101 r3168 23 23 # 23/10/06 SEL keywords not mandatory in config file. 24 24 # 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running. 25 # December 2007 SEL rewrite to use Bryans' python XQuery stuff to create mini-moles instead of java. 26 # Also extracted hard coded pwds into a file. 25 27 26 28 import os … … 33 35 from DIF import DIF 34 36 from MDIP import MDIP 37 import ndgUtils 38 from ndgUtils.ndgXqueries import ndgXqueries 39 from ndgUtils.eXistInterface import ndg_eXist 40 from ndgUtils.ndgObject import ndgObject 41 import ConfigParser 42 from xmlrpclib import Fault 43 from ndgUtils.elementtree import ElementTree as ET 44 from ndgUtils.ndgDirectory import ndgDirectory 35 45 36 46 def getID(filename): … … 55 65 NDG_dataProvider = False 56 66 57 if (len(sys.argv) < 2):58 print " <datacentre>parameter not supplied."67 if (len(sys.argv) < 3): 68 print "ERROR: <datacentre> or <db info file> parameter not supplied." 59 69 sys.exit() 60 70 else: 61 71 datacentre = sys.argv[1] 72 dbinfoname = sys.argv[2] 62 73 63 74 # Other settings and constants 64 75 date_string = commands.getoutput ("date +'%y%m%d_%H%M'") 65 76 os.putenv ('EXIST_HOME', '/usr/local/exist-client') 66 os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')77 #os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03') 67 78 os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 68 os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar') 79 #os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar') 80 81 #Xquery settings 82 #f='glue.config' 83 #c=ConfigParser.ConfigParser() 84 #c.read(f) 85 xq=ndgXqueries() 86 xmldb=ndg_eXist(db='glue.badc.rl.ac.uk') 69 87 70 88 # Get the harvested records directory and groups for this datacentre from the datacentre specific config file … … 73 91 # Groups are added to the intermediate MOLES when it is created. 74 92 datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" 75 print " Datacentre config file = %s" %datacentre_config_filename93 print "INFO: Datacentre config file = %s" %datacentre_config_filename 76 94 datacentre_config_file = open(datacentre_config_filename, "r") 77 95 … … 115 133 #any records to harvest? 116 134 if len( os.listdir(harvest_home)) == 0: 117 print " Nothing to harvest this time from %s" %datacentre135 print "INFO: Nothing to harvest this time from %s" %datacentre 118 136 sys.exit() 137 138 # get the db access info 139 host ='glue.badc.rl.ac.uk' 140 dbaccess={} 141 dbinfo_file=open(dbinfoname,"r") 142 for line in dbinfo_file.readlines(): 143 words = string.split(line) 144 if len(words) < 2: 145 continue 146 dbaccess[(words[0],words[1])] = [words[2]] 147 datacentre_config_file.close() 148 #print dbaccess 149 db_admin = dbaccess[(host,'admin')][0] 150 #print db_admin 119 151 120 152 # The directory to put things for a tape backup (should already exist) … … 124 156 if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"): 125 157 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}" 126 print " Executing : " + commandline158 print "INFO: Executing : " + commandline 127 159 status = os.system(commandline) 128 160 else: 129 161 commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 130 print " Executing : " + commandline162 print "INFO: Executing : " + commandline 131 163 status= os.system(commandline) 132 164 … … 137 169 138 170 commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals" 139 print " Executing : " + commandline171 print "INFO: Executing : " + commandline 140 172 status = os.system(commandline) 141 173 if status !=0: … … 145 177 if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"): 146 178 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}" 147 print " Executing : " + commandline179 print "INFO: Executing : " + commandline 148 180 status = os.system(commandline) 149 181 else: 150 182 commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 151 print " Executing : " + commandline183 print "INFO: Executing : " + commandline 152 184 status= os.system(commandline) 153 185 … … 155 187 if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"): 156 188 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/{\}" 157 print " Executing : " + commandline189 print "INFO: Executing : " + commandline 158 190 status = os.system(commandline) 159 191 else: 160 192 commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected" 161 print " Executing : " + commandline193 print "INFO: Executing : " + commandline 162 194 status= os.system(commandline) 163 195 … … 165 197 # Copy the datacentre specific version of config to config.properties file. 166 198 commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" 167 print " Executing : " + commandline199 print "INFO: Executing : " + commandline 168 200 status = os.system(commandline) 169 201 if status !=0: … … 183 215 original_filename = indir + "/" + filename 184 216 ident=getID(original_filename) 185 print "I D extracted from the discovery record = %s" %ident217 print "INFO: ID extracted from the discovery record = %s" %ident 186 218 if NDG_dataProvider: 187 219 new_filename = outdir + "/"+ ident.replace(":","__")+".xml" … … 190 222 ident = ident.replace("/","-") 191 223 new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 192 print " original file = %s, newfile = %s" %(original_filename, new_filename)224 print "INFO: original file = %s, newfile = %s" %(original_filename, new_filename) 193 225 commandline = "cp "+original_filename+ " " +new_filename 194 226 #print "Executing : " + commandline … … 198 230 numfilesproc += 1 199 231 else: 200 print ' File %s is not xml format. Not processed' %(full_filename)232 print 'WARNING: File %s is not xml format. Not processed' %(full_filename) 201 233 202 234 #replace any namespace declarations with a standard one which we know works in NDG … … 214 246 215 247 # ingest the datacentres records into eXist db (backups of exist happen nightly). 216 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx-p "+outdir217 print " Executing : actual command to ingest into exist db"248 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P "+db_admin+" -p "+outdir 249 print "INFO: Executing : actual command to ingest into exist db" 218 250 status = os.system(commandline) 219 251 if status !=0: … … 222 254 #are there any old moles records hanging around.If so, remove. 223 255 try: 224 256 os.stat("./DIF2MOLES") 225 257 except: 226 print "No old moles records hanging around"227 else: 228 229 print "Executing : " + commandline230 231 232 258 print "INFO: No old moles records hanging around" 259 else: 260 commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}" 261 print "INFO: Executing : " + commandline 262 status = os.system(commandline) 263 if status !=0: 264 sys.exit("ERROR: Failed at clearing out DIF2MOLES area.") 233 265 234 266 # Then run the minimum moles creator for each discovery record 235 267 # Put records in ./DIF2MOLES with same filename 236 filenames = os.listdir(outdir)237 for filename in filenames:238 if filename.find('.xml') != -1:239 original_filename = outdir + "/" + filename240 ident=getID(original_filename)241 if NDG_dataProvider:242 newident=ident.replace(":","__")243 print "identifier is %s" %newident244 molesLocalID = newident.split("__",2)[2]245 else:246 molesLocalID = ident247 print "molesLocalID is %s" %molesLocalID248 commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \249 datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \250 datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename251 print "Executing command to run d2boneoff.jar"252 status= os.system(commandline)253 if status==10:254 print "WARNING: couldn't find the record"255 elif status!=0:256 print "ERROR: couldn't create the minimum moles records"257 sys.exit268 #filenames = os.listdir(outdir) 269 #for filename in filenames: 270 # if filename.find('.xml') != -1: 271 # original_filename = outdir + "/" + filename 272 # ident=getID(original_filename) 273 # if NDG_dataProvider: 274 # newident=ident.replace(":","__") 275 # print "identifier is %s" %newident 276 # molesLocalID = newident.split("__",2)[2] 277 # else: 278 # molesLocalID = ident 279 # print "molesLocalID is %s" %molesLocalID 280 # commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \ 281 # datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \ 282 # datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename 283 # print "Executing command to run d2boneoff.jar" 284 # status= os.system(commandline) 285 # if status==10: 286 # print "WARNING: couldn't find the record" 287 # elif status!=0: 288 # print "ERROR: couldn't create the minimum moles records" 289 # sys.exit 258 290 #There should be some records now 259 291 try: … … 263 295 sys.exit() 264 296 297 # Then run the minimum moles creator for each discovery record 298 # Put records in ./DIF2MOLES with same filename 299 300 # First get the list of discovery record ids from the db collection 301 targetCollection = "/db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace 302 if datacentre_format == 'DIF': 303 ndgDir=ndgDirectory(targetCollection,host,docType='DIF') 304 #print ndgDir.members 305 else: 306 print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)' 307 308 309 #create the mini-moles for each Discovery record in the collection 310 for member in ndgDir.members: 311 #print member 312 filename= member['fileName'] 313 disc_id = member['EntryID'] 314 print "INFO: internal id = %s" %disc_id 315 print "INFO: discovery filename = %s" %filename 316 # now create the xquery 317 # sort out the output ID stuff ... 318 if NDG_dataProvider: 319 discObj=ndgObject(disc_id) 320 xquery=xq.actual('dif2moles',targetCollection,discObj.repository,discObj.localID) 321 else: 322 xquery=xq.actual('dif2moles',targetCollection,datacentre_namespace,disc_id) 323 # and then sort out the input ID stuff 324 xquery=xquery.replace('Input_Entry_ID',disc_id) 325 xquery=xquery.replace('repository_localid', datacentre_namespace ) 326 #print xq.help('dif2moles') 327 molesid,s=xmldb.executeQuery(xquery) 328 moles_from_dif=xmldb.retrieve(molesid,0) 329 #print moles_from_dif 330 # now write out xml to file 331 outdir= './DIF2MOLES' 332 f=open(outdir+"/"+filename,'w') 333 f.write(moles_from_dif) 334 f.close() 335 265 336 #Add keywords if necessary 266 337 if datacentre_groups == "": 267 338 commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/" 268 print " Executing : " + commandline339 print "INFO: Executing : " + commandline 269 340 status = os.system(commandline) 270 341 if status !=0: … … 274 345 275 346 # ingest the created discovery minimum molesrecords into eXist db. 276 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx-p ./FINALMOLES"277 print " Executing : actual command to ingest into exist db"347 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P "+db_admin+" -p ./FINALMOLES" 348 print "INFO: Executing : actual command to ingest into exist db." 278 349 status = os.system(commandline) 279 350 if status !=0: … … 286 357 this_backupdir = backupdir + datacentre + "_" + date_string + "_originals" 287 358 commandline = "mkdir " + this_backupdir 288 print " Executing : " + commandline359 print "INFO: Executing : " + commandline 289 360 status = os.system(commandline) 290 361 if status !=0: … … 292 363 293 364 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir 294 print " Executing : " + commandline365 print "INFO: Executing : " + commandline 295 366 status = os.system(commandline) 296 367 if status !=0: … … 299 370 this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery" 300 371 commandline = "mkdir " + this_backupdir 301 print " Executing : " + commandline372 print "INFO: Executing : " + commandline 302 373 status = os.system(commandline) 303 374 if status !=0: … … 305 376 306 377 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir 307 print " Executing : " + commandline378 print "INFO: Executing : " + commandline 308 379 status = os.system(commandline) 309 380 if status !=0: … … 312 383 this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES" 313 384 commandline = "mkdir " + this_backupdir 314 print " Executing : " + commandline385 print "INFO: Executing : " + commandline 315 386 status = os.system(commandline) 316 387 if status !=0: … … 318 389 319 390 commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir 320 print " Executing : " + commandline391 print "INFO: Executing : " + commandline 321 392 status = os.system(commandline) 322 393 if status !=0: … … 325 396 #Clear out the original harvest records area and FINALMOLES 326 397 commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}" 327 print " Executing : " + commandline398 print "INFO: Executing : " + commandline 328 399 status = os.system(commandline) 329 400 if status !=0: … … 331 402 332 403 commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}" 333 print " Executing : " + commandline404 print "INFO: Executing : " + commandline 334 405 status = os.system(commandline) 335 406 if status !=0: 336 407 sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 337 338 339 340 #remove the DIF2MOLES directory341 ##commandline = "rmdir ./DIF2MOLES"342 #print "Executing : " + commandline343 #status = os.system(commandline)344 #if status !=0:345 # sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)346 408 347 409 print "======================================================" … … 350 412 print " Procedure oai_ingest.py ran to end" 351 413 else: 352 print " Procedure oai_ingest.py FAILED with status %s" %status414 print "ERROR: Procedure oai_ingest.py FAILED with status %s" %status 353 415 354 416 print "======================================================"
Note: See TracChangeset
for help on using the changeset viewer.