Changeset 3168 for TI01-discovery/trunk


Ignore:
Timestamp:
19/12/07 18:16:28 (12 years ago)
Author:
selatham
Message:

Re-write to use the pythonic dif2moles xquery rather than Java. extract pwds too.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py

    r3101 r3168  
    2323# 23/10/06 SEL keywords not mandatory in config file. 
    2424# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running. 
     25#  December 2007 SEL rewrite to use Bryans' python XQuery stuff to create mini-moles instead of java. 
     26#                    Also extracted hard coded pwds into a file. 
    2527 
    2628import os 
     
    3335from DIF import DIF 
    3436from MDIP import MDIP 
     37import ndgUtils 
     38from ndgUtils.ndgXqueries import ndgXqueries 
     39from ndgUtils.eXistInterface import ndg_eXist 
     40from ndgUtils.ndgObject import ndgObject 
     41import ConfigParser 
     42from xmlrpclib import Fault 
     43from ndgUtils.elementtree import ElementTree as ET 
     44from ndgUtils.ndgDirectory import ndgDirectory 
    3545 
    3646def getID(filename): 
     
    5565NDG_dataProvider = False 
    5666 
    57 if (len(sys.argv) < 2): 
    58     print "<datacentre> parameter not supplied." 
     67if (len(sys.argv) < 3): 
     68    print "ERROR: <datacentre> or <db info file> parameter not supplied." 
    5969    sys.exit() 
    6070else: 
    6171    datacentre = sys.argv[1] 
     72    dbinfoname = sys.argv[2] 
    6273 
    6374# Other settings and constants 
    6475date_string = commands.getoutput ("date +'%y%m%d_%H%M'") 
    6576os.putenv ('EXIST_HOME', '/usr/local/exist-client') 
    66 os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03') 
     77#os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03') 
    6778os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
    68 os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar') 
     79#os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar') 
     80 
     81#Xquery settings 
     82#f='glue.config' 
     83#c=ConfigParser.ConfigParser() 
     84#c.read(f) 
     85xq=ndgXqueries() 
     86xmldb=ndg_eXist(db='glue.badc.rl.ac.uk') 
    6987 
    7088# Get the harvested records directory and groups for this datacentre from the datacentre specific config file 
     
    7391# Groups are added to the intermediate MOLES when it is created. 
    7492datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" 
    75 print "Datacentre config file = %s" %datacentre_config_filename 
     93print "INFO: Datacentre config file = %s" %datacentre_config_filename 
    7694datacentre_config_file = open(datacentre_config_filename, "r") 
    7795 
     
    115133#any records to harvest? 
    116134if len( os.listdir(harvest_home)) == 0: 
    117     print "Nothing to harvest this time from %s" %datacentre 
     135    print "INFO: Nothing to harvest this time from %s" %datacentre 
    118136    sys.exit() 
     137 
     138# get the db access info 
     139host ='glue.badc.rl.ac.uk' 
     140dbaccess={} 
     141dbinfo_file=open(dbinfoname,"r") 
     142for line in dbinfo_file.readlines(): 
     143    words  = string.split(line) 
     144    if len(words) < 2: 
     145        continue 
     146    dbaccess[(words[0],words[1])] = [words[2]] 
     147datacentre_config_file.close() 
     148#print dbaccess 
     149db_admin = dbaccess[(host,'admin')][0] 
     150#print db_admin 
    119151 
    120152# The directory to put things for a tape backup (should already exist) 
     
    124156if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"): 
    125157    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}" 
    126     print "Executing : " + commandline 
     158    print "INFO: Executing : " + commandline 
    127159    status = os.system(commandline) 
    128160else: 
    129161    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 
    130     print "Executing : " + commandline 
     162    print "INFO: Executing : " + commandline 
    131163    status= os.system(commandline) 
    132164 
     
    137169 
    138170commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals" 
    139 print "Executing : " + commandline 
     171print "INFO: Executing : " + commandline 
    140172status = os.system(commandline) 
    141173if status !=0: 
     
    145177if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"): 
    146178    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}" 
    147     print "Executing : " + commandline 
     179    print "INFO: Executing : " + commandline 
    148180    status = os.system(commandline) 
    149181else: 
    150182    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 
    151     print "Executing : " + commandline 
     183    print "INFO: Executing : " + commandline 
    152184    status= os.system(commandline) 
    153185 
     
    155187if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected"): 
    156188    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected/{\}" 
    157     print "Executing : " + commandline 
     189    print "INFO: Executing : " + commandline 
    158190    status = os.system(commandline) 
    159191else: 
    160192    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery_corrected" 
    161     print "Executing : " + commandline 
     193    print "INFO: Executing : " + commandline 
    162194    status= os.system(commandline) 
    163195 
     
    165197# Copy the datacentre specific version of config to config.properties file. 
    166198commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" 
    167 print "Executing : " + commandline 
     199print "INFO: Executing : " + commandline 
    168200status = os.system(commandline) 
    169201if status !=0: 
     
    183215                original_filename = indir + "/" + filename 
    184216                ident=getID(original_filename) 
    185                 print "ID extracted from the discovery record = %s" %ident 
     217                print "INFO: ID extracted from the discovery record = %s" %ident 
    186218                if NDG_dataProvider: 
    187219                    new_filename = outdir + "/"+ ident.replace(":","__")+".xml" 
     
    190222                    ident = ident.replace("/","-") 
    191223                    new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 
    192                 print "original file = %s, newfile = %s" %(original_filename, new_filename) 
     224                print "INFO: original file = %s, newfile = %s" %(original_filename, new_filename) 
    193225                commandline = "cp "+original_filename+ " " +new_filename 
    194226                #print "Executing : " + commandline 
     
    198230                numfilesproc += 1 
    199231        else: 
    200                 print 'File %s is not xml format. Not processed'  %(full_filename) 
     232                print 'WARNING: File %s is not xml format. Not processed'  %(full_filename) 
    201233 
    202234#replace any namespace declarations with a standard one which we know works in NDG 
     
    214246 
    215247# ingest the datacentres records into eXist db (backups of exist happen nightly). 
    216 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir 
    217 print "Executing : actual command to ingest into exist db" 
     248commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P "+db_admin+" -p "+outdir 
     249print "INFO: Executing : actual command to ingest into exist db" 
    218250status = os.system(commandline) 
    219251if status !=0: 
     
    222254#are there any old moles records hanging around.If so, remove. 
    223255try: 
    224     os.stat("./DIF2MOLES") 
     256   os.stat("./DIF2MOLES") 
    225257except: 
    226     print "No old moles records hanging around" 
    227 else: 
    228     commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}" 
    229     print "Executing : " + commandline 
    230     status = os.system(commandline) 
    231     if status !=0: 
    232         sys.exit("ERROR: Failed at clearing out DIF2MOLES area.") 
     258   print "INFO: No old moles records hanging around" 
     259else: 
     260   commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}" 
     261   print "INFO: Executing : " + commandline 
     262   status = os.system(commandline) 
     263   if status !=0: 
     264       sys.exit("ERROR: Failed at clearing out DIF2MOLES area.") 
    233265 
    234266# Then run the minimum moles creator for each discovery record 
    235267# Put records in ./DIF2MOLES with same filename 
    236 filenames = os.listdir(outdir) 
    237 for filename in filenames: 
    238         if filename.find('.xml') != -1: 
    239                     original_filename = outdir + "/" + filename 
    240                     ident=getID(original_filename) 
    241                     if NDG_dataProvider: 
    242                         newident=ident.replace(":","__") 
    243                         print "identifier is %s" %newident 
    244                         molesLocalID = newident.split("__",2)[2] 
    245                     else: 
    246                         molesLocalID = ident 
    247                     print "molesLocalID is %s" %molesLocalID 
    248                     commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \ 
    249                     datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \ 
    250                     datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename 
    251                     print "Executing command to run d2boneoff.jar" 
    252                     status= os.system(commandline) 
    253                     if status==10: 
    254                         print "WARNING: couldn't find the record" 
    255                     elif status!=0: 
    256                         print "ERROR: couldn't create the minimum moles records" 
    257                         sys.exit 
     268#filenames = os.listdir(outdir) 
     269#for filename in filenames: 
     270#        if filename.find('.xml') != -1: 
     271#                    original_filename = outdir + "/" + filename 
     272#                    ident=getID(original_filename) 
     273#                    if NDG_dataProvider: 
     274#                        newident=ident.replace(":","__") 
     275#                        print "identifier is %s" %newident 
     276#                        molesLocalID = newident.split("__",2)[2] 
     277#                    else: 
     278#                        molesLocalID = ident 
     279#                    print "molesLocalID is %s" %molesLocalID 
     280#                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \ 
     281#                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \ 
     282#                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+ ident+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename 
     283#                    print "Executing command to run d2boneoff.jar" 
     284#                    status= os.system(commandline) 
     285#                    if status==10: 
     286#                        print "WARNING: couldn't find the record" 
     287#                    elif status!=0: 
     288#                        print "ERROR: couldn't create the minimum moles records" 
     289#                        sys.exit 
    258290#There should be some records now 
    259291try: 
     
    263295    sys.exit() 
    264296 
     297# Then run the minimum moles creator for each discovery record 
     298# Put records in ./DIF2MOLES with same filename 
     299 
     300# First get the list of discovery record ids from the db collection 
     301targetCollection = "/db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace 
     302if datacentre_format == 'DIF': 
     303    ndgDir=ndgDirectory(targetCollection,host,docType='DIF') 
     304    #print ndgDir.members 
     305else: 
     306    print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)' 
     307 
     308 
     309#create the mini-moles for each Discovery record in the collection 
     310for member in ndgDir.members: 
     311    #print member 
     312    filename= member['fileName'] 
     313    disc_id = member['EntryID'] 
     314    print "INFO: internal id = %s" %disc_id 
     315    print "INFO: discovery filename = %s" %filename 
     316    # now create the xquery 
     317    # sort out the output ID stuff ... 
     318    if NDG_dataProvider: 
     319        discObj=ndgObject(disc_id) 
     320        xquery=xq.actual('dif2moles',targetCollection,discObj.repository,discObj.localID) 
     321    else: 
     322        xquery=xq.actual('dif2moles',targetCollection,datacentre_namespace,disc_id) 
     323    # and then sort out the input ID stuff 
     324    xquery=xquery.replace('Input_Entry_ID',disc_id) 
     325    xquery=xquery.replace('repository_localid', datacentre_namespace ) 
     326    #print xq.help('dif2moles') 
     327    molesid,s=xmldb.executeQuery(xquery) 
     328    moles_from_dif=xmldb.retrieve(molesid,0) 
     329    #print moles_from_dif 
     330    # now write out xml to file 
     331    outdir= './DIF2MOLES' 
     332    f=open(outdir+"/"+filename,'w') 
     333    f.write(moles_from_dif) 
     334    f.close() 
     335 
    265336#Add keywords if necessary 
    266337if datacentre_groups == "": 
    267338    commandline = "ls -1 ./DIF2MOLES/ | xargs -i mv ./DIF2MOLES/{\} ./FINALMOLES/" 
    268     print "Executing : " + commandline 
     339    print "INFO: Executing : " + commandline 
    269340    status = os.system(commandline) 
    270341    if status !=0: 
     
    274345 
    275346# ingest the created discovery minimum molesrecords into eXist db. 
    276 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./FINALMOLES" 
    277 print "Executing : actual command to ingest into exist db" 
     347commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P "+db_admin+" -p ./FINALMOLES" 
     348print "INFO: Executing : actual command to ingest into exist db." 
    278349status = os.system(commandline) 
    279350if status !=0: 
     
    286357this_backupdir = backupdir + datacentre + "_" + date_string + "_originals" 
    287358commandline = "mkdir " + this_backupdir 
    288 print "Executing : " + commandline 
     359print "INFO: Executing : " + commandline 
    289360status = os.system(commandline) 
    290361if status !=0: 
     
    292363 
    293364commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir 
    294 print "Executing : " + commandline 
     365print "INFO: Executing : " + commandline 
    295366status = os.system(commandline) 
    296367if status !=0: 
     
    299370this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery" 
    300371commandline = "mkdir " + this_backupdir 
    301 print "Executing : " + commandline 
     372print "INFO: Executing : " + commandline 
    302373status = os.system(commandline) 
    303374if status !=0: 
     
    305376 
    306377commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir 
    307 print "Executing : " + commandline 
     378print "INFO: Executing : " + commandline 
    308379status = os.system(commandline) 
    309380if status !=0: 
     
    312383this_backupdir = backupdir + datacentre + "_" + date_string + "_FINALMOLES" 
    313384commandline = "mkdir " + this_backupdir 
    314 print "Executing : " + commandline 
     385print "INFO: Executing : " + commandline 
    315386status = os.system(commandline) 
    316387if status !=0: 
     
    318389 
    319390commandline = "ls -1 ./FINALMOLES | xargs -i cp ./FINALMOLES/{\} " + this_backupdir 
    320 print "Executing : " + commandline 
     391print "INFO: Executing : " + commandline 
    321392status = os.system(commandline) 
    322393if status !=0: 
     
    325396#Clear out the original harvest records area and FINALMOLES 
    326397commandline = "ls -1 ./FINALMOLES | xargs -i rm ./FINALMOLES/{\}" 
    327 print "Executing : " + commandline 
     398print "INFO: Executing : " + commandline 
    328399status = os.system(commandline) 
    329400if status !=0: 
     
    331402 
    332403commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}" 
    333 print "Executing : " + commandline 
     404print "INFO: Executing : " + commandline 
    334405status = os.system(commandline) 
    335406if status !=0: 
    336407    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 
    337  
    338  
    339  
    340 #remove the DIF2MOLES directory 
    341 ##commandline = "rmdir ./DIF2MOLES" 
    342 #print "Executing : " + commandline 
    343 #status = os.system(commandline) 
    344 #if status !=0: 
    345 #    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home) 
    346408 
    347409print "======================================================" 
     
    350412    print " Procedure oai_ingest.py ran to end" 
    351413else: 
    352     print "Procedure oai_ingest.py FAILED with status %s" %status 
     414    print "ERROR: Procedure oai_ingest.py FAILED with status %s" %status 
    353415 
    354416print "======================================================" 
Note: See TracChangeset for help on using the changeset viewer.