Changeset 3810 for TI01-discovery
- Timestamp:
- 21/04/08 13:10:30 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/FileUtilities.py
r3800 r3810 1 import os 2 import sys 1 import os, sys 2 from Logger import Logger 3 3 4 class FileUtilities: 4 5 ''' 5 6 Various helper methods for setting up and cleaning filesystems 6 C Byrom Apr 08 7 ''' 8 9 def makepath(path): 10 """ creates missing directories for the given path and 11 returns a normalized absolute version of the path. 12 13 - if the given path already exists in the filesystem 14 the filesystem is not modified. 15 16 - otherwise makepath creates directories along the given path 17 using the dirname() of the path. You may append 18 a '/' to the path if you want it to be a directory path. 19 20 from holger@trillke.net 2002/03/18 21 """ 22 from os import makedirs 23 from os.path import normpath,dirname,exists,abspath 24 25 print "INFO: Creating dir: %s" %path 26 dpath = normpath(dirname(path)) 27 if not exists(dpath): 28 makedirs(dpath) 29 return normpath(abspath(path)) 30 31 def setUpDir(dir): 32 ''' 33 Clean out specified directory - or create this, if it doesn't already exist 34 @param dir: directory to set up/clean 35 ''' 36 print "Setting up directory, %s" %dir 37 if os.path.isdir(dir): 38 cleanDir(dir) 39 else: 40 try: 41 makepath(dir) 42 except: 43 sys.exit("Failed at setting up directory, %s" %dir) 7 C Byrom Apr 08 8 ''' 9 logger = None 10 def __init__(self, inLogger): 11 ''' 12 Constructor - to set up the utils with a default logger 13 @param logger: logger to use; if none specified, use default logger. NB, logger must 14 support a 'printOutput' method 15 ''' 16 if inLogger is not None: 17 self.logger = inLogger 18 else: 19 self.logger = Logger(True) 44 20 45 21 46 def cleanDir(dir): 47 ''' 48 Remove all files from the specified directory 49 @param dir: directory to clean 50 ''' 51 print "INFO: Cleaning dir %s" %dir 52 commandline = "ls -1 " + dir + " | xargs -i rm " + dir + "{\}" 53 print "INFO: Executing : " + commandline 54 status = os.system(commandline) 55 if status !=0: 56 sys.exit("Failed at cleaning out directory, %s" %dir) 22 def makepath(self, path): 23 ''' creates missing directories for the given path and 24 returns a normalized absolute version of the path. 25 26 - if the given path already exists in the filesystem 27 the filesystem is not modified. 28 29 - otherwise makepath creates directories along the given path 30 using the dirname() of the path. You may append 31 a '/' to the path if you want it to be a directory path. 32 33 from holger@trillke.net 2002/03/18 34 ''' 35 from os import makedirs 36 from os.path import normpath,dirname,exists,abspath 37 38 self.logger.printOutput("INFO: Creating dir: " + path) 39 dpath = normpath(dirname(path)) 40 if not exists(dpath): 41 makedirs(dpath) 42 return normpath(abspath(path)) 57 43 58 59 def makeBackUp(original_dir, backup_dir): 60 ''' 61 Copy contents of original dir into backup dir 62 @param original_dir: dir to backup 63 @param backup_dir: dir to backup to 64 ''' 65 print "Creating backup directory (%s --> %s)" %original_dir %backup_dir 66 commandline = "mkdir " + backup_dir 67 print "INFO: Executing : " + commandline 68 status = os.system(commandline) 69 if status !=0: 70 sys.exit("Failed at creating backup directory %s" %backup_dir) 44 45 def setUpDir(self, dir): 46 ''' 47 Clean out specified directory - or create this, if it doesn't already exist 48 @param dir: directory to set up/clean 49 ''' 50 self.logger.printOutput("Setting up directory, " + dir) 51 if os.path.isdir(dir): 52 self.cleanDir(dir) 53 else: 54 try: 55 makepath(dir) 56 except: 57 sys.exit("Failed at setting up directory, %s" %dir) 71 58 72 commandline = "ls -1 " + original_dir + " | xargs -i cp " + original_dir + "{\} " + backup_dir 73 print "INFO: Executing : " + commandline 74 status = os.system(commandline) 75 if status !=0: 76 sys.exit("Failed at copying to backup directory %s" %backup_dir) 77 59 60 def cleanDir(self, dir): 61 ''' 62 Remove all files from the specified directory 63 @param dir: directory to clean 64 ''' 65 self.logger.printOutput("INFO: Cleaning dir " + dir) 66 commandline = "ls -1 " + dir + " | xargs -i rm " + dir + "{\}" 67 self.logger.printOutput("INFO: Executing : " + commandline) 68 69 status = os.system(commandline) 70 if status !=0: 71 sys.exit("Failed at cleaning out directory, %s" %dir) 72 73 74 def makeBackUp(self, original_dir, backup_dir): 75 ''' 76 Copy contents of original dir into backup dir 77 @param original_dir: dir to backup 78 @param backup_dir: dir to backup to 79 ''' 80 self.logger.printOutput("Creating backup directory (" + original_dir + \ 81 " --> " + backup_dir + ")") 82 commandline = "mkdir " + backup_dir 83 self.logger.printOutput("INFO: Executing : " + commandline) 84 status = os.system(commandline) 85 if status !=0: 86 sys.exit("Failed at creating backup directory %s" %backup_dir) 87 88 commandline = "ls -1 " + original_dir + " | xargs -i cp " + original_dir + "{\} " + backup_dir 89 self.logger.printOutput("INFO: Executing : " + commandline) 90 status = os.system(commandline) 91 if status !=0: 92 sys.exit("Failed at copying to backup directory %s" %backup_dir) 93 -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/SchemaNameSpace.py
r3797 r3810 1 1 import sys 2 from Logger import Logger 2 3 3 4 class SchemaNameSpace: … … 6 7 NB, only currently handles correction of DIF files 7 8 ''' 8 def __init__(self,infile,outfile,format ):9 def __init__(self,infile,outfile,format, logger): 9 10 ''' 10 11 Constructor - with the logic to do the namespace change … … 19 20 for self.line in self.lines: 20 21 if self.format== "DIF" and self.line.startswith('<DIF'): 21 print "INFO: changing line for %s. output to %s" %(infile,outfile) 22 message = "INFO: changing line for %s. output to %s" %(infile,outfile) 23 if (logger is None): 24 print message 25 else: 26 logger.printOutput(message) 22 27 self.line='<DIF xmlns="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n' 23 28 self.ww.write(self.line) 24 29 self.ff.close() 25 30 self.ww.close() 26 27 if __name__=="__main__":28 import sys29 f=sys.argv[1]30 w=sys.argv[2]31 form=sys.argv[3]32 SchemaNameSpace(f,w,form) -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_ingest_new.py
r3800 r3810 27 27 # + remove dependency on eXist DB 28 28 29 import os 30 import sys 31 import commands 32 import string 29 import os, sys, commands, string, getopt 33 30 import keywordAdder 34 31 from SchemaNameSpace import SchemaNameSpace … … 41 38 from ndgUtils.ndgDirectory import ndgDirectory 42 39 from ndgUtils.eXistInterface import ndg_eXist 43 from FileUtilities import setUpDir, cleanDir, makeBackUp40 from FileUtilities import FileUtilities 44 41 from PostgresRecord import PostgresRecord 45 import PostgresDAL 46 42 import PostgresDAO 43 import renderEntity 44 from Logger import Logger 45 46 47 47 def getID(filename): 48 48 ''' … … 52 52 @return: ID - id to use to refer to the document 53 53 ''' 54 logger.printOutput("INFO: Retrieving identifier for metadata record " + filename) 54 55 xml=file(filename).read() 55 56 if datacentre_format == "DIF": … … 61 62 else: 62 63 sys.exit("Only handles DIF or MDIP here.") 64 65 logger.printOutput("Found identifier: " + ID) 63 66 return ID 67 64 68 65 69 def addFileToPostgresDB(filename): … … 68 72 data in the process 69 73 ''' 70 print "Adding file, %s, to postgres DB" %filename74 logger.printOutput("Adding file, " + filename + ", to postgres DB") 71 75 72 76 # first of all create a PostgresRecord - this object represents all the data required 73 77 # for a DB entry 74 record = PostgresRecord(filename, NDG_dataProvider) 75 dal = PostgresDAL(record) 76 dal.createOrUpdateRecord() 78 record = PostgresRecord(filename, NDG_dataProvider, targetCollection, datacentre_namespace, 'discovery_idTEST', xq, datacentre_format) 79 dao = PostgresDAO(record) 80 dao.createOrUpdateRecord() 81 77 82 78 83 def getConfigDetails(datacentre): … … 90 95 global datacentre_config_filename, NDG_dataProvider 91 96 datacentre_config_filename = base_dir + datacentre + "_config.properties" 92 print "INFO: Retrieving data from datacentre config file = %s" %datacentre_config_filename97 logger.printOutput("INFO: Retrieving data from datacentre config file, " + datacentre_config_filename) 93 98 94 99 # Check this file exists; if not, assume an invalid datacentre has been specified … … 119 124 sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename) 120 125 121 print "INFO: harvested records are in %s" %harvest_home126 logger.printOutput("INFO: harvested records are in " + harvest_home) 122 127 123 128 if datacentre_groups == "": 124 print "INFO: No groups/keywords set for datacentre %s" %datacentre129 logger.printOutput("INFO: No groups/keywords set for datacentre " + datacentre) 125 130 else: 126 print "INFO: datacentre groups/keywords = %s" %datacentre_groups131 logger.printOutput("INFO: datacentre groups/keywords: " + datacentre_groups) 127 132 128 133 if datacentre_format == "": 129 134 sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename) 130 135 131 print "INFO: format being harvested = %s" %datacentre_format136 logger.printOutput("INFO: format being harvested: " + datacentre_format) 132 137 133 138 if datacentre_namespace == "": 134 139 sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename) 135 140 136 print "INFO: datacentre namespace = %s" %datacentre_namespace141 logger.printOutput("INFO: datacentre namespace: " + datacentre_namespace) 137 142 print lineSeparator 138 139 140 def usage():141 '''142 Display input params for the script143 '''144 print "Usage: python oai_ingest.py <datacentre> (<dbInfoFile>)"145 print " - where:\n <datacentre> is the data centre to ingest data from; and"146 print " <dbInfoFile> provides info on the eXist DB to use to do XQuery transforms"147 sys.exit()148 143 149 144 … … 153 148 into the other required doc types 154 149 ''' 155 print "INFO: Putting original docs in eXist..."150 logger.printOutput("INFO: Putting original docs in eXist...") 156 151 # this command creates the targetCollection in eXist from the contents of the discovery_corrected_dir 157 152 commandline = "$EXIST_HOME/bin/client.sh -c " + targetCollection + " -u admin -P " + \ 158 153 db_admin + " -p " + discovery_corrected_dir 159 print lineSeparator160 print "INFO: Executing : actual command to ingest into exist db", commandline154 logger.printOutput(lineSeparator) 155 logger.printOutput("INFO: Executing : actual command to ingest into exist db: " + commandline) 161 156 status = os.system(commandline) 162 157 if status !=0: 163 158 sys.exit("Failed at ingesting into exist db. Datacentre = %s. Status = %s" %(datacentre,status)) 164 print "INFO: Original docs now in eXist"165 print lineSeparator159 logger.printOutput("INFO: Original docs now in eXist") 160 logger.printOutput(lineSeparator) 166 161 167 162 … … 172 167 ''' 173 168 # First get the list of discovery record ids from the db collection 174 print "INFO: Creating minimoles documents from original docs in eXist" 175 if datacentre_format == 'DIF': 176 ndgDir=ndgDirectory(targetCollection,host, datacentre_format.upper()) 177 else: 178 print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)' 169 logger.printOutput("INFO: Creating minimoles documents from original docs in eXist") 170 # if datacentre_format == 'DIF': 171 172 ndgDir = ndgDirectory(targetCollection, host, datacentre_format.upper()) 173 # else: 174 # print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)' 179 175 180 176 #create the mini-moles for each Discovery record in the collection … … 186 182 # now create the xquery 187 183 # sort out the output ID stuff ... 184 xQueryType = 'dif2moles' 188 185 if NDG_dataProvider: 189 186 discObj=ndgObject(disc_id) 190 xquery=xq.actual('dif2moles',targetCollection,discObj.repository,discObj.localID) 187 xquery=xq.actual(xQueryType, targetCollection, discObj.repository, discObj.localID) 188 print "REPOSITORY: %s, DISC_ID: %s" %(discObj.repository, discObj.localID) 191 189 else: 192 xquery=xq.actual( 'dif2moles',targetCollection,datacentre_namespace,disc_id)190 xquery=xq.actual(xQueryType, targetCollection, datacentre_namespace, disc_id) 193 191 194 192 # and then sort out the input ID stuff 195 xquery=xquery.replace('Input_Entry_ID', disc_id)193 xquery=xquery.replace('Input_Entry_ID', disc_id) 196 194 xquery=xquery.replace('repository_localid', datacentre_namespace ) 197 195 … … 218 216 addKeywords() 219 217 220 221 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles-u admin -P " + db_admin + \222 223 224 225 226 218 # ingest the created discovery minimum molesrecords into eXist db. 219 commandline = "$EXIST_HOME/bin/client.sh -c " + miniMolesCollection + " -u admin -P " + db_admin + \ 220 " -p " + finalmoles_dir 221 print "INFO: Executing : actual command to ingest into exist db." 222 status = os.system(commandline) 223 if status !=0: 224 sys.exit("Failed at ingesting into exist db. Datacentre = %s. Status = %s" %(datacentre,status)) 227 225 228 226 … … 234 232 " -u admin -P " + db_admin + " -m " 235 233 collections = ['/db/xqueryLib', '/db/discovery/moles'] 236 print lineSeparator234 logger.printOutput(lineSeparator) 237 235 for collection in collections: 238 236 cmd = commandline + collection + " -p" 239 print "INFO: Executing : actual command to create DB collection ", cmd237 logger.printOutput("INFO: Executing : actual command to create DB collection: " + cmd) 240 238 status = os.system(cmd) 241 239 print status … … 243 241 sys.exit("Failed to create DB collection. Status = %s" %(status)) 244 242 245 246 243 247 244 def addKeywords(): … … 249 246 commandline = "ls -1 " + minimoles_dir + " | xargs -i mv " + minimoles_dir + \ 250 247 "{\} " + finalmoles_dir 251 print "INFO: Executing : " + commandline248 logger.printOutput("INFO: Executing : " + commandline) 252 249 status = os.system(commandline) 253 250 if status !=0: … … 264 261 commandline = "$EXIST_HOME/bin/client.sh -c /db " + \ 265 262 " -u admin -P " + db_admin + " -p " + base_dir + 'xquery' 266 print lineSeparator267 print "INFO: Executing : actual command to ingest into exist db", commandline263 logger.printOutput(lineSeparator) 264 logger.printOutput("INFO: Executing : actual command to ingest into exist db: " + commandline) 268 265 status = os.system(commandline) 269 print status270 266 if status !=0: 271 267 sys.exit("Failed at ingesting into exist db. Datacentre = %s. Status = %s" %(datacentre,status)) 272 268 273 269 274 270 def usage(): 271 ''' 272 Display input params for the script 273 ''' 274 print "Usage: python -v oai_ingest.py <datacentre> (<dbInfoFile>)" 275 print " - where:\n <datacentre> is the data centre to ingest data from; and" 276 print " <dbInfoFile> provides info on the eXist DB to use to do XQuery transforms" 277 print " -v - verbose mode for output logging" 278 sys.exit(2) 279 280 275 281 lineSeparator = "-----------------------------" 276 282 print lineSeparator … … 278 284 print lineSeparator 279 285 280 if (len(sys.argv) < 2 or len(sys.argv) > 3): 286 verboseMode = False 287 288 # check for verbose option 289 try: 290 opts, args = getopt.getopt(sys.argv[1:], "v") 291 except getopt.GetoptError, err: 292 # print help information and exit: 293 print str(err) # will print something like "option -a not recognized" 294 usage() 295 296 for o, a in opts: 297 if o == "-v": 298 verboseMode = True 299 300 if (len(args) < 1 or len(args) > 2): 281 301 usage() 282 302 else: 283 datacentre = sys.argv[1]303 datacentre = args[0] 284 304 285 305 # set the default password file 286 306 dbinfoname = "ingest.txt" 287 if (len(sys.argv) == 3): 288 dbinfoname = sys.argv[2] 307 if (len(args) == 2): 308 dbinfoname = args[1] 309 310 # set up the logger to use 311 logger = Logger(verboseMode) 312 313 # set up the file utils to use this logger 314 fileUtils = FileUtilities(logger) 289 315 290 316 status = 0 291 317 numfilesproc = 0 292 base_dir = "/usr/local/WSClients/OAIBatch/" # this is the base dir that the script is ran from293 #base_dir = os.getcwd() + "/"# this is the base dir that the script is ran from318 #base_dir = "/usr/local/WSClients/OAIBatch/" # this is the base dir that the script is ran from 319 base_dir = os.getcwd() + "/"# this is the base dir that the script is ran from 294 320 295 321 data_dir = base_dir + "data/" + datacentre # dir relating to the specified datacentre docs … … 300 326 # Other settings and constants 301 327 date_string = commands.getoutput("date +'%y%m%d_%H%M'") 328 302 329 #os.putenv('EXIST_HOME', '/usr/local/exist-client') 303 330 os.putenv('EXIST_HOME', '/home/users/cbyrom/opt/eXist') … … 314 341 #any records to harvest? 315 342 if len( os.listdir(harvest_home)) == 0: 316 print "INFO: Nothing to harvest this time from %s" %datacentre343 logger.printOutput("INFO: Nothing to harvest this time from " + datacentre) 317 344 sys.exit() 318 345 … … 342 369 343 370 # Create/clear the 'in' directory pristine copy of the discovery records 344 setUpDir(originals_dir)371 fileUtils.setUpDir(originals_dir) 345 372 commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} " + originals_dir 346 print "INFO: Executing : " + commandline 373 logger.printOutput("INFO: Executing : " + commandline) 347 374 status = os.system(commandline) 375 348 376 if status !=0: 349 377 sys.exit("Failed at making pristine copy stage") 350 378 351 379 # Create/clear the directory for the 'out' processed copy of the discovery records. 352 setUpDir(discovery_dir)380 fileUtils.setUpDir(discovery_dir) 353 381 354 382 # Create/clear the directory for the 'out' namespace corrected copy of the discovery records. 355 setUpDir(discovery_corrected_dir)383 fileUtils.setUpDir(discovery_corrected_dir) 356 384 357 385 # The file config.properties contains the location of the particular datacentres harvested records. 358 386 # Copy the datacentre specific version of config to config.properties file. 359 387 commandline = "cp " + datacentre_config_filename + " " + base_dir + "config.properties" 360 print "INFO: Executing : " + commandline 388 logger.printOutput("INFO: Executing : " + commandline) 361 389 status = os.system(commandline) 362 390 if status !=0: … … 365 393 #Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file) 366 394 # NB, this copies files from the original dir to the discovery dir 367 print lineSeparator 368 print "INFO: Renaming files:" 395 logger.printOutput(lineSeparator) 396 logger.printOutput("INFO: Renaming files:") 369 397 for filename in os.listdir(originals_dir): 370 398 if filename.find('.xml') != -1: 371 399 original_filename = originals_dir + filename 372 400 ident=getID(original_filename) 373 print "INFO: ID extracted from the discovery record = %s" %ident374 401 if NDG_dataProvider: 375 402 new_filename = discovery_dir + ident.replace(":","__")+".xml" … … 379 406 new_filename = discovery_dir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 380 407 381 print "original file = %s\nnewfile = %s" %(original_filename, new_filename) 408 logger.printOutput("original file = " + original_filename) 409 logger.printOutput("newfile = " + new_filename) 382 410 383 411 try: … … 391 419 #replace any namespace declarations with a standard one which we know works in NDG 392 420 # NB, this copies files from the discovery dir to the discovery corrected dir 393 print lineSeparator 394 print "INFO: Correcting namespaces of files (corrected files in %s)" %discovery_corrected_dir 421 logger.printOutput(lineSeparator) 422 logger.printOutput("INFO: Correcting namespaces of files (corrected files in " + \ 423 discovery_corrected_dir + ")") 395 424 for filename in os.listdir(discovery_dir): 396 425 if filename.find('.xml') != -1: … … 398 427 corrected_filename = discovery_corrected_dir + filename 399 428 try: 400 SchemaNameSpace(in_filename, corrected_filename,datacentre_format )429 SchemaNameSpace(in_filename, corrected_filename,datacentre_format, logger) 401 430 except: 402 431 print "ERROR: SchemaNameSpace failed on file %s"%in_filename … … 408 437 # now set up the required XQueries and check xqueries libs are available from eXist 409 438 targetCollection = "/db/discovery/original/" +datacentre_format+ "/" +datacentre_namespace 439 miniMolesCollection = "/db/discovery/moles" 410 440 xq=ndgXqueries() 411 441 xmldb=ndg_eXist(db='' + host + '') … … 418 448 minimoles_dir = base_dir + 'MINIMOLES/' 419 449 finalmoles_dir = base_dir + 'FINALMOLES/' 420 setUpDir(minimoles_dir)421 setUpDir(finalmoles_dir)450 fileUtils.setUpDir(minimoles_dir) 451 fileUtils.setUpDir(finalmoles_dir) 422 452 createEXistMiniMoles() 423 453 424 454 # TODO: need to skip the eXist steps below and just populate the postGres DB 425 # 1. using the discovery_corrected_dir - (do we need the non corrected files?) - import into original column426 455 # 2. use the correct xquery to transform the original doc directly and store t 427 456 … … 432 461 addFileToPostgresDB(discovery_corrected_dir + filename) 433 462 434 #SpaceTimeIngestFromMOLES.main("./FINALMOLES")435 436 #Extract the spatiotemporal info and put into NEW postgis tables437 #SpaceTimeIngestPostgisFromMOLES.main("./FINALMOLES")438 439 463 #Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups 440 464 backupdir_base = backupdir + datacentre + "_" + date_string 441 465 this_backupdir = backupdir_base + "_originals" 442 makeBackUp(originals_dir, this_backupdir)466 fileUtils.makeBackUp(originals_dir, this_backupdir) 443 467 444 468 this_backupdir = backupdir_base + "_discovery" 445 makeBackUp(discovery_dir, this_backupdir)469 fileUtils.makeBackUp(discovery_dir, this_backupdir) 446 470 447 471 this_backupdir = backupdir_base + "_FINALMOLES" 448 makeBackUp("./FINALMOLES", this_backupdir)472 fileUtils.makeBackUp("./FINALMOLES", this_backupdir) 449 473 450 474 #Clear out the original harvest records area and FINALMOLES 451 cleanDir("./FINALMOLES")452 cleanDir(harvest_home)475 fileUtils.cleanDir("./FINALMOLES") 476 fileUtils.cleanDir(harvest_home) 453 477 454 478 print "======================================================"
Note: See TracChangeset
for help on using the changeset viewer.