Changeset 3972
- Timestamp:
- 03/06/08 13:47:43 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py
r3967 r3972 227 227 ''' 228 228 logging.info("Inserting new original document in Postgres DB") 229 sqlCmd = "SELECT create_document('" + self._record. filename + "', '" + \229 sqlCmd = "SELECT create_document('" + self._record.shortFilename + "', '" + \ 230 230 self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 231 231 self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ … … 257 257 logging.info("Updating original document in Postgres DB") 258 258 sqlCmd = "SELECT update_document('" + str(self._record.db_id) + "', '" + \ 259 self._record. filename + "', '" + \259 self._record.shortFilename + "', '" + \ 260 260 self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 261 self._record.originalFormat + "', '" + self._record.getAuthorsInfo + "', '" + \262 self._record.getParametersInfo + "', '" + self._record.getScopeInfo+ "', '" + str(self._record.scn) + "');"261 self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ 262 self._record.getParametersInfo() + "', '" + self._record.getScopeInfo() + "', '" + str(self._record.scn) + "');" 263 263 db_funcs.runSQLCommand(self._connection, sqlCmd) 264 264 … … 272 272 Insert the metadata docs into the postgres DB 273 273 ''' 274 logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record. filename)274 logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.shortFilename) 275 275 if self._record.db_id is None: 276 276 logging.info("No DB ID for the original record exists; cannot add associated transformed docs") … … 293 293 Update the metadata docs into the postgres DB 294 294 ''' 295 logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record. filename)295 logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.shortFilename) 296 296 if self._record.db_id is None: 297 297 logging.info("No DB ID for the original record exists; cannot update associated transformed docs") -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py
r3967 r3972 65 65 tmp = filename.split('/') 66 66 self._dir = '/'.join(tmp[0:len(tmp)-1]) 67 self. _shortFilename = tmp[len(tmp)-1]67 self.shortFilename = tmp[len(tmp)-1] 68 68 69 69 # dir to store a temp copy of the moles file, when produced - for use by other transforms … … 103 103 ''' 104 104 return re.sub(r'\'', '\\\'', inputString) 105 106 107 def unescapeSpecialCharacters(self, inputString): 108 ''' 109 Adjust the input string to remove escaped characters that would interfere with string or DB 110 operations 111 @param inputString: string to correct 112 @return: corrected string 113 ''' 114 str = re.sub(r'%20', ' ', inputString) 115 return 105 116 106 117 … … 129 140 self.doMolesTransform() 130 141 131 self._fileUtils.createFile(self._molesDir + self. _shortFilename, self._molesFormat)142 self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat) 132 143 logging.info("Moles file created - at %s" %self._molesDir) 133 144 134 145 # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on 135 molesFile = self._molesDir + self. _shortFilename146 molesFile = self._molesDir + self.shortFilename 136 147 logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 137 148 … … 381 392 authors = "" 382 393 try: 383 creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator 394 # TODO: check this is the correct path for author data - NB, this is not obvious from example files 395 # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn 396 creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier 384 397 logging.info("Found creator information - adding this to authors record") 385 398 … … 397 410 logging.info(detail) 398 411 logging.info("- this suggests document does not contain cited author information.") 399 412 400 413 self.authors = authors + " " + creators 401 414 return self.authors … … 413 426 params = "" 414 427 try: 415 parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary 416 logging.info("Found parameter information - adding this to record") 428 # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files 429 parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured 430 parameters_list = self.listify(parameters) 431 for parameter in parameters_list: 432 if parameters.dgValidTerm: 433 logging.info("Found parameter information - adding this to record") 434 params += " " + parameters.dgValidTerm 435 417 436 418 437 except Exception, detail: … … 450 469 logging.info(detail) 451 470 logging.info("- this suggests document does not contain scope information.") 452 453 self.scope = scope 471 472 # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC' 473 # - to avoid this, use the following delimiter 474 self.scope = re.sub(r'_', 'UNDERSCORE', scope) 454 475 return self.scope 455 476 -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/README.txt
r3913 r3972 138 138 the logger. Ideally the logger would be used throughout code, but this is probably unlikely to happen. 139 139 140 8. The 'author' and 'parameter' elements from the moles files are extracted by the PostgresRecord class. The original code used 141 a simple xpath search operator (&=) to find the data below a particular parent element. Where possible, the more exact location 142 has been identified and used in the new code, however, not all the required info has been available to test - in particular, the 143 following need to be checked and adjusted if pointing to the incorrect field: 144 145 creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier 146 authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors 147 parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured.dgValidTerm 148 149 150 151 -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py
r3912 r3972 327 327 328 328 this_backupdir = backupdir_base + "_originals/" 329 fileUtils.makeBackUp(originals_dir, this_backupdir)329 #fileUtils.makeBackUp(originals_dir, this_backupdir) 330 330 331 331 #Clear out the original harvest records area and discovery dir 332 332 fileUtils.cleanDir(originals_dir) 333 fileUtils.cleanDir(discovery_dir)333 #fileUtils.cleanDir(discovery_dir) 334 334 335 335 logging.info("oai_document_ingest processing complete:") 336 336 if self._no_problem_files == 0: 337 337 logging.info("All files successfully processed - cleaning harvest directory") 338 fileUtils.cleanDir(self._harvest_home)338 #fileUtils.cleanDir(self._harvest_home) 339 339 else: 340 340 logging.error("Problems experienced with %s files" %self._no_problem_files)
Note: See TracChangeset
for help on using the changeset viewer.