Changeset 3972


Ignore:
Timestamp:
03/06/08 13:47:43 (11 years ago)
Author:
cbyrom
Message:

Use the short filename in the postgres DB for storing the original
document filename.
Add fix to allow proper handling of scope fields as a ts_vector.
Add TODO comments to highlight areas of concern + update docs.

Location:
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py

    r3967 r3972  
    227227        ''' 
    228228        logging.info("Inserting new original document in Postgres DB") 
    229         sqlCmd = "SELECT create_document('" + self._record.filename + "', '" + \ 
     229        sqlCmd = "SELECT create_document('" + self._record.shortFilename + "', '" + \ 
    230230            self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 
    231231            self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ 
     
    257257        logging.info("Updating original document in Postgres DB") 
    258258        sqlCmd = "SELECT update_document('" + str(self._record.db_id) + "', '" + \ 
    259             self._record.filename + "', '" + \ 
     259            self._record.shortFilename + "', '" + \ 
    260260            self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 
    261             self._record.originalFormat + "', '" + self._record.getAuthorsInfo + "', '" + \ 
    262             self._record.getParametersInfo + "', '" + self._record.getScopeInfo + "', '" + str(self._record.scn) + "');" 
     261            self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ 
     262            self._record.getParametersInfo() + "', '" + self._record.getScopeInfo() + "', '" + str(self._record.scn) + "');" 
    263263        db_funcs.runSQLCommand(self._connection, sqlCmd) 
    264264         
     
    272272        Insert the metadata docs into the postgres DB 
    273273        ''' 
    274         logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename) 
     274        logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.shortFilename) 
    275275        if self._record.db_id is None: 
    276276            logging.info("No DB ID for the original record exists; cannot add associated transformed docs") 
     
    293293        Update the metadata docs into the postgres DB 
    294294        ''' 
    295         logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.filename) 
     295        logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.shortFilename) 
    296296        if self._record.db_id is None: 
    297297            logging.info("No DB ID for the original record exists; cannot update associated transformed docs") 
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3967 r3972  
    6565        tmp = filename.split('/') 
    6666        self._dir = '/'.join(tmp[0:len(tmp)-1]) 
    67         self._shortFilename = tmp[len(tmp)-1] 
     67        self.shortFilename = tmp[len(tmp)-1] 
    6868         
    6969        # dir to store a temp copy of the moles file, when produced - for use by other transforms 
     
    103103        ''' 
    104104        return re.sub(r'\'', '\\\'', inputString) 
     105 
     106 
     107    def unescapeSpecialCharacters(self, inputString): 
     108        ''' 
     109        Adjust the input string to remove escaped characters that would interfere with string or DB 
     110        operations 
     111        @param inputString: string to correct 
     112        @return: corrected string  
     113        ''' 
     114        str = re.sub(r'%20', ' ', inputString) 
     115        return  
    105116     
    106117     
     
    129140            self.doMolesTransform() 
    130141             
    131         self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) 
     142        self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat) 
    132143        logging.info("Moles file created - at %s" %self._molesDir) 
    133144         
    134145        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on         
    135         molesFile = self._molesDir + self._shortFilename 
     146        molesFile = self._molesDir + self.shortFilename 
    136147        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 
    137148         
     
    381392        authors = "" 
    382393        try: 
    383             creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator 
     394            # TODO: check this is the correct path for author data - NB, this is not obvious from example files 
     395            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn 
     396            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier 
    384397            logging.info("Found creator information - adding this to authors record") 
    385398             
     
    397410            logging.info(detail) 
    398411            logging.info("- this suggests document does not contain cited author information.") 
    399              
     412         
    400413        self.authors = authors + " " + creators 
    401414        return self.authors 
     
    413426        params = "" 
    414427        try: 
    415             parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary 
    416             logging.info("Found parameter information - adding this to record") 
     428            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files 
     429            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured 
     430            parameters_list = self.listify(parameters) 
     431            for parameter in parameters_list: 
     432                if parameters.dgValidTerm: 
     433                    logging.info("Found parameter information - adding this to record") 
     434                    params += " " + parameters.dgValidTerm 
     435             
    417436             
    418437        except Exception, detail: 
     
    450469            logging.info(detail) 
    451470            logging.info("- this suggests document does not contain scope information.") 
    452          
    453         self.scope = scope 
     471 
     472        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC' 
     473        # - to avoid this, use the following delimiter 
     474        self.scope = re.sub(r'_', 'UNDERSCORE', scope) 
    454475        return self.scope 
    455476             
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/README.txt

    r3913 r3972  
    138138the logger.  Ideally the logger would be used throughout code, but this is probably unlikely to happen. 
    139139 
     1408. The 'author' and 'parameter' elements from the moles files are extracted by the PostgresRecord class.  The original code used 
     141a simple xpath search operator (&=) to find the data below a particular parent element.  Where possible, the more exact location 
     142has been identified and used in the new code, however, not all the required info has been available to test - in particular, the 
     143following need to be checked and adjusted if pointing to the incorrect field: 
     144 
     145  creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier 
     146  authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors 
     147  parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured.dgValidTerm 
     148   
     149 
     150 
     151 
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py

    r3912 r3972  
    327327                 
    328328                this_backupdir = backupdir_base + "_originals/" 
    329                 fileUtils.makeBackUp(originals_dir, this_backupdir) 
     329                #fileUtils.makeBackUp(originals_dir, this_backupdir) 
    330330                 
    331331                #Clear out the original harvest records area and discovery dir 
    332332                fileUtils.cleanDir(originals_dir) 
    333                 fileUtils.cleanDir(discovery_dir) 
     333                #fileUtils.cleanDir(discovery_dir) 
    334334                 
    335335                logging.info("oai_document_ingest processing complete:") 
    336336                if self._no_problem_files == 0: 
    337337                        logging.info("All files successfully processed - cleaning harvest directory") 
    338                         fileUtils.cleanDir(self._harvest_home) 
     338                        #fileUtils.cleanDir(self._harvest_home) 
    339339                else: 
    340340                        logging.error("Problems experienced with %s files" %self._no_problem_files) 
Note: See TracChangeset for help on using the changeset viewer.