Ignore:
Timestamp:
02/06/08 10:43:00 (11 years ago)
Author:
cbyrom
Message:

Update code to allow the extraction of authors, parameters and scope
from moles files + adjust the data model to handle these new data.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3912 r3967  
    11#!/usr/bin/env python 
    22''' 
    3 Class representing the contents of a row in the metadata_record postgres DB table 
     3Class representing the a document to be ingested into the postgres DB table 
    44C Byrom Apr 08 
    55''' 
     
    2323class PostgresRecord: 
    2424    '''  
    25     Class representing the contents of a row in the metadata_record postgres DB table 
     25    Class representing the a document to be ingested into the postgres DB table 
    2626    @param filename: Name of file to use a metadata record 
    27     @param  
     27    @param ndg_dataprovider 
     28    @param datacentre_groups 
     29    @param datacentre_namespace 
     30    @param discovery_id 
     31    @param xq 
     32    @param doctype - type of doc to process 
    2833    ''' 
    2934    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these 
    3035    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] 
     36     
     37    # vocab server - used for finding scope values in the moles files 
     38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010" 
    3139         
    3240    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): 
     
    6169        # dir to store a temp copy of the moles file, when produced - for use by other transforms 
    6270        self._molesDir = None 
     71        # object to hold the moles file - this will be loaded in when it is created - in order to extract 
     72        # spatiotemporal data, etc 
     73        self.dgMeta = None 
    6374 
    6475        # firstly load contents of file 
     
    7889        # spatiotemporal data object 
    7990        self.stData = None 
     91         
     92        # fields to hold author, parameter and scope data 
     93        self.authors = None 
     94        self.parameters = None 
     95        self.scope = None 
    8096 
    8197    def escapeSpecialCharacters(self, inputString): 
     
    115131        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) 
    116132        logging.info("Moles file created - at %s" %self._molesDir) 
     133         
     134        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on         
     135        molesFile = self._molesDir + self._shortFilename 
     136        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 
     137         
     138        # load in the moles file and put this into an object for direct access to the xml elements 
     139        self.dgMeta=MRW.dgMetadata() 
     140        try: 
     141            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
     142        except Exception, detail: 
     143            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 
     144 
    117145             
    118146 
     
    313341        Extract spatio temporal data from the original document 
    314342        ''' 
     343        logging.info('Retrieving spatiotemporal info from moles file') 
    315344        # initialise the various spatiotemporal arrays used to extract data to 
    316345        self.stData = SpatioTemporalData() 
    317346         
    318         molesFile = self._molesDir + self._shortFilename 
    319         logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 
    320          
    321         # load in the moles file and put this into an object for direct access to the xml elements 
    322         dgMeta=MRW.dgMetadata() 
    323         try: 
    324             dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
    325         except Exception, detail: 
    326             raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 
    327  
     347        if self.dgMeta is None: 
     348            self.createMolesFile() 
     349             
    328350        # do quick checks to see if the relevant data exists 
    329         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: 
     351        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: 
    330352            logging.info("No data summary elements found - assuming no spatiotemporal data available") 
    331353            return 
    332354         
    333         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: 
     355        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: 
    334356            logging.info("No data coverage elements found - assuming no spatiotemporal data available") 
    335357            return 
    336358         
    337         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: 
     359        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: 
    338360            logging.info("No spatial coverage elements found - assuming no spatial data available") 
    339361        else: 
    340             self.getCoordData(dgMeta) 
    341  
    342         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: 
     362            self.getCoordData(self.dgMeta) 
     363 
     364        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: 
    343365            logging.info("No temporal coverage elements found - assuming no temporal data available") 
    344366        else: 
    345             self.getTimeRangeData(dgMeta) 
    346  
    347  
     367            self.getTimeRangeData(self.dgMeta) 
     368 
     369     
     370    def getAuthorsInfo(self): 
     371        ''' 
     372        Extract authors info from the moles file 
     373        ''' 
     374        logging.info('Retrieving authors info from moles file') 
     375         
     376        if self.dgMeta is None: 
     377            self.createMolesFile() 
     378             
     379        logging.info("Extracting author info") 
     380        creators = "" 
     381        authors = "" 
     382        try: 
     383            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator 
     384            logging.info("Found creator information - adding this to authors record") 
     385             
     386        except Exception, detail: 
     387            logging.info("Exception thrown whilst trying to find creator information:") 
     388            logging.info(detail) 
     389            logging.info("- this suggests document does not contain creator information.") 
     390 
     391        try: 
     392            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors 
     393            logging.info("Found cited author information - adding this to authors record") 
     394             
     395        except Exception, detail: 
     396            logging.info("Exception thrown whilst trying to find cited author information:") 
     397            logging.info(detail) 
     398            logging.info("- this suggests document does not contain cited author information.") 
     399             
     400        self.authors = authors + " " + creators 
     401        return self.authors 
     402     
     403     
     404    def getParametersInfo(self): 
     405        ''' 
     406        Extract parameters info from the moles file 
     407        ''' 
     408        logging.info('Retrieving parameters info from moles file') 
     409         
     410        if self.dgMeta is None: 
     411            self.createMolesFile() 
     412             
     413        params = "" 
     414        try: 
     415            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary 
     416            logging.info("Found parameter information - adding this to record") 
     417             
     418        except Exception, detail: 
     419            logging.info("Exception thrown whilst trying to find parameter information:") 
     420            logging.info(detail) 
     421            logging.info("- this suggests document does not contain parameter information.") 
     422         
     423        self.parameters = params 
     424        return self.parameters 
     425     
     426     
     427    def getScopeInfo(self): 
     428        ''' 
     429        Extract scope info from the moles file 
     430        ''' 
     431        logging.info('Retrieving scope info from moles file') 
     432         
     433        if self.dgMeta is None: 
     434            self.createMolesFile() 
     435             
     436        scope = "" 
     437        try: 
     438            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword 
     439            logging.info("Found keyword information - parsing this for scope") 
     440 
     441            keywords_list = self.listify(keywords) 
     442            for keyword in keywords_list: 
     443                if keyword.dgValidTermID: 
     444                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab): 
     445                        logging.info("Found scope value - adding this to record") 
     446                        scope += " " + keyword.dgValidTerm.strip() 
     447             
     448        except Exception, detail: 
     449            logging.info("Exception thrown whilst trying to find scope information:") 
     450            logging.info(detail) 
     451            logging.info("- this suggests document does not contain scope information.") 
     452         
     453        self.scope = scope 
     454        return self.scope 
     455             
     456             
    348457    def getTimeRangeData(self, dgMeta): 
    349458        ''' 
Note: See TracChangeset for help on using the changeset viewer.