Changeset 3967


Ignore:
Timestamp:
02/06/08 10:43:00 (11 years ago)
Author:
cbyrom
Message:

Update code to allow the extraction of authors, parameters and scope
from moles files + adjust the data model to handle these new data.

Location:
TI01-discovery/branches/ingestAutomation-upgrade
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py

    r3869 r3967  
    229229        sqlCmd = "SELECT create_document('" + self._record.filename + "', '" + \ 
    230230            self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 
    231             self._record.originalFormat + "');"  
     231            self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ 
     232            self._record.getParametersInfo() + "', '" + self._record.getScopeInfo() + "');"  
    232233 
    233234        id = db_funcs.runSQLCommand(self._connection, sqlCmd) 
     
    258259            self._record.filename + "', '" + \ 
    259260            self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 
    260             self._record.originalFormat + "', '" + str(self._record.scn) + "');" 
     261            self._record.originalFormat + "', '" + self._record.getAuthorsInfo + "', '" + \ 
     262            self._record.getParametersInfo + "', '" + self._record.getScopeInfo + "', '" + str(self._record.scn) + "');" 
    261263        db_funcs.runSQLCommand(self._connection, sqlCmd) 
    262264         
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3912 r3967  
    11#!/usr/bin/env python 
    22''' 
    3 Class representing the contents of a row in the metadata_record postgres DB table 
     3Class representing the a document to be ingested into the postgres DB table 
    44C Byrom Apr 08 
    55''' 
     
    2323class PostgresRecord: 
    2424    '''  
    25     Class representing the contents of a row in the metadata_record postgres DB table 
     25    Class representing the a document to be ingested into the postgres DB table 
    2626    @param filename: Name of file to use a metadata record 
    27     @param  
     27    @param ndg_dataprovider 
     28    @param datacentre_groups 
     29    @param datacentre_namespace 
     30    @param discovery_id 
     31    @param xq 
     32    @param doctype - type of doc to process 
    2833    ''' 
    2934    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these 
    3035    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] 
     36     
     37    # vocab server - used for finding scope values in the moles files 
     38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010" 
    3139         
    3240    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): 
     
    6169        # dir to store a temp copy of the moles file, when produced - for use by other transforms 
    6270        self._molesDir = None 
     71        # object to hold the moles file - this will be loaded in when it is created - in order to extract 
     72        # spatiotemporal data, etc 
     73        self.dgMeta = None 
    6374 
    6475        # firstly load contents of file 
     
    7889        # spatiotemporal data object 
    7990        self.stData = None 
     91         
     92        # fields to hold author, parameter and scope data 
     93        self.authors = None 
     94        self.parameters = None 
     95        self.scope = None 
    8096 
    8197    def escapeSpecialCharacters(self, inputString): 
     
    115131        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) 
    116132        logging.info("Moles file created - at %s" %self._molesDir) 
     133         
     134        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on         
     135        molesFile = self._molesDir + self._shortFilename 
     136        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 
     137         
     138        # load in the moles file and put this into an object for direct access to the xml elements 
     139        self.dgMeta=MRW.dgMetadata() 
     140        try: 
     141            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
     142        except Exception, detail: 
     143            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 
     144 
    117145             
    118146 
     
    313341        Extract spatio temporal data from the original document 
    314342        ''' 
     343        logging.info('Retrieving spatiotemporal info from moles file') 
    315344        # initialise the various spatiotemporal arrays used to extract data to 
    316345        self.stData = SpatioTemporalData() 
    317346         
    318         molesFile = self._molesDir + self._shortFilename 
    319         logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 
    320          
    321         # load in the moles file and put this into an object for direct access to the xml elements 
    322         dgMeta=MRW.dgMetadata() 
    323         try: 
    324             dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
    325         except Exception, detail: 
    326             raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 
    327  
     347        if self.dgMeta is None: 
     348            self.createMolesFile() 
     349             
    328350        # do quick checks to see if the relevant data exists 
    329         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: 
     351        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: 
    330352            logging.info("No data summary elements found - assuming no spatiotemporal data available") 
    331353            return 
    332354         
    333         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: 
     355        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: 
    334356            logging.info("No data coverage elements found - assuming no spatiotemporal data available") 
    335357            return 
    336358         
    337         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: 
     359        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: 
    338360            logging.info("No spatial coverage elements found - assuming no spatial data available") 
    339361        else: 
    340             self.getCoordData(dgMeta) 
    341  
    342         if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: 
     362            self.getCoordData(self.dgMeta) 
     363 
     364        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: 
    343365            logging.info("No temporal coverage elements found - assuming no temporal data available") 
    344366        else: 
    345             self.getTimeRangeData(dgMeta) 
    346  
    347  
     367            self.getTimeRangeData(self.dgMeta) 
     368 
     369     
     370    def getAuthorsInfo(self): 
     371        ''' 
     372        Extract authors info from the moles file 
     373        ''' 
     374        logging.info('Retrieving authors info from moles file') 
     375         
     376        if self.dgMeta is None: 
     377            self.createMolesFile() 
     378             
     379        logging.info("Extracting author info") 
     380        creators = "" 
     381        authors = "" 
     382        try: 
     383            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator 
     384            logging.info("Found creator information - adding this to authors record") 
     385             
     386        except Exception, detail: 
     387            logging.info("Exception thrown whilst trying to find creator information:") 
     388            logging.info(detail) 
     389            logging.info("- this suggests document does not contain creator information.") 
     390 
     391        try: 
     392            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors 
     393            logging.info("Found cited author information - adding this to authors record") 
     394             
     395        except Exception, detail: 
     396            logging.info("Exception thrown whilst trying to find cited author information:") 
     397            logging.info(detail) 
     398            logging.info("- this suggests document does not contain cited author information.") 
     399             
     400        self.authors = authors + " " + creators 
     401        return self.authors 
     402     
     403     
     404    def getParametersInfo(self): 
     405        ''' 
     406        Extract parameters info from the moles file 
     407        ''' 
     408        logging.info('Retrieving parameters info from moles file') 
     409         
     410        if self.dgMeta is None: 
     411            self.createMolesFile() 
     412             
     413        params = "" 
     414        try: 
     415            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary 
     416            logging.info("Found parameter information - adding this to record") 
     417             
     418        except Exception, detail: 
     419            logging.info("Exception thrown whilst trying to find parameter information:") 
     420            logging.info(detail) 
     421            logging.info("- this suggests document does not contain parameter information.") 
     422         
     423        self.parameters = params 
     424        return self.parameters 
     425     
     426     
     427    def getScopeInfo(self): 
     428        ''' 
     429        Extract scope info from the moles file 
     430        ''' 
     431        logging.info('Retrieving scope info from moles file') 
     432         
     433        if self.dgMeta is None: 
     434            self.createMolesFile() 
     435             
     436        scope = "" 
     437        try: 
     438            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword 
     439            logging.info("Found keyword information - parsing this for scope") 
     440 
     441            keywords_list = self.listify(keywords) 
     442            for keyword in keywords_list: 
     443                if keyword.dgValidTermID: 
     444                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab): 
     445                        logging.info("Found scope value - adding this to record") 
     446                        scope += " " + keyword.dgValidTerm.strip() 
     447             
     448        except Exception, detail: 
     449            logging.info("Exception thrown whilst trying to find scope information:") 
     450            logging.info(detail) 
     451            logging.info("- this suggests document does not contain scope information.") 
     452         
     453        self.scope = scope 
     454        return self.scope 
     455             
     456             
    348457    def getTimeRangeData(self, dgMeta): 
    349458        ''' 
  • TI01-discovery/branches/ingestAutomation-upgrade/database/ingest_procedures.sql

    r3863 r3967  
    2121 
    2222DROP FUNCTION create_document(filename_in varchar(255), discovery_id_in varchar(255),  
    23         doc_type_in text, original_document_in text) CASCADE; 
     23        doc_type_in text, original_document_in text, authors text, parameters text, scope text) CASCADE; 
    2424CREATE FUNCTION create_document(filename_in varchar(255), discovery_id_in varchar(255),  
    25         doc_type_in text, original_document_in text) RETURNS integer AS  
     25        doc_type_in text, original_document_in text, authors text, parameters text, scope text) RETURNS integer AS  
    2626$$ 
    2727        DECLARE 
     
    3030        -- This inserts a new document into the DB 
    3131        INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_filename,  
    32             discovery_id, original_format, original_document, ts_vector, create_date,  
    33                         harvest_count, scn) VALUES ( 
     32            discovery_id, original_format, original_document, document_ts_vector, authors_ts_vector,  
     33            parameters_ts_vector, scope_ts_vector, create_date, harvest_count, scn) VALUES ( 
    3434            DEFAULT, filename_in, discovery_id_in, doc_type_in, original_document_in,  
    35                         to_tsvector('english', original_document_in), current_timestamp, 1, 1); 
     35                        to_tsvector('english', original_document_in), to_tsvector('english', authors),  
     36                        to_tsvector('english', parameters), to_tsvector('english', scope), current_timestamp, 1, 1); 
    3637 
    3738                SELECT original_document_id INTO db_id FROM ORIGINAL_DOCUMENT WHERE discovery_id = discovery_id_in; 
     
    5657 
    5758DROP FUNCTION update_document(original_document_id_in int, filename_in varchar(255),  
    58         discovery_id_in varchar(255), doc_type_in text, original_document_in text, scn_in int) CASCADE; 
     59        discovery_id_in varchar(255), doc_type_in text, original_document_in text,  
     60        authors text, parameters text, scope text, scn_in int) CASCADE; 
    5961CREATE FUNCTION update_document(original_document_id_in int, filename_in varchar(255),  
    60         discovery_id_in varchar(255), doc_type_in text, original_document_in text, scn_in int)  
     62        discovery_id_in varchar(255), doc_type_in text, original_document_in text,  
     63        authors text, parameters text, scope text, scn_in int)  
    6164        RETURNS VOID AS  
    6265$$ 
     
    7982                        original_format = doc_type_in, 
    8083                        original_document = original_document_in, 
    81             ts_vector = to_tsvector('english', original_document_in), 
     84            document_ts_vector = to_tsvector('english', original_document_in), 
     85            authors_ts_vector = to_tsvector('english', authors), 
     86            parameters_ts_vector = to_tsvector('english', parameters), 
     87            scope_ts_vector = to_tsvector('english', scope), 
    8288            update_date = current_timestamp,  
    8389                        harvest_count = 1, 
  • TI01-discovery/branches/ingestAutomation-upgrade/database/original_document.sql

    r3849 r3967  
    1212                original_format text, 
    1313                original_document text, 
    14                 ts_vector       tsvector, 
     14                document_ts_vector      tsvector, 
     15                authors_ts_vector       tsvector, 
     16                parameters_ts_vector    tsvector, 
     17                scope_ts_vector tsvector, 
    1518                create_date     timestamp, 
    1619                update_date timestamp, 
     
    2932 
    3033-- Create index on searchable column to speed up searches 
    31 CREATE INDEX textsearch_idx ON original_document USING gin(ts_vector); 
     34CREATE INDEX textsearch_idx ON original_document USING gin(document_ts_vector); 
  • TI01-discovery/branches/ingestAutomation-upgrade/database/spatial_data.sql

    r3849 r3967  
    1616-- add 2D geometry column, 'geometry', to table - with SRS val of 4326 
    1717select addgeometrycolumn('spatial_data','geometry',4326,'GEOMETRY',2); 
     18 
     19-- Create index on searchable column to speed up searches 
     20CREATE INDEX spatialsearch_idx ON spatial_data USING GIST(geometry); 
  • TI01-discovery/branches/ingestAutomation-upgrade/database/test_data.sql

    r3863 r3967  
    55 *  
    66 */ 
    7 INSERT INTO original_document VALUES (DEFAULT, 'tst.xml', 'test_dummy', 'DIF', null, null, current_timestamp, null, 1, 1); 
     7INSERT INTO original_document VALUES (DEFAULT, 'tst.xml', 'test_dummy', 'DIF', null, null, null, null, to_tsvector('NERC'), current_timestamp, null, 1, 1); 
    88INSERT INTO original_document VALUES (DEFAULT, 'tst1.xml', 'test_record', 'DIF', '<?xml version="1.0" encoding="utf-8"?> 
    99<kml xmlns="http://earth.google.com/kml/2.2"> 
     
    2828  </Folder> 
    2929</kml>', 
    30 null, current_timestamp, null, 1, 1); 
     30null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 
    3131 
    32 INSERT INTO original_document VALUES (DEFAULT, 'tst2.xml', 'test_dummy1', 'DIF', 'freds freds bloo' , null, current_timestamp, null, 1, 1); 
    33 INSERT INTO original_document VALUES (DEFAULT, 'tst3.xml', 'test_dummy2', null, 'fred fred bloo fred', null, current_timestamp, null, 1, 1); 
    34 INSERT INTO original_document VALUES (DEFAULT, 'tst4.xml', 'test_dummy3', null, 'fred fred bloo, fred, fred, fred, fre, fred', null, current_timestamp, null, 1, 1); 
     32INSERT INTO original_document VALUES (DEFAULT, 'tst2.xml', 'test_dummy1', 'DIF', 'freds freds bloo' , null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 
     33INSERT INTO original_document VALUES (DEFAULT, 'tst3.xml', 'test_dummy2', null, 'fred fred bloo fred', null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 
     34INSERT INTO original_document VALUES (DEFAULT, 'tst4.xml', 'test_dummy3', null, 'fred fred bloo, fred, fred, fred, fre, fred', null, null, null, null, current_timestamp, null, 1, 1); 
    3535 
    36 UPDATE original_document SET ts_vector = to_tsvector('english', original_document); 
     36UPDATE original_document SET document_ts_vector = to_tsvector('english', original_document); 
    3737 
    38 select original_document_id, ts_vector, ts_rank(ts_vector, query) as rank from original_document, to_tsquery('english', 'FRED') query WHERE query @@ coalesce(ts_vector,'') order by rank desc; 
     38select original_document_id, document_ts_vector, ts_rank(document_ts_vector, query) as rank from original_document, to_tsquery('english', 'FRED') query WHERE query @@ coalesce(document_ts_vector,'') order by rank desc; 
    3939 
    4040 
Note: See TracChangeset for help on using the changeset viewer.