Changeset 3814 for TI01-discovery


Ignore:
Timestamp:
23/04/08 09:52:51 (11 years ago)
Author:
cbyrom
Message:

Fill out functionality for DAO - including check for uniqueness of ingested records
+ improved logging + fixes to the SQL to match current data model.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py

    r3808 r3814  
    44C Byrom Apr 08 
    55''' 
     6import sys, os, logging 
    67import molesReadWrite as MRW 
    7 import sys 
    8 import os 
    98import db_funcs 
    109 
     
    2423        self._connection = db_connect() 
    2524        self._record = record 
     25        self.id = None 
    2626         
    2727             
     
    3232        @return: id of record, if it exists, '-1' if it doesn't 
    3333        ''' 
    34         sql = "select id from spatiotemp where id = '" + self._record.id + "';" 
    35         id = db_funcs.runSQLCommand(self._connection, sql) 
    36         print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXKKKK", id 
    37         self.id = id 
    38         return id 
     34        logging.info("Looking up record, " + self._record.discovery_id + " in DB") 
     35        if self.id is not None & self.id > 0: 
     36            logging.info("Already looked up record - ID is " + self.id) 
     37            return self.id 
     38         
     39        sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where discovery_id = '" + self._record.discovery_id + "';" 
     40        self.id = db_funcs.runSQLCommand(self._connection, sql) 
     41        print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXKKKK", self.id 
     42         
    3943 
    4044    def createOrUpdateRecord(self): 
     
    4650        else: 
    4751            updateRecord() 
     52 
    4853             
    4954    def createRecord(self): 
     
    5156        Create a record in the postgres DB based on the specified PostgresRecord object 
    5257        ''' 
     58        logging.info("Creating a new record in the DB for the metada document") 
    5359        # firstly store the actual documents contained by the record - i.e. the original doc + 
    5460        # the various transforms required 
     61        _insertOriginalRecord() 
    5562        _insertMetadataRecords() 
    5663         
    5764        # Now add the spatiotemporal data 
    5865        _insertSpatioTemporalData() 
     66        logging.info("New record created") 
    5967             
    6068 
     
    6371        Update a record in the postgres DB based on the specified PostgresRecord object 
    6472        ''' 
    65         # firstly update the actual documents contained by the record - i.e. the original doc + 
    66         # the various transforms required 
    67         _updateMetadataRecords(record) 
    68          
    69         # Now update the spatiotemporal data 
    70         _updateSpatioTemporalData(record) 
    71          
    72          
     73        logging.info("Record already existing in DB - performing updates") 
     74        # firstly, check the document is actually new - i.e. not just a repeat of the same data 
     75        if _checkIsUpdatedRecord(): 
     76         
     77            # firstly update the actual documents contained by the record - i.e. the original doc + 
     78            # the various transforms required 
     79            _updateMetadataRecords(record) 
     80             
     81            # Now update the spatiotemporal data 
     82            _updateSpatioTemporalData(record) 
     83         
     84         
     85    def _checkIsUpdatedRecord(self): 
     86        ''' 
     87        Looks up an existing record and checks it is not identical to the new record; if it is 
     88        incremement the harvest_count value and don't process again 
     89        @return: True if doc contains changes, False otherwise 
     90        ''' 
     91        logging.info("Checking the updated document actually contains changes") 
     92        sql = "SELECT harvest_count FROM ORIGINAL_DOCUMENT where original_document_id = " + self.id + \ 
     93            " AND original_document = " + self._record.originalFormat + "';" 
     94        count = db_funcs.runSQLCommand(self._connection, sql) 
     95         
     96        # NB, if the document is not identical, the sql command will not find anything 
     97        if count > 0: 
     98            logging.info("Ingested document is identical to document currently in DB - incrementing harvest_count") 
     99            count += 1 
     100            sql = "UPDATE ORIGINAL_DOCUMENT SET harvest_count = " + count + " WHERE original_document_id = " + self.id 
     101            count = db_funcs.runSQLCommand(self._connection, sql) 
     102            return False 
     103 
     104        logging.info("Ingested document is different to that in the current DB") 
     105        return True 
     106 
    73107     
    74108    def _insertSpatioTemporalData(): 
     
    77111        specified in the PostgresRecord object 
    78112        ''' 
     113        logging.info("Adding spatiotemporal data to DB record") 
    79114        # NB, build the query up in two parts so that we only include the correct data 
    80115        sqlStart = "" 
     
    104139            sqlEnd = ", " + sqlEnd 
    105140        sqlCmd = "INSERT INTO spatiotemp (id, original_doc_id" + sqlStart + ") " + sqlEnd + ");" 
    106         print sqlCmd 
    107141        db_funcs.runSQLCommand(self._connection, sqlCmd) 
     142        logging.info("Spatiotemporal data added to DB record") 
    108143             
    109144     
     
    114149        ''' 
    115150        # NB, build the query up in two parts so that we only include the correct data 
     151        logging.info("Updating spatiotemporal data to DB record") 
    116152        sqlStart = "" 
    117153        sqlEnd = " WHERE original_doc_id='" + self._record.id + "';"  
     
    135171        sqlCmd = "UPDATE spatiotemp SET (" + sqlStart + "update_time= now()) " + sqlEnd + ");" 
    136172 
    137         print sqlCmd 
    138173        db_funcs.runSQLCommand(self._connection, sqlCmd) 
     174        logging.info("Spatiotemporal data updated for DB record") 
    139175     
    140176             
     
    143179        Insert the original metadata doc into the postgres DB 
    144180        ''' 
    145         print "INFO: Inserting new original document in Postgres DB" 
     181        logging.info("Inserting new original document in Postgres DB") 
    146182        sqlCmd = "INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_name, original_format, " \ 
    147183            "original_document, ts_vector, create_date, harvest_count, scn) VALUES (" \ 
     
    152188        self.id = id 
    153189        self._record.db_id = id 
    154          
    155      
     190        logging.info("Original document inserted in Postgres DB") 
     191         
    156192     
    157193    def _updateOriginalRecord(): 
     
    159195        Update the original doc in the postgres DB 
    160196        ''' 
    161         print "INFO: Updating original document in Postgres DB" 
     197        logging.info("Updating original document in Postgres DB") 
    162198        sqlCmd = "UPDATE ORIGINAL_DOCUMENT SET (original_document_name = '" + self._record.filename + "', " \ 
    163199            "original_format = '" + self._record.originalFormat + "', " \ 
     
    168204     
    169205        db_funcs.runSQLCommand(self._connection, sqlCmd) 
     206        logging.info("Original document updated in Postgres DB") 
    170207     
    171208             
     
    174211        Insert the metadata docs into the postgres DB 
    175212        ''' 
    176         print "INFO: Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename 
     213        logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename) 
    177214        if self._record.db_id is None: 
    178215            print "No DB ID for the original record exists; cannot add associated transformed docs" 
     
    185222                "DEFAULT, '" + self._record.db_id + "', '" + \ 
    186223                docType + "', '" + doc + "'), current_date, 1);" 
    187          
     224             
     225            db_funcs.runSQLCommand(self._connection, sqlCmd) 
     226         
     227        logging.info("Transformed records created in DB") 
    188228     
    189229     
     
    192232        Update the metadata docs into the postgres DB 
    193233        ''' 
    194         print "INFO: Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename 
     234        logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.filename) 
    195235        if self._record.db_id is None: 
    196236            print "No DB ID for the original record exists; cannot update associated transformed docs" 
     
    201241                "', update_date = current_date) WHERE original_record_id = " + \ 
    202242                self._record.db_id + " AND transformed_format = '" + docType + "';" 
    203      
     243 
     244            db_funcs.runSQLCommand(self._connection, sqlCmd) 
     245     
     246        logging.info("Transformed records updated in DB") 
     247 
    204248 
    205249    def setRecord(self, record): 
     
    209253        ''' 
    210254        self._record = record 
    211         self.getSpatioTemporalData() 
    212         self.doRecordTransforms() 
Note: See TracChangeset for help on using the changeset viewer.