Changeset 3846


Ignore:
Timestamp:
02/05/08 11:26:44 (11 years ago)
Author:
cbyrom
Message:

Ajudst DAO and Record classes to throw errors rather than catching them

  • to allow processing of multiple files (wrapped by oai_ingest)

to continue more cleanly.

Location:
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py

    r3839 r3846  
    55''' 
    66import sys, os, logging 
    7 import molesReadWrite as MRW 
    87import db_funcs 
    98 
     
    1817            sys.exit("USAGE: argument 1 = PostgresRecord object to process") 
    1918        else: 
    20             print "INFO: Creating/updating DB entry for record, %s" %record.discovery_id 
     19            logging.info("INFO: Creating/updating DB entry for record, %s" %record.discovery_id) 
    2120 
    2221        # setup a connection to the db - if none specified 
     
    2524        self._connection = connection 
    2625        self._record = record 
    27         self.id = None 
    2826         
    2927 
     
    5048        ''' 
    5149        logging.info("Looking up record, " + self._record.discovery_id + " in DB") 
    52         if self.id is not None and self.id > 0: 
    53             logging.info("Already looked up record - ID is " + self.id) 
    54             return self.id 
    55          
    56         sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where discovery_id = '" + self._record.discovery_id + "';" 
    57         self.id = db_funcs.runSQLCommand(self._connection, sql) 
    58         print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXKKKK", self.id 
     50        if self._record.db_id is not None and self._record.db_id > 0: 
     51            logging.info("Already looked up record - ID is " + str(self._record.db_id)) 
     52            return self._record.db_id 
     53         
     54        sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where discovery_id = '" + \ 
     55            self._record.discovery_id + "';" 
     56        dbId = db_funcs.runSQLCommand(self._connection, sql) 
     57        if dbId: 
     58            self._record.db_id = dbId[0][0] 
    5959         
    6060 
     
    6363        Looks up a record in the DB; if it finds it, update it, otherwise create it 
    6464        ''' 
    65         if self.getRecordID() > 0: 
     65        self.getRecordID() 
     66        if self._record.db_id: 
     67            self.updateRecord() 
     68        else: 
    6669            self.createRecord() 
    67         else: 
    68             self.updateRecord() 
    6970 
    7071             
     
    9091        logging.info("Record already existing in DB - performing updates") 
    9192        # firstly, check the document is actually new - i.e. not just a repeat of the same data 
    92         if _checkIsUpdatedRecord(): 
     93        if self._checkIsUpdatedRecord(): 
    9394         
    9495            # firstly update the actual documents contained by the record - i.e. the original doc + 
    9596            # the various transforms required 
    96             self._updateMetadataRecords(record) 
    97              
     97            self._updateMetadataRecords() 
     98 
    9899            # Now update the spatiotemporal data 
    99             self._updateSpatioTemporalData(record) 
     100            self._updateSpatioTemporalData() 
     101 
     102        logging.info("Finish processing document...") 
    100103         
    101104         
     
    107110        ''' 
    108111        logging.info("Checking the updated document actually contains changes") 
    109         sql = "SELECT harvest_count FROM ORIGINAL_DOCUMENT where original_document_id = " + self.id + \ 
    110             " AND original_document = " + self._record.originalFormat + "';" 
     112 
     113        sql = "SELECT harvest_count FROM ORIGINAL_DOCUMENT where original_document_id = " + \ 
     114            str(self._record.db_id) + " AND original_document = '" + self._record.originalFormat + "';" 
    111115        count = db_funcs.runSQLCommand(self._connection, sql) 
    112          
     116 
    113117        # NB, if the document is not identical, the sql command will not find anything 
    114         if count > 0: 
    115             logging.info("Ingested document is identical to document currently in DB - incrementing harvest_count") 
    116             count += 1 
    117             sql = "UPDATE ORIGINAL_DOCUMENT SET harvest_count = " + count + " WHERE original_document_id = " + self.id 
    118             count = db_funcs.runSQLCommand(self._connection, sql) 
    119             return False 
    120  
    121         logging.info("Ingested document is different to that in the current DB") 
    122         return True 
     118        if not count: 
     119            logging.info("Ingested document is different to that in the current DB") 
     120            return True 
     121             
     122        count = count[0][0] 
     123        logging.info("Ingested document is identical to document currently in DB - " + \ 
     124                     "incrementing harvest_count") 
     125        count += 1 
     126        sql = "UPDATE ORIGINAL_DOCUMENT SET harvest_count = " + str(count) + \ 
     127            " WHERE original_document_id = " + str(self._record.db_id) 
     128        db_funcs.runSQLCommand(self._connection, sql) 
     129        return False 
     130 
    123131 
    124132     
     
    132140        sqlStart = "" 
    133141        insert_sql="INSERT INTO "+location_table+" (geom) VALUES ( setsrid('BOX3D("+west+" "+south+","+east+" "+north+")'::box3d,4326) );" 
    134         sqlEnd = " VALUES (DEFAULT, '" + self._record.id  
     142        sqlEnd = " VALUES (DEFAULT, '" + str(self._record.db_id)  
    135143        if (not self._record.hasNullCoords()): 
    136144            sqlStart += "coordinates" 
     
    156164            sqlStart = ", " + sqlStart 
    157165            sqlEnd = ", " + sqlEnd 
     166             
    158167        sqlCmd = "INSERT INTO spatiotemp (id, original_doc_id" + sqlStart + ") " + sqlEnd + ");" 
    159168        db_funcs.runSQLCommand(self._connection, sqlCmd) 
     
    169178        logging.info("Updating spatiotemporal data to DB record") 
    170179        sqlStart = "" 
    171         sqlEnd = " WHERE original_doc_id='" + self._record.id + "';"  
     180        sqlEnd = " WHERE original_doc_id='" + str(self._record.db_id) + "';"  
    172181        if (not self._record.hasNullCoords()): 
    173182            sqlStart += "coordinates = sbox'((" + self._record.west + "d , " + \ 
     
    198207        ''' 
    199208        logging.info("Inserting new original document in Postgres DB") 
    200         sqlCmd = "INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_name, original_format, " \ 
    201             "original_document, ts_vector, create_date, harvest_count, scn) VALUES (" \ 
    202             "DEFAULT, '" + self._record.filename + "', '" + self._record.docType + "', '" + self._record.originalFormat + \ 
    203             "', to_tsvector('english', " + self._record.originalFormat + "), current_date, 1, 1);" 
    204              
    205         id = db_funcs.runSQLCommand(self._connection, sqlCmd) 
    206         self.id = id 
    207         self._record.db_id = id 
     209        sqlCmd = "INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_filename, " + \ 
     210            "discovery_id, original_format, " + \ 
     211            "original_document, ts_vector, create_date, harvest_count, scn) VALUES (" + \ 
     212            "DEFAULT, '" + self._record.filename + "', '" + self._record.discovery_id + \ 
     213            "', '" + self._record.docType + "', '" + self._record.originalFormat + \ 
     214            "', to_tsvector('english', '" + self._record.originalFormat + "'), current_date, 1, 1);" 
     215             
     216        self._record.db_id = db_funcs.runSQLCommand(self._connection, sqlCmd) 
    208217        logging.info("Original document inserted in Postgres DB") 
    209218         
     
    214223        ''' 
    215224        logging.info("Updating original document in Postgres DB") 
    216         sqlCmd = "UPDATE ORIGINAL_DOCUMENT SET (original_document_name = '" + self._record.filename + "', " \ 
    217             "original_format = '" + self._record.originalFormat + "', " \ 
    218             "ts_vector = to_tsvector('english', " + self._record.originalFormat + "), " \ 
    219             "update_date = current_date, " \ 
    220             "harvest_count = 1)" \ 
    221             " WHERE original_document_id = " + self._record.db_id + ";" 
     225        sqlCmd = "UPDATE ORIGINAL_DOCUMENT SET (original_document_filename = '" + self._record.filename + \ 
     226            "', discovery_id = '" + self._record.discovery_id + "', " + \ 
     227            "original_format = '" + self._record.originalFormat + "', " + \ 
     228            "ts_vector = to_tsvector('english', '" + self._record.originalFormat + "'), " + \ 
     229            "update_date = current_date, harvest_count = 1)" + \ 
     230            " WHERE original_document_id = " + str(self._record.db_id) + ";" 
    222231     
    223232        db_funcs.runSQLCommand(self._connection, sqlCmd) 
     
    231240        logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename) 
    232241        if self._record.db_id is None: 
    233             print "No DB ID for the original record exists; cannot add associated transformed docs" 
     242            logging.info("No DB ID for the original record exists; cannot add associated transformed docs") 
    234243            return 
    235244         
    236         for docType, doc in record.getAllDocs(): 
     245        for docType, doc in self._record.getAllDocs(): 
    237246            sqlCmd = "INSERT INTO TRANSFORMED_DOCUMENT (transformed_document_id, " \ 
    238247                "original_record_id, transformed_format, " \ 
    239248                "transformed_document, create_date, scn) VALUES (" \ 
    240                 "DEFAULT, '" + self._record.db_id + "', '" + \ 
     249                "DEFAULT, '" + str(self._record.db_id) + "', '" + \ 
    241250                docType + "', '" + doc + "'), current_date, 1);" 
    242251             
     
    252261        logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.filename) 
    253262        if self._record.db_id is None: 
    254             print "No DB ID for the original record exists; cannot update associated transformed docs" 
     263            logging.info("No DB ID for the original record exists; cannot update associated transformed docs") 
    255264            return 
    256265         
    257         for docType, doc in record.getAllDocs(): 
     266        for docType, doc in self._record.getAllDocs(): 
    258267            sqlCmd = "UPDATE TRANSFORMED_DOCUMENT SET (transformed_document = '" + doc + \ 
    259268                "', update_date = current_date) WHERE original_record_id = " + \ 
    260                 self._record.db_id + " AND transformed_format = '" + docType + "';" 
     269                str(self._record.db_id) + " AND transformed_format = '" + docType + "';" 
    261270 
    262271            db_funcs.runSQLCommand(self._connection, sqlCmd) 
     
    292301        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid, timeid) values ('"+Mid+"', "+locationidstr+", "+timeidstr+" );" 
    293302     
    294         print "ItemTimeLocation:\t"+itemtimelocation_sql 
     303        #print "ItemTimeLocation:\t"+itemtimelocation_sql 
    295304        cursor = connection.cursor() 
    296305        try: 
     
    310319            itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid) values ('"+Mid+"', "+locationidstr+" );" 
    311320     
    312             print "ItemTimeLocation:\t"+itemtimelocation_sql 
     321            #print "ItemTimeLocation:\t"+itemtimelocation_sql 
    313322            cursor = connection.cursor() 
    314323            try: 
     
    328337            itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, timeid) values ('"+Mid+"', "+timeidstr+" );" 
    329338     
    330             print "ItemTimeLocation:\t"+itemtimelocation_sql 
     339            #print "ItemTimeLocation:\t"+itemtimelocation_sql 
    331340            cursor = connection.cursor() 
    332341            try: 
     
    359368            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value) 
    360369        connection.commit() 
     370 
    361371     
    362372    def insert_temporal_coverage(Mid,startdate,enddate): 
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3839 r3846  
    146146 
    147147        if status is not None: 
    148             sys.exit("Failed at running the XQuery") 
     148            raise SystemError, 'Failed at running the XQuery' 
    149149 
    150150        # now remove the temp xquery file 
    151151        status = os.unlink(xqFile) 
    152152        if status is not None: 
    153             sys.exit("Failed to remove the temporary xquery file, " + xqFile) 
     153            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile 
    154154         
    155155        logging.info("Transform completed successfully") 
     
    247247            return self._allDocs 
    248248         
    249         for docType in documentTypes: 
    250             self._allDocs.append([docType, getDocumentFormat(docType)]) 
     249        for docType in self.documentTypes: 
     250            self._allDocs.append([docType, self.getDocumentFormat(docType)]) 
    251251        return self._allDocs 
    252252         
     
    272272        # represented as ns0, ns1, ns2 etc. 
    273273        #cElementTree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) 
    274         no_bbox = False 
    275         no_dates = False 
    276274        self.east = 'null' 
    277         self.west = 'null' 
     275        self.west = [] 
    278276        self.north = 'null' 
    279277        self.south = 'null' 
     
    286284        try: 
    287285            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
    288             print dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox 
    289286        except Exception, detail: 
    290             logging.warning("Cannot parse the XML moles document %s. Will not process" %molesFile) 
    291             logging.debug(detail) 
    292             return 
    293          
    294         try: 
     287            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 
     288         
     289        bbox_list = [] 
     290        try: 
     291            logging.info("Extracting bounding box info") 
    295292            bbox_list=self.listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 
     293            #parse the list of coordinates 
     294            for bbox in bbox_list: 
     295                self.west.append(self.parseCoord(bbox.LimitWest, 'W', 'E')) 
     296                self.east = self.parseCoord(bbox.LimitEast, 'W', 'E') 
     297                self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N') 
     298                self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N') 
    296299        except Exception, detail: 
    297300            logging.info("XML moles document " + molesFile + " does not contain a bounding box.") 
    298301            logging.debug(detail) 
    299             no_bbox=True 
    300302 
    301303        try: 
     
    321323            self.enddate = enddate 
    322324 
    323         if not no_bbox: 
    324             #parse the coordinates somewhat - only use the first bounding box. 
    325             bbox=bbox_list[0] 
    326             try: 
    327                 self.west = self.parseCoord(bbox.LimitWest, 'W', 'E') 
    328             except: 
    329                 logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile) 
    330                 return 
    331              
    332             try: 
    333                 self.east = self.parseCoord(bbox.LimitEast, 'W', 'E') 
    334             except: 
    335                 logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile) 
    336                 return 
    337              
    338             try: 
    339                 self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N') 
    340             except: 
    341                 logging.error("Will not process File %s. Contains incorrect North bounding box limit." %molesFile) 
    342                 return 
    343              
    344             try: 
    345                 self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N') 
    346             except: 
    347                 logging.error("Will not process File %s. Contains incorrect South bounding box limit." %molesFile) 
    348                 return 
    349325 
    350326        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \ 
     
    363339        @return: coord - the value of the coordinate as a string    
    364340        ''' 
    365  
    366         coord = coordValue.strip() 
    367         if coord.endswith(maxField): 
    368             coord=coordValue.split(maxField)[0] 
    369         elif coord.endswith(minField): 
    370             if coord.startswith('-'): 
    371                 coord = coordValue.split(minField)[0] 
    372             else: 
    373                 coord = "-" + coordValue.split(minField)[0] 
    374  
    375         return '%s' % float(coord) 
     341        logging.info("Parsing document coordinates") 
     342        try: 
     343            coord = coordValue.strip() 
     344            if coord.endswith(maxField): 
     345                coord=coordValue.split(maxField)[0] 
     346            elif coord.endswith(minField): 
     347                if coord.startswith('-'): 
     348                    coord = coordValue.split(minField)[0] 
     349                else: 
     350                    coord = "-" + coordValue.split(minField)[0] 
     351     
     352            return '%s' % float(coord) 
     353        except: 
     354            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue 
     355 
    376356             
    377357    def hasNullCoords(): 
Note: See TracChangeset for help on using the changeset viewer.