Changeset 3846 for TI01-discovery
- Timestamp:
- 02/05/08 11:26:44 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py
r3839 r3846 5 5 ''' 6 6 import sys, os, logging 7 import molesReadWrite as MRW8 7 import db_funcs 9 8 … … 18 17 sys.exit("USAGE: argument 1 = PostgresRecord object to process") 19 18 else: 20 print "INFO: Creating/updating DB entry for record, %s" %record.discovery_id19 logging.info("INFO: Creating/updating DB entry for record, %s" %record.discovery_id) 21 20 22 21 # setup a connection to the db - if none specified … … 25 24 self._connection = connection 26 25 self._record = record 27 self.id = None28 26 29 27 … … 50 48 ''' 51 49 logging.info("Looking up record, " + self._record.discovery_id + " in DB") 52 if self.id is not None and self.id > 0: 53 logging.info("Already looked up record - ID is " + self.id) 54 return self.id 55 56 sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where discovery_id = '" + self._record.discovery_id + "';" 57 self.id = db_funcs.runSQLCommand(self._connection, sql) 58 print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXKKKK", self.id 50 if self._record.db_id is not None and self._record.db_id > 0: 51 logging.info("Already looked up record - ID is " + str(self._record.db_id)) 52 return self._record.db_id 53 54 sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where discovery_id = '" + \ 55 self._record.discovery_id + "';" 56 dbId = db_funcs.runSQLCommand(self._connection, sql) 57 if dbId: 58 self._record.db_id = dbId[0][0] 59 59 60 60 … … 63 63 Looks up a record in the DB; if it finds it, update it, otherwise create it 64 64 ''' 65 if self.getRecordID() > 0: 65 self.getRecordID() 66 if self._record.db_id: 67 self.updateRecord() 68 else: 66 69 self.createRecord() 67 else:68 self.updateRecord()69 70 70 71 … … 90 91 logging.info("Record already existing in DB - performing updates") 91 92 # firstly, check the document is actually new - i.e. not just a repeat of the same data 92 if _checkIsUpdatedRecord():93 if self._checkIsUpdatedRecord(): 93 94 94 95 # firstly update the actual documents contained by the record - i.e. the original doc + 95 96 # the various transforms required 96 self._updateMetadataRecords( record)97 97 self._updateMetadataRecords() 98 98 99 # Now update the spatiotemporal data 99 self._updateSpatioTemporalData(record) 100 self._updateSpatioTemporalData() 101 102 logging.info("Finish processing document...") 100 103 101 104 … … 107 110 ''' 108 111 logging.info("Checking the updated document actually contains changes") 109 sql = "SELECT harvest_count FROM ORIGINAL_DOCUMENT where original_document_id = " + self.id + \ 110 " AND original_document = " + self._record.originalFormat + "';" 112 113 sql = "SELECT harvest_count FROM ORIGINAL_DOCUMENT where original_document_id = " + \ 114 str(self._record.db_id) + " AND original_document = '" + self._record.originalFormat + "';" 111 115 count = db_funcs.runSQLCommand(self._connection, sql) 112 116 113 117 # NB, if the document is not identical, the sql command will not find anything 114 if count > 0: 115 logging.info("Ingested document is identical to document currently in DB - incrementing harvest_count") 116 count += 1 117 sql = "UPDATE ORIGINAL_DOCUMENT SET harvest_count = " + count + " WHERE original_document_id = " + self.id 118 count = db_funcs.runSQLCommand(self._connection, sql) 119 return False 120 121 logging.info("Ingested document is different to that in the current DB") 122 return True 118 if not count: 119 logging.info("Ingested document is different to that in the current DB") 120 return True 121 122 count = count[0][0] 123 logging.info("Ingested document is identical to document currently in DB - " + \ 124 "incrementing harvest_count") 125 count += 1 126 sql = "UPDATE ORIGINAL_DOCUMENT SET harvest_count = " + str(count) + \ 127 " WHERE original_document_id = " + str(self._record.db_id) 128 db_funcs.runSQLCommand(self._connection, sql) 129 return False 130 123 131 124 132 … … 132 140 sqlStart = "" 133 141 insert_sql="INSERT INTO "+location_table+" (geom) VALUES ( setsrid('BOX3D("+west+" "+south+","+east+" "+north+")'::box3d,4326) );" 134 sqlEnd = " VALUES (DEFAULT, '" + s elf._record.id142 sqlEnd = " VALUES (DEFAULT, '" + str(self._record.db_id) 135 143 if (not self._record.hasNullCoords()): 136 144 sqlStart += "coordinates" … … 156 164 sqlStart = ", " + sqlStart 157 165 sqlEnd = ", " + sqlEnd 166 158 167 sqlCmd = "INSERT INTO spatiotemp (id, original_doc_id" + sqlStart + ") " + sqlEnd + ");" 159 168 db_funcs.runSQLCommand(self._connection, sqlCmd) … … 169 178 logging.info("Updating spatiotemporal data to DB record") 170 179 sqlStart = "" 171 sqlEnd = " WHERE original_doc_id='" + s elf._record.id+ "';"180 sqlEnd = " WHERE original_doc_id='" + str(self._record.db_id) + "';" 172 181 if (not self._record.hasNullCoords()): 173 182 sqlStart += "coordinates = sbox'((" + self._record.west + "d , " + \ … … 198 207 ''' 199 208 logging.info("Inserting new original document in Postgres DB") 200 sqlCmd = "INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_ name, original_format, "\201 " original_document, ts_vector, create_date, harvest_count, scn) VALUES ("\202 " DEFAULT, '" + self._record.filename + "', '" + self._record.docType + "', '" + self._record.originalFormat+ \203 " ', to_tsvector('english', " + self._record.originalFormat + "), current_date, 1, 1);"204 205 id = db_funcs.runSQLCommand(self._connection, sqlCmd)206 self.id = id207 self._record.db_id = id209 sqlCmd = "INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_filename, " + \ 210 "discovery_id, original_format, " + \ 211 "original_document, ts_vector, create_date, harvest_count, scn) VALUES (" + \ 212 "DEFAULT, '" + self._record.filename + "', '" + self._record.discovery_id + \ 213 "', '" + self._record.docType + "', '" + self._record.originalFormat + \ 214 "', to_tsvector('english', '" + self._record.originalFormat + "'), current_date, 1, 1);" 215 216 self._record.db_id = db_funcs.runSQLCommand(self._connection, sqlCmd) 208 217 logging.info("Original document inserted in Postgres DB") 209 218 … … 214 223 ''' 215 224 logging.info("Updating original document in Postgres DB") 216 sqlCmd = "UPDATE ORIGINAL_DOCUMENT SET (original_document_ name = '" + self._record.filename + "', "\217 " original_format = '" + self._record.originalFormat + "', "\218 " ts_vector = to_tsvector('english', " + self._record.originalFormat + "), "\219 " update_date = current_date, "\220 " harvest_count = 1)"\221 " WHERE original_document_id = " + s elf._record.db_id+ ";"225 sqlCmd = "UPDATE ORIGINAL_DOCUMENT SET (original_document_filename = '" + self._record.filename + \ 226 "', discovery_id = '" + self._record.discovery_id + "', " + \ 227 "original_format = '" + self._record.originalFormat + "', " + \ 228 "ts_vector = to_tsvector('english', '" + self._record.originalFormat + "'), " + \ 229 "update_date = current_date, harvest_count = 1)" + \ 230 " WHERE original_document_id = " + str(self._record.db_id) + ";" 222 231 223 232 db_funcs.runSQLCommand(self._connection, sqlCmd) … … 231 240 logging.info("Inserting transformed documents for original document, %s, in Postgres DB", self._record.filename) 232 241 if self._record.db_id is None: 233 print "No DB ID for the original record exists; cannot add associated transformed docs"242 logging.info("No DB ID for the original record exists; cannot add associated transformed docs") 234 243 return 235 244 236 for docType, doc in record.getAllDocs():245 for docType, doc in self._record.getAllDocs(): 237 246 sqlCmd = "INSERT INTO TRANSFORMED_DOCUMENT (transformed_document_id, " \ 238 247 "original_record_id, transformed_format, " \ 239 248 "transformed_document, create_date, scn) VALUES (" \ 240 "DEFAULT, '" + s elf._record.db_id+ "', '" + \249 "DEFAULT, '" + str(self._record.db_id) + "', '" + \ 241 250 docType + "', '" + doc + "'), current_date, 1);" 242 251 … … 252 261 logging.info("Updating transformed documents for original document, %s, in Postgres DB", self._record.filename) 253 262 if self._record.db_id is None: 254 print "No DB ID for the original record exists; cannot update associated transformed docs"263 logging.info("No DB ID for the original record exists; cannot update associated transformed docs") 255 264 return 256 265 257 for docType, doc in record.getAllDocs():266 for docType, doc in self._record.getAllDocs(): 258 267 sqlCmd = "UPDATE TRANSFORMED_DOCUMENT SET (transformed_document = '" + doc + \ 259 268 "', update_date = current_date) WHERE original_record_id = " + \ 260 s elf._record.db_id+ " AND transformed_format = '" + docType + "';"269 str(self._record.db_id) + " AND transformed_format = '" + docType + "';" 261 270 262 271 db_funcs.runSQLCommand(self._connection, sqlCmd) … … 292 301 itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid, timeid) values ('"+Mid+"', "+locationidstr+", "+timeidstr+" );" 293 302 294 print "ItemTimeLocation:\t"+itemtimelocation_sql303 #print "ItemTimeLocation:\t"+itemtimelocation_sql 295 304 cursor = connection.cursor() 296 305 try: … … 310 319 itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid) values ('"+Mid+"', "+locationidstr+" );" 311 320 312 print "ItemTimeLocation:\t"+itemtimelocation_sql321 #print "ItemTimeLocation:\t"+itemtimelocation_sql 313 322 cursor = connection.cursor() 314 323 try: … … 328 337 itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, timeid) values ('"+Mid+"', "+timeidstr+" );" 329 338 330 print "ItemTimeLocation:\t"+itemtimelocation_sql339 #print "ItemTimeLocation:\t"+itemtimelocation_sql 331 340 cursor = connection.cursor() 332 341 try: … … 359 368 print "Error: database error %s %s" %(sys.exc_type, sys.exc_value) 360 369 connection.commit() 370 361 371 362 372 def insert_temporal_coverage(Mid,startdate,enddate): -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py
r3839 r3846 146 146 147 147 if status is not None: 148 sys.exit("Failed at running the XQuery")148 raise SystemError, 'Failed at running the XQuery' 149 149 150 150 # now remove the temp xquery file 151 151 status = os.unlink(xqFile) 152 152 if status is not None: 153 sys.exit("Failed to remove the temporary xquery file, " + xqFile)153 raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile 154 154 155 155 logging.info("Transform completed successfully") … … 247 247 return self._allDocs 248 248 249 for docType in documentTypes:250 self._allDocs.append([docType, getDocumentFormat(docType)])249 for docType in self.documentTypes: 250 self._allDocs.append([docType, self.getDocumentFormat(docType)]) 251 251 return self._allDocs 252 252 … … 272 272 # represented as ns0, ns1, ns2 etc. 273 273 #cElementTree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) 274 no_bbox = False275 no_dates = False276 274 self.east = 'null' 277 self.west = 'null'275 self.west = [] 278 276 self.north = 'null' 279 277 self.south = 'null' … … 286 284 try: 287 285 dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 288 print dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox289 286 except Exception, detail: 290 logging.warning("Cannot parse the XML moles document %s. Will not process" %molesFile)291 logging.debug(detail)292 return293 294 try:287 raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 288 289 bbox_list = [] 290 try: 291 logging.info("Extracting bounding box info") 295 292 bbox_list=self.listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 293 #parse the list of coordinates 294 for bbox in bbox_list: 295 self.west.append(self.parseCoord(bbox.LimitWest, 'W', 'E')) 296 self.east = self.parseCoord(bbox.LimitEast, 'W', 'E') 297 self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N') 298 self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N') 296 299 except Exception, detail: 297 300 logging.info("XML moles document " + molesFile + " does not contain a bounding box.") 298 301 logging.debug(detail) 299 no_bbox=True300 302 301 303 try: … … 321 323 self.enddate = enddate 322 324 323 if not no_bbox:324 #parse the coordinates somewhat - only use the first bounding box.325 bbox=bbox_list[0]326 try:327 self.west = self.parseCoord(bbox.LimitWest, 'W', 'E')328 except:329 logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile)330 return331 332 try:333 self.east = self.parseCoord(bbox.LimitEast, 'W', 'E')334 except:335 logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile)336 return337 338 try:339 self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N')340 except:341 logging.error("Will not process File %s. Contains incorrect North bounding box limit." %molesFile)342 return343 344 try:345 self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N')346 except:347 logging.error("Will not process File %s. Contains incorrect South bounding box limit." %molesFile)348 return349 325 350 326 logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \ … … 363 339 @return: coord - the value of the coordinate as a string 364 340 ''' 365 366 coord = coordValue.strip() 367 if coord.endswith(maxField): 368 coord=coordValue.split(maxField)[0] 369 elif coord.endswith(minField): 370 if coord.startswith('-'): 371 coord = coordValue.split(minField)[0] 372 else: 373 coord = "-" + coordValue.split(minField)[0] 374 375 return '%s' % float(coord) 341 logging.info("Parsing document coordinates") 342 try: 343 coord = coordValue.strip() 344 if coord.endswith(maxField): 345 coord=coordValue.split(maxField)[0] 346 elif coord.endswith(minField): 347 if coord.startswith('-'): 348 coord = coordValue.split(minField)[0] 349 else: 350 coord = "-" + coordValue.split(minField)[0] 351 352 return '%s' % float(coord) 353 except: 354 raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue 355 376 356 377 357 def hasNullCoords():
Note: See TracChangeset
for help on using the changeset viewer.