Changeset 3967 for TI01-discovery
- Timestamp:
- 02/06/08 10:43:00 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrade
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresDAO.py
r3869 r3967 229 229 sqlCmd = "SELECT create_document('" + self._record.filename + "', '" + \ 230 230 self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 231 self._record.originalFormat + "');" 231 self._record.originalFormat + "', '" + self._record.getAuthorsInfo() + "', '" + \ 232 self._record.getParametersInfo() + "', '" + self._record.getScopeInfo() + "');" 232 233 233 234 id = db_funcs.runSQLCommand(self._connection, sqlCmd) … … 258 259 self._record.filename + "', '" + \ 259 260 self._record.discovery_id + "', '" + self._record.docType + "', '" + \ 260 self._record.originalFormat + "', '" + str(self._record.scn) + "');" 261 self._record.originalFormat + "', '" + self._record.getAuthorsInfo + "', '" + \ 262 self._record.getParametersInfo + "', '" + self._record.getScopeInfo + "', '" + str(self._record.scn) + "');" 261 263 db_funcs.runSQLCommand(self._connection, sqlCmd) 262 264 -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py
r3912 r3967 1 1 #!/usr/bin/env python 2 2 ''' 3 Class representing the contents of a row in the metadata_recordpostgres DB table3 Class representing the a document to be ingested into the postgres DB table 4 4 C Byrom Apr 08 5 5 ''' … … 23 23 class PostgresRecord: 24 24 ''' 25 Class representing the contents of a row in the metadata_recordpostgres DB table25 Class representing the a document to be ingested into the postgres DB table 26 26 @param filename: Name of file to use a metadata record 27 @param 27 @param ndg_dataprovider 28 @param datacentre_groups 29 @param datacentre_namespace 30 @param discovery_id 31 @param xq 32 @param doctype - type of doc to process 28 33 ''' 29 34 # TODO MDIP transforms do not work very well for lots of files - so currently hiding these 30 35 documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] 36 37 # vocab server - used for finding scope values in the moles files 38 ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010" 31 39 32 40 def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): … … 61 69 # dir to store a temp copy of the moles file, when produced - for use by other transforms 62 70 self._molesDir = None 71 # object to hold the moles file - this will be loaded in when it is created - in order to extract 72 # spatiotemporal data, etc 73 self.dgMeta = None 63 74 64 75 # firstly load contents of file … … 78 89 # spatiotemporal data object 79 90 self.stData = None 91 92 # fields to hold author, parameter and scope data 93 self.authors = None 94 self.parameters = None 95 self.scope = None 80 96 81 97 def escapeSpecialCharacters(self, inputString): … … 115 131 self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) 116 132 logging.info("Moles file created - at %s" %self._molesDir) 133 134 # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on 135 molesFile = self._molesDir + self._shortFilename 136 logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 137 138 # load in the moles file and put this into an object for direct access to the xml elements 139 self.dgMeta=MRW.dgMetadata() 140 try: 141 self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 142 except Exception, detail: 143 raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 144 117 145 118 146 … … 313 341 Extract spatio temporal data from the original document 314 342 ''' 343 logging.info('Retrieving spatiotemporal info from moles file') 315 344 # initialise the various spatiotemporal arrays used to extract data to 316 345 self.stData = SpatioTemporalData() 317 346 318 molesFile = self._molesDir + self._shortFilename 319 logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) 320 321 # load in the moles file and put this into an object for direct access to the xml elements 322 dgMeta=MRW.dgMetadata() 323 try: 324 dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 325 except Exception, detail: 326 raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) 327 347 if self.dgMeta is None: 348 self.createMolesFile() 349 328 350 # do quick checks to see if the relevant data exists 329 if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:351 if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: 330 352 logging.info("No data summary elements found - assuming no spatiotemporal data available") 331 353 return 332 354 333 if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:355 if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: 334 356 logging.info("No data coverage elements found - assuming no spatiotemporal data available") 335 357 return 336 358 337 if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:359 if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: 338 360 logging.info("No spatial coverage elements found - assuming no spatial data available") 339 361 else: 340 self.getCoordData( dgMeta)341 342 if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:362 self.getCoordData(self.dgMeta) 363 364 if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: 343 365 logging.info("No temporal coverage elements found - assuming no temporal data available") 344 366 else: 345 self.getTimeRangeData(dgMeta) 346 347 367 self.getTimeRangeData(self.dgMeta) 368 369 370 def getAuthorsInfo(self): 371 ''' 372 Extract authors info from the moles file 373 ''' 374 logging.info('Retrieving authors info from moles file') 375 376 if self.dgMeta is None: 377 self.createMolesFile() 378 379 logging.info("Extracting author info") 380 creators = "" 381 authors = "" 382 try: 383 creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator 384 logging.info("Found creator information - adding this to authors record") 385 386 except Exception, detail: 387 logging.info("Exception thrown whilst trying to find creator information:") 388 logging.info(detail) 389 logging.info("- this suggests document does not contain creator information.") 390 391 try: 392 authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors 393 logging.info("Found cited author information - adding this to authors record") 394 395 except Exception, detail: 396 logging.info("Exception thrown whilst trying to find cited author information:") 397 logging.info(detail) 398 logging.info("- this suggests document does not contain cited author information.") 399 400 self.authors = authors + " " + creators 401 return self.authors 402 403 404 def getParametersInfo(self): 405 ''' 406 Extract parameters info from the moles file 407 ''' 408 logging.info('Retrieving parameters info from moles file') 409 410 if self.dgMeta is None: 411 self.createMolesFile() 412 413 params = "" 414 try: 415 parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary 416 logging.info("Found parameter information - adding this to record") 417 418 except Exception, detail: 419 logging.info("Exception thrown whilst trying to find parameter information:") 420 logging.info(detail) 421 logging.info("- this suggests document does not contain parameter information.") 422 423 self.parameters = params 424 return self.parameters 425 426 427 def getScopeInfo(self): 428 ''' 429 Extract scope info from the moles file 430 ''' 431 logging.info('Retrieving scope info from moles file') 432 433 if self.dgMeta is None: 434 self.createMolesFile() 435 436 scope = "" 437 try: 438 keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword 439 logging.info("Found keyword information - parsing this for scope") 440 441 keywords_list = self.listify(keywords) 442 for keyword in keywords_list: 443 if keyword.dgValidTermID: 444 if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab): 445 logging.info("Found scope value - adding this to record") 446 scope += " " + keyword.dgValidTerm.strip() 447 448 except Exception, detail: 449 logging.info("Exception thrown whilst trying to find scope information:") 450 logging.info(detail) 451 logging.info("- this suggests document does not contain scope information.") 452 453 self.scope = scope 454 return self.scope 455 456 348 457 def getTimeRangeData(self, dgMeta): 349 458 ''' -
TI01-discovery/branches/ingestAutomation-upgrade/database/ingest_procedures.sql
r3863 r3967 21 21 22 22 DROP FUNCTION create_document(filename_in varchar(255), discovery_id_in varchar(255), 23 doc_type_in text, original_document_in text ) CASCADE;23 doc_type_in text, original_document_in text, authors text, parameters text, scope text) CASCADE; 24 24 CREATE FUNCTION create_document(filename_in varchar(255), discovery_id_in varchar(255), 25 doc_type_in text, original_document_in text ) RETURNS integer AS25 doc_type_in text, original_document_in text, authors text, parameters text, scope text) RETURNS integer AS 26 26 $$ 27 27 DECLARE … … 30 30 -- This inserts a new document into the DB 31 31 INSERT INTO ORIGINAL_DOCUMENT (original_document_id, original_document_filename, 32 discovery_id, original_format, original_document, ts_vector, create_date,33 32 discovery_id, original_format, original_document, document_ts_vector, authors_ts_vector, 33 parameters_ts_vector, scope_ts_vector, create_date, harvest_count, scn) VALUES ( 34 34 DEFAULT, filename_in, discovery_id_in, doc_type_in, original_document_in, 35 to_tsvector('english', original_document_in), current_timestamp, 1, 1); 35 to_tsvector('english', original_document_in), to_tsvector('english', authors), 36 to_tsvector('english', parameters), to_tsvector('english', scope), current_timestamp, 1, 1); 36 37 37 38 SELECT original_document_id INTO db_id FROM ORIGINAL_DOCUMENT WHERE discovery_id = discovery_id_in; … … 56 57 57 58 DROP FUNCTION update_document(original_document_id_in int, filename_in varchar(255), 58 discovery_id_in varchar(255), doc_type_in text, original_document_in text, scn_in int) CASCADE; 59 discovery_id_in varchar(255), doc_type_in text, original_document_in text, 60 authors text, parameters text, scope text, scn_in int) CASCADE; 59 61 CREATE FUNCTION update_document(original_document_id_in int, filename_in varchar(255), 60 discovery_id_in varchar(255), doc_type_in text, original_document_in text, scn_in int) 62 discovery_id_in varchar(255), doc_type_in text, original_document_in text, 63 authors text, parameters text, scope text, scn_in int) 61 64 RETURNS VOID AS 62 65 $$ … … 79 82 original_format = doc_type_in, 80 83 original_document = original_document_in, 81 ts_vector = to_tsvector('english', original_document_in), 84 document_ts_vector = to_tsvector('english', original_document_in), 85 authors_ts_vector = to_tsvector('english', authors), 86 parameters_ts_vector = to_tsvector('english', parameters), 87 scope_ts_vector = to_tsvector('english', scope), 82 88 update_date = current_timestamp, 83 89 harvest_count = 1, -
TI01-discovery/branches/ingestAutomation-upgrade/database/original_document.sql
r3849 r3967 12 12 original_format text, 13 13 original_document text, 14 ts_vector tsvector, 14 document_ts_vector tsvector, 15 authors_ts_vector tsvector, 16 parameters_ts_vector tsvector, 17 scope_ts_vector tsvector, 15 18 create_date timestamp, 16 19 update_date timestamp, … … 29 32 30 33 -- Create index on searchable column to speed up searches 31 CREATE INDEX textsearch_idx ON original_document USING gin( ts_vector);34 CREATE INDEX textsearch_idx ON original_document USING gin(document_ts_vector); -
TI01-discovery/branches/ingestAutomation-upgrade/database/spatial_data.sql
r3849 r3967 16 16 -- add 2D geometry column, 'geometry', to table - with SRS val of 4326 17 17 select addgeometrycolumn('spatial_data','geometry',4326,'GEOMETRY',2); 18 19 -- Create index on searchable column to speed up searches 20 CREATE INDEX spatialsearch_idx ON spatial_data USING GIST(geometry); -
TI01-discovery/branches/ingestAutomation-upgrade/database/test_data.sql
r3863 r3967 5 5 * 6 6 */ 7 INSERT INTO original_document VALUES (DEFAULT, 'tst.xml', 'test_dummy', 'DIF', null, null, current_timestamp, null, 1, 1);7 INSERT INTO original_document VALUES (DEFAULT, 'tst.xml', 'test_dummy', 'DIF', null, null, null, null, to_tsvector('NERC'), current_timestamp, null, 1, 1); 8 8 INSERT INTO original_document VALUES (DEFAULT, 'tst1.xml', 'test_record', 'DIF', '<?xml version="1.0" encoding="utf-8"?> 9 9 <kml xmlns="http://earth.google.com/kml/2.2"> … … 28 28 </Folder> 29 29 </kml>', 30 null, current_timestamp, null, 1, 1);30 null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 31 31 32 INSERT INTO original_document VALUES (DEFAULT, 'tst2.xml', 'test_dummy1', 'DIF', 'freds freds bloo' , null, current_timestamp, null, 1, 1);33 INSERT INTO original_document VALUES (DEFAULT, 'tst3.xml', 'test_dummy2', null, 'fred fred bloo fred', null, current_timestamp, null, 1, 1);34 INSERT INTO original_document VALUES (DEFAULT, 'tst4.xml', 'test_dummy3', null, 'fred fred bloo, fred, fred, fred, fre, fred', null, current_timestamp, null, 1, 1);32 INSERT INTO original_document VALUES (DEFAULT, 'tst2.xml', 'test_dummy1', 'DIF', 'freds freds bloo' , null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 33 INSERT INTO original_document VALUES (DEFAULT, 'tst3.xml', 'test_dummy2', null, 'fred fred bloo fred', null, null, null, to_tsvector('MDIP'), current_timestamp, null, 1, 1); 34 INSERT INTO original_document VALUES (DEFAULT, 'tst4.xml', 'test_dummy3', null, 'fred fred bloo, fred, fred, fred, fre, fred', null, null, null, null, current_timestamp, null, 1, 1); 35 35 36 UPDATE original_document SET ts_vector = to_tsvector('english', original_document);36 UPDATE original_document SET document_ts_vector = to_tsvector('english', original_document); 37 37 38 select original_document_id, ts_vector, ts_rank(ts_vector, query) as rank from original_document, to_tsquery('english', 'FRED') query WHERE query @@ coalesce(ts_vector,'') order by rank desc;38 select original_document_id, document_ts_vector, ts_rank(document_ts_vector, query) as rank from original_document, to_tsquery('english', 'FRED') query WHERE query @@ coalesce(document_ts_vector,'') order by rank desc; 39 39 40 40
Note: See TracChangeset
for help on using the changeset viewer.