Ignore:
Timestamp:
01/05/08 14:15:41 (12 years ago)
Author:
cbyrom
Message:

Add script to run ingest for all avaiable config files.
Make oai_ingest_new2 a proper object.
Adjust db_funcs - now pass in details to set up database connection

  • although defaults available, if not done.

Simplify coord parsing in PostgresRecord? by using a reusable function.
+ various tidy ups and fixes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3821 r3839  
    55''' 
    66try: #python 2.5 
    7     from xml.etree import ElementTree as ET 
     7    from xml.etree import cElementTree 
    88except ImportError: 
    99    try: 
    1010        # if you've installed it yourself it comes this way 
    11         import ElementTree as ET 
     11        import cElementTree 
    1212    except ImportError: 
    1313        # if you've egged it this is the way it comes 
    14         from elementtree import ElementTree as ET 
    15 #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc. 
    16 #ET._namespace_map.update({'http://www.oceannet.org/mdip/xml': 'mdip', 'http://www.w3.org/1999/xlink':'xlink'}) 
     14        from ndgUtils.elementtree import cElementTree 
    1715 
    1816import os, sys, logging 
    19 from ETxmlView import loadET, nsdumb 
     17#from ETxmlView import loadET, nsdumb 
    2018import molesReadWrite as MRW 
    2119from ndgUtils.ndgObject import ndgObject 
     
    2826    @param  
    2927    ''' 
    30     documentTypes = ['MOLES', 'DIF', 'DC', 'MDIP', 'ISO19139'] 
     28    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] 
    3129         
    3230    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): 
     
    6563        self.originalFormat = file(filename).read() 
    6664         
    67         # we use loadET to protect ourselves from scummy characters and unicode problems 
    68         # DO WE NEED TO DO THIS?? 
    69         self.correctedFormat = loadET(self.originalFormat) 
    70  
    71          
    7265        # initialise the various record fields 
    7366        self.db_id = None    # the DB ID of the record, for easy reference when it is created 
     
    146139        # Now do the transform 
    147140        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
    148         xqCommand = "java -cp saxon9.jar net.sf.saxon.Query " + xqFile 
     141        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes" 
    149142        logging.debug("Running saxon command: " + xqCommand) 
    150143        pipe = os.popen(xqCommand + " 2>&1") 
     
    152145        status = pipe.close() 
    153146 
    154         print output 
    155147        if status is not None: 
    156148            sys.exit("Failed at running the XQuery") 
     
    162154         
    163155        logging.info("Transform completed successfully") 
     156         
     157#        f=open(xQueryType + "_doc.xml", 'w') 
     158#        f.write(output) 
     159#        f.close() 
    164160             
    165161        return output 
     
    256252         
    257253 
    258     def listify(item): 
     254    def listify(self, item): 
    259255        '''  
    260256        listify checks if an item is a list, if it isn't it puts it  
     
    273269        Extract spatio temporal data from the original document 
    274270        ''' 
    275         ET._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) 
     271        #this is a fix to the  ElementTree namespace problem that namespaces are usually  
     272        # represented as ns0, ns1, ns2 etc. 
     273        #cElementTree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) 
    276274        no_bbox = False 
    277275        no_dates = False 
     
    283281        self.enddate='noenddate' 
    284282         
     283        molesFile = self._molesDir + self._shortFilename 
     284         
    285285        dgMeta=MRW.dgMetadata() 
    286286        try: 
    287             dgMeta.fromXML(cElementTree.ElementTree(file=self.filename).getroot()) 
    288         except: 
    289             logging.warning("WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename) 
     287            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) 
     288            print dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox 
     289        except Exception, detail: 
     290            logging.warning("Cannot parse the XML moles document %s. Will not process" %molesFile) 
     291            logging.debug(detail) 
    290292            return 
     293         
    291294        try: 
    292             bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 
    293         except: 
    294             logging.info("XML moles document " + self.filename + \ 
    295                 " does not contain a bounding box.") 
     295            bbox_list=self.listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 
     296        except Exception, detail: 
     297            logging.info("XML moles document " + molesFile + " does not contain a bounding box.") 
     298            logging.debug(detail) 
    296299            no_bbox=True 
    297300 
     
    301304            print "enddate = %s" %dates.DateRangeEnd 
    302305        except: 
    303             logging.info("XML moles document " + self.filename + " does not contain temporal info.") 
     306            logging.info("XML moles document " + molesFile + " does not contain temporal info.") 
    304307            no_dates=True 
    305308 
    306309        if no_bbox and no_dates: 
    307             logging.info("XML moles document " + self.filename + " does not contain any spatiotemporal info.") 
     310            logging.info("XML moles document " + molesFile + " does not contain any spatiotemporal info.") 
    308311            return 
    309312 
     
    322325            bbox=bbox_list[0] 
    323326            try: 
    324                 west = bbox.LimitWest.strip() 
     327                self.west = self.parseCoord(bbox.LimitWest, 'W', 'E') 
    325328            except: 
    326                 print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename 
     329                logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile) 
    327330                return 
    328             if west.endswith('E'): 
    329                 west=bbox.LimitWest.split('E')[0] 
    330             elif west.endswith('W'): 
    331                 if west.startswith('-'): 
    332                     west = bbox.LimitWest.split('W')[0] 
    333                 else: 
    334                     west = "-" +bbox.LimitWest.split('W')[0] 
     331             
    335332            try: 
    336                 float(west) 
     333                self.east = self.parseCoord(bbox.LimitEast, 'W', 'E') 
    337334            except: 
    338                 print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename 
     335                logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile) 
    339336                return 
    340             self.west = west 
    341337             
    342338            try: 
    343                 east = bbox.LimitEast.strip() 
     339                self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N') 
    344340            except: 
    345                 print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename 
     341                logging.error("Will not process File %s. Contains incorrect North bounding box limit." %molesFile) 
    346342                return 
    347             if east.endswith('E'): 
    348                 east=bbox.LimitEast.split('E')[0] 
    349             elif east.endswith('W'): 
    350                 if east.startswith('-'): 
    351                     east = bbox.LimitEast.split('W')[0] 
    352                 else: 
    353                     east = "-" +bbox.LimitEast.split('W')[0] 
     343             
    354344            try: 
    355                 float(east) 
     345                self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N') 
    356346            except: 
    357                 print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename 
     347                logging.error("Will not process File %s. Contains incorrect South bounding box limit." %molesFile) 
    358348                return 
    359             self.east = east 
    360              
    361             try: 
    362                 north = bbox.LimitNorth.strip() 
    363             except: 
    364                 print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %self.filename 
    365                 return 
    366             if north.endswith('N'): 
    367                 north=bbox.LimitNorth.split('N')[0] 
    368             elif north.endswith('S'): 
    369                 if north.startswith('-'): 
    370                     north = bbox.LimitNorth.split('S')[0] 
    371                 else: 
    372                     north = "-" +bbox.LimitNorth.split('S')[0] 
    373             try: 
    374                 float(north) 
    375             except: 
    376                 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename 
    377                 return 
    378             self.north = north 
    379              
    380             try: 
    381                 south = bbox.LimitSouth.strip() 
    382             except: 
    383                 print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %self.filename 
    384                 return 
    385             if south.endswith('N'): 
    386                 south=bbox.LimitSouth.split('N')[0] 
    387             elif south.endswith('S'): 
    388                 if south.startswith('-'): 
    389                     south = bbox.LimitSouth.split('S')[0] 
    390                 else: 
    391                     south = "-" +bbox.LimitSouth.split('S')[0] 
    392             try: 
    393                 float(south) 
    394             except: 
    395                 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename 
    396                 return 
    397             self.south = south 
    398349 
    399350        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \ 
     
    401352        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate)  
    402353 
     354 
     355 
     356    def parseCoord(self, coordValue, minField, maxField): 
     357        ''' 
     358        Take a coordinate value extracted from a molefile bbox limit - together with  
     359        the appropriate max/min limits and extract the correct value from it 
     360        @param coordValue: the contents of the bbox limit tage 
     361        @param minField: the expected min field of the coord range - i.e. 'W' or 'S' 
     362        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N' 
     363        @return: coord - the value of the coordinate as a string    
     364        ''' 
     365 
     366        coord = coordValue.strip() 
     367        if coord.endswith(maxField): 
     368            coord=coordValue.split(maxField)[0] 
     369        elif coord.endswith(minField): 
     370            if coord.startswith('-'): 
     371                coord = coordValue.split(minField)[0] 
     372            else: 
     373                coord = "-" + coordValue.split(minField)[0] 
     374 
     375        return '%s' % float(coord) 
    403376             
    404377    def hasNullCoords(): 
     378        ''' 
     379        Checks a record to determine whether it has any coordinates set to null 
     380        ''' 
    405381        if str(self.west)=='null' or \ 
    406382            str(self.south)=='null' or \ 
Note: See TracChangeset for help on using the changeset viewer.