Changeset 3816


Ignore:
Timestamp:
23/04/08 09:55:42 (11 years ago)
Author:
cbyrom
Message:

Add workflow to allow creation of moles doc from whatever starting point
when creating PostgresRecord? + add logging support and extend logging and
tidy up code.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py

    r3809 r3816  
    1616#ET._namespace_map.update({'http://www.oceannet.org/mdip/xml': 'mdip', 'http://www.w3.org/1999/xlink':'xlink'}) 
    1717 
     18import os, sys, logging 
    1819from ETxmlView import loadET, nsdumb 
    1920import molesReadWrite as MRW 
    20 from ndgUtils.ndgXqueries import ndgXqueries 
    21 import os, sys 
    22 from ndgUtils.eXistInterface import ndg_eXist 
    23 from ndgUtils.ndgObject import ndgObject 
    2421 
    2522class PostgresRecord: 
     
    2724    Class representing the contents of a row in the metadata_record postgres DB table 
    2825    @param filename: Name of file to use a metadata record 
    29     @param ndgDataProvider: If True, data has come from a NDG dataprovider, otherwise False   
     26    @param  
    3027    ''' 
    31     documentTypes = ['DIF', 'MOLES', 'DC', 'MDIP', 'ISO19139'] 
    32          
    33     def __init__(self, filename, ndgDataProvider, targetCollection, datacentre_namespace, discovery_id, xq, docType): 
    34         self.filename = filename    # TODO: not sure we need to store this? 
    35         self._targetColection = targetCollection 
     28    documentTypes = ['MOLES', 'DIF', 'DC', 'MDIP', 'ISO19139'] 
     29         
     30    def __init__(self, filename, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): 
     31        logging.info("Setting up Postgres record for file, " + filename) 
     32        self.filename = filename 
     33        self._datacentre_groups = datacentre_groups 
    3634        self._repository = datacentre_namespace 
    3735        self.discovery_id = discovery_id 
    3836        self._xq = xq 
    3937        self.docType = docType 
    40          
     38 
     39        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr 
    4140        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO 
    42  
    43 #        if ndgDataProvider: 
    44 #            discObj=ndgObject(discovery_id) 
    45 #            self._repository = discObj.repository 
    46 #            self.discovery_id = discObj.localID 
    4741 
    4842        # firstly load contents of file 
     
    5549 
    5650        #debugging stuff 
    57         print "vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" 
    58         print self.correctedFormat 
     51#        self.logger.printOutput("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 
     52#        print self.correctedFormat 
    5953#        print self.originalFormat.keys() 
    6054#        for i in self.originalFormat: print i.tag 
    6155#        print dir(self.originalFormat) 
    62         print "vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" 
     56#        self.logger.printOutput("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 
    6357        #we use nsdumb in case the namespace causes difficulties ... 
    6458#        helper=nsdumb(self.originalFormat) 
     
    7468        self.iso19139Format = None 
    7569         
     70        # do some initial setting up of record 
     71#        self.setUpRecord() 
     72        self.doRecordTransforms() 
    7673        self.getSpatioTemporalData() 
    77         self.doRecordTransforms() 
    78  
    79  
    80     def doTransform(xQueryType): 
     74 
     75     
     76    def doRecordTransforms(self): 
     77        ''' 
     78        Run various transforms on the original doc, to populate the record with 
     79        the other types of doc used elsewhere 
     80        ''' 
     81        logging.info("Running transforms for all document types") 
     82        for docType in self.documentTypes: 
     83            self.getDocumentFormat(docType) 
     84        logging.info("Transforms complete") 
     85 
     86 
     87    def doTransform(self, xQueryType): 
    8188        ''' 
    8289        Transform the record according to the specified XQuery type 
     
    8491        @return: the metadata record in the required transformed format  
    8592        ''' 
    86         xquery = self._xq.actual(xQueryType, self._targetCollection, self._datacentre_namespace, self.discovery_id) 
    87  
    88         # and then sort out the input ID stuff 
     93        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document") 
     94 
     95        # get the query and set this up to use properly 
     96        xquery = self._xq.actual(xQueryType, self.filename, self._repository, self.discovery_id) 
     97 
     98        # sort out the input ID stuff 
    8999        xquery=xquery.replace('Input_Entry_ID', self.discovery_id) 
    90         xquery=xquery.replace('repository_localid', self._datacentre_namespace) 
     100        xquery=xquery.replace('repository_localid', self._repository) 
     101 
     102        # write the query to file, to make it easier to input 
     103        # NB, running directly at the command line leads to problems with the interpretation of $ characters 
     104        xqFile = "currentQuery.xq" 
     105        f=open(xqFile,'w') 
     106        f.write(xquery) 
     107        f.close() 
    91108 
    92109        # Now do the transform 
    93         xqCommand = "java -cp /home/users/cbyrom/opt/saxonsa/saxon9sa.jar:/home/users/cbyrom/opt/saxonsa net.sf.saxon.Query -sa -t -s " + \ 
    94             self._filename + " /home/users/cbyrom/tmp/ingestAutomation/OAIBatch/xquery/dif2moles.xq" #\"{" + xquery + "}\"" 
    95         print 'HHHHHHHHH' 
    96         print xqCommand 
    97         status = os.system(xqCommand) 
    98         if status !=0: 
    99             sys.exit("Failed at running the XQuery")#, %s" %xqCommand) 
    100  
    101         print "INFO: Running XQuery transform to create minimoles document" 
    102         molesid, s = xmldb.executeQuery(xquery) 
    103         return xmldb.retrieve(molesid,0) 
    104  
    105  
    106     def getDocumentFormat(docType): 
     110#        xqCommand = "java -cp /home/users/cbyrom/opt/saxonsa/saxon9sa.jar:/home/users/cbyrom/opt/saxonsa net.sf.saxon.Query " + \ 
     111        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
     112        xqCommand = "java -cp saxon9.jar net.sf.saxon.Query " + xqFile 
     113        logging.debug("Running saxon command: " + xqCommand) 
     114        pipe = os.popen(xqCommand + " 2>&1") 
     115        output = pipe.read() 
     116        status = pipe.close() 
     117 
     118        print output 
     119        print "ss,", status 
     120        if status is not None: 
     121            sys.exit("Failed at running the XQuery") 
     122 
     123        # now remove the temp xquery file 
     124        status = os.unlink(xqFile) 
     125        if status is not None: 
     126            sys.exit("Failed to remove the temporary xquery file, " + xqFile) 
     127         
     128        logging.info("Transform completed successfully") 
     129             
     130        return output 
     131 
     132 
     133    def doMolesTransform(self): 
     134        ''' 
     135        Set up the basic moles doc - according to the type of document first ingested 
     136        ''' 
     137        logging.info("Creating moles document - for use with other transforms") 
     138        xqName = None 
     139        if self.docType == "DIF": 
     140            xqName = "dif2moles" 
     141        elif self.docType == "MDIP": 
     142            xqName = "mdip2moles" 
     143        else: 
     144            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \ 
     145                     %self.docType) 
     146 
     147        # add keywords, if required 
     148        if self._datacentre_groups != "": 
     149            addKeywords() 
     150 
     151        # now run the appropriate transform and set the attribute 
     152        setattr(self, "_molesFormat", self.doTransform(xqName)) 
     153        logging.info("moles document created") 
     154         
     155 
     156    def addKeywords(self): 
     157        ''' 
     158        If datacentre groups have been specified, these need to be added as keywords 
     159        - NB, this is rather clumsy approach but uses old code to achieve the result 
     160        ''' 
     161        logging.info("Adding datacentre keywords to moles file") 
     162        # NB, use temporary directories to do the keyword additions 
     163        tmpDir = os.getcwd() + "/" 
     164        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/" 
     165        fileUtils = FileUtilities(self.logger) 
     166        fileUtils.setUpDir(tmpDir) 
     167        fileUtils.setUpDir(tmpKeywordsDir) 
     168        tmpFile = 'tmpFile.xml' 
     169        f=open(tmpDir + "/" + tmpFile,'w') 
     170        f.write(self._molesFormat) 
     171        f.close() 
     172 
     173        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups) 
     174 
     175        # Now load in the converted file 
     176        f=open(tmpKeywordsDir + "/" + tmpFile, 'r') 
     177        self._molesFormat = f.read() 
     178        f.close 
     179         
     180        # Finally, tidy up temp dirs 
     181        fileUtils.cleanDir(tmpDir) 
     182        fileUtils.clearDir(tmpKeywordsDir) 
     183        logging.info("Completed adding keywords") 
     184         
     185 
     186    def getDocumentFormat(self, docType): 
    107187        ''' 
    108188        Lookup document format; if it is already defined then return it, else do the required XQuery 
    109         transform 
     189        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available 
    110190        @param docType: format of document to return  
    111191        ''' 
    112         print "INFO: Retrieving document type, %s" %docType 
     192        logging.info("Retrieving document type, " + docType) 
    113193        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType] 
    114194        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType] 
    115195         
     196        # check we have the moles format available; if not create it 
     197        if self._molesFormat is None: 
     198            self.doMolesTransform() 
     199         
    116200        # check the document isn't already defined 
    117         doc = getattr(self, attributeName) 
    118         if doc is not None: 
    119             return doc 
     201        try: 
     202            doc = getattr(self, attributeName) 
     203            if doc is not None: 
     204                logging.info("Found existing document - returning this now") 
     205                return doc 
     206        except: 
     207            logging.info("Creating new transformed document") 
    120208 
    121209        # the doc type doesn't exist - so run the xquery 
    122         setattr(self, attributeName, doTransform(xqName)) 
     210        setattr(self, attributeName, self.doTransform(xqName)) 
    123211         
    124212     
    125     def getAllDocs(): 
     213    def getAllDocs(self): 
    126214        ''' 
    127215        Return a list of all the available doc types in the record 
     
    133221            self._allDocs.append([docType, getDocumentFormat(docType)]) 
    134222        return self._allDocs 
    135      
    136     def doRecordTransforms(self): 
    137         ''' 
    138         Run various transforms on the original doc, to populate the record with 
    139         the other types of doc used elsewhere 
    140         ''' 
    141         for docType in documentTypes: 
    142             getDocumentFormat(docType) 
    143223         
    144224 
     
    174254            dgMeta.fromXML(cElementTree.ElementTree(file=self.filename).getroot()) 
    175255        except: 
    176             print "WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename 
     256            logging.warning("WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename) 
    177257            return 
    178258        try: 
    179259            bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 
    180260        except: 
    181             print "INFO: XML moles document %s does not contain a bounding box." %self.filename 
     261            logging.info("XML moles document " + self.filename + \ 
     262                " does not contain a bounding box.") 
    182263            no_bbox=True 
    183264 
     
    187268            print "enddate = %s" %dates.DateRangeEnd 
    188269        except: 
    189             print "INFO: XML moles document %s does not contain temporal info." %self.filename 
     270            logging.info("XML moles document " + self.filename + " does not contain temporal info.") 
    190271            no_dates=True 
    191272 
    192273        if no_bbox and no_dates: 
    193             print "INFO: XML moles document %s does not contain any spatiotemporal info." %self.filename 
     274            logging.info("XML moles document " + self.filename + " does not contain any spatiotemporal info.") 
    194275            return 
    195276 
     
    283364            self.south = south 
    284365 
    285         print "Spatial info: west= %s,south %s, east %s, north %s" \ 
    286             %(self.west,self.south,self.east,self.north) 
    287         print "Temporal info: startdate %s, enddate %s" %(self.startdate, self.enddate) 
     366        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \ 
     367                    self.east + ", north " + self.north + "") 
     368        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate)  
    288369 
    289370             
Note: See TracChangeset for help on using the changeset viewer.