Changeset 4414


Ignore:
Timestamp:
06/11/08 17:28:00 (11 years ago)
Author:
cbyrom
Message:

Add new code to allow bulk loading of data from eXist - to improve performance
when doing data ingest.

Location:
exist/trunk/python/ndgUtils/models
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • exist/trunk/python/ndgUtils/models/existdbclient.py

    r4282 r4414  
    2121class eXistDBClient: 
    2222     
    23     def __init__(self, configFile = None, eXistDBHostname = None): 
     23    def __init__(self, configFile = None, eXistDBHostname = None, loadCollectionData=False): 
    2424        ''' 
    2525        Initialise a connection to the eXistDB 
     
    4848        # set up any collections required - NB, if these already exist they won't cause any files to be lost 
    4949        self._setUpEXistAtomCollections() 
     50         
     51        self.collections = None 
     52        if loadCollectionData: 
     53            self.collections = self.getAllAtomCollections() 
     54             
    5055        logging.info("eXist DB connection initialised") 
    5156 
     
    7681                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH) 
    7782                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH) 
     83                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH) 
    7884                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH) 
    7985        logging.info("Required collections available") 
     
    175181        @return: path to new backup file 
    176182        ''' 
     183        if not collection.endswith('/'): 
     184            collection += '/' 
     185             
    177186        docPath = collection + fileName 
    178187        logging.info("Backing up file, '%s', in eXist DB" %docPath) 
     
    256265             
    257266        self.createEXistFile(xml, collection, fileName) 
     267 
     268 
     269    def getAllAtomIDs(self): 
     270        ''' 
     271        Retrieve all the atom IDs in the atoms directory - NB, this can 
     272        be a quick way of producing a cache of data to check - e.g. to avoid 
     273        multiple calls to getAtomFileCollectionPath 
     274        @return: ids - array of all atom IDs 
     275        ''' 
     276        logging.info("Retrieving all atom ids") 
     277        xq = "declare default element namespace 'http://www.w3.org/2005/Atom'; \ 
     278            for $ID in collection(/db/atoms)/entry/id return <id>{tokenize(string($ID), '__ATOM__')[2]}</id>"#<entry>$DE/entry/id</entry>"#for $d in $DE/entry/id return data($d)"#$DE/entry/id" 
     279                 
     280        id, doc = self.xmldb.executeQuery(xq) 
     281         
     282        indices = range(doc['hits']) 
     283        ids = [] 
     284        for i in indices: 
     285            doc = self.xmldb.retrieve(id,i,{}) 
     286            docET = ET.fromstring(doc) 
     287            ids.append(docET.text) 
     288        logging.debug("Found ids, '%s'" %ids) 
     289        return ids 
     290 
     291 
     292    def getAllAtomCollections(self): 
     293        ''' 
     294        Get all atom collection paths and store in a dictionary - for easy 
     295        reference when doing lots of things at once 
     296        @return: dict with key/val of atomID/collectionPath 
     297        ''' 
     298        logging.info("Retrieving all atom collection paths") 
     299                 
     300        # NB, we get all data back in one field here since otherwise eXist complains 
     301        # that the returned dataset is too large and falls over 
     302        xq = "declare default element namespace 'http://www.w3.org/2005/Atom'; \ 
     303            for $DE in collection('/db/atoms')/entry/id let $f:=util:document-name($DE) return \ 
     304            <fileName>{util:collection-name($DE)}/{$f}</fileName>" 
     305 
     306        id, doc = self.xmldb.executeQuery(xq) 
     307        indices = range(doc['hits']) 
     308        colData = {} 
     309        for i in indices: 
     310            doc = self.xmldb.retrieve(id,i,{}) 
     311            docET = ET.fromstring(doc) 
     312            data = docET.text 
     313            key = data.split('/')[-1] 
     314            key = key.split('.')[0] 
     315            val = '/'.join(data.split('/')[0:-1]) 
     316            colData[key] = val 
     317        logging.debug("Finished looking up atom paths") 
     318        return colData 
    258319 
    259320 
     
    297358            isNew = True 
    298359            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1())) 
    299              
    300         eXistCollection = self.getAtomFileCollectionPath(atom.datasetID) 
     360 
     361        eXistCollection = None 
     362        if self.collections is not None: # cope with empty dict 
     363            eXistCollection = self.collections.get(atom.datasetID) 
     364        else: 
     365            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID) 
    301366         
    302367        # if collection not found, assume we're dealing with a new atom; get its 
  • exist/trunk/python/ndgUtils/models/testexistdbclient.py

    r4229 r4414  
    9898        self.assertEquals(self.dbc.createEXistFile(data.testdata.xmlString, self.VALID_COLLECTION_PATH, self.VALID_FILE), True) 
    9999        self.dbc.getAtomFileCollectionPath(data.testdata.id) 
    100         #self.dbc.getAtomFileCollectionPath('tag:localhost:5000,2008-09-23:/view/neodc.nerc.ac.uk__ATOM__activity_11737124322917004') 
     100 
     101             
     102    def testGetAllAtomIDs(self): 
     103        self.tidyUp = True 
     104        self.assertEquals(self.dbc.createEXistFile(data.testdata.xmlString, self.VALID_COLLECTION_PATH, self.VALID_FILE), True) 
     105        ids = self.dbc.getAllAtomIDs() 
     106        self.assertNotEquals(0, len(ids)) 
     107        self.assertNotEquals(None, ids[0]) 
     108             
     109    def testGetAllAtomCollections(self): 
     110        self.tidyUp = True 
     111        self.assertEquals(self.dbc.createEXistFile(data.testdata.xmlString, self.VALID_COLLECTION_PATH, self.VALID_FILE), True) 
     112        ids = self.dbc.getAllAtomCollections() 
     113        self.assertNotEquals(0, len(ids)) 
     114        self.assertNotEquals(None, ids[0]) 
    101115 
    102116     
  • exist/trunk/python/ndgUtils/models/utilities.py

    r4209 r4414  
    1 import os, sys, logging, re, cgi, datetime 
     1import os, sys, logging, re, cgi 
    22from ndgUtils.ETxmlView import subAI 
    33#import csml.csmlscan as CsmlScan 
     
    3636            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString) 
    3737        val = val.strip() 
    38         returnData[i] = val 
     38        returnData[i] = escapeSpecialCharacters(val) 
    3939        i += 1 
    4040     
     
    5353     
    5454    # the XMLCHARREFREPLACE does the required character replacement 
    55     #return correctedString.encode('ascii', 'xmlcharrefreplace') 
    56     return correctedString.encode('utf-8') 
     55    return correctedString.encode('ascii', 'xmlcharrefreplace') 
     56    #return correctedString.encode('utf-8') 
    5757     
    5858 
Note: See TracChangeset for help on using the changeset viewer.