source: exist/trunk/python/ndgUtils/models/existdbclient.py @ 4493

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/existdbclient.py@4493
Revision 4493, 18.5 KB checked in by cbyrom, 12 years ago (diff)

Add new keyword to avoid setting up the eXist DB unless explicitly requested.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime
8from ndgUtils.eXistInterface import ndg_eXist
9from ndgUtils.eXistConnector import eXistConnector as ec
10from ndgUtils.ndgXqueries import ndgXqueries
11import uuid
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21class eXistDBClient:
22   
23    def __init__(self, configFile = None, eXistDBHostname = None, \
24                 loadCollectionData=False, setUpDB=False):
25        '''
26        Initialise a connection to the eXistDB
27        @keyword configFile: config file to use in setting up DB
28        @keyword existDBHostname: name of eXist DB to use - if not specified, the first
29        host in the config file is used
30        @keyword loadCollectionData: preload info on all the eXist collections, if True (default False)
31        @keyword setUpDB: if True, create the basic collection structure and ingest the
32        atom schemas.  Default is False.
33        '''
34        logging.info("Initialising connection to eXist DB")
35        self.eXistDBHostname = eXistDBHostname
36        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
37                      %(eXistDBHostname or 'Default', configFile or 'Default'))
38        inputs = {}
39       
40        self.atomSchema = None
41        # NB, there are two routes through here: if a config file is specified
42        # without a hostname, the host will be taken to be the first entry in
43        # the config file; if a hostname is specified, it will be used explicitly
44        if configFile:
45            if not self.eXistDBHostname:
46                self.__loadDBDetails(configFile)
47            inputs['passwordFile'] = configFile
48           
49        if self.eXistDBHostname:
50            inputs['db'] = self.eXistDBHostname
51           
52        # Now set up the connection
53        logging.debug(inputs)
54        self.xmldb = ndg_eXist(**inputs)
55       
56        if setUpDB:
57            # set up any collections required - NB, if these already exist they won't cause any files to be lost
58            self.__setUpEXistAtomCollections()
59           
60            # add the schema required for atom validation
61            self.__addAtomSchema()
62       
63        self.collections = None
64        if loadCollectionData:
65            self.collections = self.getAllAtomCollections()
66           
67        logging.info("eXist DB connection initialised")
68
69
70    def __getSchema(self):
71        logging.debug("Getting atom schema data")
72        if not self.atomSchema:
73            self.atomSchema = ec.BASE_COLLECTION_PATH + \
74                ndgXqueries.ATOM_MOLES_SCHEMA  + '.xsd'
75
76        return self.atomSchema
77
78    AtomSchema = property(fget=__getSchema, doc="Atom schema path")
79
80
81    def createCollections(self, collections):
82        '''
83        Create the specified collections in eXist
84        @param collections: array of collections to create
85        @return True if successful
86        '''
87        logging.info("Setting up eXist collections")
88        for col in collections:
89            logging.debug("Creating collection, '%s'" %col)
90            self.xmldb.createCollection(col)
91        logging.info("All collections set up")
92
93
94    def checkAtomSchemaCompliance(self, atomPath, atom = None):
95        '''
96        Validate the specified atom in eXist with the atom schemae in eXist
97        @param atomPath: path to the atom in eXist
98        @keyword atom: if set to an atom, this will be created temporarily in eXist
99        - since it may not already exist there.  Once validation is completed, the
100        file will be removed from eXist.
101        @return array: containing any errors found - NB, if an empty array is returned,
102        this indicates successful validation
103        '''
104        logging.info("Validating atom, '%s' against schemae in eXist" %atomPath)
105       
106        if atom:
107            logging.info("Creating temporary file in eXist to do validation against")
108            fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
109            self.createEXistFile(atom.toPrettyXML(), \
110                                 atom.getDefaultCollectionPath(), fileName)
111            atomPath = atom.getDefaultCollectionPath() + fileName
112           
113        validationQuery = 'validation:validate-report("' + atomPath + \
114            '", xs:anyURI("' + self.AtomSchema + '"))'
115        id, result = self.xmldb.executeQuery(validationQuery)
116        errorMessage = None
117        if result['hits'] == 0: 
118            errorMessage = "Validation did not complete successfully - please retry"
119        elif result['hits'] > 1:
120            errorMessage = "More than one atom was validated - expecting only a single atom validation - please retry"
121
122        if atom:
123            logging.info("Deleting temporary file in eXist")
124            self.deleteEXistFile(atomPath)
125
126        if errorMessage:
127            logging.error(errorMessage)
128            raise SystemError(errorMessage)
129       
130        doc = self.xmldb.retrieve(id, 0)
131        et = ET.fromstring(doc)
132        status = et.findtext('status')
133       
134        # retrieve the error detail if invalid
135        errors = []
136        if status == 'invalid':
137            logging.info("Atom is invalid - details as follows:")
138            for error in et.findall('message'):
139                lineNo = error.attrib.get('line')
140                colNo = error.attrib.get('column')
141                level = error.attrib.get('level')
142                repeat = error.attrib.get('repeat')
143                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, error.text)
144                if repeat:
145                    errorMessage += " (%s times)" %repeat
146                errors.append(errorMessage)
147                logging.info(errorMessage)
148        else:
149            logging.info("Atom is valid")
150           
151        logging.info("Validation complete")
152        return errors
153   
154
155    def __setUpEXistAtomCollections(self):
156        '''
157        Set up the required eXist collections needed for running the granulator script
158        '''
159        logging.info("Ensuring required collections are available in eXist")
160        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
161            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
162                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
163                self.xmldb.createCollection(col)
164                self.xmldb.createCollection(col + type)
165                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
166                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
167                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
168                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
169        logging.info("Required collections available")
170       
171
172    def __addAtomSchema(self):
173        '''
174        Add the required atom schema to the atoms collection - to allow validation
175        of input atoms
176        '''
177        logging.info("Adding atom schema to eXist")
178        xq = ndgXqueries()
179        schemae = [xq.ATOM_SCHEMA, xq.MOLES_SCHEMA, xq.ATOM_MOLES_SCHEMA]
180        for schema in schemae:
181            xml = xq.getSchema(schema)
182            self.createEXistFile(xml, ec.BASE_COLLECTION_PATH, schema + '.xsd')
183        logging.info("- schema added")
184       
185
186    def __loadDBDetails(self, configFile):
187        '''
188        Retrieve info from the eXist db config file
189        '''
190        logging.info("Loading DB config data")
191        # Check this file exists
192        if not os.path.isfile(configFile):
193            errorMessage = "Could not find the DB config file, %s; please make sure this " \
194                     "is available from the running directory" %configFile
195            logging.error(errorMessage)
196            raise ValueError(errorMessage)
197        dbinfo_file=open(configFile, "r")
198        dbinfo = dbinfo_file.read().split()
199        if len(dbinfo) < 3:
200            errorMessage = 'Incorrect data in DB config file'
201            logging.error(errorMessage)
202            raise ValueError(errorMessage)
203        self.eXistDBHostname = dbinfo[0]
204        self._username = dbinfo[1]
205        self._pw = dbinfo[2]
206        logging.info("DB config data loaded")
207
208
209    def __lookupEXistFile(self, docPath):
210        '''
211        Look up a file in eXist using XPath
212        @param docPath: path to doc to look up
213        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
214        '''
215        logging.info("Retrieving info for file, '%s'" %docPath)
216       
217        doc = self.xmldb.executeQuery('doc("' + docPath + '")')
218       
219        if doc[1]['hits'] == 0:
220            logging.info("File does not exist in eXist DB")
221            return None
222        logging.info("Found file - returning result ID")
223        return doc[0]
224         
225
226    def getEXistFile(self, docPath):
227        '''
228        Use XQuery to retrieve the specified document from eXist
229        @param docPath: the path of the doc to retrieve
230        @return: contents of document if exists, None otherwise
231        '''
232        id = self.__lookupEXistFile(docPath)
233       
234        if not id:
235            logging.info("No file found - nothing to retrieve")
236            return None
237       
238        logging.info("Found file - now retrieving content")
239        doc = self.xmldb.retrieve(id, 0)
240        return doc
241
242
243    def isNewEXistFile(self, docPath):
244        '''
245        Backup a file that exists in the eXist DB
246        @param docPath: path of file in eXist to backup
247        '''
248        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
249       
250        id = self.__lookupEXistFile(docPath)
251
252        if id:
253            return False
254       
255        return True
256
257
258    def __addTimeStamp(self, fileName):
259        '''
260        Add timestamp to input filename
261        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
262        is included before this; if not it is just added at the end
263        '''
264        bits = fileName.rsplit(".", 1)
265        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
266       
267        if len(bits) > 1:
268            fileName += "." + bits[1]
269        return fileName
270
271
272    def backupEXistFile(self, collection, fileName):
273        '''
274        Backup a file that exists in the eXist DB
275        - NB, this really just creates a new file with the same contents in a
276        backup dir
277        @param collection: path of the collection to store the file in
278        @param fileName: name of file to add in eXist
279        @return: path to new backup file
280        '''
281        if not collection.endswith('/'):
282            collection += '/'
283           
284        docPath = collection + fileName
285        logging.info("Backing up file, '%s', in eXist DB" %docPath)
286
287        logging.debug("Firstly, retrieve file contents from eXist")
288        doc = self.getEXistFile(docPath)
289        if not doc:
290            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
291            logging.error(errorMessage)
292            raise SystemError(errorMessage)
293       
294        # Now adjust the collection to map to the backup dir
295        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
296        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
297       
298        # add timestamp to filename
299        fileName = self.__addTimeStamp(fileName)
300        docPath = collection + fileName
301
302        logging.debug("Now creating backup file, '%s'" %fileName)
303        self.createEXistFile(doc, collection, fileName)
304       
305        logging.info("File backed up in eXist")
306        return docPath
307
308
309    def createEXistFile(self, xml, collection, fileName):
310        '''
311        Add the input file to the eXist DB
312        @param xml: contents of xml file to create in eXist
313        @param collection: path of the collection to store the file in
314        @param fileName: name of file to add in eXist
315        @return: True, if file created successfully
316        '''
317        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
318                     %(fileName, collection))
319        logging.debug("data: %s" %xml)
320
321        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
322        self.createCollections([collection])
323        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
324        if not status:
325            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
326            logging.error(errorMessage)
327            raise SystemError(errorMessage)
328       
329        logging.info("File added to eXist")
330        return True
331
332
333    def deleteEXistFile(self, docPath):
334        '''
335        Delete the input file from eXist DB
336        @param docPath: path of document to delete
337        @return: True, if file deleted successfully
338        '''
339        logging.info("Deleting file, '%s', from eXist DB" %docPath)
340
341        status = self.xmldb.removeDoc(docPath)   
342        if not status:
343            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
344            logging.error(errorMessage)
345            raise SystemError(errorMessage)
346       
347        logging.info("File deleted from eXist")
348        return True
349
350
351    def createOrUpdateEXistFile(self, xml, collection, fileName):
352        '''
353        Check if a file already exists in eXist; if it does, run an
354        update (which will backup the existing file), otherwise create
355        the file in eXist
356        @param xml: contents of xml file to create/update in eXist
357        @param collection: path of the collection to store the file in
358        @param fileName: name of file to add in eXist
359        '''
360        logging.info("Creating or updating file in eXist...")
361        if not self.isNewEXistFile(collection + fileName):
362            self.backupEXistFile(collection, fileName)
363           
364        self.createEXistFile(xml, collection, fileName)
365
366
367    def getAllAtomIDs(self):
368        '''
369        Retrieve all the atom IDs in the atoms directory - NB, this can
370        be a quick way of producing a cache of data to check - e.g. to avoid
371        multiple calls to getAtomFileCollectionPath
372        @return: ids - array of all atom IDs
373        '''
374        logging.info("Retrieving all atom ids")
375        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
376        id, doc = self.xmldb.executeQuery(xq)
377        if doc['hits'] == 0: 
378            return []
379       
380        indices = range(doc['hits'])
381       
382        doc = self.xmldb.retrieve(id, 0)
383        et = ET.fromstring(doc)
384        ids = []
385        for member in et:
386            fn = member.findtext('{http://www.w3.org/2005/Atom}repositoryID')
387            ids.append(fn)
388        logging.debug("Found ids, '%s'" %ids)
389        return ids
390
391
392    def getAllAtomCollections(self):
393        '''
394        Get all atom collection paths and store in a dictionary - for easy
395        reference when doing lots of things at once
396        @return: dict with key/val of atomID/collectionPath
397        '''
398        logging.info("Retrieving all atom collection paths")
399        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
400        id, doc = self.xmldb.executeQuery(xq)
401        if doc['hits'] == 0: 
402            return []
403       
404        indices = range(doc['hits'])
405       
406        doc = self.xmldb.retrieve(id, 0)
407        et = ET.fromstring(doc)
408        colData = {}
409        for member in et:
410            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
411            fileName = collection.split('/')[-1]
412            fileName = fileName.split('.')[0]
413            dir = '/'.join(collection.split('/')[0:-1])
414            colData[fileName] = dir
415
416        logging.debug("Finished looking up atom paths")
417        return colData
418
419
420    def getAtomFileCollectionPath(self, atomID):
421        '''
422        Given an atom id, determine and return the collection path in eXist
423        of the associated atom file
424        @param atom: atom id to look up
425        @return: collection path, if it exists, None, otherwise
426        '''
427        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
428        xq = ndgXqueries()['atomFullPath']
429        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
430        xq = xq.replace('LocalID', atomID)
431
432        id, doc = self.xmldb.executeQuery(xq)
433        if doc['hits'] == 0:
434            logging.info("No document found with the specified ID")
435            return None
436
437        doc = self.xmldb.retrieve(id,0,{})
438
439        docET = ET.fromstring(doc)
440        collPath = docET.text + '/'
441        logging.debug("Found collection path, '%s'" %collPath)
442        return collPath
443       
444           
445    def createAtomInExist(self, atom):
446        '''
447        Create an atom in the eXist DB
448        @param atom: atom object to create in the DB
449        '''
450        logging.info("Creating atom in eXist")
451       
452        # if the atom has no dataset ID, generate and add one
453        # NB, this should only be the case when the atom is being created
454        # via the web interface
455        isNew = False
456        if not atom.datasetID:
457            isNew = True
458            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
459
460        eXistCollection = None
461        if self.collections is not None: # cope with empty dict
462            eXistCollection = self.collections.get(atom.datasetID)
463        else:
464            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
465       
466        # if collection not found, assume we're dealing with a new atom; get its
467        # default collection
468        if not eXistCollection:
469            eXistCollection = atom.getDefaultCollectionPath()
470        elif isNew:
471            # in this situation we're trying to create an atom with the same
472            # name via the web interface - this can't be allowed - so retry to
473            # generate a new ID
474            atom.datasetID = None
475            self.createAtomInExist(atom)
476            return
477        # create backup of atom if it already exists
478        else:
479            self.backupEXistFile(eXistCollection, atom.atomName)
480           
481            # also change updated date to current time
482            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
483           
484        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
485        logging.info("Atom created in eXist")
486        return atom
Note: See TracBrowser for help on using the repository browser.