source: exist/trunk/python/ndgUtils/lib/existdbclient.py @ 4679

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/existdbclient.py@4679
Revision 4679, 21.2 KB checked in by cbyrom, 11 years ago (diff)

Extend granulite to allow command line operation - with input options
to specify logging level and 'replace atom' mode - which command line
inputs to ask users if they want to replace duplicated data.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime, uuid
8from ndgUtils.models.Atom import Atom
9from ndgUtils.eXistConnector import eXistConnector as ec
10from ndgUtils.ndgXqueries import ndgXqueries
11from ndgUtils import DocumentRetrieve as DR
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21from threading import Thread
22
23
24class DuplicateError(Exception):
25    """
26    Exception handling for when a duplicated atom doc is discovered
27    """
28    def __init__(self, msg):
29        logging.error(msg)
30        Exception.__init__(self, msg)
31
32class backingUpThread(Thread):
33   
34   def __init__ (self, existClient, doc, collection, fileName):
35       logging.info("Setting up thread to run backup for file, '%s'" %fileName)
36       Thread.__init__(self)
37       self.ec = existClient
38       self.doc = doc
39       self.collection = collection
40       self.fileName = fileName
41       logging.info("- finished setting up thread")
42     
43   def run(self):
44       logging.info("Running thread to perform backup of file, '%s'" %self.fileName)
45       self.ec.createEXistFile(self.doc, self.collection, self.fileName)
46       logging.info("- finished backing up file")
47       
48
49class eXistDBClient:
50   
51    def __init__(self, configFile = None, eXistDBHostname = None, \
52                 loadCollectionData=False, setUpDB = False):
53        '''
54        Initialise a connection to the eXistDB
55        @keyword configFile: config file to use in setting up DB
56        @keyword existDBHostname: name of eXist DB to use - if not specified, the first
57        host in the config file is used
58        @keyword loadCollectionData: preload info on all the eXist collections, if True (default False)
59        @keyword setUpDB: if True, create the basic collection structure and ingest the
60        atom schemas.  Default is False.
61        '''
62        logging.info("Initialising connection to eXist DB")
63        self.eXistDBHostname = eXistDBHostname
64        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
65                      %(eXistDBHostname or 'Default', configFile or 'Default'))
66        inputs = {}
67       
68        self.atomSchema = None
69        # NB, there are two routes through here: if a config file is specified
70        # without a hostname, the host will be taken to be the first entry in
71        # the config file; if a hostname is specified, it will be used explicitly
72        if configFile:
73            inputs['pwfile'] = configFile
74            if not self.eXistDBHostname:
75                self.__loadDBDetails(configFile)
76           
77           
78        # Now set up the connection
79        logging.debug(inputs)
80        self.xmldb = DR(self.eXistDBHostname, **inputs)
81       
82        if setUpDB:
83            # set up any collections required - NB, if these already exist they won't cause any files to be lost
84            self.__setUpEXistAtomCollections()
85           
86            # add the schema required for atom validation
87            self.__addAtomSchema()
88       
89        self.collections = None
90        if loadCollectionData:
91            self.collections = self.getAllAtomCollections()
92           
93        logging.info("eXist DB connection initialised")
94
95
96    def __getSchema(self):
97        logging.debug("Getting atom schema data")
98        if not self.atomSchema:
99            self.atomSchema = ec.BASE_COLLECTION_PATH + \
100                ndgXqueries.ATOM_MOLES_SCHEMA  + '.xsd'
101
102        return self.atomSchema
103
104    AtomSchema = property(fget=__getSchema, doc="Atom schema path")
105
106
107    def createCollections(self, collections):
108        '''
109        Create the specified collections in eXist
110        @param collections: array of collections to create
111        @return True if successful
112        '''
113        logging.info("Setting up eXist collections")
114        for col in collections:
115            logging.debug("Creating collection, '%s'" %col)
116            self.xmldb.createCollection(col)
117        logging.info("All collections set up")
118
119
120    def getAtom(self, id):
121        '''
122        Lookup the atom with id
123        @param id: id of the atom to retrieve
124        '''
125        logging.info("Looking up atom with id, '%s'" %(id))
126        doc = self.xmldb.get('', DR.ATOM, id, \
127                             targetCollection = ec.BASE_COLLECTION_PATH)
128        logging.info("Atom retrieved")
129        return doc
130       
131
132    def checkAtomSchemaCompliance(self, atomPath, atom = None, isDebug = False):
133        '''
134        Validate the specified atom in eXist with the atom schemae in eXist
135        @param atomPath: path to the atom in eXist
136        @keyword atom: if set to an atom, this will be created temporarily in eXist
137        - since it may not already exist there.  Once validation is completed, the
138        file will be removed from eXist.
139        @keyword isDebug: if True, return full error details, otherwise only return
140        a summary
141        @return array: containing any errors found - NB, if an empty array is returned,
142        this indicates successful validation
143        '''
144        logging.info("Validating atom, '%s' against schemae in eXist" %atomPath)
145       
146        if atom:
147            logging.info("Creating temporary file in eXist to do validation against")
148            fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
149            self.createEXistFile(atom.toPrettyXML(), \
150                                 atom.getDefaultCollectionPath(), fileName)
151            atomPath = atom.getDefaultCollectionPath() + fileName
152           
153        validationQuery = 'validation:validate-report("' + atomPath + \
154            '", xs:anyURI("' + self.AtomSchema + '"))'
155        id, result = self.xmldb.executeQuery(validationQuery)
156        errorMessage = None
157        if result['hits'] == 0: 
158            errorMessage = "Validation did not complete successfully - please retry"
159        elif result['hits'] > 1:
160            errorMessage = "More than one atom was validated - expecting only a single atom validation - please retry"
161
162        if atom:
163            logging.info("Deleting temporary file in eXist")
164            self.deleteEXistFile(atomPath)
165
166        if errorMessage:
167            logging.error(errorMessage)
168            raise SystemError(errorMessage)
169       
170        doc = self.xmldb.retrieve(id, 0)
171        et = ET.fromstring(doc)
172        status = et.findtext('status')
173       
174        # retrieve the error detail if invalid
175        errors = []
176        if status == 'invalid':
177            logging.info("Atom is invalid - details as follows:")
178            for error in et.findall('message'):
179                lineNo = error.attrib.get('line')
180                colNo = error.attrib.get('column')
181                level = error.attrib.get('level')
182                repeat = error.attrib.get('repeat')
183                errorText = error.text
184                # remove the meaningless error type from message
185                if errorText.startswith('cvc-'):
186                    errorText = ':'.join(errorText.split(':')[1:])
187                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, errorText)
188                if repeat:
189                    errorMessage += " (%s times)" %repeat
190
191                if isDebug:
192                    errors.append(errorMessage)
193                else:
194                    errors.append(errorText)
195                logging.info(errorMessage)
196        else:
197            logging.info("Atom is valid")
198           
199        logging.info("Validation complete")
200        return errors
201   
202
203    def __setUpEXistAtomCollections(self):
204        '''
205        Set up the required eXist collections needed for running the granulator script
206        '''
207        logging.info("Ensuring required collections are available in eXist")
208        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
209            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
210                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
211                self.xmldb.createCollection(col)
212                self.xmldb.createCollection(col + type)
213                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
214                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
215                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
216                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
217        logging.info("Required collections available")
218       
219
220    def __addAtomSchema(self):
221        '''
222        Add the required atom schema to the atoms collection - to allow validation
223        of input atoms
224        '''
225        logging.info("Adding atom schema to eXist")
226        xq = ndgXqueries()
227        schemae = [xq.ATOM_SCHEMA, xq.MOLES_SCHEMA, xq.ATOM_MOLES_SCHEMA]
228        for schema in schemae:
229            xml = xq.getSchema(schema)
230            self.createEXistFile(xml, ec.BASE_COLLECTION_PATH, schema + '.xsd')
231        logging.info("- schema added")
232       
233
234    def __loadDBDetails(self, configFile):
235        '''
236        Retrieve info from the eXist db config file
237        '''
238        logging.info("Loading DB config data")
239        # Check this file exists
240        if not os.path.isfile(configFile):
241            errorMessage = "Could not find the DB config file, %s; please make sure this " \
242                     "is available from the running directory" %configFile
243            logging.error(errorMessage)
244            raise ValueError(errorMessage)
245        dbinfo_file=open(configFile, "r")
246        dbinfo = dbinfo_file.read().split()
247        if len(dbinfo) < 3:
248            errorMessage = 'Incorrect data in DB config file'
249            logging.error(errorMessage)
250            raise ValueError(errorMessage)
251        self.eXistDBHostname = dbinfo[0]
252        self._username = dbinfo[1]
253        self._pw = dbinfo[2]
254        logging.info("DB config data loaded")
255
256
257    def __lookupEXistFile(self, docPath):
258        '''
259        Look up a file in eXist using XPath
260        @param docPath: path to doc to look up
261        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
262        '''
263        logging.info("Retrieving info for file, '%s'" %docPath)
264       
265        id, doc = self.xmldb.executeQuery('doc("' + docPath + '")')
266       
267        if doc['hits'] == 0:
268            logging.info("File does not exist in eXist DB")
269            return None
270        logging.info("Found file - returning result ID")
271        return id
272         
273
274    def getEXistFile(self, docPath):
275        '''
276        Use XQuery to retrieve the specified document from eXist
277        @param docPath: the path of the doc to retrieve
278        @return: contents of document if exists, None otherwise
279        '''
280        id = self.__lookupEXistFile(docPath)
281       
282        if not id and id != 0:
283            logging.info("No file found - nothing to retrieve")
284            return None
285       
286        logging.info("Found file - now retrieving content")
287        doc = self.xmldb.retrieve(id, 0)
288        return doc
289
290
291    def isNewEXistFile(self, docPath):
292        '''
293        Test if a file already exists in eXist
294        @param docPath: path of file in eXist to look up
295        @return: True if a new file, False if otherwise
296        '''
297        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
298       
299        id = self.__lookupEXistFile(docPath)
300
301        if id:
302            return False
303       
304        return True
305
306
307    def __addTimeStamp(self, fileName):
308        '''
309        Add timestamp to input filename
310        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
311        is included before this; if not it is just added at the end
312        '''
313        bits = fileName.rsplit(".", 1)
314        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
315       
316        if len(bits) > 1:
317            fileName += "." + bits[1]
318        return fileName
319
320
321    def backupEXistFile(self, collection, fileName):
322        '''
323        Backup a file that exists in the eXist DB
324        - NB, this really just creates a new file with the same contents in a
325        backup dir
326        - to improve efficiency, spawn this process as a new thread since we
327        don't need to worry about the outcome
328        @param collection: path of the collection to store the file in
329        @param fileName: name of file to add in eXist
330        @return: path to new backup file
331        '''
332        if not collection.endswith('/'):
333            collection += '/'
334           
335        docPath = collection + fileName
336        logging.info("Backing up file, '%s', in eXist DB" %docPath)
337
338        logging.debug("Firstly, retrieve file contents from eXist")
339        doc = self.getEXistFile(docPath)
340        if not doc:
341            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
342            logging.error(errorMessage)
343            raise SystemError(errorMessage)
344       
345        # Now adjust the collection to map to the backup dir
346        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
347        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
348       
349        # add timestamp to filename
350        fileName = self.__addTimeStamp(fileName)
351        docPath = collection + fileName
352       
353        # run the back up in a separate thread
354        thread = backingUpThread(self, doc, collection, fileName)
355        thread.start()
356
357        return docPath
358
359
360    def createEXistFile(self, xml, collection, fileName):
361        '''
362        Add the input file to the eXist DB
363        @param xml: contents of xml file to create in eXist
364        @param collection: path of the collection to store the file in
365        @param fileName: name of file to add in eXist
366        @return: True, if file created successfully
367        '''
368        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
369                     %(fileName, collection))
370        logging.debug("data: %s" %xml)
371
372        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
373        self.createCollections([collection])
374        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
375        if not status:
376            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
377            logging.error(errorMessage)
378            raise SystemError(errorMessage)
379       
380        logging.info("File added to eXist")
381        return True
382
383
384    def deleteEXistFile(self, docPath):
385        '''
386        Delete the input file from eXist DB
387        @param docPath: path of document to delete
388        @return: True, if file deleted successfully
389        '''
390        logging.info("Deleting file, '%s', from eXist DB" %docPath)
391
392        status = self.xmldb.removeDoc(docPath)   
393        if not status:
394            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
395            logging.error(errorMessage)
396            raise SystemError(errorMessage)
397       
398        logging.info("File deleted from eXist")
399        return True
400
401
402    def createOrUpdateEXistFile(self, xml, collection, fileName):
403        '''
404        Check if a file already exists in eXist; if it does, run an
405        update (which will backup the existing file), otherwise create
406        the file in eXist
407        @param xml: contents of xml file to create/update in eXist
408        @param collection: path of the collection to store the file in
409        @param fileName: name of file to add in eXist
410        '''
411        logging.info("Creating or updating file in eXist...")
412        if not self.isNewEXistFile(collection + fileName):
413            self.backupEXistFile(collection, fileName)
414           
415        self.createEXistFile(xml, collection, fileName)
416
417
418    def getAllAtomIDs(self):
419        '''
420        Retrieve all the atom IDs in the atoms directory - NB, this can
421        be a quick way of producing a cache of data to check - e.g. to avoid
422        multiple calls to getAtomFileCollectionPath
423        @return: ids - array of all atom IDs
424        '''
425        logging.info("Retrieving all atom ids")
426        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
427        id, doc = self.xmldb.executeQuery(xq)
428        if doc['hits'] == 0: 
429            return []
430       
431        indices = range(doc['hits'])
432       
433        doc = self.xmldb.retrieve(id, 0)
434        et = ET.fromstring(doc)
435        ids = []
436        for member in et:
437            fn = member.findtext('{http://www.w3.org/2005/Atom}repositoryID')
438            ids.append(fn)
439        logging.debug("Found ids, '%s'" %ids)
440        return ids
441
442
443    def getAllAtomCollections(self):
444        '''
445        Get all atom collection paths and store in a dictionary - for easy
446        reference when doing lots of things at once
447        @return: dict with key/val of atomID/collectionPath
448        '''
449        logging.info("Retrieving all atom collection paths")
450        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
451        id, doc = self.xmldb.executeQuery(xq)
452        if doc['hits'] == 0: 
453            return []
454       
455        indices = range(doc['hits'])
456       
457        doc = self.xmldb.retrieve(id, 0)
458        et = ET.fromstring(doc)
459        colData = {}
460        for member in et:
461            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
462            fileName = collection.split('/')[-1]
463            fileName = fileName.split('.')[0]
464            dir = '/'.join(collection.split('/')[0:-1])
465            colData[fileName] = dir
466
467        logging.debug("Finished looking up atom paths")
468        return colData
469
470
471    def getAtomFileCollectionPath(self, atomID):
472        '''
473        Given an atom id, determine and return the collection path in eXist
474        of the associated atom file
475        @param atom: atom id to look up
476        @return: collection path, if it exists, None, otherwise
477        '''
478        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
479        xq = ndgXqueries()['atomFullPath']
480        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
481        xq = xq.replace('LocalID', atomID)
482
483        id, doc = self.xmldb.executeQuery(xq)
484        if doc['hits'] == 0:
485            logging.info("No document found with the specified ID")
486            return None
487
488        doc = self.xmldb.retrieve(id,0,{})
489
490        docET = ET.fromstring(doc)
491        collPath = docET.text + '/'
492        logging.debug("Found collection path, '%s'" %collPath)
493        return collPath
494
495
496    def deleteAtomInExist(self, atom):
497        '''
498        Delete the given atom from the eXist DB - using the atom
499        details to work out the required path to delete
500        '''
501        logging.info("Deleting atom from eXist")
502        atomPath = atom.getDefaultCollectionPath() + atom.atomName
503        self.deleteEXistFile(atomPath)
504        logging.info("Atom deleted")
505
506           
507    def createAtomInExist(self, atom, replaceAtom = True):
508        '''
509        Create an atom in the eXist DB - using the atom contents to work out
510        the location + data set ID
511        @param atom: atom object to create in the DB
512        @keyword replaceAtom: if False and the atom is already available in eXist
513        then raise a ValueError.
514        '''
515        logging.info("Creating atom in eXist")
516        if not atom:
517            raise ValueError("Input is not an object - cannot create in eXist")
518        if not isinstance(atom, Atom):
519            raise ValueError("Input object is not an Atom object - cannot create in eXist")
520       
521        # if the atom has no dataset ID, generate and add one
522        # NB, this should only be the case when the atom is being created
523        # via the web interface
524        isNew = False
525        if not atom.datasetID:
526            isNew = True
527            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
528
529        eXistCollection = None
530        if self.collections is not None: # cope with empty dict
531            eXistCollection = self.collections.get(atom.datasetID)
532        else:
533            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
534       
535        # if collection not found, assume we're dealing with a new atom; get its
536        # default collection
537        if not eXistCollection:
538            eXistCollection = atom.getDefaultCollectionPath()
539        elif isNew:
540            # in this situation we're trying to create an atom with the same
541            # name via the web interface - this can't be allowed - so retry to
542            # generate a new ID
543            atom.datasetID = None
544            self.createAtomInExist(atom)
545            return
546        # create backup of atom if it already exists
547        else:
548            if not replaceAtom:
549                raise DuplicateError('An atom with the specified ID (%s) already exists in eXist' \
550                                     %atom.datasetID)
551            self.backupEXistFile(eXistCollection, atom.atomName)
552           
553            # also change updated date to current time
554            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
555           
556        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
557        logging.info("Atom created in eXist")
558        return atom
Note: See TracBrowser for help on using the repository browser.