source: exist/trunk/python/ndgUtils/lib/existdbclient.py @ 4555

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/existdbclient.py@4555
Revision 4555, 18.8 KB checked in by cbyrom, 11 years ago (diff)

Move existbdclient to lib package + extend to make use of DocumentRetrieve? to allow retrieval of atoms by ID + fix handling of
authors vs contributors when doing Atom to XML exports.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime
8from ndgUtils.eXistConnector import eXistConnector as ec
9from ndgUtils.ndgXqueries import ndgXqueries
10from ndgUtils import DocumentRetrieve as DR
11import uuid
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21class eXistDBClient:
22   
23    def __init__(self, configFile = None, eXistDBHostname = None, \
24                 loadCollectionData=False, setUpDB = False):
25        '''
26        Initialise a connection to the eXistDB
27        @keyword configFile: config file to use in setting up DB
28        @keyword existDBHostname: name of eXist DB to use - if not specified, the first
29        host in the config file is used
30        @keyword loadCollectionData: preload info on all the eXist collections, if True (default False)
31        @keyword setUpDB: if True, create the basic collection structure and ingest the
32        atom schemas.  Default is False.
33        '''
34        logging.info("Initialising connection to eXist DB")
35        self.eXistDBHostname = eXistDBHostname
36        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
37                      %(eXistDBHostname or 'Default', configFile or 'Default'))
38        inputs = {}
39       
40        self.atomSchema = None
41        # NB, there are two routes through here: if a config file is specified
42        # without a hostname, the host will be taken to be the first entry in
43        # the config file; if a hostname is specified, it will be used explicitly
44        if configFile:
45            inputs['pwfile'] = configFile
46            if not self.eXistDBHostname:
47                self.__loadDBDetails(configFile)
48           
49           
50        # Now set up the connection
51        logging.debug(inputs)
52        self.xmldb = DR(self.eXistDBHostname, **inputs)
53       
54        if setUpDB:
55            # set up any collections required - NB, if these already exist they won't cause any files to be lost
56            self.__setUpEXistAtomCollections()
57           
58            # add the schema required for atom validation
59            self.__addAtomSchema()
60       
61        self.collections = None
62        if loadCollectionData:
63            self.collections = self.getAllAtomCollections()
64           
65        logging.info("eXist DB connection initialised")
66
67
68    def __getSchema(self):
69        logging.debug("Getting atom schema data")
70        if not self.atomSchema:
71            self.atomSchema = ec.BASE_COLLECTION_PATH + \
72                ndgXqueries.ATOM_MOLES_SCHEMA  + '.xsd'
73
74        return self.atomSchema
75
76    AtomSchema = property(fget=__getSchema, doc="Atom schema path")
77
78
79    def createCollections(self, collections):
80        '''
81        Create the specified collections in eXist
82        @param collections: array of collections to create
83        @return True if successful
84        '''
85        logging.info("Setting up eXist collections")
86        for col in collections:
87            logging.debug("Creating collection, '%s'" %col)
88            self.xmldb.createCollection(col)
89        logging.info("All collections set up")
90
91
92    def getAtom(self, id):
93        '''
94        Lookup the atom with id
95        @param id: id of the atom to retrieve
96        '''
97        logging.info("Looking up atom with id, '%s'" %(id))
98        doc = self.xmldb.get('', DR.ATOM, id, targetCollection = ec.BASE_COLLECTION_PATH)
99        logging.info("Atom retrieved")
100        return doc
101       
102
103    def checkAtomSchemaCompliance(self, atomPath, atom = None):
104        '''
105        Validate the specified atom in eXist with the atom schemae in eXist
106        @param atomPath: path to the atom in eXist
107        @keyword atom: if set to an atom, this will be created temporarily in eXist
108        - since it may not already exist there.  Once validation is completed, the
109        file will be removed from eXist.
110        @return array: containing any errors found - NB, if an empty array is returned,
111        this indicates successful validation
112        '''
113        logging.info("Validating atom, '%s' against schemae in eXist" %atomPath)
114       
115        if atom:
116            logging.info("Creating temporary file in eXist to do validation against")
117            fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
118            self.createEXistFile(atom.toPrettyXML(), \
119                                 atom.getDefaultCollectionPath(), fileName)
120            atomPath = atom.getDefaultCollectionPath() + fileName
121           
122        validationQuery = 'validation:validate-report("' + atomPath + \
123            '", xs:anyURI("' + self.AtomSchema + '"))'
124        id, result = self.xmldb.executeQuery(validationQuery)
125        errorMessage = None
126        if result['hits'] == 0: 
127            errorMessage = "Validation did not complete successfully - please retry"
128        elif result['hits'] > 1:
129            errorMessage = "More than one atom was validated - expecting only a single atom validation - please retry"
130
131        if atom:
132            logging.info("Deleting temporary file in eXist")
133            self.deleteEXistFile(atomPath)
134
135        if errorMessage:
136            logging.error(errorMessage)
137            raise SystemError(errorMessage)
138       
139        doc = self.xmldb.retrieve(id, 0)
140        et = ET.fromstring(doc)
141        status = et.findtext('status')
142       
143        # retrieve the error detail if invalid
144        errors = []
145        if status == 'invalid':
146            logging.info("Atom is invalid - details as follows:")
147            for error in et.findall('message'):
148                lineNo = error.attrib.get('line')
149                colNo = error.attrib.get('column')
150                level = error.attrib.get('level')
151                repeat = error.attrib.get('repeat')
152                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, error.text)
153                if repeat:
154                    errorMessage += " (%s times)" %repeat
155                # only return basic error message to users - the log file will contain the full error
156                errors.append(error.text)#errorMessage)
157                logging.info(errorMessage)
158        else:
159            logging.info("Atom is valid")
160           
161        logging.info("Validation complete")
162        return errors
163   
164
165    def __setUpEXistAtomCollections(self):
166        '''
167        Set up the required eXist collections needed for running the granulator script
168        '''
169        logging.info("Ensuring required collections are available in eXist")
170        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
171            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
172                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
173                self.xmldb.createCollection(col)
174                self.xmldb.createCollection(col + type)
175                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
176                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
177                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
178                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
179        logging.info("Required collections available")
180       
181
182    def __addAtomSchema(self):
183        '''
184        Add the required atom schema to the atoms collection - to allow validation
185        of input atoms
186        '''
187        logging.info("Adding atom schema to eXist")
188        xq = ndgXqueries()
189        schemae = [xq.ATOM_SCHEMA, xq.MOLES_SCHEMA, xq.ATOM_MOLES_SCHEMA]
190        for schema in schemae:
191            xml = xq.getSchema(schema)
192            self.createEXistFile(xml, ec.BASE_COLLECTION_PATH, schema + '.xsd')
193        logging.info("- schema added")
194       
195
196    def __loadDBDetails(self, configFile):
197        '''
198        Retrieve info from the eXist db config file
199        '''
200        logging.info("Loading DB config data")
201        # Check this file exists
202        if not os.path.isfile(configFile):
203            errorMessage = "Could not find the DB config file, %s; please make sure this " \
204                     "is available from the running directory" %configFile
205            logging.error(errorMessage)
206            raise ValueError(errorMessage)
207        dbinfo_file=open(configFile, "r")
208        dbinfo = dbinfo_file.read().split()
209        if len(dbinfo) < 3:
210            errorMessage = 'Incorrect data in DB config file'
211            logging.error(errorMessage)
212            raise ValueError(errorMessage)
213        self.eXistDBHostname = dbinfo[0]
214        self._username = dbinfo[1]
215        self._pw = dbinfo[2]
216        logging.info("DB config data loaded")
217
218
219    def __lookupEXistFile(self, docPath):
220        '''
221        Look up a file in eXist using XPath
222        @param docPath: path to doc to look up
223        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
224        '''
225        logging.info("Retrieving info for file, '%s'" %docPath)
226       
227        id, doc = self.xmldb.executeQuery('doc("' + docPath + '")')
228       
229        if doc['hits'] == 0:
230            logging.info("File does not exist in eXist DB")
231            return None
232        logging.info("Found file - returning result ID")
233        return id
234         
235
236    def getEXistFile(self, docPath):
237        '''
238        Use XQuery to retrieve the specified document from eXist
239        @param docPath: the path of the doc to retrieve
240        @return: contents of document if exists, None otherwise
241        '''
242        id = self.__lookupEXistFile(docPath)
243       
244        if not id and id != 0:
245            logging.info("No file found - nothing to retrieve")
246            return None
247       
248        logging.info("Found file - now retrieving content")
249        doc = self.xmldb.retrieve(id, 0)
250        return doc
251
252
253    def isNewEXistFile(self, docPath):
254        '''
255        Backup a file that exists in the eXist DB
256        @param docPath: path of file in eXist to backup
257        '''
258        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
259       
260        id = self.__lookupEXistFile(docPath)
261
262        if id:
263            return False
264       
265        return True
266
267
268    def __addTimeStamp(self, fileName):
269        '''
270        Add timestamp to input filename
271        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
272        is included before this; if not it is just added at the end
273        '''
274        bits = fileName.rsplit(".", 1)
275        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
276       
277        if len(bits) > 1:
278            fileName += "." + bits[1]
279        return fileName
280
281
282    def backupEXistFile(self, collection, fileName):
283        '''
284        Backup a file that exists in the eXist DB
285        - NB, this really just creates a new file with the same contents in a
286        backup dir
287        @param collection: path of the collection to store the file in
288        @param fileName: name of file to add in eXist
289        @return: path to new backup file
290        '''
291        if not collection.endswith('/'):
292            collection += '/'
293           
294        docPath = collection + fileName
295        logging.info("Backing up file, '%s', in eXist DB" %docPath)
296
297        logging.debug("Firstly, retrieve file contents from eXist")
298        doc = self.getEXistFile(docPath)
299        if not doc:
300            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
301            logging.error(errorMessage)
302            raise SystemError(errorMessage)
303       
304        # Now adjust the collection to map to the backup dir
305        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
306        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
307       
308        # add timestamp to filename
309        fileName = self.__addTimeStamp(fileName)
310        docPath = collection + fileName
311
312        logging.debug("Now creating backup file, '%s'" %fileName)
313        self.createEXistFile(doc, collection, fileName)
314       
315        logging.info("File backed up in eXist")
316        return docPath
317
318
319    def createEXistFile(self, xml, collection, fileName):
320        '''
321        Add the input file to the eXist DB
322        @param xml: contents of xml file to create in eXist
323        @param collection: path of the collection to store the file in
324        @param fileName: name of file to add in eXist
325        @return: True, if file created successfully
326        '''
327        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
328                     %(fileName, collection))
329        logging.debug("data: %s" %xml)
330
331        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
332        self.createCollections([collection])
333        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
334        if not status:
335            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
336            logging.error(errorMessage)
337            raise SystemError(errorMessage)
338       
339        logging.info("File added to eXist")
340        return True
341
342
343    def deleteEXistFile(self, docPath):
344        '''
345        Delete the input file from eXist DB
346        @param docPath: path of document to delete
347        @return: True, if file deleted successfully
348        '''
349        logging.info("Deleting file, '%s', from eXist DB" %docPath)
350
351        status = self.xmldb.removeDoc(docPath)   
352        if not status:
353            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
354            logging.error(errorMessage)
355            raise SystemError(errorMessage)
356       
357        logging.info("File deleted from eXist")
358        return True
359
360
361    def createOrUpdateEXistFile(self, xml, collection, fileName):
362        '''
363        Check if a file already exists in eXist; if it does, run an
364        update (which will backup the existing file), otherwise create
365        the file in eXist
366        @param xml: contents of xml file to create/update in eXist
367        @param collection: path of the collection to store the file in
368        @param fileName: name of file to add in eXist
369        '''
370        logging.info("Creating or updating file in eXist...")
371        if not self.isNewEXistFile(collection + fileName):
372            self.backupEXistFile(collection, fileName)
373           
374        self.createEXistFile(xml, collection, fileName)
375
376
377    def getAllAtomIDs(self):
378        '''
379        Retrieve all the atom IDs in the atoms directory - NB, this can
380        be a quick way of producing a cache of data to check - e.g. to avoid
381        multiple calls to getAtomFileCollectionPath
382        @return: ids - array of all atom IDs
383        '''
384        logging.info("Retrieving all atom ids")
385        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
386        id, doc = self.xmldb.executeQuery(xq)
387        if doc['hits'] == 0: 
388            return []
389       
390        indices = range(doc['hits'])
391       
392        doc = self.xmldb.retrieve(id, 0)
393        et = ET.fromstring(doc)
394        ids = []
395        for member in et:
396            fn = member.findtext('{http://www.w3.org/2005/Atom}repositoryID')
397            ids.append(fn)
398        logging.debug("Found ids, '%s'" %ids)
399        return ids
400
401
402    def getAllAtomCollections(self):
403        '''
404        Get all atom collection paths and store in a dictionary - for easy
405        reference when doing lots of things at once
406        @return: dict with key/val of atomID/collectionPath
407        '''
408        logging.info("Retrieving all atom collection paths")
409        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
410        id, doc = self.xmldb.executeQuery(xq)
411        if doc['hits'] == 0: 
412            return []
413       
414        indices = range(doc['hits'])
415       
416        doc = self.xmldb.retrieve(id, 0)
417        et = ET.fromstring(doc)
418        colData = {}
419        for member in et:
420            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
421            fileName = collection.split('/')[-1]
422            fileName = fileName.split('.')[0]
423            dir = '/'.join(collection.split('/')[0:-1])
424            colData[fileName] = dir
425
426        logging.debug("Finished looking up atom paths")
427        return colData
428
429
430    def getAtomFileCollectionPath(self, atomID):
431        '''
432        Given an atom id, determine and return the collection path in eXist
433        of the associated atom file
434        @param atom: atom id to look up
435        @return: collection path, if it exists, None, otherwise
436        '''
437        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
438        xq = ndgXqueries()['atomFullPath']
439        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
440        xq = xq.replace('LocalID', atomID)
441
442        id, doc = self.xmldb.executeQuery(xq)
443        if doc['hits'] == 0:
444            logging.info("No document found with the specified ID")
445            return None
446
447        doc = self.xmldb.retrieve(id,0,{})
448
449        docET = ET.fromstring(doc)
450        collPath = docET.text + '/'
451        logging.debug("Found collection path, '%s'" %collPath)
452        return collPath
453       
454           
455    def createAtomInExist(self, atom):
456        '''
457        Create an atom in the eXist DB
458        @param atom: atom object to create in the DB
459        '''
460        logging.info("Creating atom in eXist")
461       
462        # if the atom has no dataset ID, generate and add one
463        # NB, this should only be the case when the atom is being created
464        # via the web interface
465        isNew = False
466        if not atom.datasetID:
467            isNew = True
468            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
469
470        eXistCollection = None
471        if self.collections is not None: # cope with empty dict
472            eXistCollection = self.collections.get(atom.datasetID)
473        else:
474            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
475       
476        # if collection not found, assume we're dealing with a new atom; get its
477        # default collection
478        if not eXistCollection:
479            eXistCollection = atom.getDefaultCollectionPath()
480        elif isNew:
481            # in this situation we're trying to create an atom with the same
482            # name via the web interface - this can't be allowed - so retry to
483            # generate a new ID
484            atom.datasetID = None
485            self.createAtomInExist(atom)
486            return
487        # create backup of atom if it already exists
488        else:
489            self.backupEXistFile(eXistCollection, atom.atomName)
490           
491            # also change updated date to current time
492            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
493           
494        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
495        logging.info("Atom created in eXist")
496        return atom
Note: See TracBrowser for help on using the repository browser.