source: exist/trunk/python/ndgUtils/models/existdbclient.py @ 4490

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/existdbclient.py@4490
Revision 4490, 18.0 KB checked in by cbyrom, 12 years ago (diff)

Add methods to the eXist DB client to ingest the required atom xsd docs + to allow validation of atoms against these schemae. Add ability to validate docs already in eXist and also to temporarily ingest them to allow validation.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime
8from ndgUtils.eXistInterface import ndg_eXist
9from ndgUtils.eXistConnector import eXistConnector as ec
10from ndgUtils.ndgXqueries import ndgXqueries
11import uuid
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21class eXistDBClient:
22   
23    def __init__(self, configFile = None, eXistDBHostname = None, loadCollectionData=False):
24        '''
25        Initialise a connection to the eXistDB
26        '''
27        logging.info("Initialising connection to eXist DB")
28        self.eXistDBHostname = eXistDBHostname
29        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
30                      %(eXistDBHostname or 'Default', configFile or 'Default'))
31        inputs = {}
32       
33        self.atomSchema = None
34        # NB, there are two routes through here: if a config file is specified
35        # without a hostname, the host will be taken to be the first entry in
36        # the config file; if a hostname is specified, it will be used explicitly
37        if configFile:
38            if not self.eXistDBHostname:
39                self.__loadDBDetails(configFile)
40            inputs['passwordFile'] = configFile
41           
42        if self.eXistDBHostname:
43            inputs['db'] = self.eXistDBHostname
44           
45        # Now set up the connection
46        logging.debug(inputs)
47        self.xmldb = ndg_eXist(**inputs)
48       
49        # set up any collections required - NB, if these already exist they won't cause any files to be lost
50        self.__setUpEXistAtomCollections()
51       
52        # add the schema required for atom validation
53        self.__addAtomSchema()
54       
55        self.collections = None
56        if loadCollectionData:
57            self.collections = self.getAllAtomCollections()
58           
59        logging.info("eXist DB connection initialised")
60
61
62    def __getSchema(self):
63        logging.debug("Getting atom schema data")
64        if not self.atomSchema:
65            self.atomSchema = ec.BASE_COLLECTION_PATH + \
66                ndgXqueries.ATOM_MOLES_SCHEMA  + '.xsd'
67
68        return self.atomSchema
69
70    AtomSchema = property(fget=__getSchema, doc="Atom schema path")
71
72
73    def createCollections(self, collections):
74        '''
75        Create the specified collections in eXist
76        @param collections: array of collections to create
77        @return True if successful
78        '''
79        logging.info("Setting up eXist collections")
80        for col in collections:
81            logging.debug("Creating collection, '%s'" %col)
82            self.xmldb.createCollection(col)
83        logging.info("All collections set up")
84
85
86    def checkAtomSchemaCompliance(self, atomPath, atom = None):
87        '''
88        Validate the specified atom in eXist with the atom schemae in eXist
89        @param atomPath: path to the atom in eXist
90        @keyword atom: if set to an atom, this will be created temporarily in eXist
91        - since it may not already exist there.  Once validation is completed, the
92        file will be removed from eXist.
93        @return array: containing any errors found - NB, if an empty array is returned,
94        this indicates successful validation
95        '''
96        logging.info("Validating atom, '%s' against schemae in eXist" %atomPath)
97       
98        if atom:
99            logging.info("Creating temporary file in eXist to do validation against")
100            fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
101            self.createEXistFile(atom.toPrettyXML(), \
102                                 atom.getDefaultCollectionPath(), fileName)
103            atomPath = atom.getDefaultCollectionPath() + fileName
104           
105        validationQuery = 'validation:validate-report("' + atomPath + \
106            '", xs:anyURI("' + self.AtomSchema + '"))'
107        id, result = self.xmldb.executeQuery(validationQuery)
108        errorMessage = None
109        if result['hits'] == 0: 
110            errorMessage = "Validation did not complete successfully - please retry"
111        elif result['hits'] > 1:
112            errorMessage = "More than one atom was validated - expecting only a single atom validation - please retry"
113
114        if atom:
115            logging.info("Deleting temporary file in eXist")
116            self.deleteEXistFile(atomPath)
117
118        if errorMessage:
119            logging.error(errorMessage)
120            raise SystemError(errorMessage)
121       
122        doc = self.xmldb.retrieve(id, 0)
123        et = ET.fromstring(doc)
124        status = et.findtext('status')
125       
126        # retrieve the error detail if invalid
127        errors = []
128        if status == 'invalid':
129            logging.info("Atom is invalid - details as follows:")
130            for error in et.findall('message'):
131                lineNo = error.attrib.get('line')
132                colNo = error.attrib.get('column')
133                level = error.attrib.get('level')
134                repeat = error.attrib.get('repeat')
135                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, error.text)
136                if repeat:
137                    errorMessage += " (%s times)" %repeat
138                errors.append(errorMessage)
139                logging.info(errorMessage)
140        else:
141            logging.info("Atom is valid")
142           
143        logging.info("Validation complete")
144        return errors
145   
146
147    def __setUpEXistAtomCollections(self):
148        '''
149        Set up the required eXist collections needed for running the granulator script
150        '''
151        logging.info("Ensuring required collections are available in eXist")
152        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
153            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
154                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
155                self.xmldb.createCollection(col)
156                self.xmldb.createCollection(col + type)
157                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
158                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
159                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
160                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
161        logging.info("Required collections available")
162       
163
164    def __addAtomSchema(self):
165        '''
166        Add the required atom schema to the atoms collection - to allow validation
167        of input atoms
168        '''
169        logging.info("Adding atom schema to eXist")
170        xq = ndgXqueries()
171        schemae = [xq.ATOM_SCHEMA, xq.MOLES_SCHEMA, xq.ATOM_MOLES_SCHEMA]
172        for schema in schemae:
173            xml = xq.getSchema(schema)
174            self.createEXistFile(xml, ec.BASE_COLLECTION_PATH, schema + '.xsd')
175        logging.info("- schema added")
176       
177
178    def __loadDBDetails(self, configFile):
179        '''
180        Retrieve info from the eXist db config file
181        '''
182        logging.info("Loading DB config data")
183        # Check this file exists
184        if not os.path.isfile(configFile):
185            errorMessage = "Could not find the DB config file, %s; please make sure this " \
186                     "is available from the running directory" %configFile
187            logging.error(errorMessage)
188            raise ValueError(errorMessage)
189        dbinfo_file=open(configFile, "r")
190        dbinfo = dbinfo_file.read().split()
191        if len(dbinfo) < 3:
192            errorMessage = 'Incorrect data in DB config file'
193            logging.error(errorMessage)
194            raise ValueError(errorMessage)
195        self.eXistDBHostname = dbinfo[0]
196        self._username = dbinfo[1]
197        self._pw = dbinfo[2]
198        logging.info("DB config data loaded")
199
200
201    def __lookupEXistFile(self, docPath):
202        '''
203        Look up a file in eXist using XPath
204        @param docPath: path to doc to look up
205        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
206        '''
207        logging.info("Retrieving info for file, '%s'" %docPath)
208       
209        doc = self.xmldb.executeQuery('doc("' + docPath + '")')
210       
211        if doc[1]['hits'] == 0:
212            logging.info("File does not exist in eXist DB")
213            return None
214        logging.info("Found file - returning result ID")
215        return doc[0]
216         
217
218    def getEXistFile(self, docPath):
219        '''
220        Use XQuery to retrieve the specified document from eXist
221        @param docPath: the path of the doc to retrieve
222        @return: contents of document if exists, None otherwise
223        '''
224        id = self.__lookupEXistFile(docPath)
225       
226        if not id:
227            logging.info("No file found - nothing to retrieve")
228            return None
229       
230        logging.info("Found file - now retrieving content")
231        doc = self.xmldb.retrieve(id, 0)
232        return doc
233
234
235    def isNewEXistFile(self, docPath):
236        '''
237        Backup a file that exists in the eXist DB
238        @param docPath: path of file in eXist to backup
239        '''
240        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
241       
242        id = self.__lookupEXistFile(docPath)
243
244        if id:
245            return False
246       
247        return True
248
249
250    def __addTimeStamp(self, fileName):
251        '''
252        Add timestamp to input filename
253        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
254        is included before this; if not it is just added at the end
255        '''
256        bits = fileName.rsplit(".", 1)
257        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
258       
259        if len(bits) > 1:
260            fileName += "." + bits[1]
261        return fileName
262
263
264    def backupEXistFile(self, collection, fileName):
265        '''
266        Backup a file that exists in the eXist DB
267        - NB, this really just creates a new file with the same contents in a
268        backup dir
269        @param collection: path of the collection to store the file in
270        @param fileName: name of file to add in eXist
271        @return: path to new backup file
272        '''
273        if not collection.endswith('/'):
274            collection += '/'
275           
276        docPath = collection + fileName
277        logging.info("Backing up file, '%s', in eXist DB" %docPath)
278
279        logging.debug("Firstly, retrieve file contents from eXist")
280        doc = self.getEXistFile(docPath)
281        if not doc:
282            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
283            logging.error(errorMessage)
284            raise SystemError(errorMessage)
285       
286        # Now adjust the collection to map to the backup dir
287        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
288        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
289       
290        # add timestamp to filename
291        fileName = self.__addTimeStamp(fileName)
292        docPath = collection + fileName
293
294        logging.debug("Now creating backup file, '%s'" %fileName)
295        self.createEXistFile(doc, collection, fileName)
296       
297        logging.info("File backed up in eXist")
298        return docPath
299
300
301    def createEXistFile(self, xml, collection, fileName):
302        '''
303        Add the input file to the eXist DB
304        @param xml: contents of xml file to create in eXist
305        @param collection: path of the collection to store the file in
306        @param fileName: name of file to add in eXist
307        @return: True, if file created successfully
308        '''
309        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
310                     %(fileName, collection))
311        logging.debug("data: %s" %xml)
312
313        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
314        self.createCollections([collection])
315        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
316        if not status:
317            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
318            logging.error(errorMessage)
319            raise SystemError(errorMessage)
320       
321        logging.info("File added to eXist")
322        return True
323
324
325    def deleteEXistFile(self, docPath):
326        '''
327        Delete the input file from eXist DB
328        @param docPath: path of document to delete
329        @return: True, if file deleted successfully
330        '''
331        logging.info("Deleting file, '%s', from eXist DB" %docPath)
332
333        status = self.xmldb.removeDoc(docPath)   
334        if not status:
335            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
336            logging.error(errorMessage)
337            raise SystemError(errorMessage)
338       
339        logging.info("File deleted from eXist")
340        return True
341
342
343    def createOrUpdateEXistFile(self, xml, collection, fileName):
344        '''
345        Check if a file already exists in eXist; if it does, run an
346        update (which will backup the existing file), otherwise create
347        the file in eXist
348        @param xml: contents of xml file to create/update in eXist
349        @param collection: path of the collection to store the file in
350        @param fileName: name of file to add in eXist
351        '''
352        logging.info("Creating or updating file in eXist...")
353        if not self.isNewEXistFile(collection + fileName):
354            self.backupEXistFile(collection, fileName)
355           
356        self.createEXistFile(xml, collection, fileName)
357
358
359    def getAllAtomIDs(self):
360        '''
361        Retrieve all the atom IDs in the atoms directory - NB, this can
362        be a quick way of producing a cache of data to check - e.g. to avoid
363        multiple calls to getAtomFileCollectionPath
364        @return: ids - array of all atom IDs
365        '''
366        logging.info("Retrieving all atom ids")
367        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
368        id, doc = self.xmldb.executeQuery(xq)
369        if doc['hits'] == 0: 
370            return []
371       
372        indices = range(doc['hits'])
373       
374        doc = self.xmldb.retrieve(id, 0)
375        et = ET.fromstring(doc)
376        ids = []
377        for member in et:
378            fn = member.findtext('{http://www.w3.org/2005/Atom}repositoryID')
379            ids.append(fn)
380        logging.debug("Found ids, '%s'" %ids)
381        return ids
382
383
384    def getAllAtomCollections(self):
385        '''
386        Get all atom collection paths and store in a dictionary - for easy
387        reference when doing lots of things at once
388        @return: dict with key/val of atomID/collectionPath
389        '''
390        logging.info("Retrieving all atom collection paths")
391        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
392        id, doc = self.xmldb.executeQuery(xq)
393        if doc['hits'] == 0: 
394            return []
395       
396        indices = range(doc['hits'])
397       
398        doc = self.xmldb.retrieve(id, 0)
399        et = ET.fromstring(doc)
400        colData = {}
401        for member in et:
402            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
403            fileName = collection.split('/')[-1]
404            fileName = fileName.split('.')[0]
405            dir = '/'.join(collection.split('/')[0:-1])
406            colData[fileName] = dir
407
408        logging.debug("Finished looking up atom paths")
409        return colData
410
411
412    def getAtomFileCollectionPath(self, atomID):
413        '''
414        Given an atom id, determine and return the collection path in eXist
415        of the associated atom file
416        @param atom: atom id to look up
417        @return: collection path, if it exists, None, otherwise
418        '''
419        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
420        xq = ndgXqueries()['atomFullPath']
421        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
422        xq = xq.replace('LocalID', atomID)
423
424        id, doc = self.xmldb.executeQuery(xq)
425        if doc['hits'] == 0:
426            logging.info("No document found with the specified ID")
427            return None
428
429        doc = self.xmldb.retrieve(id,0,{})
430
431        docET = ET.fromstring(doc)
432        collPath = docET.text + '/'
433        logging.debug("Found collection path, '%s'" %collPath)
434        return collPath
435       
436           
437    def createAtomInExist(self, atom):
438        '''
439        Create an atom in the eXist DB
440        @param atom: atom object to create in the DB
441        '''
442        logging.info("Creating atom in eXist")
443       
444        # if the atom has no dataset ID, generate and add one
445        # NB, this should only be the case when the atom is being created
446        # via the web interface
447        isNew = False
448        if not atom.datasetID:
449            isNew = True
450            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
451
452        eXistCollection = None
453        if self.collections is not None: # cope with empty dict
454            eXistCollection = self.collections.get(atom.datasetID)
455        else:
456            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
457       
458        # if collection not found, assume we're dealing with a new atom; get its
459        # default collection
460        if not eXistCollection:
461            eXistCollection = atom.getDefaultCollectionPath()
462        elif isNew:
463            # in this situation we're trying to create an atom with the same
464            # name via the web interface - this can't be allowed - so retry to
465            # generate a new ID
466            atom.datasetID = None
467            self.createAtomInExist(atom)
468            return
469        # create backup of atom if it already exists
470        else:
471            self.backupEXistFile(eXistCollection, atom.atomName)
472           
473            # also change updated date to current time
474            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
475           
476        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
477        logging.info("Atom created in eXist")
478        return atom
Note: See TracBrowser for help on using the repository browser.