source: exist/trunk/python/ndgUtils/models/existdbclient.py @ 4512

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/existdbclient.py@4512
Revision 4512, 18.6 KB checked in by cbyrom, 11 years ago (diff)

Fix problem with retaining empty category data + improve output error logging + improve robustness of exist file retrieval.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime
8from ndgUtils.eXistInterface import ndg_eXist
9from ndgUtils.eXistConnector import eXistConnector as ec
10from ndgUtils.ndgXqueries import ndgXqueries
11import uuid
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21class eXistDBClient:
22   
23    def __init__(self, configFile = None, eXistDBHostname = None, \
24                 loadCollectionData=False, setUpDB = False):
25        '''
26        Initialise a connection to the eXistDB
27        @keyword configFile: config file to use in setting up DB
28        @keyword existDBHostname: name of eXist DB to use - if not specified, the first
29        host in the config file is used
30        @keyword loadCollectionData: preload info on all the eXist collections, if True (default False)
31        @keyword setUpDB: if True, create the basic collection structure and ingest the
32        atom schemas.  Default is False.
33        '''
34        logging.info("Initialising connection to eXist DB")
35        self.eXistDBHostname = eXistDBHostname
36        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
37                      %(eXistDBHostname or 'Default', configFile or 'Default'))
38        inputs = {}
39       
40        self.atomSchema = None
41        # NB, there are two routes through here: if a config file is specified
42        # without a hostname, the host will be taken to be the first entry in
43        # the config file; if a hostname is specified, it will be used explicitly
44        if configFile:
45            if not self.eXistDBHostname:
46                self.__loadDBDetails(configFile)
47            inputs['passwordFile'] = configFile
48           
49        if self.eXistDBHostname:
50            inputs['db'] = self.eXistDBHostname
51           
52        # Now set up the connection
53        logging.debug(inputs)
54        self.xmldb = ndg_eXist(**inputs)
55       
56        if setUpDB:
57            # set up any collections required - NB, if these already exist they won't cause any files to be lost
58            self.__setUpEXistAtomCollections()
59           
60            # add the schema required for atom validation
61            self.__addAtomSchema()
62       
63        self.collections = None
64        if loadCollectionData:
65            self.collections = self.getAllAtomCollections()
66           
67        logging.info("eXist DB connection initialised")
68
69
70    def __getSchema(self):
71        logging.debug("Getting atom schema data")
72        if not self.atomSchema:
73            self.atomSchema = ec.BASE_COLLECTION_PATH + \
74                ndgXqueries.ATOM_MOLES_SCHEMA  + '.xsd'
75
76        return self.atomSchema
77
78    AtomSchema = property(fget=__getSchema, doc="Atom schema path")
79
80
81    def createCollections(self, collections):
82        '''
83        Create the specified collections in eXist
84        @param collections: array of collections to create
85        @return True if successful
86        '''
87        logging.info("Setting up eXist collections")
88        for col in collections:
89            logging.debug("Creating collection, '%s'" %col)
90            self.xmldb.createCollection(col)
91        logging.info("All collections set up")
92
93
94    def checkAtomSchemaCompliance(self, atomPath, atom = None):
95        '''
96        Validate the specified atom in eXist with the atom schemae in eXist
97        @param atomPath: path to the atom in eXist
98        @keyword atom: if set to an atom, this will be created temporarily in eXist
99        - since it may not already exist there.  Once validation is completed, the
100        file will be removed from eXist.
101        @return array: containing any errors found - NB, if an empty array is returned,
102        this indicates successful validation
103        '''
104        logging.info("Validating atom, '%s' against schemae in eXist" %atomPath)
105       
106        if atom:
107            logging.info("Creating temporary file in eXist to do validation against")
108            fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
109            self.createEXistFile(atom.toPrettyXML(), \
110                                 atom.getDefaultCollectionPath(), fileName)
111            atomPath = atom.getDefaultCollectionPath() + fileName
112           
113        validationQuery = 'validation:validate-report("' + atomPath + \
114            '", xs:anyURI("' + self.AtomSchema + '"))'
115        id, result = self.xmldb.executeQuery(validationQuery)
116        errorMessage = None
117        if result['hits'] == 0: 
118            errorMessage = "Validation did not complete successfully - please retry"
119        elif result['hits'] > 1:
120            errorMessage = "More than one atom was validated - expecting only a single atom validation - please retry"
121
122        if atom:
123            logging.info("Deleting temporary file in eXist")
124            self.deleteEXistFile(atomPath)
125
126        if errorMessage:
127            logging.error(errorMessage)
128            raise SystemError(errorMessage)
129       
130        doc = self.xmldb.retrieve(id, 0)
131        et = ET.fromstring(doc)
132        status = et.findtext('status')
133       
134        # retrieve the error detail if invalid
135        errors = []
136        if status == 'invalid':
137            logging.info("Atom is invalid - details as follows:")
138            for error in et.findall('message'):
139                lineNo = error.attrib.get('line')
140                colNo = error.attrib.get('column')
141                level = error.attrib.get('level')
142                repeat = error.attrib.get('repeat')
143                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, error.text)
144                if repeat:
145                    errorMessage += " (%s times)" %repeat
146                # only return basic error message to users - the log file will contain the full error
147                errors.append(error.text)#errorMessage)
148                logging.info(errorMessage)
149        else:
150            logging.info("Atom is valid")
151           
152        logging.info("Validation complete")
153        return errors
154   
155
156    def __setUpEXistAtomCollections(self):
157        '''
158        Set up the required eXist collections needed for running the granulator script
159        '''
160        logging.info("Ensuring required collections are available in eXist")
161        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
162            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
163                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
164                self.xmldb.createCollection(col)
165                self.xmldb.createCollection(col + type)
166                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
167                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
168                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
169                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
170        logging.info("Required collections available")
171       
172
173    def __addAtomSchema(self):
174        '''
175        Add the required atom schema to the atoms collection - to allow validation
176        of input atoms
177        '''
178        logging.info("Adding atom schema to eXist")
179        xq = ndgXqueries()
180        schemae = [xq.ATOM_SCHEMA, xq.MOLES_SCHEMA, xq.ATOM_MOLES_SCHEMA]
181        for schema in schemae:
182            xml = xq.getSchema(schema)
183            self.createEXistFile(xml, ec.BASE_COLLECTION_PATH, schema + '.xsd')
184        logging.info("- schema added")
185       
186
187    def __loadDBDetails(self, configFile):
188        '''
189        Retrieve info from the eXist db config file
190        '''
191        logging.info("Loading DB config data")
192        # Check this file exists
193        if not os.path.isfile(configFile):
194            errorMessage = "Could not find the DB config file, %s; please make sure this " \
195                     "is available from the running directory" %configFile
196            logging.error(errorMessage)
197            raise ValueError(errorMessage)
198        dbinfo_file=open(configFile, "r")
199        dbinfo = dbinfo_file.read().split()
200        if len(dbinfo) < 3:
201            errorMessage = 'Incorrect data in DB config file'
202            logging.error(errorMessage)
203            raise ValueError(errorMessage)
204        self.eXistDBHostname = dbinfo[0]
205        self._username = dbinfo[1]
206        self._pw = dbinfo[2]
207        logging.info("DB config data loaded")
208
209
210    def __lookupEXistFile(self, docPath):
211        '''
212        Look up a file in eXist using XPath
213        @param docPath: path to doc to look up
214        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
215        '''
216        logging.info("Retrieving info for file, '%s'" %docPath)
217       
218        id, doc = self.xmldb.executeQuery('doc("' + docPath + '")')
219       
220        if doc['hits'] == 0:
221            logging.info("File does not exist in eXist DB")
222            return None
223        logging.info("Found file - returning result ID")
224        return id
225         
226
227    def getEXistFile(self, docPath):
228        '''
229        Use XQuery to retrieve the specified document from eXist
230        @param docPath: the path of the doc to retrieve
231        @return: contents of document if exists, None otherwise
232        '''
233        id = self.__lookupEXistFile(docPath)
234       
235        if not id and id != 0:
236            logging.info("No file found - nothing to retrieve")
237            return None
238       
239        logging.info("Found file - now retrieving content")
240        doc = self.xmldb.retrieve(id, 0)
241        return doc
242
243
244    def isNewEXistFile(self, docPath):
245        '''
246        Backup a file that exists in the eXist DB
247        @param docPath: path of file in eXist to backup
248        '''
249        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
250       
251        id = self.__lookupEXistFile(docPath)
252
253        if id:
254            return False
255       
256        return True
257
258
259    def __addTimeStamp(self, fileName):
260        '''
261        Add timestamp to input filename
262        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
263        is included before this; if not it is just added at the end
264        '''
265        bits = fileName.rsplit(".", 1)
266        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
267       
268        if len(bits) > 1:
269            fileName += "." + bits[1]
270        return fileName
271
272
273    def backupEXistFile(self, collection, fileName):
274        '''
275        Backup a file that exists in the eXist DB
276        - NB, this really just creates a new file with the same contents in a
277        backup dir
278        @param collection: path of the collection to store the file in
279        @param fileName: name of file to add in eXist
280        @return: path to new backup file
281        '''
282        if not collection.endswith('/'):
283            collection += '/'
284           
285        docPath = collection + fileName
286        logging.info("Backing up file, '%s', in eXist DB" %docPath)
287
288        logging.debug("Firstly, retrieve file contents from eXist")
289        doc = self.getEXistFile(docPath)
290        if not doc:
291            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
292            logging.error(errorMessage)
293            raise SystemError(errorMessage)
294       
295        # Now adjust the collection to map to the backup dir
296        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
297        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
298       
299        # add timestamp to filename
300        fileName = self.__addTimeStamp(fileName)
301        docPath = collection + fileName
302
303        logging.debug("Now creating backup file, '%s'" %fileName)
304        self.createEXistFile(doc, collection, fileName)
305       
306        logging.info("File backed up in eXist")
307        return docPath
308
309
310    def createEXistFile(self, xml, collection, fileName):
311        '''
312        Add the input file to the eXist DB
313        @param xml: contents of xml file to create in eXist
314        @param collection: path of the collection to store the file in
315        @param fileName: name of file to add in eXist
316        @return: True, if file created successfully
317        '''
318        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
319                     %(fileName, collection))
320        logging.debug("data: %s" %xml)
321
322        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
323        self.createCollections([collection])
324        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
325        if not status:
326            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
327            logging.error(errorMessage)
328            raise SystemError(errorMessage)
329       
330        logging.info("File added to eXist")
331        return True
332
333
334    def deleteEXistFile(self, docPath):
335        '''
336        Delete the input file from eXist DB
337        @param docPath: path of document to delete
338        @return: True, if file deleted successfully
339        '''
340        logging.info("Deleting file, '%s', from eXist DB" %docPath)
341
342        status = self.xmldb.removeDoc(docPath)   
343        if not status:
344            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
345            logging.error(errorMessage)
346            raise SystemError(errorMessage)
347       
348        logging.info("File deleted from eXist")
349        return True
350
351
352    def createOrUpdateEXistFile(self, xml, collection, fileName):
353        '''
354        Check if a file already exists in eXist; if it does, run an
355        update (which will backup the existing file), otherwise create
356        the file in eXist
357        @param xml: contents of xml file to create/update in eXist
358        @param collection: path of the collection to store the file in
359        @param fileName: name of file to add in eXist
360        '''
361        logging.info("Creating or updating file in eXist...")
362        if not self.isNewEXistFile(collection + fileName):
363            self.backupEXistFile(collection, fileName)
364           
365        self.createEXistFile(xml, collection, fileName)
366
367
368    def getAllAtomIDs(self):
369        '''
370        Retrieve all the atom IDs in the atoms directory - NB, this can
371        be a quick way of producing a cache of data to check - e.g. to avoid
372        multiple calls to getAtomFileCollectionPath
373        @return: ids - array of all atom IDs
374        '''
375        logging.info("Retrieving all atom ids")
376        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
377        id, doc = self.xmldb.executeQuery(xq)
378        if doc['hits'] == 0: 
379            return []
380       
381        indices = range(doc['hits'])
382       
383        doc = self.xmldb.retrieve(id, 0)
384        et = ET.fromstring(doc)
385        ids = []
386        for member in et:
387            fn = member.findtext('{http://www.w3.org/2005/Atom}repositoryID')
388            ids.append(fn)
389        logging.debug("Found ids, '%s'" %ids)
390        return ids
391
392
393    def getAllAtomCollections(self):
394        '''
395        Get all atom collection paths and store in a dictionary - for easy
396        reference when doing lots of things at once
397        @return: dict with key/val of atomID/collectionPath
398        '''
399        logging.info("Retrieving all atom collection paths")
400        xq = ndgXqueries().actual('atomList', '/db/atoms', '', '')
401        id, doc = self.xmldb.executeQuery(xq)
402        if doc['hits'] == 0: 
403            return []
404       
405        indices = range(doc['hits'])
406       
407        doc = self.xmldb.retrieve(id, 0)
408        et = ET.fromstring(doc)
409        colData = {}
410        for member in et:
411            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
412            fileName = collection.split('/')[-1]
413            fileName = fileName.split('.')[0]
414            dir = '/'.join(collection.split('/')[0:-1])
415            colData[fileName] = dir
416
417        logging.debug("Finished looking up atom paths")
418        return colData
419
420
421    def getAtomFileCollectionPath(self, atomID):
422        '''
423        Given an atom id, determine and return the collection path in eXist
424        of the associated atom file
425        @param atom: atom id to look up
426        @return: collection path, if it exists, None, otherwise
427        '''
428        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
429        xq = ndgXqueries()['atomFullPath']
430        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
431        xq = xq.replace('LocalID', atomID)
432
433        id, doc = self.xmldb.executeQuery(xq)
434        if doc['hits'] == 0:
435            logging.info("No document found with the specified ID")
436            return None
437
438        doc = self.xmldb.retrieve(id,0,{})
439
440        docET = ET.fromstring(doc)
441        collPath = docET.text + '/'
442        logging.debug("Found collection path, '%s'" %collPath)
443        return collPath
444       
445           
446    def createAtomInExist(self, atom):
447        '''
448        Create an atom in the eXist DB
449        @param atom: atom object to create in the DB
450        '''
451        logging.info("Creating atom in eXist")
452       
453        # if the atom has no dataset ID, generate and add one
454        # NB, this should only be the case when the atom is being created
455        # via the web interface
456        isNew = False
457        if not atom.datasetID:
458            isNew = True
459            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
460
461        eXistCollection = None
462        if self.collections is not None: # cope with empty dict
463            eXistCollection = self.collections.get(atom.datasetID)
464        else:
465            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
466       
467        # if collection not found, assume we're dealing with a new atom; get its
468        # default collection
469        if not eXistCollection:
470            eXistCollection = atom.getDefaultCollectionPath()
471        elif isNew:
472            # in this situation we're trying to create an atom with the same
473            # name via the web interface - this can't be allowed - so retry to
474            # generate a new ID
475            atom.datasetID = None
476            self.createAtomInExist(atom)
477            return
478        # create backup of atom if it already exists
479        else:
480            self.backupEXistFile(eXistCollection, atom.atomName)
481           
482            # also change updated date to current time
483            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
484           
485        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
486        logging.info("Atom created in eXist")
487        return atom
Note: See TracBrowser for help on using the repository browser.