source: exist/trunk/python/ndgUtils/models/existdbclient.py @ 4419

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/existdbclient.py@4419
Revision 4419, 14.7 KB checked in by cbyrom, 11 years ago (diff)

Add method to atom to allow ingest of CSML data + add inputs check for
dbclient method.

Line 
1'''
2 Class supporting set up and communication with eXist DB
3 for the purposes of creating and updating atoms
4 
5 @author: C Byrom - Tessella 08
6'''
7import os, sys, logging, datetime
8from ndgUtils.eXistInterface import ndg_eXist
9from ndgUtils.eXistConnector import eXistConnector as ec
10from ndgUtils.ndgXqueries import ndgXqueries
11import uuid
12
13try:
14    from xml.etree import ElementTree as ET
15except ImportError:
16    try:
17        import ElementTree as ET
18    except ImportError:
19        import elementtree.ElementTree as ET
20
21class eXistDBClient:
22   
23    def __init__(self, configFile = None, eXistDBHostname = None, loadCollectionData=False):
24        '''
25        Initialise a connection to the eXistDB
26        '''
27        logging.info("Initialising connection to eXist DB")
28        self.eXistDBHostname = eXistDBHostname
29        logging.debug("- connecting to DB, '%s', with config file, '%s'" \
30                      %(eXistDBHostname or 'Default', configFile or 'Default'))
31        inputs = {}
32       
33        # NB, there are two routes through here: if a config file is specified
34        # without a hostname, the host will be taken to be the first entry in
35        # the config file; if a hostname is specified, it will be used explicitly
36        if configFile:
37            if not self.eXistDBHostname:
38                self._loadDBDetails(configFile)
39            inputs['passwordFile'] = configFile
40           
41        if self.eXistDBHostname:
42            inputs['db'] = self.eXistDBHostname
43           
44        # Now set up the connection
45        logging.debug(inputs)
46        self.xmldb = ndg_eXist(**inputs)
47       
48        # set up any collections required - NB, if these already exist they won't cause any files to be lost
49        self._setUpEXistAtomCollections()
50       
51        self.collections = None
52        if loadCollectionData:
53            self.collections = self.getAllAtomCollections()
54           
55        logging.info("eXist DB connection initialised")
56
57
58    def createCollections(self, collections):
59        '''
60        Create the specified collections in eXist
61        @param collections: array of collections to create
62        @return True if successful
63        '''
64        logging.info("Setting up eXist collections")
65        for col in collections:
66            logging.debug("Creating collection, '%s'" %col)
67            self.xmldb.createCollection(col)
68        logging.info("All collections set up")
69
70
71    def _setUpEXistAtomCollections(self):
72        '''
73        Set up the required eXist collections needed for running the granulator script
74        '''
75        logging.info("Ensuring required collections are available in eXist")
76        for col in [ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH]:
77            for type in [ec.OLD_COLLECTION_PATH, ec.PUBLISHED_COLLECTION_PATH, \
78                         ec.SMALL_P_PUBLISHED_COLLECTION_PATH, ec.WORKING_COLLECTION_PATH]:
79                self.xmldb.createCollection(col)
80                self.xmldb.createCollection(col + type)
81                self.xmldb.createCollection(col + type + ec.DE_COLLECTION_PATH)
82                self.xmldb.createCollection(col + type + ec.DEPLOYMENT_COLLECTION_PATH)
83                self.xmldb.createCollection(col + type + ec.DEPLOYMENTS_COLLECTION_PATH)
84                self.xmldb.createCollection(col + type + ec.GRANULE_COLLECTION_PATH)
85        logging.info("Required collections available")
86       
87
88    def _loadDBDetails(self, configFile):
89        '''
90        Retrieve info from the eXist db config file
91        '''
92        logging.info("Loading DB config data")
93        # Check this file exists
94        if not os.path.isfile(configFile):
95            errorMessage = "Could not find the DB config file, %s; please make sure this " \
96                     "is available from the running directory" %configFile
97            logging.error(errorMessage)
98            raise ValueError(errorMessage)
99        dbinfo_file=open(configFile, "r")
100        dbinfo = dbinfo_file.read().split()
101        if len(dbinfo) < 3:
102            errorMessage = 'Incorrect data in DB config file'
103            logging.error(errorMessage)
104            raise ValueError(errorMessage)
105        self.eXistDBHostname = dbinfo[0]
106        self._username = dbinfo[1]
107        self._pw = dbinfo[2]
108        logging.info("DB config data loaded")
109
110
111    def _lookupEXistFile(self, docPath):
112        '''
113        Look up a file in eXist using XPath
114        @param docPath: path to doc to look up
115        @return: id returned from query, with which to retrieve doc; if doc doesn't exist, return None
116        '''
117        logging.info("Retrieving info for file, '%s'" %docPath)
118       
119        doc = self.xmldb.executeQuery('doc("' + docPath + '")')
120       
121        if doc[1]['hits'] == 0:
122            logging.info("File does not exist in eXist DB")
123            return None
124        logging.info("Found file - returning result ID")
125        return doc[0]
126         
127
128    def getEXistFile(self, docPath):
129        '''
130        Use XQuery to retrieve the specified document from eXist
131        @param docPath: the path of the doc to retrieve
132        @return: contents of document if exists, None otherwise
133        '''
134        id = self._lookupEXistFile(docPath)
135       
136        if not id:
137            logging.info("No file found - nothing to retrieve")
138            return None
139       
140        logging.info("Found file - now retrieving content")
141        doc = self.xmldb.retrieve(id, 0)
142        return doc
143
144
145    def isNewEXistFile(self, docPath):
146        '''
147        Backup a file that exists in the eXist DB
148        @param docPath: path of file in eXist to backup
149        '''
150        logging.info("Checking if file, '%s', exists in eXist DB" %docPath)
151       
152        id = self._lookupEXistFile(docPath)
153
154        if id:
155            return False
156       
157        return True
158
159
160    def _addTimeStamp(self, fileName):
161        '''
162        Add timestamp to input filename
163        NB, this assumes there is a file type identifier at the end of the filename; if so, the datestamp
164        is included before this; if not it is just added at the end
165        '''
166        bits = fileName.rsplit(".", 1)
167        fileName = bits[0] + "_" + datetime.datetime.today().strftime("%Y-%m-%dT%H_%M_%S")
168       
169        if len(bits) > 1:
170            fileName += "." + bits[1]
171        return fileName
172
173
174    def backupEXistFile(self, collection, fileName):
175        '''
176        Backup a file that exists in the eXist DB
177        - NB, this really just creates a new file with the same contents in a
178        backup dir
179        @param collection: path of the collection to store the file in
180        @param fileName: name of file to add in eXist
181        @return: path to new backup file
182        '''
183        if not collection.endswith('/'):
184            collection += '/'
185           
186        docPath = collection + fileName
187        logging.info("Backing up file, '%s', in eXist DB" %docPath)
188
189        logging.debug("Firstly, retrieve file contents from eXist")
190        doc = self.getEXistFile(docPath)
191        if not doc:
192            errorMessage = "Could not retrieve file contents (%s) to backup - exiting." %docPath
193            logging.error(errorMessage)
194            raise SystemError(errorMessage)
195       
196        # Now adjust the collection to map to the backup dir
197        collection = collection.replace(ec.BASE_COLLECTION_PATH, ec.BACKUP_COLLECTION_PATH)
198        collection = collection.replace(ec.NDG_A_COLLECTION_PATH, ec.NDG_A_COLLECTION_PATH_BACKUP)
199       
200        # add timestamp to filename
201        fileName = self._addTimeStamp(fileName)
202        docPath = collection + fileName
203
204        logging.debug("Now creating backup file, '%s'" %fileName)
205        self.createEXistFile(doc, collection, fileName)
206       
207        logging.info("File backed up in eXist")
208        return docPath
209
210
211    def createEXistFile(self, xml, collection, fileName):
212        '''
213        Add the input file to the eXist DB
214        @param xml: contents of xml file to create in eXist
215        @param collection: path of the collection to store the file in
216        @param fileName: name of file to add in eXist
217        @return: True, if file created successfully
218        '''
219        logging.info("Adding file, '%s' to eXist DB collection, '%s'" \
220                     %(fileName, collection))
221        logging.debug("data: %s" %xml)
222
223        # create the collection, in case it doesn't already exist - NB, this won't overwrite anything
224        self.createCollections([collection])
225        status = self.xmldb.storeXML(xml, collection + "/" + fileName, overwrite=1)   
226        if not status:
227            errorMessage = "Command to create file in eXist did not complete successfully - exiting"
228            logging.error(errorMessage)
229            raise SystemError(errorMessage)
230       
231        logging.info("File added to eXist")
232        return True
233
234
235    def deleteEXistFile(self, docPath):
236        '''
237        Delete the input file from eXist DB
238        @param docPath: path of document to delete
239        @return: True, if file deleted successfully
240        '''
241        logging.info("Deleting file, '%s', from eXist DB" %docPath)
242
243        status = self.xmldb.removeDoc(docPath)   
244        if not status:
245            errorMessage = "Command to delete file in eXist did not complete successfully - exiting"
246            logging.error(errorMessage)
247            raise SystemError(errorMessage)
248       
249        logging.info("File deleted from eXist")
250        return True
251
252
253    def createOrUpdateEXistFile(self, xml, collection, fileName):
254        '''
255        Check if a file already exists in eXist; if it does, run an
256        update (which will backup the existing file), otherwise create
257        the file in eXist
258        @param xml: contents of xml file to create/update in eXist
259        @param collection: path of the collection to store the file in
260        @param fileName: name of file to add in eXist
261        '''
262        logging.info("Creating or updating file in eXist...")
263        if not self.isNewEXistFile(collection + fileName):
264            self.backupEXistFile(collection, fileName)
265           
266        self.createEXistFile(xml, collection, fileName)
267
268
269    def getAllAtomIDs(self):
270        '''
271        Retrieve all the atom IDs in the atoms directory - NB, this can
272        be a quick way of producing a cache of data to check - e.g. to avoid
273        multiple calls to getAtomFileCollectionPath
274        @return: ids - array of all atom IDs
275        '''
276        logging.info("Retrieving all atom ids")
277        xq = "declare default element namespace 'http://www.w3.org/2005/Atom'; \
278            for $ID in collection(/db/atoms)/entry/id return <id>{tokenize(string($ID), '__ATOM__')[2]}</id>"#<entry>$DE/entry/id</entry>"#for $d in $DE/entry/id return data($d)"#$DE/entry/id"
279               
280        id, doc = self.xmldb.executeQuery(xq)
281       
282        indices = range(doc['hits'])
283        ids = []
284        for i in indices:
285            doc = self.xmldb.retrieve(id,i,{})
286            docET = ET.fromstring(doc)
287            ids.append(docET.text)
288        logging.debug("Found ids, '%s'" %ids)
289        return ids
290
291
292    def getAllAtomCollections(self):
293        '''
294        Get all atom collection paths and store in a dictionary - for easy
295        reference when doing lots of things at once
296        @return: dict with key/val of atomID/collectionPath
297        '''
298        logging.info("Retrieving all atom collection paths")
299               
300        # NB, we get all data back in one field here since otherwise eXist complains
301        # that the returned dataset is too large and falls over
302        xq = "declare default element namespace 'http://www.w3.org/2005/Atom'; \
303            for $DE in collection('/db/atoms')/entry/id let $f:=util:document-name($DE) return \
304            <fileName>{util:collection-name($DE)}/{$f}</fileName>"
305
306        id, doc = self.xmldb.executeQuery(xq)
307        indices = range(doc['hits'])
308        colData = {}
309        for i in indices:
310            doc = self.xmldb.retrieve(id,i,{})
311            docET = ET.fromstring(doc)
312            data = docET.text
313            key = data.split('/')[-1]
314            key = key.split('.')[0]
315            val = '/'.join(data.split('/')[0:-1])
316            colData[key] = val
317        logging.debug("Finished looking up atom paths")
318        return colData
319
320
321    def getAtomFileCollectionPath(self, atomID):
322        '''
323        Given an atom id, determine and return the collection path in eXist
324        of the associated atom file
325        @param atom: atom id to look up
326        @return: collection path, if it exists, None, otherwise
327        '''
328        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
329        xq = ndgXqueries()['atomFullPath']
330        xq = xq.replace('TargetCollection', ec.BASE_COLLECTION_PATH)
331        xq = xq.replace('LocalID', atomID)
332
333        id, doc = self.xmldb.executeQuery(xq)
334        if doc['hits'] == 0:
335            logging.info("No document found with the specified ID")
336            return None
337
338        doc = self.xmldb.retrieve(id,0,{})
339
340        docET = ET.fromstring(doc)
341        collPath = docET.text + '/'
342        logging.debug("Found collection path, '%s'" %collPath)
343        return collPath
344       
345           
346    def createAtomInExist(self, atom):
347        '''
348        Create an atom in the eXist DB
349        @param atom: atom object to create in the DB
350        '''
351        logging.info("Creating atom in eXist")
352       
353        # if the atom has no dataset ID, generate and add one
354        # NB, this should only be the case when the atom is being created
355        # via the web interface
356        isNew = False
357        if not atom.datasetID:
358            isNew = True
359            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
360
361        eXistCollection = None
362        if self.collections is not None: # cope with empty dict
363            eXistCollection = self.collections.get(atom.datasetID)
364        else:
365            eXistCollection = self.getAtomFileCollectionPath(atom.datasetID)
366       
367        # if collection not found, assume we're dealing with a new atom; get its
368        # default collection
369        if not eXistCollection:
370            eXistCollection = atom.getDefaultCollectionPath()
371        elif isNew:
372            # in this situation we're trying to create an atom with the same
373            # name via the web interface - this can't be allowed - so retry to
374            # generate a new ID
375            atom.datasetID = None
376            self.createAtomInExist(atom)
377            return
378        # create backup of atom if it already exists
379        else:
380            self.backupEXistFile(eXistCollection, atom.atomName)
381           
382            # also change updated date to current time
383            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
384           
385        self.createEXistFile(atom.toPrettyXML(), eXistCollection, atom.atomName)
386        logging.info("Atom created in eXist")
387        return atom
Note: See TracBrowser for help on using the repository browser.