source: ndgCommon/trunk/ndg/common/src/clients/xmldb/eXist/atomclient.py @ 5184

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/clients/xmldb/eXist/atomclient.py@5184
Revision 5184, 15.0 KB checked in by cbyrom, 11 years ago (diff)

Simplify backups structure in eXist - putting all backups under a
single toplevel structure.

Line 
1'''
2 Clients for providing functionality to support Atom  document usage with eXist DB
3 - NB, extends the CRUDClient functionality
4 
5 @author: C Byrom - Tessella, Feb 09
6'''
7import os, sys, logging, datetime, uuid
8from xml.etree import cElementTree as ET
9from ndg.common.src.models.Atom import Atom
10from ndg.common.src.models import AtomState
11from ndg.common.src.clients.xmldb.abstractxmldbatomclient import *
12from crudclient import CRUDClient
13from ndg.common.src.clients.xmldb.eXist.atomdbinitialiser import AtomDBInitialiser
14from ndg.common.src.clients.xmldb.eXist.feedclient import FeedClient as feedClient
15import dbconstants as dc
16from ndg.common.src.lib.ndgresources import ATOM_MOLES_SCHEMA
17
18
19class AtomClient(CRUDClient, AbstractXMLDBAtomClient):
20   
21    def __init__(self, dbHostName = 'bora.badc.rl.ac.uk', configFileName ='passwords.txt', 
22                 clientType = dc.XML_RPC_CLIENT, loadCollectionData = False, 
23                 setUpDB = False):
24        '''
25        Set up a search client for eXist DB
26        @keyword dbHostName: eXist host to use - defaults to 'chinook.badc.rl.ac.uk'
27        @keyword configFileName: password file to use - NB, this should
28        have contents of format (NB, space delimiter):
29        dbName userID password
30        Default is 'passwords.txt'.
31        @keyword clientType: Type of client to use for eXist connection.  Currently
32        only supported by 'xmlrpc' - which is the default.
33        @keyword loadCollectionData: preload info on all the eXist
34        collections, if True (default False)
35        @keyword setUpDB: if True, create the basic collection structure and ingest the
36        atom schemas.  Default is False.
37        '''
38        logging.debug("Initialising AtomClient")
39        super(AtomClient, self).__init__(dbHostName = dbHostName, 
40                                         configFileName = configFileName,
41                                         clientType = clientType)
42
43        # set up feed client, too - NB, info should be added to the feed as it is added
44        # to the eXist collections, so the standard dbclient and the feed client
45        # are intrinsicly linked
46        self.feedClient = feedClient(self.client.auth, 
47                                     dbHostName = self.client.host, 
48                                     eXistPortNo = self.client.port)
49       
50        if setUpDB:
51            # set up the eXist DB appropriately for use with ndgCommon
52            # NB, initialiser will set up feeds structure - so need to have
53            # defined a feedclient by this point
54            initialiser = AtomDBInitialiser(self)
55            initialiser.initialise()
56
57       
58        if loadCollectionData:
59            self.__getAllAtomCollections()
60       
61        logging.debug("AtomClient initialised")
62
63
64
65    def __getAllAtomCollections(self):
66        '''
67        Get all atom collection paths and store in a dictionary - for easy
68        reference when doing lots of things at once
69        @return: dict with key/val of filename/collectionPath
70        '''
71        logging.info("Retrieving all atom collection paths")
72
73        docs = self.buildAndRunQuery('atomList', dc.ATOM_COLLECTION_PATH, 
74                                     '', '', useChunked = True)
75       
76        if not docs:
77            return
78       
79        et = ET.fromstring(docs[0])
80        colData = {}
81
82        for member in et:
83            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
84            fileName = collection.split('/')[-1]
85            fileName = fileName.split('.')[0:-1]
86            fileName = '.'.join(fileName)
87            dir = '/'.join(collection.split('/')[0:-1])
88            colData[fileName] = dir
89
90        logging.debug("Finished looking up atom paths")
91        self.atomCollections = colData
92
93
94    def checkAtomSchemaCompliance(self, atomPath, atom = None, isDebug = False):
95        '''
96        Validate the specified atom in eXist with the atom schemas in eXist
97        @param atomPath: path to the atom in eXist
98        @keyword atom: if set to an atom, this will be created temporarily in eXist
99        - since it may not already exist there.  Once validation is completed, the
100        file will be removed from eXist.
101        @keyword isDebug: if True, return full error details, otherwise only return
102        a summary
103        @raise SystemError: if errors encountered whilst running the schema validation
104        @return array: containing any errors found - NB, if an empty array is returned,
105        this indicates successful validation
106        '''
107        logging.info("Validating atom, '%s' against schemas in eXist" %atomPath)
108       
109        # get path to schemas, if not already set
110        if not self.atomSchema:
111            self.atomSchema = '%s/%s.xsd' %(dc.SCHEMAS_COLLECTION_PATH, ATOM_MOLES_SCHEMA)
112           
113        # path to temp file, if we create one
114        tempAtomPath = None
115
116        errorMessage = None
117        result = None
118        try:
119            if atom:
120                logging.info("Creating temporary file in eXist to do validation against")
121                fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
122                self.createDoc(atom.toPrettyXML(),
123                               atom.getDefaultCollectionPath(), 
124                               fileName)
125                tempAtomPath = atom.getDefaultCollectionPath() + fileName
126                atomPath = tempAtomPath
127               
128            validationQuery = 'validation:validate-report("' + atomPath + \
129                '", xs:anyURI("' + self.atomSchema + '"))'
130            logging.debug("Running validation, '%s'" %validationQuery)
131
132            result = self.client.runQuery(validationQuery)
133            if not result:
134                errorMessage = "Validation did not complete successfully - please retry"
135            elif len(result) > 1:
136                errorMessage = "More than one atom was validated - expecting " + \
137                    "only a single atom validation - please retry"
138               
139        except Exception, e:
140            errorMessage = "Error encountered whilst validating atom: '%s'" %(e.message or e)
141
142        if tempAtomPath:
143            logging.info("Deleting temporary file in eXist")
144            self.deleteDoc(tempAtomPath)
145
146        # avoid further processing if errors have been encountered here
147        if errorMessage:
148            logging.error(errorMessage)
149            raise SystemError(errorMessage)
150       
151        # escape anything that may cause problems when ingesting into ET
152        doc = result[0].replace('<xsd:schema>', '&lt;xsd:schema&gt;')
153        et = ET.fromstring(doc)
154        status = et.findtext('status')
155       
156        # retrieve the error detail if invalid
157        errors = []
158        if status == 'invalid':
159            logging.info("Atom is invalid - details as follows:")
160            for error in et.findall('message'):
161                lineNo = error.attrib.get('line')
162                colNo = error.attrib.get('column')
163                level = error.attrib.get('level')
164                repeat = error.attrib.get('repeat')
165                errorText = error.text
166                # remove the meaningless error type from message
167                if errorText.startswith('cvc-'):
168                    errorText = ':'.join(errorText.split(':')[1:])
169                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, errorText)
170                if repeat:
171                    errorMessage += " (%s times)" %repeat
172
173                if isDebug:
174                    errors.append(errorMessage)
175                else:
176                    errors.append(errorText)
177                logging.info(errorMessage)
178        else:
179            logging.info("Atom is valid")
180           
181        logging.info("Validation complete")
182        return errors
183
184
185    def getAtomPublicationState(self, atomID, providerID):
186        '''
187        Retrieve the publication state of the specified atom - by
188        checking the collection it is in
189        @param atom: atom id to look up
190        @param providerID: provider ID for the atom data
191        @return: AtomState for the atom.  NB, if the ID is not found, assume
192        we're dealing with a new atom and set the state as the working state
193        '''
194        logging.debug("Finding atom publication state")
195        path = self.__getAtomFileCollectionPath(atomID, providerID)
196        for state in AtomState.allStates.values():
197            if path.find('/%s' %state.collectionPath) > -1:
198                logging.debug("- state found: '%s'" %state.title)
199                return state
200       
201        logging.debug("- state not found - returning WORKING state")
202        return AtomState.WORKING_STATE
203
204
205    def changeAtomPublicationState(self, atom, newState):
206        '''
207        Adjust the publication state of an atom in the XML DB
208        @param atom: the Atom data model of the atom whose publication state
209        needs to change
210        @param newState: an AtomState object representing the new publication
211        state of the atom
212        @return atom: atom data model with updated state
213        '''
214        logging.info("Changing the publication state of atom - from '%s' to '%s'" \
215                     %(atom.state.title, newState.title))
216        oldState = atom.state
217        # firstly, create atom in new publication state collection - so data isn't
218        # lost if this fails
219        atom.state = newState
220        self.createDoc(atom.toPrettyXML(), atom.getDefaultCollectionPath(), 
221                       atom.atomName)
222       
223        # now delete atom in the old publication state
224        atom.state = oldState
225        self.deleteAtom(atom)
226        logging.info("- atom created in new publication state and removed from old one")
227        atom.state = newState
228       
229        # update feeds + create DIFs, if needed
230        if atom.isPublished():
231            self.__runAsynchAtomPublish(atom)
232       
233        return atom
234
235           
236    def createAtom(self, atom, replaceAtom = True, runAsynch = True):
237        '''
238        Create an atom in the eXist DB - using the atom contents to work out
239        the location + data set ID
240        @param atom: atom object to create in the DB
241        @keyword replaceAtom: if False and the atom is already available in eXist
242        @param runAsynch: if True, if a backup of an existing file, do this
243        asynchronously in a separate thread + do the feed publishing and DIF
244        creating in a separate thread, too
245        then raise a ValueError.
246        '''
247        logging.info("Creating atom in eXist")
248        if not atom or not isinstance(atom, Atom):
249            raise ValueError("Input object is not an Atom object - cannot create in eXist")
250       
251        # if the atom has no dataset ID, generate and add one
252        # NB, this should only be the case when the atom is being created
253        # via the web interface
254        isNew = False
255        if not atom.datasetID:
256            isNew = True
257            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
258
259        eXistCollection = None
260        if self.atomCollections: # if we've pre-loaded the collection info, use this
261            eXistCollection = self.atomCollections.get(atom.datasetID)
262        else:
263            eXistCollection = self.__getAtomFileCollectionPath(atom.datasetID, 
264                                                               atom.ME.providerID)
265       
266        # if collection not found, assume we're dealing with a new atom; get its
267        # default collection
268        if not eXistCollection:
269            eXistCollection = atom.getDefaultCollectionPath()
270
271            # check if we need a new provider feed set up
272            providerCollection = dc.PROVIDER_FEED_PATH + atom.ME.providerID + '/'
273            if self.isNewCollection(providerCollection):
274                logging.info("Creating feed for new provider ID")
275                self.createCollections([providerCollection])
276                self.feedClient.createAtomFeed(providerCollection,
277                                               self.feedClient.PROVIDERLEVEL_ATOM_FEED_TITLE %atom.ME.providerID)
278           
279        elif isNew:
280            # in this situation we're trying to create an atom with the same
281            # name via the web interface - this can't be allowed - so retry to
282            # generate a new ID
283            atom.datasetID = None
284            return self.createAtom(atom)
285           
286        # create backup of atom if it already exists
287        else:
288            if not replaceAtom:
289                raise DuplicateError('An atom with the specified ID (%s) already exists in eXist' \
290                                     %atom.datasetID)
291            # store name of backup - to allow restore, if subsequent ops fail
292            self.backupName = self.backupDoc(eXistCollection, atom.atomName,
293                                             runAsynch = runAsynch)
294           
295            # also change updated date to current time
296            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
297           
298        self.createDoc(atom.toPrettyXML(), eXistCollection, atom.atomName)
299
300        if self.atomCollections:
301            self.atomCollections[atom.datasetID] = eXistCollection
302       
303        logging.info("Atom created in eXist")
304       
305        # lastly, if we're dealing with a published atom, update DIF records +
306        # feeds
307        if atom.isPublished():
308            if runAsynch:
309                self.__runAsynchAtomPublish(atom)
310            else:
311                self.__publishAtom(atom)
312           
313        return atom
314       
315       
316    '''
317    Run the atom publishing process in a new thread
318    @param atom: the Atom data model to publish
319    '''
320    def __runAsynchAtomPublish(self, atom):
321        thread = publishingThread(self, atom)
322        thread.start()
323
324   
325    def __publishAtom(self, atom):
326        '''
327        Add atom info to the various feeds - and if it is a data entity, use
328        it to create a DIF document and add this to feeds, also
329        '''
330        if atom.isDE():
331            self._createDIFDocumentFromAtom(atom, dc.DIF_COLLECTION_PATH)
332       
333        self.feedClient.addAtomToFeeds(atom)
334
335
336    def __getAtomFileCollectionPath(self, atomID, providerID):
337        '''
338        Given an atom id, determine and return the collection path in eXist
339        of the associated atom file
340        @param atomID: atom id to look up
341        @param providerID: provider ID for the atom data
342        @return: collection path, if it exists, None, otherwise
343        '''
344        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
345        doc = self.buildAndRunQuery('atomFullPath', 
346                                    dc.ATOM_COLLECTION_PATH, 
347                                    providerID, 
348                                    atomID)
349
350        if not doc:
351            logging.info("No document found with the specified ID")
352            return None
353        elif len(doc) > 1:
354            raise ValueError("Multiple documents returned with same ID ('%s') - this should not occur"
355                             %atomID)
356
357        docET = ET.fromstring(doc[0])
358        collPath = docET.text + '/'
359        logging.debug("Found collection path, '%s'" %collPath)
360        return collPath
Note: See TracBrowser for help on using the repository browser.