source: ndgCommon/trunk/ndg/common/src/clients/xmldb/eXist/atomclient.py @ 4928

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/clients/xmldb/eXist/atomclient.py@4928
Revision 4928, 15.0 KB checked in by cbyrom, 12 years ago (diff)

Create atom client, implementing the generic interface for ndgcommon,
to enable the functionality to properly cope with atom docs in eXist.

Line 
1'''
2 Clients for providing functionality to support Atom  document usage with eXist DB
3 - NB, extends the CRUDClient functionality
4 
5 @author: C Byrom - Tessella, Feb 09
6'''
7import os, sys, logging, datetime, uuid
8from xml.etree import cElementTree as ET
9from ndg.common.src.models.Atom import Atom
10from ndg.common.src.models import AtomState
11from ndg.common.src.clients.xmldb.abstractxmldbatomclient import *
12from crudclient import CRUDClient
13from ndg.common.src.clients.xmldb.eXist.atomdbinitialiser import AtomDBInitialiser
14from ndg.common.src.clients.xmldb.eXist.feedclient import FeedClient as feedClient
15import dbconstants as dc
16from ndg.common.src.lib.ndgresources import ATOM_MOLES_SCHEMA
17
18
19class AtomClient(CRUDClient, AbstractXMLDBAtomClient):
20   
21    def __init__(self, dbHostName = 'chinook.badc.rl.ac.uk', configFileName ='passwords.txt', 
22                 clientType = dc.XML_RPC_CLIENT, loadCollectionData = False, 
23                 setUpDB = False):
24        '''
25        Set up a search client for eXist DB
26        @keyword dbHostName: eXist host to use - defaults to 'chinook.badc.rl.ac.uk'
27        @keyword configFileName: password file to use - NB, this should
28        have contents of format (NB, space delimiter):
29        dbName userID password
30        Default is 'passwords.txt'.
31        @keyword clientType: Type of client to use for eXist connection.  Currently
32        only supported by 'xmlrpc' - which is the default.
33        @keyword loadCollectionData: preload info on all the eXist
34        collections, if True (default False)
35        @keyword setUpDB: if True, create the basic collection structure and ingest the
36        atom schemas.  Default is False.
37        '''
38        logging.debug("Initialising AtomClient")
39        super(AtomClient, self).__init__(dbHostName = dbHostName, 
40                                         configFileName = configFileName,
41                                         clientType = clientType)
42
43        # set up feed client, too - NB, info should be added to the feed as it is added
44        # to the eXist collections, so the standard dbclient and the feed client
45        # are intrinsicly linked
46        self.feedClient = feedClient(self.client.auth, 
47                                     dbHostName = self.client.host, 
48                                     eXistPortNo = self.client.port)
49       
50        if setUpDB:
51            # set up the eXist DB appropriately for use with ndgCommon
52            # NB, initialiser will set up feeds structure - so need to have
53            # defined a feedclient by this point
54            initialiser = AtomDBInitialiser(self)
55            initialiser.initialise()
56
57       
58        if loadCollectionData:
59            self.atomCollections = self.__getAllAtomCollections()
60       
61        logging.debug("AtomClient initialised")
62
63
64
65    def __getAllAtomCollections(self):
66        '''
67        Get all atom collection paths and store in a dictionary - for easy
68        reference when doing lots of things at once
69        @return: dict with key/val of filename/collectionPath
70        '''
71        logging.info("Retrieving all atom collection paths")
72
73        docs = self.buildAndRunQuery('atomList', dc.ATOM_COLLECTION_PATH, '', '')
74       
75        if not docs:
76            return
77       
78        et = ET.fromstring(docs[0])
79        colData = {}
80        for member in et:
81            collection = member.findtext('{http://www.w3.org/2005/Atom}fileName')
82            fileName = collection.split('/')[-1]
83            fileName = fileName.split('.')[0:-1]
84            fileName = '.'.join(fileName)
85            dir = '/'.join(collection.split('/')[0:-1])
86            colData[fileName] = dir
87
88        logging.debug("Finished looking up atom paths")
89        return colData
90
91
92    def checkAtomSchemaCompliance(self, atomPath, atom = None, isDebug = False):
93        '''
94        Validate the specified atom in eXist with the atom schemas in eXist
95        @param atomPath: path to the atom in eXist
96        @keyword atom: if set to an atom, this will be created temporarily in eXist
97        - since it may not already exist there.  Once validation is completed, the
98        file will be removed from eXist.
99        @keyword isDebug: if True, return full error details, otherwise only return
100        a summary
101        @raise SystemError: if errors encountered whilst running the schema validation
102        @return array: containing any errors found - NB, if an empty array is returned,
103        this indicates successful validation
104        '''
105        logging.info("Validating atom, '%s' against schemas in eXist" %atomPath)
106       
107        # get path to schemas, if not already set
108        if not self.atomSchema:
109            self.atomSchema = '%s/%s.xsd' %(dc.SCHEMAS_COLLECTION_PATH, ATOM_MOLES_SCHEMA)
110           
111        # path to temp file, if we create one
112        tempAtomPath = None
113
114        errorMessage = None
115        result = None
116        try:
117            if atom:
118                logging.info("Creating temporary file in eXist to do validation against")
119                fileName = atom.datasetID + str(datetime.datetime.today().microsecond)
120                self.createDoc(atom.toPrettyXML(),
121                               '/db/atoms/working/data_granules/badc.nerc.ac.uk',
122#                               atom.getDefaultCollectionPath(),
123                               fileName)
124                tempAtomPath = atom.getDefaultCollectionPath() + fileName
125                atomPath = tempAtomPath
126               
127            validationQuery = 'validation:validate-report("' + atomPath + \
128                '", xs:anyURI("' + self.atomSchema + '"))'
129            logging.debug("Running validation, '%s'" %validationQuery)
130
131            result = self.client.runQuery(validationQuery)
132            if not result:
133                errorMessage = "Validation did not complete successfully - please retry"
134            elif len(result) > 1:
135                errorMessage = "More than one atom was validated - expecting " + \
136                    "only a single atom validation - please retry"
137               
138        except Exception, e:
139            errorMessage = "Error encountered whilst validating atom: '%s'" %e.message
140
141        if tempAtomPath:
142            logging.info("Deleting temporary file in eXist")
143            self.deleteDoc(tempAtomPath)
144
145        # avoid further processing if errors have been encountered here
146        if errorMessage:
147            logging.error(errorMessage)
148            raise SystemError(errorMessage)
149       
150        # escape anything that may cause problems when ingesting into ET
151        doc = result[0].replace('<xsd:schema>', '&lt;xsd:schema&gt;')
152        et = ET.fromstring(doc)
153        status = et.findtext('status')
154       
155        # retrieve the error detail if invalid
156        errors = []
157        if status == 'invalid':
158            logging.info("Atom is invalid - details as follows:")
159            for error in et.findall('message'):
160                lineNo = error.attrib.get('line')
161                colNo = error.attrib.get('column')
162                level = error.attrib.get('level')
163                repeat = error.attrib.get('repeat')
164                errorText = error.text
165                # remove the meaningless error type from message
166                if errorText.startswith('cvc-'):
167                    errorText = ':'.join(errorText.split(':')[1:])
168                errorMessage = "%s at line %s, column %s: %s" %(level, lineNo, colNo, errorText)
169                if repeat:
170                    errorMessage += " (%s times)" %repeat
171
172                if isDebug:
173                    errors.append(errorMessage)
174                else:
175                    errors.append(errorText)
176                logging.info(errorMessage)
177        else:
178            logging.info("Atom is valid")
179           
180        logging.info("Validation complete")
181        return errors
182
183
184    def getAtomPublicationState(self, atomID, providerID):
185        '''
186        Retrieve the publication state of the specified atom - by
187        checking the collection it is in
188        @param atom: atom id to look up
189        @param providerID: provider ID for the atom data
190        @return: AtomState for the atom.  NB, if the ID is not found, assume
191        we're dealing with a new atom and set the state as the working state
192        '''
193        logging.debug("Finding atom publication state")
194        path = self.__getAtomFileCollectionPath(atomID, providerID)
195        for state in AtomState.allStates.values():
196            if path.find('/%s' %state.collectionPath) > -1:
197                logging.debug("- state found: '%s'" %state.title)
198                return state
199       
200        logging.debug("- state not found - returning WORKING state")
201        return AtomState.WORKING_STATE
202
203
204    def changeAtomPublicationState(self, atom, newState):
205        '''
206        Adjust the publication state of an atom in the XML DB
207        @param atom: the Atom data model of the atom whose publication state
208        needs to change
209        @param newState: an AtomState object representing the new publication
210        state of the atom
211        @return atom: atom data model with updated state
212        '''
213        logging.info("Changing the publication state of atom - from '%s' to '%s'" \
214                     %(atom.state.title, newState.title))
215        oldState = atom.state
216        # firstly, create atom in new publication state collection - so data isn't
217        # lost if this fails
218        atom.state = newState
219        self.createDoc(atom.toPrettyXML(), atom.getDefaultCollectionPath(), 
220                       atom.atomName)
221       
222        # now delete atom in the old publication state
223        atom.state = oldState
224        self.deleteAtom(atom)
225        logging.info("- atom created in new publication state and removed from old one")
226        atom.state = newState
227       
228        # update feeds + create DIFs, if needed
229        if atom.isPublished():
230            self.__runAsynchAtomPublish(atom)
231       
232        return atom
233
234           
235    def createAtom(self, atom, replaceAtom = True, runAsynch = True):
236        '''
237        Create an atom in the eXist DB - using the atom contents to work out
238        the location + data set ID
239        @param atom: atom object to create in the DB
240        @keyword replaceAtom: if False and the atom is already available in eXist
241        @param runAsynch: if True, if a backup of an existing file, do this
242        asynchronously in a separate thread + do the feed publishing and DIF
243        creating in a separate thread, too
244        then raise a ValueError.
245        '''
246        logging.info("Creating atom in eXist")
247        if not atom or not isinstance(atom, Atom):
248            raise ValueError("Input object is not an Atom object - cannot create in eXist")
249       
250        # if the atom has no dataset ID, generate and add one
251        # NB, this should only be the case when the atom is being created
252        # via the web interface
253        isNew = False
254        if not atom.datasetID:
255            isNew = True
256            atom.setDatasetID(atom.atomTypeID + '_' + str(uuid.uuid1()))
257
258        eXistCollection = None
259        if self.atomCollections: # if we've pre-loaded the collection info, use this
260            eXistCollection = self.atomCollections.get(atom.datasetID)
261        else:
262            eXistCollection = self.__getAtomFileCollectionPath(atom.datasetID, 
263                                                               atom.ME.providerID)
264       
265        # if collection not found, assume we're dealing with a new atom; get its
266        # default collection
267        if not eXistCollection:
268            eXistCollection = atom.getDefaultCollectionPath()
269
270            # check if we need a new provider feed set up
271            providerCollection = dc.PROVIDER_FEED_PATH + atom.ME.providerID + '/'
272            if self.isNewCollection(providerCollection):
273                logging.info("Creating feed for new provider ID")
274                self.createCollections([providerCollection])
275                self.feedClient.createAtomFeed(providerCollection,
276                                               self.feedClient.PROVIDERLEVEL_ATOM_FEED_TITLE %atom.ME.providerID)
277           
278        elif isNew:
279            # in this situation we're trying to create an atom with the same
280            # name via the web interface - this can't be allowed - so retry to
281            # generate a new ID
282            atom.datasetID = None
283            return self.createAtom(atom)
284           
285        # create backup of atom if it already exists
286        else:
287            if not replaceAtom:
288                raise DuplicateError('An atom with the specified ID (%s) already exists in eXist' \
289                                     %atom.datasetID)
290            # store name of backup - to allow restore, if subsequent ops fail
291            self.backupName = self.backupDoc(eXistCollection, atom.atomName,
292                                             runAsynch = runAsynch)
293           
294            # also change updated date to current time
295            atom.updatedDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
296           
297        self.createDoc(atom.toPrettyXML(), eXistCollection, atom.atomName)
298
299        if self.atomCollections:
300            self.atomCollections[atom.datasetID] = eXistCollection
301       
302        logging.info("Atom created in eXist")
303       
304        # lastly, if we're dealing with a published atom, update DIF records +
305        # feeds
306        if atom.isPublished():
307            if runAsynch:
308                self.__runAsynchAtomPublish(atom)
309            else:
310                self.__publishAtom(atom)
311           
312        return atom
313       
314       
315    '''
316    Run the atom publishing process in a new thread
317    @param atom: the Atom data model to publish
318    '''
319    def __runAsynchAtomPublish(self, atom):
320        thread = publishingThread(self, atom)
321        thread.start()
322
323   
324    def __publishAtom(self, atom):
325        '''
326        Add atom info to the various feeds - and if it is a data entity, use
327        it to create a DIF document and add this to feeds, also
328        '''
329        if atom.isDE():
330            self._createDIFDocumentFromAtom(atom, dc.DIF_COLLECTION_PATH)
331       
332        self.feedClient.addAtomToFeeds(atom)
333
334
335    def __getAtomFileCollectionPath(self, atomID, providerID):
336        '''
337        Given an atom id, determine and return the collection path in eXist
338        of the associated atom file
339        @param atomID: atom id to look up
340        @param providerID: provider ID for the atom data
341        @return: collection path, if it exists, None, otherwise
342        '''
343        logging.info("Looking up collection path for atom ID, '%s'" %atomID)
344        doc = self.buildAndRunQuery('atomFullPath', 
345                                    dc.ATOM_COLLECTION_PATH, 
346                                    providerID, 
347                                    atomID)
348
349        if not doc:
350            logging.info("No document found with the specified ID")
351            return None
352        elif len(doc) > 1:
353            raise ValueError("Multiple documents returned with same ID ('%s') - this should not occur"
354                             %atomID)
355
356        docET = ET.fromstring(doc[0])
357        collPath = docET.text + '/'
358        logging.debug("Found collection path, '%s'" %collPath)
359        return collPath
Note: See TracBrowser for help on using the repository browser.