source: exist/trunk/python/ndgUtils/models/Atom.py @ 4696

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4696
Revision 4696, 48.5 KB checked in by cbyrom, 11 years ago (diff)

Adjust Atom and MolesEntity? data models to properly use namespaces when
dealing with xpath queries - rather than having these stripped out. This
avoids problems when namespaces are given arbitrary names and is a more
exact, hence robust, approach.
Create new test class to put the xmlhandler2 tests separately in.
Add delete function to granulite - to allow data granules, and their
connections to data entities, to be removed + add 'roll back' functionality
to cope with scenarios when granulite replace/delete fails to complete
properly. Add new methods to the existdbclient to allow the restore/delete/backup
functionality.
Extend test suite to exercise new functionality.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils import ndgObject
17from ndgUtils.eXistConnector import eXistConnector
18from ndgUtils.ETxmlView import et2text
19import ndgUtils.lib.utilities as utilities
20from ndgUtils.models.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class Person(object):
36    '''
37    Class representing atom author type data - with name, uri and role attributes
38    '''
39    AUTHOR_TYPE = 0
40    CONTRIBUTOR_TYPE = 1
41    RESPONSIBLE_PARTY_TYPE = 2
42    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
43   
44    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
45        self.type = personType
46        self.ns = namespace
47        self.name = ""
48        self.uri = ""
49        self.role = ""
50       
51        # NB, the atom format specifies slightly different data contents
52        self.uriTagName = "email"
53        # NB, responsible party data is always stored in the moles section
54        if self.type == self.RESPONSIBLE_PARTY_TYPE:
55            self.ns = 'moles'
56            self.uriTagName = "uri"
57
58    def __str__(self):
59        if self.name or self.uri or self.role:
60            return self.name + " | " + self.uri + " | " + self.role
61        return ""
62
63
64    def hasValue(self):
65        if self.name or self.uri or self.role:
66            return True
67        return False
68   
69    def fromString(self, personString):
70        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
71       
72    def fromETElement(self, personTag):
73        self.name = personTag.findtext('{%s}name' %ndgObject.ATOM_NS) or ""
74        self.role = personTag.findtext('{%s}role' %ndgObject.ATOM_NS) or ""
75        self.uri = personTag.findtext('{%s}%s' %(ndgObject.ATOM_NS, self.uriTagName)) or ""
76        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
77                      %(self.name, self.role, self.uriTagName, self.uri))
78
79    def toXML(self):
80        prefix = ""
81        if self.ns:
82            prefix = self.ns + ':'
83
84        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
85
86        if self.name:
87            name = ET.SubElement(author, prefix + "name")
88            name.text = self.name
89       
90        if self.uri:
91            uri = ET.SubElement(author, prefix + self.uriTagName)
92            uri.text = self.uri
93       
94        if self.role:
95            role = ET.SubElement(author, prefix + "role")
96            role.text = self.role
97
98        return author
99   
100    def __cmp__(self, person1):
101        '''
102        Override comparison to allow proper object comparison when checking
103        if Person objects are in an array already - i.e. if person in personArray...
104        '''
105        if not person1:
106            return -1
107       
108        if self is person1:
109            return 0
110        elif self.uri == person1.uri and self.name == person1.name and \
111                self.role == person1.role and self.type == person1.type:
112            return 0
113        return 1
114
115
116class Link(object):
117    '''
118    Class representing an atom link - with href, title and rel attributes
119    '''
120
121    def __init__(self):
122        self.href = ""
123        self.title = ""
124        self.rel = ""
125
126    def fromString(self, linkString):
127        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
128       
129    def fromETElement(self, linkTag):
130        self.href = linkTag.attrib.get('href') or ""
131        self.rel = linkTag.attrib.get('rel') or ""
132        self.title = linkTag.attrib.get('title') or ""
133
134    def toXML(self):
135        link = ET.Element("link")
136        link.attrib["href"] = self.href
137        link.attrib["title"] = self.title
138        link.attrib["rel"] = self.rel
139        return link
140
141    def hasValue(self):
142        # NB, just a rel on its own is meaningless - so ignore
143        if self.href or self.title:
144            return True
145        return False
146   
147    def __str__(self):
148        if self.href or self.title or self.rel:
149            return self.href + " | " + self.title + " | " + self.rel
150        return ""
151   
152    def isChildAtom(self):
153        '''
154        Determines whether the link refers to another atom - e.g. a link to
155        a data granule
156        @return True, if so; False otherwise
157        '''
158        if self.rel.endswith(VTD.GRANULE_TERM) or \
159            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
160            self.rel.endswith(VTD.ACTIVITY_TERM) or \
161            self.rel.endswith(VTD.DPT_TERM) or \
162            self.rel.endswith(VTD.OBS_TERM):
163            return True
164       
165        return False
166   
167    def __cmp__(self, link1):
168        '''
169        Override comparison to allow proper object comparison when checking
170        if Link objects are in an array already - i.e. if link in linkArray...
171        '''
172        if not link1:
173            return -1
174       
175        if self is link1:
176            return 0
177        elif self.href == link1.href and self.title == link1.title and \
178                self.rel == link1.rel:
179            return 0
180        return 1
181
182
183class Category(object):
184    '''
185    Class representing an atom category - with term, scheme and label attributes
186    '''
187    def __init__(self):
188        self.term = ""
189        self.scheme = ""
190        self.label = ""
191
192    def fromString(self, linkString, escapeSpecialCharacters=True):
193        '''
194        Create Category from triple string of format, 'label | scheme | term'
195        @param linkString: triple string to create category with
196        @keyword escapeSpecialCharacters: if set to True, special characters in
197        triple string are escaped (default)
198        '''
199        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
200            doEscape=escapeSpecialCharacters)
201       
202    def fromETElement(self, linkTag):
203        self.term = linkTag.attrib.get('term') or ""
204        self.label = linkTag.attrib.get('label') or ""
205        self.scheme = linkTag.attrib.get('scheme') or ""
206
207    def toXML(self):
208        link = ET.Element("category")
209        link.attrib["term"] = self.term
210        link.attrib["scheme"] = self.scheme
211        link.attrib["label"] = self.label
212        return link
213   
214    def hasValue(self):
215        if self.scheme or self.label or self.term:
216            return True
217        return False
218
219
220class Atom(object):
221
222    # labels for use with the atom categories
223    ATOM_TYPE = "ATOM_TYPE"
224    ATOM_SUBTYPE = "ATOM_SUBTYPE"
225
226    # labels for use with the templates to set/extract specific inputs
227    ONLINE_REF_LABEL = "online_ref"
228    PARAMETER_LABEL = "parameter"
229    ATOM_REF_LABEL = "atom_ref"
230    DELIMITER = "---"
231    REMOVE_LABEL = "remove"
232   
233    # format to use for t1-t2 date range
234    YEAR_FORMAT = '%Y-%m-%d'
235
236    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
237                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
238        '''
239        Constructor - initialise the atom variables
240        '''
241        logging.info("Initialising atom")
242        if atomType:
243            logging.info(" - of type '%s'" %atomType)
244        self.atomTypeID = atomType
245
246        # some data have further subtypes specified
247        self.subtypeID = None # this should be the termID
248        self.subtype = None # and this should be the fully formed vocab URL
249       
250        self.ndgObject = ndgObject
251
252        self.atomName = None
253        self.files = []
254        self.author = Person()
255        self.contributors = []
256        self.atomAuthors = []
257        self.parameters = []
258        self.spatialData = []
259        self.temporalData = []
260        self.relatedLinks = []
261        self.summary = []
262        self.content = []
263        # NB, this deployments data duplicates other atom data - and is only used for a
264        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
265        self.deployments = []
266        # ditto for the following field
267        self.dataEntities = []
268           
269        self.csmlFile = None
270        self.cdmlFile = None
271        # general variable to use for setting the atom content - NB, if a csmlFile is specified
272        # (either directly or via a cdmlFile specification), this will be the content by default
273        # for this purpose
274        self.contentFile = None     
275        self.title = None
276        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
277        self.atomID = None
278   
279        # boundary box info - to replace spatial/temporalData?
280        self.minX = None
281        self.minY = None
282        self.maxX = None
283        self.maxY = None
284        self.t1 = None
285        self.t2 = None
286
287        self.ME = ME.MolesEntity()
288       
289        # date when the atom was first ingested
290        self.publishedDate = None
291
292        # last update date
293        self.updatedDate = None
294
295        # assume atom in working state by default - this is used to define what collection
296        # in eXist the atom is stored in
297        self.state = state
298       
299        # additional, non standard atom data can be included in the molesExtra element
300        if vocabTermData:
301            self.VTD = vocabTermData
302        else:
303            self.VTD = VTD()
304       
305        if xmlString:
306            self.fromString(xmlString)
307
308        # if inputs passed in as dict, add these now
309        if inputs:
310            logging.info("Adding info to atom from input dict")
311            logging.debug(inputs)
312            self.__dict__.update(inputs)
313           
314            # NB, this doesn't trigger the Summary Property, so do this
315            # explicitly, if need be
316            if inputs.has_key('Summary'):
317                self.Summary = inputs.get('Summary')
318            if inputs.has_key('Content'):
319                self.Content = inputs.get('Content')
320            if inputs.has_key('author'):
321                name = inputs.get('author')
322                author = Person()
323                author.fromString(name)
324                self.author = author
325           
326            # also pass any moles data up to the moles entity object
327            if inputs.has_key('providerID'):
328                self.ME.providerID = inputs.get('providerID')
329               
330            if inputs.has_key('abbreviation'):
331                self.ME.abbreviation = inputs.get('abbreviation')
332
333        if self.atomTypeID:
334            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
335
336        logging.info("Atom initialised")
337
338
339    def addOnlineReferences(self, links):
340        '''
341        Add online reference data associated with the atom
342        - NB, care needs to be taken here since this data is stored in the atom
343        link elements and these are also used for the various atom associations
344        @param links: a Link or array of Links to add to the relatedLinks attribute
345        '''
346        logging.debug("Adding online references")
347        if not links:
348            return
349       
350        if type(links) is not list:
351            links = [links]
352       
353        # firstly clear out any online refs data from the existing related links
354        newLinks = []
355        for link in self.relatedLinks:
356            if link.isChildAtom():
357                newLinks.append(link)
358       
359        newLinks.extend(links)
360        self.relatedLinks = newLinks
361        logging.debug("Online references added")
362
363
364    def addUniqueRelatedLinks(self, links):
365        '''
366        Add links to relatedLinks array - if they are not already included
367        @param links: a Link or array of Links to add to the relatedLinks attribute
368        '''
369        self.addUniqueLinks(self.relatedLinks, links)
370       
371
372    def removeRelatedLinks(self, linksToDelete):
373        '''
374        Remove any links in the input list from the atom's related links list
375        @param linksToDelete: array of Link objects to remove from atom
376        '''
377        logging.debug("Removing related links from atom")
378        if not linksToDelete:
379            return
380       
381        if type(linksToDelete) is not list:
382            linksToDelete = [linksToDelete]
383       
384        updatedLinks = []
385        for link in self.relatedLinks:
386            if type(link) is not Link:
387                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
388                continue
389            if link in linksToDelete:
390                logging.debug("- found link to remove")
391            else:
392                updatedLinks.append(link)
393
394        self.relatedLinks = updatedLinks
395        logging.debug("Links removed")
396       
397
398    def getDefaultCollectionPath(self):
399        '''
400        Determine the correct collection to use for the atom in eXist
401        '''
402        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
403       
404        if self.atomTypeID == VTD.DE_TERM:
405            collectionPath += eXistConnector.DE_COLLECTION_PATH
406        elif self.atomTypeID == VTD.GRANULE_TERM:
407            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
408        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
409            self.subtypeID == VTD.DEPLOYMENT_TERM:
410            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
411        else:
412            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
413       
414        if not self.ME.providerID:
415            raise AtomError("Error: cannot determine atom collection path because " + \
416                            "the provider ID is not defined")
417           
418        collectionPath += self.ME.providerID + "/"
419        return collectionPath
420
421
422    def __addAtomTypeDataXML(self, root):
423        '''
424        Add the atom type, and subtype data, if available, to atom categories
425        - and lookup and add the appropriate vocab term data
426        '''
427        if self.atomTypeID:
428            logging.info("Adding atom type info to XML output")
429            category = Category()
430            category.label = self.atomTypeID
431            # look up the appropriate vocab term data
432            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
433            category.term = self.ATOM_TYPE
434            root.append(category.toXML())
435
436        if self.subtypeID:
437            logging.info("Adding atom subtype info to XML output")
438            # NB subtypes not all defined, so leave this out for the moment
439            category.label = self.subtypeID
440            # look up the appropriate vocab term data
441            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
442            category.term = self.ATOM_SUBTYPE
443            root.append(category.toXML())
444
445
446    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
447        '''
448        Add data to include in the moles entity element
449        '''
450        logging.debug('Adding moles entity information')
451        self.ME.abbreviation = abbreviation
452        self.ME.providerID = provider_id
453        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
454        logging.debug('Moles entity information added')
455
456
457    def addAuthors(self, authors):
458        '''
459        Add author data appropriately to the atom
460        NB, these will overwrite any existing authors of the same type
461        @param authors: list of Person objects with the author data
462        '''
463        logging.debug('Adding authors data to Atom')
464        isFirstAuthor = {}
465        authorArray = None
466        for author in authors:
467            # NB, we're only allowed one atom author
468            if author.type == Person.AUTHOR_TYPE:
469                self.author = author
470                   
471                if isFirstAuthor.has_key(author.type):
472                    raise AtomError("Error: an atom can only have one author specified")
473                isFirstAuthor[author.type] = 1
474                continue
475            elif author.type == Person.CONTRIBUTOR_TYPE:
476                authorArray = self.contributors
477            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
478                authorArray = self.ME.responsibleParties
479               
480            # check if this is the first addition - if so, clear out the
481            # array in advance
482            if not isFirstAuthor.has_key(author.type):
483                logging.debug("Clearing out author array")
484                # NB, need to be careful to clear the array, not create a ref
485                # to a new array
486                del authorArray[:]
487                isFirstAuthor[author.type] = 1
488
489            if author.hasValue() and author not in authorArray:
490                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
491                              %(author.type, author.name, author.uri, author.role))
492                authorArray.append(author)
493
494        logging.debug('Finished adding authors data')
495
496
497    def _isNewParameter(self, param):
498        '''
499        Check if a parameter is already specified in the atom, return False if
500        so, otherwise return True
501        '''
502        for p in self.parameters:
503            if p.term == param.term and \
504                p.scheme == param.scheme and \
505                p.label == param.label:
506                return False
507        return True
508
509
510    def addRelatedLinks(self, linkVals):
511        '''
512        Add related links in string format - converting to Link objects
513        NB, only add the link if it is unique
514       
515        @param linkVals: string of format, 'uri | title | vocabServerURL'
516        '''
517        link = self.objectify(linkVals, 'relatedLinks')
518        if link not in self.relatedLinks:
519            self.relatedLinks.append(link)
520
521
522    def addParameters(self, params):
523        '''
524        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
525        @params param: parameter, as string array, to add to atom parameters collection
526        '''
527        # avoid strings being parsed character by character
528        if type(params) is str:
529            params = [params]
530           
531        for param in params:
532            # firstly tidy parameter
533            param = utilities.tidyUpParameters(param)
534            category = Category()
535            # NB, data already tidied up here, so set keyword to avoid this happening again
536            category.fromString(param, escapeSpecialCharacters=True)
537
538            # now check for uniqueness
539            if self._isNewParameter(category):
540                logging.debug("Adding new parameter: %s" %param)
541                self.parameters.append(category)
542   
543   
544    def _linksToXML(self, root):
545        '''
546        Add required links to the input element
547        @param root: element to add links to - NB, should be the root element of the atom
548        '''
549        selfLink = ET.SubElement(root, "link")
550        selfLink.attrib["href"] = self.atomBrowseURL
551        selfLink.attrib["rel"] = "self"
552       
553        for relatedLink in self.relatedLinks:
554            if relatedLink.hasValue():
555                root.append(relatedLink.toXML())
556   
557    def toXML(self):
558        '''
559        Convert the atom into XML representation and return this
560        @return: xml version of atom
561        '''
562        logging.info("Creating formatted XML version of Atom")
563        root = ET.Element("entry")
564        root.attrib["xmlns"] = ndgObject.ATOM_NS
565        root.attrib["xmlns:moles"] = ndgObject.MOLES_NS
566        root.attrib["xmlns:georss"] = ndgObject.GEOSS_NS
567        root.attrib["xmlns:gml"] = ndgObject.GML_NS
568        id = ET.SubElement(root, "id")
569        id.text = self.atomID
570        title = ET.SubElement(root, "title")
571        title.text = self.title
572        self._linksToXML(root)
573
574        if self.author and self.author.hasValue():
575            root.append(self.author.toXML())
576           
577        for contributor in self.contributors:
578            root.append(contributor.toXML())
579
580        # add parameters data
581        for param in self.parameters:
582            if param.hasValue():
583                root.append(param.toXML())
584
585        # add the type and subtype data
586        self.__addAtomTypeDataXML(root)
587                   
588        summary = ET.SubElement(root, "summary")
589        summary.text = self.Summary
590                   
591        # add link to content, if required - NB, can only have one content element in atom
592        # - and this is mandatory
593        content = ET.SubElement(root, "content")
594        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
595        if contentFile:
596            content.attrib["type"] = "application/xml"
597            content.attrib["src"] = contentFile
598        else:
599            content.attrib["type"] = "xhtml"
600            div = ET.SubElement(content, 'div')
601            div.attrib["xmlns"] = ndgObject.XHTML_NS
602            div.text = self.Content
603       
604        # if there's a published date already defined, assume we're doing an update now
605        # NB, update element is mandatory
606        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
607        if not self.publishedDate:
608            self.publishedDate = currentDate
609
610        updated = ET.SubElement(root, "updated")
611        if not self.updatedDate:
612            self.updatedDate = currentDate
613        updated.text = self.updatedDate
614
615        published = ET.SubElement(root, "published")
616        published.text = self.publishedDate
617
618        # add the moles entity section, if it is required
619        if self.ME:
620            root.append(self.ME.toXML())
621
622        # add temporal range data, if available
623        temporalRange = ET.SubElement(root, "moles:temporalRange")
624        if self.t1:
625            temporalRange.text = self.t1
626            if self.t2:
627                temporalRange.text += "/" + self.t2
628
629        # add spatial range data, if available
630        self._addSpatialData(root)
631
632        tree = ET.ElementTree(root)
633        logging.info("XML version of Atom created")
634        return tree
635
636
637    def __getSummary(self):
638        logging.debug("Getting summary data")
639        summaryString = ""
640        for summary_line in self.summary:
641            summaryString += summary_line + "\n"
642
643        return summaryString
644
645    def __setSummary(self, summary):
646        logging.debug("Adding summary data")
647        self.summary = []
648        for summary_line in summary.split('\n'):
649            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
650           
651    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
652
653
654    def __getContent(self):
655        logging.debug("Getting content data")
656        contentString = ""
657        # NB, there must be content specified in an atom
658        if not self.content:
659            return "Metadata document"
660       
661        for content_line in self.content:
662            contentString += content_line + "\n"
663
664        return contentString
665
666    def __setContent(self, content):
667        logging.debug("Adding content data")
668        self.content = []
669        for content_line in content.split('\n'):
670            self.content.append(content_line)
671           
672    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
673
674           
675    def fromString(self, xmlString):
676        '''
677        Initialise Atom object using an xmlString
678        @param xmlString: representation of atom as an XML string
679        '''
680        logging.info("Ingesting data from XML string")
681        logging.debug("Create elementtree instance with XML string")
682        tree = ET.fromstring(xmlString)
683        title = tree.findtext('{%s}title' %ndgObject.ATOM_NS)
684        if title:
685            logging.debug("Adding title data")
686            self.title = title
687
688        summary = tree.findtext('{%s}summary' %ndgObject.ATOM_NS)
689        if summary:
690            self.Summary = summary#.decode('unicode_escape')
691
692        authorElement = tree.find('{%s}author' %ndgObject.ATOM_NS)
693        if authorElement:
694            logging.debug("Adding author data")
695            author = Person()
696            author.fromETElement(authorElement)
697            self.author = author
698
699        contributorElements = tree.findall('{%s}contributor' %ndgObject.ATOM_NS)
700        for contributorElement in contributorElements:
701            logging.debug("Adding contributor data")
702            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
703            contributor.fromETElement(contributorElement)
704            self.contributors.append(contributor)
705
706        molesElement = tree.find('{%s}entity' %ndgObject.MOLES_NS)
707        if molesElement:
708            self.ME.fromET(molesElement)
709               
710        self.atomID = tree.findtext('{%s}id' %ndgObject.ATOM_NS)
711
712        self._parseCategoryData(tree.findall('{%s}category' %ndgObject.ATOM_NS))
713
714        self._parseLinksData(tree.findall('{%s}link' %ndgObject.ATOM_NS))
715           
716        contentTag = tree.find('{%s}content' %ndgObject.ATOM_NS)
717        if contentTag != None:
718            logging.debug("Found content tag - checking for CSML/CDML file data")
719            file = contentTag.attrib.get('src')
720            if file:
721                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
722                if file.upper().find('CSML') > -1:
723                    logging.debug("Adding CSML file data")
724                    self.csmlFile = file
725                elif file.upper().find('CDML') > -1:
726                    logging.debug("Adding CDML file data")
727                    self.cdmlFile = file
728                self.contentFile = file
729            else:
730                logging.debug("No file data - adding contents of element instead")
731                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
732                self.Content = div.text
733       
734        range = tree.findtext('{%s}temporalRange' %ndgObject.MOLES_NS)
735        if range:
736            logging.debug("Adding temporal range data")
737            timeData = range.split('/')
738            self.t1 = timeData[0]
739            if len(timeData) > 1:
740                self.t2 = timeData[1]
741       
742        where = tree.find('{%s}where' %ndgObject.GEOSS_NS)
743        if where:
744            # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
745            minBBox = where.findall('.//{%s}lowerCorner' %ndgObject.GML_NS)
746            if minBBox:
747                logging.debug("Adding min spatial range data")
748                minBBox = minBBox[0]
749                spatialData = minBBox.text.split()
750                self.minX = spatialData[0]
751                if len(spatialData) > 1:
752                    self.minY = spatialData[1]
753           
754            maxBBox = where.findall('.//{%s}upperCorner' %ndgObject.GML_NS)
755            if maxBBox:
756                maxBBox = maxBBox[0]
757                logging.debug("Adding max spatial range data")
758                spatialData = maxBBox.text.split()
759                self.maxX = spatialData[0]
760                if len(spatialData) > 1:
761                    self.maxY = spatialData[1]
762               
763        publishedDate = tree.findtext('published')
764        if publishedDate:
765            logging.debug("Adding published date")
766            self.publishedDate = publishedDate
767               
768        updatedDate = tree.findtext('updated')
769        if updatedDate:
770            logging.debug("Adding updated date")
771            self.updatedDate = updatedDate
772           
773        logging.info("Completed data ingest")
774   
775   
776    def _parseCategoryData(self, categories):
777        logging.debug("Adding category/parameters data")
778        for category in categories:
779            cat = Category()
780            cat.fromETElement(category)
781           
782            if cat.term == self.ATOM_TYPE:
783                logging.debug("Found atom type data")
784                self.atomTypeID = cat.label
785                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
786                continue
787            elif cat.term == self.ATOM_SUBTYPE:
788                logging.debug("Found atom subtype data")
789                self.subtypeID = cat.label
790                self.subtype = cat.scheme
791                continue
792
793            self.parameters.append(cat)
794   
795
796    def setDatasetID(self, datasetID):
797        '''
798        Set the dataset ID for the atom - and generate an appropriate atom name using this
799        @param datasetID: ID to set for the atom
800        '''
801        self.datasetID = datasetID
802        self._generateAtomName(datasetID) 
803        self.atomID = self.createAtomID(datasetID)
804
805
806    def createAtomID(self, datasetID):
807        '''
808        Create a unique ID, conforming to atom standards, for atom
809        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
810        @param datasetID: ID of atom's dataset
811        @return: unique ID
812        '''
813        logging.info("Creating unique ID for atom")
814        if not self.atomBrowseURL:
815            self._generateAtomName(datasetID)
816        urlBit = self.atomBrowseURL.split('://')[1]
817        urlBit = urlBit.replace('#', '')
818        urlBits = urlBit.split('/')
819        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
820        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
821       
822        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
823        logging.info("- unique ID created for atom")
824        logging.debug(" - '%s'" %id)
825        return id
826       
827       
828    def _generateAtomName(self, datasetID):
829        '''
830        Generate a consistent name for the atom - with full eXist doc path
831        @param datasetID: ID of atom's dataset
832        '''
833        self.atomName = datasetID + ".atom"
834        if not self.ME.providerID:
835            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
836        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
837        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
838
839
840    def _parseLinksData(self, links):
841        '''
842        Extract links and atom data from array of link elements in the XML representation of the atom
843        @param links: an array of <link> elements
844        '''
845        # firstly, get all data to start with, so we can properly process it afterwards
846        linkData = {}
847        logging.debug("Getting link data")
848        for linkTag in links:
849            link = Link()
850            link.fromETElement(linkTag)
851
852            if not linkData.has_key(link.rel):
853                linkData[link.rel] = []
854           
855            linkData[link.rel].append(link)
856
857        # there should be one self referencing link - which will provide info on the atom itself
858        if not linkData.has_key('self'):
859            errorMessage = "Atom does not have self referencing link - " + \
860                "cannot ascertain datasetID without this - please fix"
861            logging.error(errorMessage)
862            raise ValueError(errorMessage)
863       
864        # this is the link describing the atom itself
865        self.atomBrowseURL = linkData['self'][0].href
866       
867        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
868        self.atomName = self.datasetID + ".atom"
869        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
870       
871        # now remove this value and the associated moles doc link
872        del linkData['self']
873        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
874        if linkData.has_key('related'):
875            relatedLinks = []
876            for link in linkData['related']:
877                if link.href != molesDoc:
878                    relatedLinks.append(link)
879           
880            linkData['related'] = relatedLinks
881               
882        # now add the remaining links to the atom
883        for key in linkData:
884            for link in linkData[key]:
885                logging.debug("Adding link data")
886                self.relatedLinks.append(link)
887       
888
889    def _addSpatialData(self, element):
890        '''
891        Add spatial coverage element to an input element
892        @param element: element to add coverage data to
893        '''
894        logging.info("Adding spatial data to Atom")
895        if not self.minX:
896            logging.info("No spatial data specified")
897            return
898        bbox = ET.SubElement(element, "georss:where")
899        envelope = ET.SubElement(bbox, "gml:Envelope")
900        lc = ET.SubElement(envelope, "gml:lowerCorner")
901        lc.text = str(self.minX) + " " + str(self.minY)
902        uc = ET.SubElement(envelope, "gml:upperCorner")
903        uc.text = str(self.maxX) + " " + str(self.maxY)
904
905       
906    def setAttribute(self, attributeName, attributeValue):
907        '''
908        Set the value of an atom attribute - and do some basic tidying up of the string content
909        - to escape any XML unfriendly characters
910        @param attributeName: name of the attribute whose value to set
911        @param attributeValue: value to set the attribute to 
912        '''
913        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
914        origValue = attributeValue
915       
916        # escape any special characters if a value has been specified
917        # NB, need to cope with both single values and arrays
918        if attributeValue:
919            if type(attributeValue) is list:
920                newVals = []
921                for val in attributeValue:
922                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
923                attributeValue = newVals
924                   
925            else:
926                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
927
928        # handle the special case of authors; only one author is allowed per atom
929        # - the others should be treated as contributors
930        if attributeName == "authors":
931            setattr(self, "author", attributeValue[0])
932            if len(attributeValue) > 1:
933                setattr(self, "contributors", attributeValue[1:])
934        elif attributeName == "atomAuthors":
935            if isinstance(attributeValue, list):
936                for val in attributeValue:
937                    self.ME.responsibleParties.append(val)
938            else:
939                self.ME.responsibleParties.append(attributeValue)
940        elif attributeName == "files":
941            self.addUniqueRelatedLinks(attributeValue)
942        else:
943            setattr(self, attributeName, attributeValue)
944
945
946    def objectify(self, objectVals, attributeName):
947        '''
948        Some inputs are specified as strings but need to be converted into
949        objects - do this here
950        @param objectVals: a '|' delimited string of values
951        @param attributeName: name of attribute the values belong to
952        '''
953        obj = None
954        if type(objectVals) != str:
955            return objectVals
956       
957        if attributeName == "relatedLinks":
958            obj = Link()
959        elif attributeName == "atomAuthors" or attributeName == "authors":
960            # NB, ensure there is only one author tag - extra authors are contributors
961            authorType = Person.AUTHOR_TYPE
962            if self.author and self.author.hasValue():
963                authorType= Person.CONTRIBUTOR_TYPE
964            obj = Person(personType = authorType)
965        elif attributeName == 'files':
966            obj = Link()
967            objectVals = '%s|%s|%s' \
968                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
969
970        if obj:
971            obj.fromString(objectVals)
972            # NB, need to set it now, just in case we don't set it before coming back
973            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
974                self.author = obj
975            return obj
976       
977        return objectVals
978
979
980    def toPrettyXML(self):
981        '''
982        Returns nicely formatted XML as string
983        '''
984        atomXML = self.toXML()
985
986        # create the string
987        logging.debug("Converting the elementtree object into a string")
988        prettyXML = et2text(atomXML.getroot())
989
990        # add XML version tag
991        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
992        logging.info("Created formatted version of XML object")
993        return prettyXML
994
995
996    def getLinksOfType(self, termID):
997        '''
998        Returns links in the atom related links attribute which match the specified
999        term ID
1000        @param termID: the termID to look for in the related links - NB, this is
1001        matched to the end of the link.rel value
1002        @return links: array of Link objects with matching term type
1003        '''
1004        logging.debug("Getting atom links of type, '%s'" %termID)
1005        matchingLinks = []
1006        for link in self.relatedLinks:
1007            # firstly, handle special case where we only want the online ref type links
1008            # returned
1009            if termID == self.ONLINE_REF_LABEL:
1010                if not link.isChildAtom():
1011                    logging.debug("- found link with matching term type")
1012                    matchingLinks.append(link)
1013               
1014            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1015                logging.debug("- found link with matching term type")
1016                matchingLinks.append(link)
1017               
1018        logging.debug("Returning matched links")
1019        return matchingLinks
1020       
1021       
1022    def getLogos(self):
1023        '''
1024        Return related links that are logos
1025        @return: array of Links containing the logos for the atom
1026        '''
1027        logos = []
1028        for link in self.relatedLinks:
1029            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1030                logos.append(link)
1031               
1032        return logos
1033   
1034   
1035    def isGranule(self):
1036        if self.atomTypeID == VTD.GRANULE_TERM:
1037            return True
1038        return False
1039   
1040   
1041    def isDE(self):
1042        if self.atomTypeID == VTD.DE_TERM:
1043            return True
1044        return False
1045   
1046    def isDeployment(self):
1047        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1048            return True
1049        return False
1050   
1051    def isDeployable(self):
1052        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1053            self.atomTypeID == VTD.DPT_TERM or \
1054            self.atomTypeID == VTD.OBS_TERM:
1055            return True
1056        return False
1057       
1058       
1059    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1060        '''
1061        Parse CSML data and add extracted info to the atom
1062        @param csmlName: name of the csml file
1063        @param csmlContent: content of the csml file - NB, if this is set to None and the
1064        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1065        directly
1066        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1067        atom coverage data will be added
1068        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1069        this should only be True if creating a new atom - e.g. from a granulite
1070        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1071        '''
1072        logging.info("Creating CSML data model")
1073        self.csmlFile = csmlName
1074        self.contentFile = csmlName
1075        content = csmlContent or csmlName
1076   
1077        csmlDoc = CsmlParser.Dataset(file=content)
1078       
1079        logging.info("Extracting info from CSML file")
1080        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1081        if useCSMLID:
1082            logging.debug(" - using this ID for the atom")
1083            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1084       
1085        title = csmlDoc.name.CONTENT
1086        logging.debug("Got dataset name (title): '%s'" %title)
1087        # NB, if a title is specified (and not as the default value), it automatically is used in
1088        # place of anything in the granulite file
1089        if title and title != "NAME OF DATASET GOES HERE":
1090            logging.info("Title, '%s', extracted from CSML file" %title)
1091            if self.title:
1092                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1093                             %self.title)
1094            self.title = title
1095               
1096        bbox1 = csmlDoc.getBoundingBox()
1097        bbox2 = csmlDoc.getCSMLBoundingBox()
1098
1099        time = None
1100        if bbox2:
1101            time = bbox2.getTimeLimits()
1102   
1103        # now check for other parameters to add to granule
1104        # Firstly, extract the bounding envelope
1105        if bbox1:
1106            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1107            n, s = (bbox1[3], bbox1[1])
1108   
1109            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1110                self.maxY = n
1111               
1112            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1113                self.minY = s
1114           
1115            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1116                self.minX = w
1117   
1118            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1119                self.maxX = e
1120           
1121            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1122                          %(w, s, e, n))
1123           
1124            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1125                          %(self.minX, self.minY, self.maxX, self.maxY))
1126        else:
1127            logging.debug("No valid bounding box data found")
1128   
1129        if time:
1130            t1 = utilities.formatDateYYYYMMDD(time[0])
1131            if not aggregateCoverage or \
1132                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1133                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1134                self.t1 = t1
1135   
1136            t2 = time[1]
1137            if t2 and t2 != 'None':
1138                t2 = utilities.formatDateYYYYMMDD(t2)
1139                if not aggregateCoverage or \
1140                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1141                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1142                    self.t2 = t2
1143           
1144            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1145        else:
1146            logging.debug("No valid time range data found")
1147   
1148        #create parameter summaries:
1149        #set up list to hold the parameters data
1150        parameters = []
1151        for feature in csmlDoc.featureCollection.featureMembers:
1152            if hasattr(feature.parameter, 'href'):
1153                paramTriple = ""
1154                if hasattr(feature, 'description'):
1155                    paramTriple = feature.description.CONTENT
1156                    paramTriple += " | " + feature.parameter.href
1157                   
1158                    term = ""
1159                    if hasattr(feature, 'name'):
1160                        term = feature.name.CONTENT
1161   
1162                    paramTriple += " | " + term
1163                   
1164                    logging.debug("Got parameter info: %s" %paramTriple)
1165                    parameters.append(paramTriple)
1166       
1167        # update the atom with the extracted parameters
1168        logging.info("Adding CSML parameters to granule atom")
1169        self.addParameters(parameters)
1170        logging.info("Finished adding CSML data")
1171        return csmlDoc
1172
1173
1174    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1175        '''
1176        Check through the atom links and retrieve any associated data of the
1177        specified type
1178        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1179        or VTD.DE_TERM
1180        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1181        config details which are not available to the Atom object
1182        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1183        defined deployments to find those which reference it, otherwise only
1184        deployments data featured in the atom related links are processed
1185        '''
1186        logging.info("Looking up %s info" %type)
1187       
1188        self.allActivities = []
1189        self.allObs = []
1190        self.allDpts = []
1191
1192        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1193            raise ValueError('Unrecognised associated data type: %s' %type)
1194       
1195        # avoid duplicating lookup effort
1196        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1197            (type == VTD.DE_TERM and self.dataEntities):
1198            logging.info("- this info has already been looked up - returning")
1199            return
1200
1201        # firstly, collect all the references to the info required
1202        if lookupIndirectReferences:
1203            logging.info("Looking up indirect references")
1204           
1205            # if we're looking up DE data for deployments data, need to have the
1206            # deployments info looked up first
1207            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1208                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1209           
1210            logging.info("Looking up references to this atom from other %s" %type)
1211           
1212            # NB, if we're looking up deployments info, we only look up references
1213            # to this atom - if we're looking up DEs, we need to look up references
1214            # to the deployments referenced by this atom
1215            urls = [self.atomBrowseURL]
1216           
1217            if type == VTD.DE_TERM and self.isDeployable():
1218                urls = []
1219                for dep in self.deployments:
1220                    urls.append(dep.browseURL)
1221                   
1222            links = []
1223            for url in urls:
1224                doc = dr.get(type, dr.ATOM_TYPE, url, \
1225                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1226                # now need to turn this results set into actual atoms
1227                tree = ET.fromstring(doc)
1228                for atom in tree:
1229                    logging.debug("- found reference in %s" %type)
1230                    links.append(ET.tostring(atom))
1231                   
1232            logging.info("Finished looking up indirect references")
1233        else:
1234            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1235
1236        # now retrieve the references and extract the required data
1237        logging.info("Retrieving info from %s references" %type)
1238        if type == VTD.DEPLOYMENT_TERM:
1239            logging.info("Extracting links data to deployment entitites")
1240            self.deployments = []
1241            for link in links:
1242                if lookupIndirectReferences:
1243                    deploymentAtom = link
1244                else:
1245                    localID = link.href.split("__ATOM__")[-1]
1246                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1247                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1248   
1249                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1250                self.deployments.append(deployment)
1251               
1252                self.addUniqueLinks(self.allActivities, deployment.activities)
1253                self.addUniqueLinks(self.allObs, deployment.obs)
1254                self.addUniqueLinks(self.allDpts, deployment.dpts)
1255        else:
1256            # for DE data, just store the title + link in a Link object
1257            self.dataEntities = []
1258            logging.info("Extracting links data to data entitites")
1259            for data in links:
1260                atom = Atom(xmlString=str(data))
1261                link = Link()
1262                link.title = atom.title
1263                link.href = atom.atomBrowseURL
1264                link.rel = atom.datasetID
1265               
1266                # NB, different deployments may be used by the same DE - so
1267                # avoid duplication
1268                self.addUniqueLinks(self.dataEntities, link)
1269           
1270        logging.info("Finished looking up %s info" %type)
1271
1272
1273    def addUniqueLinks(self, dataArray, links):
1274        '''
1275        Add links to specified array - if they are not already included
1276        @param dataArray: a list, potentially arlready containing links
1277        @param links: a Link or array of Links to add to the dataArray
1278        '''
1279        logging.debug("Adding new links")
1280        if not links:
1281            return
1282       
1283        if type(links) is not list:
1284            links = [links]
1285       
1286        for link in links:
1287            if type(link) is not Link:
1288                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1289                continue
1290            if link not in dataArray:
1291                logging.debug("- adding unique link")
1292                dataArray.append(link)
1293        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.