source: exist/trunk/python/ndgUtils/models/Atom.py @ 4721

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4721
Revision 4721, 49.1 KB checked in by cbyrom, 11 years ago (diff)

Further tidy up of Atom model, tightening up XPath expressions to
include full namespace details.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils import ndgObject
17from ndgUtils.eXistConnector import eXistConnector
18from ndgUtils.ETxmlView import et2text
19import ndgUtils.lib.utilities as utilities
20from ndgUtils.models.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class Person(object):
36    '''
37    Class representing atom author type data - with name, uri and role attributes
38    @keyword personType: Type of person to create - specified using the Person.._Type
39    values.  Default is AUTHOR_TYPE.
40    @keyword namespace: a two value array of format, ['short_namespace_name', 'full_namespace_name']
41    - e.g. ['moles', 'http://ndg.nerc.ac.uk/schema/moles2beta']
42    '''
43    AUTHOR_TYPE = 0
44    CONTRIBUTOR_TYPE = 1
45    RESPONSIBLE_PARTY_TYPE = 2
46    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
47   
48    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
49        self.type = personType
50        if namespace:
51            self.ns_shortname = namespace[0]
52            self.ns_fullname = namespace[1]
53        else:
54            self.ns_shortname = ""
55            self.ns_fullname = ndgObject.ATOM_NS
56           
57        self.name = ""
58        self.uri = ""
59        self.role = ""
60       
61        # NB, the atom format specifies slightly different data contents
62        self.uriTagName = "email"
63        # NB, responsible party data is always stored in the moles section
64        if self.type == self.RESPONSIBLE_PARTY_TYPE:
65            self.ns_shortname = 'moles'
66            self.ns_fullname = ndgObject.MOLES_NS
67            self.uriTagName = "uri"
68
69    def __str__(self):
70        if self.name or self.uri or self.role:
71            return self.name + " | " + self.uri + " | " + self.role
72        return ""
73
74
75    def hasValue(self):
76        if self.name or self.uri or self.role:
77            return True
78        return False
79   
80    def fromString(self, personString):
81        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
82       
83    def fromETElement(self, personTag):
84        self.name = personTag.findtext('{%s}name' %self.ns_fullname) or ""
85        self.role = personTag.findtext('{%s}role' %self.ns_fullname) or ""
86        self.uri = personTag.findtext('{%s}%s' %(self.ns_fullname, self.uriTagName)) or ""
87        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
88                      %(self.name, self.role, self.uriTagName, self.uri))
89
90    def toXML(self):
91        prefix = ""
92        if self.ns_shortname:
93            prefix = self.ns_shortname + ':'
94
95        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
96
97        if self.name:
98            name = ET.SubElement(author, prefix + "name")
99            name.text = self.name
100       
101        if self.uri:
102            uri = ET.SubElement(author, prefix + self.uriTagName)
103            uri.text = self.uri
104       
105        if self.role:
106            role = ET.SubElement(author, prefix + "role")
107            role.text = self.role
108
109        return author
110   
111    def __cmp__(self, person1):
112        '''
113        Override comparison to allow proper object comparison when checking
114        if Person objects are in an array already - i.e. if person in personArray...
115        '''
116        if not person1:
117            return -1
118       
119        if self is person1:
120            return 0
121        elif self.uri == person1.uri and self.name == person1.name and \
122                self.role == person1.role and self.type == person1.type:
123            return 0
124        return 1
125
126
127class Link(object):
128    '''
129    Class representing an atom link - with href, title and rel attributes
130    '''
131
132    def __init__(self):
133        self.href = ""
134        self.title = ""
135        self.rel = ""
136
137    def fromString(self, linkString):
138        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
139       
140    def fromETElement(self, linkTag):
141        self.href = linkTag.attrib.get('href') or ""
142        self.rel = linkTag.attrib.get('rel') or ""
143        self.title = linkTag.attrib.get('title') or ""
144
145    def toXML(self):
146        link = ET.Element("link")
147        link.attrib["href"] = self.href
148        link.attrib["title"] = self.title
149        link.attrib["rel"] = self.rel
150        return link
151
152    def hasValue(self):
153        # NB, just a rel on its own is meaningless - so ignore
154        if self.href or self.title:
155            return True
156        return False
157   
158    def __str__(self):
159        if self.href or self.title or self.rel:
160            return self.href + " | " + self.title + " | " + self.rel
161        return ""
162   
163    def isChildAtom(self):
164        '''
165        Determines whether the link refers to another atom - e.g. a link to
166        a data granule
167        @return True, if so; False otherwise
168        '''
169        if self.rel.endswith(VTD.GRANULE_TERM) or \
170            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
171            self.rel.endswith(VTD.ACTIVITY_TERM) or \
172            self.rel.endswith(VTD.DPT_TERM) or \
173            self.rel.endswith(VTD.OBS_TERM):
174            return True
175       
176        return False
177   
178    def __cmp__(self, link1):
179        '''
180        Override comparison to allow proper object comparison when checking
181        if Link objects are in an array already - i.e. if link in linkArray...
182        '''
183        if not link1:
184            return -1
185       
186        if self is link1:
187            return 0
188        elif self.href == link1.href and self.title == link1.title and \
189                self.rel == link1.rel:
190            return 0
191        return 1
192
193
194class Category(object):
195    '''
196    Class representing an atom category - with term, scheme and label attributes
197    '''
198    def __init__(self):
199        self.term = ""
200        self.scheme = ""
201        self.label = ""
202
203    def fromString(self, linkString, escapeSpecialCharacters=True):
204        '''
205        Create Category from triple string of format, 'label | scheme | term'
206        @param linkString: triple string to create category with
207        @keyword escapeSpecialCharacters: if set to True, special characters in
208        triple string are escaped (default)
209        '''
210        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
211            doEscape=escapeSpecialCharacters)
212       
213    def fromETElement(self, linkTag):
214        self.term = linkTag.attrib.get('term') or ""
215        self.label = linkTag.attrib.get('label') or ""
216        self.scheme = linkTag.attrib.get('scheme') or ""
217
218    def toXML(self):
219        link = ET.Element("category")
220        link.attrib["term"] = self.term
221        link.attrib["scheme"] = self.scheme
222        link.attrib["label"] = self.label
223        return link
224   
225    def hasValue(self):
226        if self.scheme or self.label or self.term:
227            return True
228        return False
229
230
231class Atom(object):
232
233    # labels for use with the atom categories
234    ATOM_TYPE = "ATOM_TYPE"
235    ATOM_SUBTYPE = "ATOM_SUBTYPE"
236
237    # labels for use with the templates to set/extract specific inputs
238    ONLINE_REF_LABEL = "online_ref"
239    PARAMETER_LABEL = "parameter"
240    ATOM_REF_LABEL = "atom_ref"
241    DELIMITER = "---"
242    REMOVE_LABEL = "remove"
243   
244    # format to use for t1-t2 date range
245    YEAR_FORMAT = '%Y-%m-%d'
246
247    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
248                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
249        '''
250        Constructor - initialise the atom variables
251        '''
252        logging.info("Initialising atom")
253        if atomType:
254            logging.info(" - of type '%s'" %atomType)
255        self.atomTypeID = atomType
256
257        # some data have further subtypes specified
258        self.subtypeID = None # this should be the termID
259        self.subtype = None # and this should be the fully formed vocab URL
260       
261        self.ndgObject = ndgObject
262
263        self.atomName = None
264        self.files = []
265        self.author = Person()
266        self.contributors = []
267        self.atomAuthors = []
268        self.parameters = []
269        self.spatialData = []
270        self.temporalData = []
271        self.relatedLinks = []
272        self.summary = []
273        self.content = []
274        # NB, this deployments data duplicates other atom data - and is only used for a
275        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
276        self.deployments = []
277        # ditto for the following field
278        self.dataEntities = []
279           
280        self.csmlFile = None
281        self.cdmlFile = None
282        # general variable to use for setting the atom content - NB, if a csmlFile is specified
283        # (either directly or via a cdmlFile specification), this will be the content by default
284        # for this purpose
285        self.contentFile = None     
286        self.title = None
287        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
288        self.atomID = None
289   
290        # boundary box info - to replace spatial/temporalData?
291        self.minX = None
292        self.minY = None
293        self.maxX = None
294        self.maxY = None
295        self.t1 = None
296        self.t2 = None
297
298        self.ME = ME.MolesEntity()
299       
300        # date when the atom was first ingested
301        self.publishedDate = None
302
303        # last update date
304        self.updatedDate = None
305
306        # assume atom in working state by default - this is used to define what collection
307        # in eXist the atom is stored in
308        self.state = state
309       
310        # additional, non standard atom data can be included in the molesExtra element
311        if vocabTermData:
312            self.VTD = vocabTermData
313        else:
314            self.VTD = VTD()
315       
316        if xmlString:
317            self.fromString(xmlString)
318
319        # if inputs passed in as dict, add these now
320        if inputs:
321            logging.info("Adding info to atom from input dict")
322            logging.debug(inputs)
323            self.__dict__.update(inputs)
324           
325            # NB, this doesn't trigger the Summary Property, so do this
326            # explicitly, if need be
327            if inputs.has_key('Summary'):
328                self.Summary = inputs.get('Summary')
329            if inputs.has_key('Content'):
330                self.Content = inputs.get('Content')
331            if inputs.has_key('author'):
332                name = inputs.get('author')
333                author = Person()
334                author.fromString(name)
335                self.author = author
336           
337            # also pass any moles data up to the moles entity object
338            if inputs.has_key('providerID'):
339                self.ME.providerID = inputs.get('providerID')
340               
341            if inputs.has_key('abbreviation'):
342                self.ME.abbreviation = inputs.get('abbreviation')
343
344        if self.atomTypeID:
345            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
346
347        logging.info("Atom initialised")
348
349
350    def addOnlineReferences(self, links):
351        '''
352        Add online reference data associated with the atom
353        - NB, care needs to be taken here since this data is stored in the atom
354        link elements and these are also used for the various atom associations
355        @param links: a Link or array of Links to add to the relatedLinks attribute
356        '''
357        logging.debug("Adding online references")
358        if not links:
359            return
360       
361        if type(links) is not list:
362            links = [links]
363       
364        # firstly clear out any online refs data from the existing related links
365        newLinks = []
366        for link in self.relatedLinks:
367            if link.isChildAtom():
368                newLinks.append(link)
369       
370        newLinks.extend(links)
371        self.relatedLinks = newLinks
372        logging.debug("Online references added")
373
374
375    def addUniqueRelatedLinks(self, links):
376        '''
377        Add links to relatedLinks array - if they are not already included
378        @param links: a Link or array of Links to add to the relatedLinks attribute
379        '''
380        self.addUniqueLinks(self.relatedLinks, links)
381       
382
383    def removeRelatedLinks(self, linksToDelete):
384        '''
385        Remove any links in the input list from the atom's related links list
386        @param linksToDelete: array of Link objects to remove from atom
387        '''
388        logging.debug("Removing related links from atom")
389        if not linksToDelete:
390            return
391       
392        if type(linksToDelete) is not list:
393            linksToDelete = [linksToDelete]
394       
395        updatedLinks = []
396        for link in self.relatedLinks:
397            if type(link) is not Link:
398                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
399                continue
400            if link in linksToDelete:
401                logging.debug("- found link to remove")
402            else:
403                updatedLinks.append(link)
404
405        self.relatedLinks = updatedLinks
406        logging.debug("Links removed")
407       
408
409    def getDefaultCollectionPath(self):
410        '''
411        Determine the correct collection to use for the atom in eXist
412        '''
413        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
414       
415        if self.atomTypeID == VTD.DE_TERM:
416            collectionPath += eXistConnector.DE_COLLECTION_PATH
417        elif self.atomTypeID == VTD.GRANULE_TERM:
418            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
419        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
420            self.subtypeID == VTD.DEPLOYMENT_TERM:
421            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
422        else:
423            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
424       
425        if not self.ME.providerID:
426            raise AtomError("Error: cannot determine atom collection path because " + \
427                            "the provider ID is not defined")
428           
429        collectionPath += self.ME.providerID + "/"
430        return collectionPath
431
432
433    def __addAtomTypeDataXML(self, root):
434        '''
435        Add the atom type, and subtype data, if available, to atom categories
436        - and lookup and add the appropriate vocab term data
437        '''
438        if self.atomTypeID:
439            logging.info("Adding atom type info to XML output")
440            category = Category()
441            category.label = self.atomTypeID
442            # look up the appropriate vocab term data
443            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
444            category.term = self.ATOM_TYPE
445            root.append(category.toXML())
446
447        if self.subtypeID:
448            logging.info("Adding atom subtype info to XML output")
449            # NB subtypes not all defined, so leave this out for the moment
450            category.label = self.subtypeID
451            # look up the appropriate vocab term data
452            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
453            category.term = self.ATOM_SUBTYPE
454            root.append(category.toXML())
455
456
457    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
458        '''
459        Add data to include in the moles entity element
460        '''
461        logging.debug('Adding moles entity information')
462        self.ME.abbreviation = abbreviation
463        self.ME.providerID = provider_id
464        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
465        logging.debug('Moles entity information added')
466
467
468    def addAuthors(self, authors):
469        '''
470        Add author data appropriately to the atom
471        NB, these will overwrite any existing authors of the same type
472        @param authors: list of Person objects with the author data
473        '''
474        logging.debug('Adding authors data to Atom')
475        isFirstAuthor = {}
476        authorArray = None
477        for author in authors:
478            # NB, we're only allowed one atom author
479            if author.type == Person.AUTHOR_TYPE:
480                self.author = author
481                   
482                if isFirstAuthor.has_key(author.type):
483                    raise AtomError("Error: an atom can only have one author specified")
484                isFirstAuthor[author.type] = 1
485                continue
486            elif author.type == Person.CONTRIBUTOR_TYPE:
487                authorArray = self.contributors
488            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
489                authorArray = self.ME.responsibleParties
490               
491            # check if this is the first addition - if so, clear out the
492            # array in advance
493            if not isFirstAuthor.has_key(author.type):
494                logging.debug("Clearing out author array")
495                # NB, need to be careful to clear the array, not create a ref
496                # to a new array
497                del authorArray[:]
498                isFirstAuthor[author.type] = 1
499
500            if author.hasValue() and author not in authorArray:
501                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
502                              %(author.type, author.name, author.uri, author.role))
503                authorArray.append(author)
504
505        logging.debug('Finished adding authors data')
506
507
508    def _isNewParameter(self, param):
509        '''
510        Check if a parameter is already specified in the atom, return False if
511        so, otherwise return True
512        '''
513        for p in self.parameters:
514            if p.term == param.term and \
515                p.scheme == param.scheme and \
516                p.label == param.label:
517                return False
518        return True
519
520
521    def addRelatedLinks(self, linkVals):
522        '''
523        Add related links in string format - converting to Link objects
524        NB, only add the link if it is unique
525       
526        @param linkVals: string of format, 'uri | title | vocabServerURL'
527        '''
528        link = self.objectify(linkVals, 'relatedLinks')
529        if link not in self.relatedLinks:
530            self.relatedLinks.append(link)
531
532
533    def addParameters(self, params):
534        '''
535        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
536        @params param: parameter, as string array, to add to atom parameters collection
537        '''
538        # avoid strings being parsed character by character
539        if type(params) is str:
540            params = [params]
541           
542        for param in params:
543            # firstly tidy parameter
544            param = utilities.tidyUpParameters(param)
545            category = Category()
546            # NB, data already tidied up here, so set keyword to avoid this happening again
547            category.fromString(param, escapeSpecialCharacters=True)
548
549            # now check for uniqueness
550            if self._isNewParameter(category):
551                logging.debug("Adding new parameter: %s" %param)
552                self.parameters.append(category)
553   
554   
555    def _linksToXML(self, root):
556        '''
557        Add required links to the input element
558        @param root: element to add links to - NB, should be the root element of the atom
559        '''
560        selfLink = ET.SubElement(root, "link")
561        selfLink.attrib["href"] = self.atomBrowseURL
562        selfLink.attrib["rel"] = "self"
563       
564        for relatedLink in self.relatedLinks:
565            if relatedLink.hasValue():
566                root.append(relatedLink.toXML())
567   
568    def toXML(self):
569        '''
570        Convert the atom into XML representation and return this
571        @return: xml version of atom
572        '''
573        logging.info("Creating formatted XML version of Atom")
574        root = ET.Element("entry")
575        root.attrib["xmlns"] = ndgObject.ATOM_NS
576        root.attrib["xmlns:moles"] = ndgObject.MOLES_NS
577        root.attrib["xmlns:georss"] = ndgObject.GEOSS_NS
578        root.attrib["xmlns:gml"] = ndgObject.GML_NS
579        id = ET.SubElement(root, "id")
580        id.text = self.atomID
581        title = ET.SubElement(root, "title")
582        title.text = self.title
583        self._linksToXML(root)
584
585        if self.author and self.author.hasValue():
586            root.append(self.author.toXML())
587           
588        for contributor in self.contributors:
589            root.append(contributor.toXML())
590
591        # add parameters data
592        for param in self.parameters:
593            if param.hasValue():
594                root.append(param.toXML())
595
596        # add the type and subtype data
597        self.__addAtomTypeDataXML(root)
598                   
599        summary = ET.SubElement(root, "summary")
600        summary.text = self.Summary
601                   
602        # add link to content, if required - NB, can only have one content element in atom
603        # - and this is mandatory
604        content = ET.SubElement(root, "content")
605        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
606        if contentFile:
607            content.attrib["type"] = "application/xml"
608            content.attrib["src"] = contentFile
609        else:
610            content.attrib["type"] = "xhtml"
611            div = ET.SubElement(content, 'div')
612            div.attrib["xmlns"] = ndgObject.XHTML_NS
613            div.text = self.Content
614       
615        # if there's a published date already defined, assume we're doing an update now
616        # NB, update element is mandatory
617        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
618        if not self.publishedDate:
619            self.publishedDate = currentDate
620
621        updated = ET.SubElement(root, "updated")
622        if not self.updatedDate:
623            self.updatedDate = currentDate
624        updated.text = self.updatedDate
625
626        published = ET.SubElement(root, "published")
627        published.text = self.publishedDate
628
629        # add the moles entity section, if it is required
630        if self.ME:
631            root.append(self.ME.toXML())
632
633        # add temporal range data, if available
634        temporalRange = ET.SubElement(root, "moles:temporalRange")
635        if self.t1:
636            temporalRange.text = self.t1
637            if self.t2:
638                temporalRange.text += "/" + self.t2
639
640        # add spatial range data, if available
641        self._addSpatialData(root)
642
643        tree = ET.ElementTree(root)
644        logging.info("XML version of Atom created")
645        return tree
646
647
648    def __getSummary(self):
649        logging.debug("Getting summary data")
650        summaryString = ""
651        for summary_line in self.summary:
652            summaryString += summary_line + "\n"
653
654        return summaryString
655
656    def __setSummary(self, summary):
657        logging.debug("Adding summary data")
658        self.summary = []
659        for summary_line in summary.split('\n'):
660            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
661           
662    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
663
664
665    def __getContent(self):
666        logging.debug("Getting content data")
667        contentString = ""
668        # NB, there must be content specified in an atom
669        if not self.content:
670            return "Metadata document"
671       
672        for content_line in self.content:
673            contentString += content_line + "\n"
674
675        return contentString
676
677    def __setContent(self, content):
678        logging.debug("Adding content data")
679        self.content = []
680        for content_line in content.split('\n'):
681            self.content.append(content_line)
682           
683    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
684
685           
686    def fromString(self, xmlString):
687        '''
688        Initialise Atom object using an xmlString
689        @param xmlString: representation of atom as an XML string
690        '''
691        logging.info("Ingesting data from XML string")
692        logging.debug("Create elementtree instance with XML string")
693        tree = ET.fromstring(xmlString)
694        title = tree.findtext('{%s}title' %ndgObject.ATOM_NS)
695        if title:
696            logging.debug("Adding title data")
697            self.title = title
698
699        summary = tree.findtext('{%s}summary' %ndgObject.ATOM_NS)
700        if summary:
701            self.Summary = summary#.decode('unicode_escape')
702
703        authorElement = tree.find('{%s}author' %ndgObject.ATOM_NS)
704        if authorElement:
705            logging.debug("Adding author data")
706            author = Person()
707            author.fromETElement(authorElement)
708            self.author = author
709
710        contributorElements = tree.findall('{%s}contributor' %ndgObject.ATOM_NS)
711        for contributorElement in contributorElements:
712            logging.debug("Adding contributor data")
713            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
714            contributor.fromETElement(contributorElement)
715            self.contributors.append(contributor)
716
717        molesElement = tree.find('{%s}entity' %ndgObject.MOLES_NS)
718        if molesElement:
719            self.ME.fromET(molesElement)
720               
721        self.atomID = tree.findtext('{%s}id' %ndgObject.ATOM_NS)
722
723        self._parseCategoryData(tree.findall('{%s}category' %ndgObject.ATOM_NS))
724
725        self._parseLinksData(tree.findall('{%s}link' %ndgObject.ATOM_NS))
726           
727        contentTag = tree.find('{%s}content' %ndgObject.ATOM_NS)
728        if contentTag != None:
729            logging.debug("Found content tag - checking for CSML/CDML file data")
730            file = contentTag.attrib.get('src')
731            if file:
732                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
733                if file.upper().find('CSML') > -1:
734                    logging.debug("Adding CSML file data")
735                    self.csmlFile = file
736                elif file.upper().find('CDML') > -1:
737                    logging.debug("Adding CDML file data")
738                    self.cdmlFile = file
739                self.contentFile = file
740            else:
741                logging.debug("No file data - adding contents of element instead")
742                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
743                self.Content = div.text
744       
745        range = tree.findtext('{%s}temporalRange' %ndgObject.MOLES_NS)
746        if range:
747            logging.debug("Adding temporal range data")
748            timeData = range.split('/')
749            self.t1 = timeData[0]
750            if len(timeData) > 1:
751                self.t2 = timeData[1]
752       
753        where = tree.find('{%s}where' %ndgObject.GEOSS_NS)
754        if where:
755            # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
756            minBBox = where.findall('.//{%s}lowerCorner' %ndgObject.GML_NS)
757            if minBBox:
758                logging.debug("Adding min spatial range data")
759                minBBox = minBBox[0]
760                spatialData = minBBox.text.split()
761                self.minX = spatialData[0]
762                if len(spatialData) > 1:
763                    self.minY = spatialData[1]
764           
765            maxBBox = where.findall('.//{%s}upperCorner' %ndgObject.GML_NS)
766            if maxBBox:
767                maxBBox = maxBBox[0]
768                logging.debug("Adding max spatial range data")
769                spatialData = maxBBox.text.split()
770                self.maxX = spatialData[0]
771                if len(spatialData) > 1:
772                    self.maxY = spatialData[1]
773               
774        publishedDate = tree.findtext('{%s}published' %ndgObject.ATOM_NS)
775        if publishedDate:
776            logging.debug("Adding published date")
777            self.publishedDate = publishedDate
778               
779        updatedDate = tree.findtext('{%s}updated' %ndgObject.ATOM_NS)
780        if updatedDate:
781            logging.debug("Adding updated date")
782            self.updatedDate = updatedDate
783           
784        logging.info("Completed data ingest")
785   
786   
787    def _parseCategoryData(self, categories):
788        logging.debug("Adding category/parameters data")
789        for category in categories:
790            cat = Category()
791            cat.fromETElement(category)
792           
793            if cat.term == self.ATOM_TYPE:
794                logging.debug("Found atom type data")
795                self.atomTypeID = cat.label
796                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
797                continue
798            elif cat.term == self.ATOM_SUBTYPE:
799                logging.debug("Found atom subtype data")
800                self.subtypeID = cat.label
801                self.subtype = cat.scheme
802                continue
803
804            self.parameters.append(cat)
805   
806
807    def setDatasetID(self, datasetID):
808        '''
809        Set the dataset ID for the atom - and generate an appropriate atom name using this
810        @param datasetID: ID to set for the atom
811        '''
812        self.datasetID = datasetID
813        self._generateAtomName(datasetID) 
814        self.atomID = self.createAtomID(datasetID)
815
816
817    def createAtomID(self, datasetID):
818        '''
819        Create a unique ID, conforming to atom standards, for atom
820        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
821        @param datasetID: ID of atom's dataset
822        @return: unique ID
823        '''
824        logging.info("Creating unique ID for atom")
825        if not self.atomBrowseURL:
826            self._generateAtomName(datasetID)
827        urlBit = self.atomBrowseURL.split('://')[1]
828        urlBit = urlBit.replace('#', '')
829        urlBits = urlBit.split('/')
830        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
831        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
832       
833        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
834        logging.info("- unique ID created for atom")
835        logging.debug(" - '%s'" %id)
836        return id
837       
838       
839    def _generateAtomName(self, datasetID):
840        '''
841        Generate a consistent name for the atom - with full eXist doc path
842        @param datasetID: ID of atom's dataset
843        '''
844        self.atomName = datasetID + ".atom"
845        if not self.ME.providerID:
846            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
847        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
848        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
849
850
851    def _parseLinksData(self, links):
852        '''
853        Extract links and atom data from array of link elements in the XML representation of the atom
854        @param links: an array of <link> elements
855        '''
856        # firstly, get all data to start with, so we can properly process it afterwards
857        linkData = {}
858        logging.debug("Getting link data")
859        for linkTag in links:
860            link = Link()
861            link.fromETElement(linkTag)
862
863            if not linkData.has_key(link.rel):
864                linkData[link.rel] = []
865           
866            linkData[link.rel].append(link)
867
868        # there should be one self referencing link - which will provide info on the atom itself
869        if not linkData.has_key('self'):
870            errorMessage = "Atom does not have self referencing link - " + \
871                "cannot ascertain datasetID without this - please fix"
872            logging.error(errorMessage)
873            raise ValueError(errorMessage)
874       
875        # this is the link describing the atom itself
876        self.atomBrowseURL = linkData['self'][0].href
877       
878        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
879        self.atomName = self.datasetID + ".atom"
880        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
881       
882        # now remove this value and the associated moles doc link
883        del linkData['self']
884        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
885        if linkData.has_key('related'):
886            relatedLinks = []
887            for link in linkData['related']:
888                if link.href != molesDoc:
889                    relatedLinks.append(link)
890           
891            linkData['related'] = relatedLinks
892               
893        # now add the remaining links to the atom
894        for key in linkData:
895            for link in linkData[key]:
896                logging.debug("Adding link data")
897                self.relatedLinks.append(link)
898       
899
900    def _addSpatialData(self, element):
901        '''
902        Add spatial coverage element to an input element
903        @param element: element to add coverage data to
904        '''
905        logging.info("Adding spatial data to Atom")
906        if not self.minX:
907            logging.info("No spatial data specified")
908            return
909        bbox = ET.SubElement(element, "georss:where")
910        envelope = ET.SubElement(bbox, "gml:Envelope")
911        lc = ET.SubElement(envelope, "gml:lowerCorner")
912        lc.text = str(self.minX) + " " + str(self.minY)
913        uc = ET.SubElement(envelope, "gml:upperCorner")
914        uc.text = str(self.maxX) + " " + str(self.maxY)
915
916       
917    def setAttribute(self, attributeName, attributeValue):
918        '''
919        Set the value of an atom attribute - and do some basic tidying up of the string content
920        - to escape any XML unfriendly characters
921        @param attributeName: name of the attribute whose value to set
922        @param attributeValue: value to set the attribute to 
923        '''
924        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
925        origValue = attributeValue
926       
927        # escape any special characters if a value has been specified
928        # NB, need to cope with both single values and arrays
929        if attributeValue:
930            if type(attributeValue) is list:
931                newVals = []
932                for val in attributeValue:
933                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
934                attributeValue = newVals
935                   
936            else:
937                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
938
939        # handle the special case of authors; only one author is allowed per atom
940        # - the others should be treated as contributors
941        if attributeName == "authors":
942            setattr(self, "author", attributeValue[0])
943            if len(attributeValue) > 1:
944                setattr(self, "contributors", attributeValue[1:])
945        elif attributeName == "atomAuthors":
946            if isinstance(attributeValue, list):
947                for val in attributeValue:
948                    self.ME.responsibleParties.append(val)
949            else:
950                self.ME.responsibleParties.append(attributeValue)
951        elif attributeName == "files":
952            self.addUniqueRelatedLinks(attributeValue)
953        else:
954            setattr(self, attributeName, attributeValue)
955
956
957    def objectify(self, objectVals, attributeName):
958        '''
959        Some inputs are specified as strings but need to be converted into
960        objects - do this here
961        @param objectVals: a '|' delimited string of values
962        @param attributeName: name of attribute the values belong to
963        '''
964        obj = None
965        if type(objectVals) != str:
966            return objectVals
967       
968        if attributeName == "relatedLinks":
969            obj = Link()
970        elif attributeName == "atomAuthors":
971            obj = Person(personType = Person.RESPONSIBLE_PARTY_TYPE)
972        elif attributeName == "authors":
973            # NB, ensure there is only one author tag - extra authors are contributors
974            authorType = Person.AUTHOR_TYPE
975            if self.author and self.author.hasValue():
976                authorType= Person.CONTRIBUTOR_TYPE
977            obj = Person(personType = authorType)
978        elif attributeName == 'files':
979            obj = Link()
980            objectVals = '%s|%s|%s' \
981                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
982
983        if obj:
984            obj.fromString(objectVals)
985            # NB, need to set it now, just in case we don't set it before coming back
986            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
987                self.author = obj
988            return obj
989       
990        return objectVals
991
992
993    def toPrettyXML(self):
994        '''
995        Returns nicely formatted XML as string
996        '''
997        atomXML = self.toXML()
998
999        # create the string
1000        logging.debug("Converting the elementtree object into a string")
1001        prettyXML = et2text(atomXML.getroot())
1002
1003        # add XML version tag
1004        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1005        logging.info("Created formatted version of XML object")
1006        return prettyXML
1007
1008
1009    def getLinksOfType(self, termID):
1010        '''
1011        Returns links in the atom related links attribute which match the specified
1012        term ID
1013        @param termID: the termID to look for in the related links - NB, this is
1014        matched to the end of the link.rel value
1015        @return links: array of Link objects with matching term type
1016        '''
1017        logging.debug("Getting atom links of type, '%s'" %termID)
1018        matchingLinks = []
1019        for link in self.relatedLinks:
1020            # firstly, handle special case where we only want the online ref type links
1021            # returned
1022            if termID == self.ONLINE_REF_LABEL:
1023                if not link.isChildAtom():
1024                    logging.debug("- found link with matching term type")
1025                    matchingLinks.append(link)
1026               
1027            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1028                logging.debug("- found link with matching term type")
1029                matchingLinks.append(link)
1030               
1031        logging.debug("Returning matched links")
1032        return matchingLinks
1033       
1034       
1035    def getLogos(self):
1036        '''
1037        Return related links that are logos
1038        @return: array of Links containing the logos for the atom
1039        '''
1040        logos = []
1041        for link in self.relatedLinks:
1042            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1043                logos.append(link)
1044               
1045        return logos
1046   
1047   
1048    def isGranule(self):
1049        if self.atomTypeID == VTD.GRANULE_TERM:
1050            return True
1051        return False
1052   
1053   
1054    def isDE(self):
1055        if self.atomTypeID == VTD.DE_TERM:
1056            return True
1057        return False
1058   
1059    def isDeployment(self):
1060        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1061            return True
1062        return False
1063   
1064    def isDeployable(self):
1065        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1066            self.atomTypeID == VTD.DPT_TERM or \
1067            self.atomTypeID == VTD.OBS_TERM:
1068            return True
1069        return False
1070       
1071       
1072    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1073        '''
1074        Parse CSML data and add extracted info to the atom
1075        @param csmlName: name of the csml file
1076        @param csmlContent: content of the csml file - NB, if this is set to None and the
1077        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1078        directly
1079        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1080        atom coverage data will be added
1081        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1082        this should only be True if creating a new atom - e.g. from a granulite
1083        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1084        '''
1085        logging.info("Creating CSML data model")
1086        self.csmlFile = csmlName
1087        self.contentFile = csmlName
1088        content = csmlContent or csmlName
1089   
1090        csmlDoc = CsmlParser.Dataset(file=content)
1091       
1092        logging.info("Extracting info from CSML file")
1093        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1094        if useCSMLID:
1095            logging.debug(" - using this ID for the atom")
1096            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1097       
1098        title = csmlDoc.name.CONTENT
1099        logging.debug("Got dataset name (title): '%s'" %title)
1100        # NB, if a title is specified (and not as the default value), it automatically is used in
1101        # place of anything in the granulite file
1102        if title and title != "NAME OF DATASET GOES HERE":
1103            logging.info("Title, '%s', extracted from CSML file" %title)
1104            if self.title:
1105                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1106                             %self.title)
1107            self.title = title
1108               
1109        bbox1 = csmlDoc.getBoundingBox()
1110        bbox2 = csmlDoc.getCSMLBoundingBox()
1111
1112        time = None
1113        if bbox2:
1114            time = bbox2.getTimeLimits()
1115   
1116        # now check for other parameters to add to granule
1117        # Firstly, extract the bounding envelope
1118        if bbox1:
1119            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1120            n, s = (bbox1[3], bbox1[1])
1121   
1122            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1123                self.maxY = n
1124               
1125            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1126                self.minY = s
1127           
1128            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1129                self.minX = w
1130   
1131            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1132                self.maxX = e
1133           
1134            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1135                          %(w, s, e, n))
1136           
1137            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1138                          %(self.minX, self.minY, self.maxX, self.maxY))
1139        else:
1140            logging.debug("No valid bounding box data found")
1141   
1142        if time:
1143            t1 = utilities.formatDateYYYYMMDD(time[0])
1144            if not aggregateCoverage or \
1145                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1146                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1147                self.t1 = t1
1148   
1149            t2 = time[1]
1150            if t2 and t2 != 'None':
1151                t2 = utilities.formatDateYYYYMMDD(t2)
1152                if not aggregateCoverage or \
1153                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1154                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1155                    self.t2 = t2
1156           
1157            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1158        else:
1159            logging.debug("No valid time range data found")
1160   
1161        #create parameter summaries:
1162        #set up list to hold the parameters data
1163        parameters = []
1164        for feature in csmlDoc.featureCollection.featureMembers:
1165            if hasattr(feature.parameter, 'href'):
1166                paramTriple = ""
1167                if hasattr(feature, 'description'):
1168                    paramTriple = feature.description.CONTENT
1169                    paramTriple += " | " + feature.parameter.href
1170                   
1171                    term = ""
1172                    if hasattr(feature, 'name'):
1173                        term = feature.name.CONTENT
1174   
1175                    paramTriple += " | " + term
1176                   
1177                    logging.debug("Got parameter info: %s" %paramTriple)
1178                    parameters.append(paramTriple)
1179       
1180        # update the atom with the extracted parameters
1181        logging.info("Adding CSML parameters to granule atom")
1182        self.addParameters(parameters)
1183        logging.info("Finished adding CSML data")
1184        return csmlDoc
1185
1186
1187    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1188        '''
1189        Check through the atom links and retrieve any associated data of the
1190        specified type
1191        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1192        or VTD.DE_TERM
1193        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1194        config details which are not available to the Atom object
1195        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1196        defined deployments to find those which reference it, otherwise only
1197        deployments data featured in the atom related links are processed
1198        '''
1199        logging.info("Looking up %s info" %type)
1200       
1201        self.allActivities = []
1202        self.allObs = []
1203        self.allDpts = []
1204
1205        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1206            raise ValueError('Unrecognised associated data type: %s' %type)
1207       
1208        # avoid duplicating lookup effort
1209        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1210            (type == VTD.DE_TERM and self.dataEntities):
1211            logging.info("- this info has already been looked up - returning")
1212            return
1213
1214        # firstly, collect all the references to the info required
1215        if lookupIndirectReferences:
1216            logging.info("Looking up indirect references")
1217           
1218            # if we're looking up DE data for deployments data, need to have the
1219            # deployments info looked up first
1220            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1221                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1222           
1223            logging.info("Looking up references to this atom from other %s" %type)
1224           
1225            # NB, if we're looking up deployments info, we only look up references
1226            # to this atom - if we're looking up DEs, we need to look up references
1227            # to the deployments referenced by this atom
1228            urls = [self.atomBrowseURL]
1229           
1230            if type == VTD.DE_TERM and self.isDeployable():
1231                urls = []
1232                for dep in self.deployments:
1233                    urls.append(dep.browseURL)
1234                   
1235            links = []
1236            for url in urls:
1237                doc = dr.get(type, dr.ATOM_TYPE, url, \
1238                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1239                # now need to turn this results set into actual atoms
1240                tree = ET.fromstring(doc)
1241                for atom in tree:
1242                    logging.debug("- found reference in %s" %type)
1243                    links.append(ET.tostring(atom))
1244                   
1245            logging.info("Finished looking up indirect references")
1246        else:
1247            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1248
1249        # now retrieve the references and extract the required data
1250        logging.info("Retrieving info from %s references" %type)
1251        if type == VTD.DEPLOYMENT_TERM:
1252            logging.info("Extracting links data to deployment entitites")
1253            self.deployments = []
1254            for link in links:
1255                if lookupIndirectReferences:
1256                    deploymentAtom = link
1257                else:
1258                    localID = link.href.split("__ATOM__")[-1]
1259                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1260                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1261   
1262                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1263                self.deployments.append(deployment)
1264               
1265                self.addUniqueLinks(self.allActivities, deployment.activities)
1266                self.addUniqueLinks(self.allObs, deployment.obs)
1267                self.addUniqueLinks(self.allDpts, deployment.dpts)
1268        else:
1269            # for DE data, just store the title + link in a Link object
1270            self.dataEntities = []
1271            logging.info("Extracting links data to data entitites")
1272            for data in links:
1273                atom = Atom(xmlString=str(data))
1274                link = Link()
1275                link.title = atom.title
1276                link.href = atom.atomBrowseURL
1277                link.rel = atom.datasetID
1278               
1279                # NB, different deployments may be used by the same DE - so
1280                # avoid duplication
1281                self.addUniqueLinks(self.dataEntities, link)
1282           
1283        logging.info("Finished looking up %s info" %type)
1284
1285
1286    def addUniqueLinks(self, dataArray, links):
1287        '''
1288        Add links to specified array - if they are not already included
1289        @param dataArray: a list, potentially arlready containing links
1290        @param links: a Link or array of Links to add to the dataArray
1291        '''
1292        logging.debug("Adding new links")
1293        if not links:
1294            return
1295       
1296        if type(links) is not list:
1297            links = [links]
1298       
1299        for link in links:
1300            if type(link) is not Link:
1301                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1302                continue
1303            if link not in dataArray:
1304                logging.debug("- adding unique link")
1305                dataArray.append(link)
1306        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.