source: ndgCommon/trunk/ndg/common/src/models/Atom.py @ 4970

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/models/Atom.py@4970
Revision 4970, 51.9 KB checked in by cbyrom, 12 years ago (diff)

Various fixes, tidy ups and simplications to ndgCommon codebase.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6import sys, logging, re, datetime
7from xml.etree import cElementTree as ET
8import csml.parser as CsmlParser
9import ndg.common.src.clients.xmldb.eXist.dbconstants as dc
10from ndg.common.src.lib.ETxmlView import et2text
11import ndg.common.src.lib.utilities as utilities
12from ndg.common.src.models.vocabtermdata import VocabTermData as VTD
13from ndg.common.src.models import MolesEntity as ME
14from ndg.common.src.models import Deployment as Deployment
15from ndg.common.src.models import AtomState
16from ndg.common.src.models.ndgObject import ndgObject
17
18class AtomError(Exception):
19    """
20    Exception handling for Atom class.
21    """
22    def __init__(self, msg):
23        logging.error(msg)
24        Exception.__init__(self, msg)
25
26
27class Person(object):
28    '''
29    Class representing atom author type data - with name, uri and role attributes
30    @keyword personType: Type of person to create - specified using the Person.._Type
31    values.  Default is AUTHOR_TYPE.
32    @keyword namespace: a two value array of format, ['short_namespace_name', 'full_namespace_name']
33    - e.g. ['moles', 'http://ndg.nerc.ac.uk/schema/moles2beta']
34    '''
35    AUTHOR_TYPE = 0
36    CONTRIBUTOR_TYPE = 1
37    RESPONSIBLE_PARTY_TYPE = 2
38    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
39   
40    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
41        self.type = personType
42        if namespace:
43            self.ns_shortname = namespace[0]
44            self.ns_fullname = namespace[1]
45        else:
46            self.ns_shortname = ""
47            self.ns_fullname = ndgObject.ATOM_NS
48           
49        self.name = ""
50        self.uri = ""
51        self.role = ""
52       
53        # NB, the atom format specifies slightly different data contents
54        self.uriTagName = "email"
55        # NB, responsible party data is always stored in the moles section
56        if self.type == self.RESPONSIBLE_PARTY_TYPE:
57            self.ns_shortname = 'moles'
58            self.ns_fullname = ndgObject.MOLES_NS
59            self.uriTagName = "uri"
60
61    def __str__(self):
62        if self.name or self.uri or self.role:
63            return self.name + " | " + self.uri + " | " + self.role
64        return ""
65
66
67    def hasValue(self):
68        if self.name or self.uri or self.role:
69            return True
70        return False
71   
72    def fromString(self, personString):
73        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
74       
75    def fromETElement(self, personTag):
76        self.name = personTag.findtext('{%s}name' %self.ns_fullname) or ""
77        self.role = personTag.findtext('{%s}role' %self.ns_fullname) or ""
78        self.uri = personTag.findtext('{%s}%s' %(self.ns_fullname, self.uriTagName)) or ""
79        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
80                      %(self.name, self.role, self.uriTagName, self.uri))
81
82    def toXML(self):
83        prefix = ""
84        if self.ns_shortname:
85            prefix = self.ns_shortname + ':'
86
87        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
88
89        if self.name:
90            name = ET.SubElement(author, prefix + "name")
91            name.text = self.name
92       
93        if self.uri:
94            uri = ET.SubElement(author, prefix + self.uriTagName)
95            uri.text = self.uri
96       
97        if self.role:
98            role = ET.SubElement(author, prefix + "role")
99            role.text = self.role
100
101        return author
102   
103    def __cmp__(self, person1):
104        '''
105        Override comparison to allow proper object comparison when checking
106        if Person objects are in an array already - i.e. if person in personArray...
107        '''
108        if not person1:
109            return -1
110       
111        if self is person1:
112            return 0
113        elif self.uri == person1.uri and self.name == person1.name and \
114                self.role == person1.role and self.type == person1.type:
115            return 0
116        return 1
117
118
119class Link(object):
120    '''
121    Class representing an atom link - with href, title and rel attributes
122    '''
123
124    def __init__(self):
125        self.href = ""
126        self.title = ""
127        self.rel = ""
128
129    def fromString(self, linkString):
130        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
131       
132    def fromETElement(self, linkTag):
133        self.href = linkTag.attrib.get('href') or ""
134        self.rel = linkTag.attrib.get('rel') or ""
135        self.title = linkTag.attrib.get('title') or ""
136
137    def toXML(self):
138        link = ET.Element("link")
139        link.attrib["href"] = self.href
140        link.attrib["title"] = self.title
141        link.attrib["rel"] = self.rel
142        return link
143
144    def hasValue(self):
145        # NB, just a rel on its own is meaningless - so ignore
146        if self.href or self.title:
147            return True
148        return False
149   
150    def __str__(self):
151        if self.href or self.title or self.rel:
152            return self.href + " | " + self.title + " | " + self.rel
153        return ""
154   
155    def isChildAtom(self):
156        '''
157        Determines whether the link refers to another atom - e.g. a link to
158        a data granule
159        @return True, if so; False otherwise
160        '''
161        if self.rel.endswith(VTD.GRANULE_TERM) or \
162            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
163            self.rel.endswith(VTD.ACTIVITY_TERM) or \
164            self.rel.endswith(VTD.DPT_TERM) or \
165            self.rel.endswith(VTD.OBS_TERM):
166            return True
167       
168        return False
169   
170    def __cmp__(self, link1):
171        '''
172        Override comparison to allow proper object comparison when checking
173        if Link objects are in an array already - i.e. if link in linkArray...
174        '''
175        if not link1:
176            return -1
177       
178        if self is link1:
179            return 0
180        elif self.href == link1.href and self.title == link1.title and \
181                self.rel == link1.rel:
182            return 0
183        return 1
184
185
186class Category(object):
187    '''
188    Class representing an atom category - with term, scheme and label attributes
189    '''
190    def __init__(self):
191        self.term = ""
192        self.scheme = ""
193        self.label = ""
194
195    def fromString(self, linkString, escapeSpecialCharacters=True):
196        '''
197        Create Category from triple string of format, 'label | scheme | term'
198        @param linkString: triple string to create category with
199        @keyword escapeSpecialCharacters: if set to True, special characters in
200        triple string are escaped (default)
201        '''
202        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
203            doEscape=escapeSpecialCharacters)
204       
205    def fromETElement(self, linkTag):
206        self.term = linkTag.attrib.get('term') or ""
207        self.label = linkTag.attrib.get('label') or ""
208        self.scheme = linkTag.attrib.get('scheme') or ""
209
210    def toXML(self):
211        link = ET.Element("category")
212        link.attrib["term"] = self.term
213        link.attrib["scheme"] = self.scheme
214        link.attrib["label"] = self.label
215        return link
216   
217    def hasValue(self):
218        if self.scheme or self.label or self.term:
219            return True
220        return False
221
222
223class Atom(object):
224
225    # labels for use with the atom categories
226    ATOM_TYPE = "ATOM_TYPE"
227    ATOM_SUBTYPE = "ATOM_SUBTYPE"
228
229    # labels for use with the templates to set/extract specific inputs
230    ONLINE_REF_LABEL = "online_ref"
231    PARAMETER_LABEL = "parameter"
232    ATOM_REF_LABEL = "atom_ref"
233    DELIMITER = "---"
234    REMOVE_LABEL = "remove"
235   
236    # format to use for t1-t2 date range
237    YEAR_FORMAT = '%Y-%m-%d'
238
239    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
240                 xmlString = None, state = AtomState.WORKING_STATE, **inputs):
241        '''
242        Constructor - initialise the atom variables
243        @keyword atomType: type of atom to set up
244        @keyword vocabTermData: instance of VocabTermData object to use with atom
245        @keywork ndgObject: instance of ndgObject to use with atom
246        @keyword xmlString: XML representation of atom - will be parsed to populate
247        the atom data
248        @keyword state:  AtomState object representing the state of the atom
249        '''
250        logging.info("Initialising atom")
251        if atomType:
252            logging.info(" - of type '%s'" %atomType)
253        self.atomTypeID = atomType
254
255        # some data have further subtypes specified
256        self.subtypeID = None # this should be the termID
257        self.subtype = None # and this should be the fully formed vocab URL
258       
259        self.ndgObject = ndgObject
260
261        self.atomName = None
262        self.files = []
263        self.author = Person()
264        self.contributors = []
265        self.atomAuthors = []
266        self.parameters = []
267        self.spatialData = []
268        self.temporalData = []
269        self.relatedLinks = []
270        self.summary = []
271        self.content = []
272        # NB, this deployments data duplicates other atom data - and is only used for a
273        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
274        self.deployments = []
275        # ditto for the following field
276        self.dataEntities = []
277           
278        self.csmlFile = None
279        self.cdmlFile = None
280        # general variable to use for setting the atom content - NB, if a csmlFile is specified
281        # (either directly or via a cdmlFile specification), this will be the content by default
282        # for this purpose
283        self.contentFile = None     
284        self.title = None
285        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
286        self.atomID = None
287   
288        # boundary box info - to replace spatial/temporalData?
289        self.minX = None
290        self.minY = None
291        self.maxX = None
292        self.maxY = None
293        self.t1 = None
294        self.t2 = None
295
296        self.ME = ME.MolesEntity()
297       
298        # date when the atom was first ingested
299        self.publishedDate = None
300
301        # last update date
302        self.updatedDate = None
303
304        # assume atom in working state by default - this is used to define what collection
305        # in eXist the atom is stored in
306        self.state = state
307       
308        # additional, non standard atom data can be included in the molesExtra element
309        if vocabTermData:
310            self.VTD = vocabTermData
311        else:
312            self.VTD = VTD()
313       
314        if xmlString:
315            self.fromString(xmlString)
316
317        # if inputs passed in as dict, add these now
318        if inputs:
319            logging.info("Adding info to atom from input dict")
320            logging.debug(inputs)
321            self.__dict__.update(inputs)
322           
323            # NB, this doesn't trigger the Summary Property, so do this
324            # explicitly, if need be
325            if inputs.has_key('Summary'):
326                self.Summary = inputs.get('Summary')
327            if inputs.has_key('Content'):
328                self.Content = inputs.get('Content')
329            if inputs.has_key('author'):
330                name = inputs.get('author')
331                author = Person()
332                author.fromString(name)
333                self.author = author
334           
335            # also pass any moles data up to the moles entity object
336            if inputs.has_key('providerID'):
337                self.ME.providerID = inputs.get('providerID')
338               
339            if inputs.has_key('abbreviation'):
340                self.ME.abbreviation = inputs.get('abbreviation')
341
342        if self.atomTypeID:
343            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
344
345        logging.info("Atom initialised")
346
347
348    def addOnlineReferences(self, links):
349        '''
350        Add online reference data associated with the atom
351        - NB, care needs to be taken here since this data is stored in the atom
352        link elements and these are also used for the various atom associations
353        @param links: a Link or array of Links to add to the relatedLinks attribute
354        '''
355        logging.debug("Adding online references")
356        if not links:
357            return
358       
359        if type(links) is not list:
360            links = [links]
361       
362        # firstly clear out any online refs data from the existing related links
363        newLinks = []
364        for link in self.relatedLinks:
365            if link.isChildAtom():
366                newLinks.append(link)
367       
368        newLinks.extend(links)
369        self.relatedLinks = newLinks
370        logging.debug("Online references added")
371
372
373    def addUniqueRelatedLinks(self, links):
374        '''
375        Add links to relatedLinks array - if they are not already included
376        @param links: a Link or array of Links to add to the relatedLinks attribute
377        '''
378        self.addUniqueLinks(self.relatedLinks, links)
379       
380
381    def removeRelatedLinks(self, linksToDelete):
382        '''
383        Remove any links in the input list from the atom's related links list
384        @param linksToDelete: array of Link objects to remove from atom
385        '''
386        logging.debug("Removing related links from atom")
387        if not linksToDelete:
388            return
389       
390        if type(linksToDelete) is not list:
391            linksToDelete = [linksToDelete]
392       
393        updatedLinks = []
394        for link in self.relatedLinks:
395            if type(link) is not Link:
396                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
397                continue
398            if link in linksToDelete:
399                logging.debug("- found link to remove")
400            else:
401                updatedLinks.append(link)
402
403        self.relatedLinks = updatedLinks
404        logging.debug("Links removed")
405
406    def getPublicationStatePath(self):
407        '''
408        Determine the correct publication state collection for the atom
409        @return collectionPath: collection path for the publication state of the atom
410        '''
411        logging.debug("Getting collection path for atom publication state")
412        collectionPath = dc.ATOM_COLLECTION_PATH + self.state.collectionPath
413        logging.debug("Returning publication state collection, '%s'" %collectionPath)
414        return collectionPath
415       
416
417    def getDefaultEntityCollectionPath(self):
418        '''
419        Determine the correct collection for the entity type of the atom
420        @return entityPath: collection path for the data type of the atom
421        '''
422        logging.debug("Getting collection path for atom entity type")
423        collectionPath = self.getPublicationStatePath()
424       
425        if self.atomTypeID == VTD.DE_TERM:
426            collectionPath += dc.DE_COLLECTION_PATH
427        elif self.atomTypeID == VTD.GRANULE_TERM:
428            collectionPath += dc.GRANULE_COLLECTION_PATH
429        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
430            self.subtypeID == VTD.DEPLOYMENT_TERM:
431            collectionPath += dc.DEPLOYMENTS_COLLECTION_PATH
432        else:
433            collectionPath += dc.DEPLOYMENT_COLLECTION_PATH
434       
435        logging.debug("Returning entity collection, '%s'" %collectionPath)
436        return collectionPath
437       
438
439    def getDefaultCollectionPath(self):
440        '''
441        Determine the correct collection to use for the atom in eXist
442        '''
443        logging.debug("Getting default collection path for atom")
444        collectionPath = self.getDefaultEntityCollectionPath()
445        if not self.ME.providerID:
446            raise AtomError("Error: cannot determine atom collection path because " + \
447                            "the provider ID is not defined")
448           
449        collectionPath += self.ME.providerID + "/"
450        logging.debug("Returning collection, '%s'" %collectionPath)
451        return collectionPath
452
453
454    def __addAtomTypeDataXML(self, root):
455        '''
456        Add the atom type, and subtype data, if available, to atom categories
457        - and lookup and add the appropriate vocab term data
458        '''
459        if self.atomTypeID:
460            logging.info("Adding atom type info to XML output")
461            category = Category()
462            category.label = self.atomTypeID
463            # look up the appropriate vocab term data
464            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
465            category.term = self.ATOM_TYPE
466            root.append(category.toXML())
467
468        if self.subtypeID:
469            logging.info("Adding atom subtype info to XML output")
470            # NB subtypes not all defined, so leave this out for the moment
471            category.label = self.subtypeID
472            # look up the appropriate vocab term data
473            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
474            category.term = self.ATOM_SUBTYPE
475            root.append(category.toXML())
476
477
478    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
479        '''
480        Add data to include in the moles entity element
481        '''
482        logging.debug('Adding moles entity information')
483        self.ME.abbreviation = abbreviation
484        self.ME.providerID = provider_id
485        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
486        logging.debug('Moles entity information added')
487
488
489    def addAuthors(self, authors):
490        '''
491        Add author data appropriately to the atom
492        NB, these will overwrite any existing authors of the same type
493        @param authors: list of Person objects with the author data
494        '''
495        logging.debug('Adding authors data to Atom')
496        isFirstAuthor = {}
497        authorArray = None
498        for author in authors:
499            # NB, we're only allowed one atom author
500            if author.type == Person.AUTHOR_TYPE:
501                self.author = author
502                   
503                if isFirstAuthor.has_key(author.type):
504                    raise AtomError("Error: an atom can only have one author specified")
505                isFirstAuthor[author.type] = 1
506                continue
507            elif author.type == Person.CONTRIBUTOR_TYPE:
508                authorArray = self.contributors
509            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
510                authorArray = self.ME.responsibleParties
511               
512            # check if this is the first addition - if so, clear out the
513            # array in advance
514            if not isFirstAuthor.has_key(author.type):
515                logging.debug("Clearing out author array")
516                # NB, need to be careful to clear the array, not create a ref
517                # to a new array
518                del authorArray[:]
519                isFirstAuthor[author.type] = 1
520
521            if author.hasValue() and author not in authorArray:
522                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
523                              %(author.type, author.name, author.uri, author.role))
524                authorArray.append(author)
525
526        logging.debug('Finished adding authors data')
527
528
529    def _isNewParameter(self, param):
530        '''
531        Check if a parameter is already specified in the atom, return False if
532        so, otherwise return True
533        '''
534        for p in self.parameters:
535            if p.term == param.term and \
536                p.scheme == param.scheme and \
537                p.label == param.label:
538                return False
539        return True
540
541
542    def addRelatedLinks(self, linkVals):
543        '''
544        Add related links in string format - converting to Link objects
545        NB, only add the link if it is unique
546       
547        @param linkVals: string of format, 'uri | title | vocabServerURL'
548        '''
549        link = self.objectify(linkVals, 'relatedLinks')
550        if link not in self.relatedLinks:
551            self.relatedLinks.append(link)
552
553
554    def addParameters(self, params):
555        '''
556        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
557        @params param: parameter, as string array, to add to atom parameters collection
558        '''
559        # avoid strings being parsed character by character
560        if type(params) is str:
561            params = [params]
562           
563        for param in params:
564            # firstly tidy parameter
565            param = utilities.tidyUpParameters(param)
566            category = Category()
567            # NB, data already tidied up here, so set keyword to avoid this happening again
568            category.fromString(param, escapeSpecialCharacters=True)
569
570            # now check for uniqueness
571            if self._isNewParameter(category):
572                logging.debug("Adding new parameter: %s" %param)
573                self.parameters.append(category)
574   
575   
576    def _linksToXML(self, root):
577        '''
578        Add required links to the input element
579        @param root: element to add links to - NB, should be the root element of the atom
580        '''
581        selfLink = ET.SubElement(root, "link")
582        selfLink.attrib["href"] = self.atomBrowseURL
583        selfLink.attrib["rel"] = "self"
584       
585        for relatedLink in self.relatedLinks:
586            if relatedLink.hasValue():
587                root.append(relatedLink.toXML())
588   
589    def toXML(self):
590        '''
591        Convert the atom into XML representation and return this
592        @return: xml version of atom
593        '''
594        logging.info("Creating formatted XML version of Atom")
595        root = ET.Element("entry")
596        root.attrib["xmlns"] = ndgObject.ATOM_NS
597        root.attrib["xmlns:moles"] = ndgObject.MOLES_NS
598        root.attrib["xmlns:georss"] = ndgObject.GEOSS_NS
599        root.attrib["xmlns:gml"] = ndgObject.GML_NS
600        id = ET.SubElement(root, "id")
601        id.text = self.atomID
602        title = ET.SubElement(root, "title")
603        title.text = self.title
604        self._linksToXML(root)
605
606        if self.author and self.author.hasValue():
607            root.append(self.author.toXML())
608           
609        for contributor in self.contributors:
610            root.append(contributor.toXML())
611
612        # add parameters data
613        for param in self.parameters:
614            if param.hasValue():
615                root.append(param.toXML())
616
617        # add the type and subtype data
618        self.__addAtomTypeDataXML(root)
619                   
620        summary = ET.SubElement(root, "summary")
621        summary.text = self.Summary
622                   
623        # add link to content, if required - NB, can only have one content element in atom
624        # - and this is mandatory
625        content = ET.SubElement(root, "content")
626        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
627        if contentFile:
628            content.attrib["type"] = "application/xml"
629            content.attrib["src"] = contentFile
630        else:
631            content.attrib["type"] = "xhtml"
632            div = ET.SubElement(content, 'div')
633            div.attrib["xmlns:xhtml"] = ndgObject.XHTML_NS
634            div.text = self.Content
635       
636        # if there's a published date already defined, assume we're doing an update now
637        # NB, update element is mandatory
638        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
639        if not self.publishedDate:
640            self.publishedDate = currentDate
641
642        updated = ET.SubElement(root, "updated")
643        if not self.updatedDate:
644            self.updatedDate = currentDate
645        updated.text = self.updatedDate
646
647        published = ET.SubElement(root, "published")
648        published.text = self.publishedDate
649
650        # add the moles entity section, if it is required
651        if self.ME:
652            root.append(self.ME.toXML())
653
654        # add temporal range data, if available
655        temporalRange = ET.SubElement(root, "moles:temporalRange")
656        if self.t1:
657            temporalRange.text = self.t1
658            if self.t2:
659                temporalRange.text += "/" + self.t2
660
661        # add spatial range data, if available
662        self._addSpatialData(root)
663
664        tree = ET.ElementTree(root)
665        logging.info("XML version of Atom created")
666        return tree
667
668
669    def __getSummary(self):
670        logging.debug("Getting summary data")
671        summaryString = ""
672        for summary_line in self.summary:
673            summaryString += summary_line + "\n"
674
675        return summaryString
676
677    def __setSummary(self, summary):
678        logging.debug("Adding summary data")
679        self.summary = []
680        for summary_line in summary.split('\n'):
681            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
682           
683    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
684
685
686    def __getContent(self):
687        logging.debug("Getting content data")
688        contentString = ""
689        # NB, there must be content specified in an atom
690        if not self.content:
691            return "Metadata document"
692       
693        for content_line in self.content:
694            contentString += content_line + "\n"
695
696        return contentString
697
698    def __setContent(self, content):
699        logging.debug("Adding content data")
700        self.content = []
701        for content_line in content.split('\n'):
702            self.content.append(content_line)
703           
704    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
705
706           
707    def fromString(self, xmlString):
708        '''
709        Initialise Atom object using an xmlString
710        @param xmlString: representation of atom as an XML string
711        '''
712        logging.info("Ingesting data from XML string")
713        logging.debug("Create elementtree instance with XML string")
714        tree = ET.fromstring(xmlString)
715        title = tree.findtext('{%s}title' %ndgObject.ATOM_NS)
716        if title:
717            logging.debug("Adding title data")
718            self.title = title
719
720        summary = tree.findtext('{%s}summary' %ndgObject.ATOM_NS)
721        if summary:
722            self.Summary = summary#.decode('unicode_escape')
723
724        authorElement = tree.find('{%s}author' %ndgObject.ATOM_NS)
725        if authorElement:
726            logging.debug("Adding author data")
727            author = Person()
728            author.fromETElement(authorElement)
729            self.author = author
730
731        contributorElements = tree.findall('{%s}contributor' %ndgObject.ATOM_NS)
732        for contributorElement in contributorElements:
733            logging.debug("Adding contributor data")
734            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
735            contributor.fromETElement(contributorElement)
736            self.contributors.append(contributor)
737
738        molesElement = tree.find('{%s}entity' %ndgObject.MOLES_NS)
739        if molesElement:
740            self.ME.fromET(molesElement)
741               
742        atomID = tree.findtext('{%s}id' %ndgObject.ATOM_NS)
743        self.__parseAtomID(atomID)
744       
745        self._parseCategoryData(tree.findall('{%s}category' %ndgObject.ATOM_NS))
746
747        self._parseLinksData(tree.findall('{%s}link' %ndgObject.ATOM_NS))
748           
749        contentTag = tree.find('{%s}content' %ndgObject.ATOM_NS)
750        if contentTag != None:
751            logging.debug("Found content tag - checking for CSML/CDML file data")
752            file = contentTag.attrib.get('src')
753            if file:
754                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
755                if file.upper().find('CSML') > -1:
756                    logging.debug("Adding CSML file data")
757                    self.csmlFile = file
758                elif file.upper().find('CDML') > -1:
759                    logging.debug("Adding CDML file data")
760                    self.cdmlFile = file
761                self.contentFile = file
762            else:
763                logging.debug("No file data - adding contents of element instead")
764                div = contentTag.findtext('{%s}div'%ndgObject.ATOM_NS)
765                self.Content = div
766       
767        range = tree.findtext('{%s}temporalRange' %ndgObject.MOLES_NS)
768        if range:
769            logging.debug("Adding temporal range data")
770            timeData = range.split('/')
771            self.t1 = timeData[0]
772            if len(timeData) > 1:
773                self.t2 = timeData[1]
774       
775        where = tree.find('{%s}where' %ndgObject.GEOSS_NS)
776        if where:
777            # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
778            minBBox = where.findall('.//{%s}lowerCorner' %ndgObject.GML_NS)
779            if minBBox:
780                logging.debug("Adding min spatial range data")
781                minBBox = minBBox[0]
782                spatialData = minBBox.text.split()
783                self.minX = spatialData[0]
784                if len(spatialData) > 1:
785                    self.minY = spatialData[1]
786           
787            maxBBox = where.findall('.//{%s}upperCorner' %ndgObject.GML_NS)
788            if maxBBox:
789                maxBBox = maxBBox[0]
790                logging.debug("Adding max spatial range data")
791                spatialData = maxBBox.text.split()
792                self.maxX = spatialData[0]
793                if len(spatialData) > 1:
794                    self.maxY = spatialData[1]
795               
796        publishedDate = tree.findtext('{%s}published' %ndgObject.ATOM_NS)
797        if publishedDate:
798            logging.debug("Adding published date")
799            self.publishedDate = publishedDate
800               
801        updatedDate = tree.findtext('{%s}updated' %ndgObject.ATOM_NS)
802        if updatedDate:
803            logging.debug("Adding updated date")
804            self.updatedDate = updatedDate
805           
806        logging.info("Completed data ingest")
807   
808   
809    def _parseCategoryData(self, categories):
810        logging.debug("Adding category/parameters data")
811        for category in categories:
812            cat = Category()
813            cat.fromETElement(category)
814           
815            if cat.term == self.ATOM_TYPE:
816                logging.debug("Found atom type data")
817                self.atomTypeID = cat.label
818                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
819                continue
820            elif cat.term == self.ATOM_SUBTYPE:
821                logging.debug("Found atom subtype data")
822                self.subtypeID = cat.label
823                self.subtype = cat.scheme
824                continue
825
826            self.parameters.append(cat)
827
828
829    def __parseAtomID(self, atomID):
830        '''
831        Given an atom ID, extract the useful bits of info and set these on
832        the relevant atom attributes
833        @param atomID: an atom ID in the 'tag' format
834        '''
835        logging.debug("Extracting atom info from ID, '%s'" %atomID)
836        self.atomID = atomID
837        self.datasetID = atomID.split("__ATOM__")[-1]
838        self._generateAtomName(self.datasetID)
839        logging.debug("- all info extracted")
840   
841
842    def setDatasetID(self, datasetID):
843        '''
844        Set the dataset ID for the atom - and generate an appropriate atom name using this
845        @param datasetID: ID to set for the atom
846        '''
847        self.datasetID = datasetID
848        self._generateAtomName(datasetID) 
849        self.atomID = self.createAtomID(datasetID)
850
851
852    def createAtomID(self, datasetID):
853        '''
854        Create a unique ID, conforming to atom standards, for atom
855        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
856        @param datasetID: ID of atom's dataset
857        @return: unique ID
858        '''
859        logging.info("Creating unique ID for atom")
860        if not self.atomBrowseURL:
861            self._generateAtomName(datasetID)
862        urlBit = self.atomBrowseURL.split('://')[1]
863        urlBit = urlBit.replace('#', '')
864        urlBits = urlBit.split('/')
865        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
866        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
867       
868        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
869        logging.info("- unique ID created for atom")
870        logging.debug(" - '%s'" %id)
871        return id
872       
873       
874    def _generateAtomName(self, datasetID):
875        '''
876        Generate a consistent name for the atom - with full eXist doc path
877        @param datasetID: ID of atom's dataset
878        '''
879        self.atomName = datasetID + ".atom"
880        if not self.ME.providerID:
881            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
882        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
883        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
884
885
886    def _parseLinksData(self, links):
887        '''
888        Extract links and atom data from array of link elements in the XML representation of the atom
889        @param links: an array of <link> elements
890        '''
891        # firstly, get all data to start with, so we can properly process it afterwards
892        linkData = {}
893        logging.debug("Getting link data")
894        for linkTag in links:
895            link = Link()
896            link.fromETElement(linkTag)
897
898            if not linkData.has_key(link.rel):
899                linkData[link.rel] = []
900           
901            linkData[link.rel].append(link)
902
903        # there should be one self referencing link - which will provide info on the atom itself
904        if not linkData.has_key('self'):
905            errorMessage = "Atom does not have self referencing link - " + \
906                "cannot ascertain datasetID without this - please fix"
907            logging.error(errorMessage)
908            raise ValueError(errorMessage)
909       
910        # this is the link describing the atom itself
911        self.atomBrowseURL = linkData['self'][0].href
912       
913        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
914        self.atomName = self.datasetID + ".atom"
915        # NB, only split on the stem, since the browse host may not be
916        # the same as that defined in VTD
917        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_STEM_URL)[-1]
918       
919        # now remove this value and the associated moles doc link
920        del linkData['self']
921        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
922        if linkData.has_key('related'):
923            relatedLinks = []
924            for link in linkData['related']:
925                if link.href != molesDoc:
926                    relatedLinks.append(link)
927           
928            linkData['related'] = relatedLinks
929               
930        # now add the remaining links to the atom
931        for key in linkData:
932            for link in linkData[key]:
933                logging.debug("Adding link data")
934                self.relatedLinks.append(link)
935       
936
937    def _addSpatialData(self, element):
938        '''
939        Add spatial coverage element to an input element
940        @param element: element to add coverage data to
941        '''
942        logging.info("Adding spatial data to Atom")
943        if not self.minX:
944            logging.info("No spatial data specified")
945            return
946        bbox = ET.SubElement(element, "georss:where")
947        envelope = ET.SubElement(bbox, "gml:Envelope")
948        lc = ET.SubElement(envelope, "gml:lowerCorner")
949        lc.text = str(self.minX) + " " + str(self.minY)
950        uc = ET.SubElement(envelope, "gml:upperCorner")
951        uc.text = str(self.maxX) + " " + str(self.maxY)
952
953       
954    def setAttribute(self, attributeName, attributeValue):
955        '''
956        Set the value of an atom attribute - and do some basic tidying up of the string content
957        - to escape any XML unfriendly characters
958        @param attributeName: name of the attribute whose value to set
959        @param attributeValue: value to set the attribute to 
960        '''
961        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
962        origValue = attributeValue
963       
964        # escape any special characters if a value has been specified
965        # NB, need to cope with both single values and arrays
966        if attributeValue:
967            if type(attributeValue) is list:
968                newVals = []
969                for val in attributeValue:
970                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
971                attributeValue = newVals
972                   
973            else:
974                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
975
976        # handle the special case of authors; only one author is allowed per atom
977        # - the others should be treated as contributors
978        if attributeName == "authors":
979            setattr(self, "author", attributeValue[0])
980            if len(attributeValue) > 1:
981                setattr(self, "contributors", attributeValue[1:])
982        elif attributeName == "atomAuthors":
983            if isinstance(attributeValue, list):
984                for val in attributeValue:
985                    self.ME.responsibleParties.append(val)
986            else:
987                self.ME.responsibleParties.append(attributeValue)
988        elif attributeName == "files":
989            self.addUniqueRelatedLinks(attributeValue)
990        else:
991            setattr(self, attributeName, attributeValue)
992
993
994    def objectify(self, objectVals, attributeName):
995        '''
996        Some inputs are specified as strings but need to be converted into
997        objects - do this here
998        @param objectVals: a '|' delimited string of values
999        @param attributeName: name of attribute the values belong to
1000        '''
1001        obj = None
1002        if type(objectVals) != str:
1003            return objectVals
1004       
1005        if attributeName == "relatedLinks":
1006            obj = Link()
1007        elif attributeName == "atomAuthors":
1008            obj = Person(personType = Person.RESPONSIBLE_PARTY_TYPE)
1009        elif attributeName == "authors":
1010            # NB, ensure there is only one author tag - extra authors are contributors
1011            authorType = Person.AUTHOR_TYPE
1012            if self.author and self.author.hasValue():
1013                authorType= Person.CONTRIBUTOR_TYPE
1014            obj = Person(personType = authorType)
1015        elif attributeName == 'files':
1016            obj = Link()
1017            objectVals = '%s|%s|%s' \
1018                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
1019
1020        if obj:
1021            obj.fromString(objectVals)
1022            # NB, need to set it now, just in case we don't set it before coming back
1023            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
1024                self.author = obj
1025            return obj
1026       
1027        return objectVals
1028
1029
1030    def toPrettyXML(self):
1031        '''
1032        Returns nicely formatted XML as string
1033        '''
1034        atomXML = self.toXML()
1035
1036        # create the string
1037        logging.debug("Converting the elementtree object into a string")
1038        prettyXML = et2text(atomXML.getroot())
1039
1040        # add XML version tag
1041        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1042        logging.info("Created formatted version of XML object")
1043        return prettyXML
1044
1045
1046    def getLinksOfType(self, termID):
1047        '''
1048        Returns links in the atom related links attribute which match the specified
1049        term ID
1050        @param termID: the termID to look for in the related links - NB, this is
1051        matched to the end of the link.rel value
1052        @return links: array of Link objects with matching term type
1053        '''
1054        logging.debug("Getting atom links of type, '%s'" %termID)
1055        matchingLinks = []
1056        for link in self.relatedLinks:
1057            # firstly, handle special case where we only want the online ref type links
1058            # returned
1059            if termID == self.ONLINE_REF_LABEL:
1060                if not link.isChildAtom():
1061                    logging.debug("- found link with matching term type")
1062                    matchingLinks.append(link)
1063               
1064            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1065                logging.debug("- found link with matching term type")
1066                matchingLinks.append(link)
1067               
1068        logging.debug("Returning matched links")
1069        return matchingLinks
1070       
1071       
1072    def getLogos(self):
1073        '''
1074        Return related links that are logos
1075        @return: array of Links containing the logos for the atom
1076        '''
1077        logos = []
1078        for link in self.relatedLinks:
1079            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1080                logos.append(link)
1081               
1082        return logos
1083   
1084   
1085    def isGranule(self):
1086        if self.atomTypeID == VTD.GRANULE_TERM:
1087            return True
1088        return False
1089   
1090   
1091    def isDE(self):
1092        if self.atomTypeID == VTD.DE_TERM:
1093            return True
1094        return False
1095   
1096    def isDeployment(self):
1097        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1098            return True
1099        return False
1100   
1101    def isDeployable(self):
1102        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1103            self.atomTypeID == VTD.DPT_TERM or \
1104            self.atomTypeID == VTD.OBS_TERM:
1105            return True
1106        return False
1107   
1108    def isPublished(self):
1109        '''
1110        Check state of atom doc - if published or Published return True,
1111        otherwise return False
1112        '''
1113        return self.state.isPublishedState()
1114       
1115       
1116    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1117        '''
1118        Parse CSML data and add extracted info to the atom
1119        @param csmlName: name of the csml file
1120        @param csmlContent: content of the csml file - NB, if this is set to None and the
1121        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1122        directly
1123        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1124        atom coverage data will be added
1125        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1126        this should only be True if creating a new atom - e.g. from a granulite
1127        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1128        '''
1129        logging.info("Creating CSML data model")
1130        self.csmlFile = csmlName
1131        self.contentFile = csmlName
1132        content = csmlContent or csmlName
1133   
1134        csmlDoc = CsmlParser.Dataset(file=content)
1135       
1136        logging.info("Extracting info from CSML file")
1137        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1138        if useCSMLID:
1139            logging.debug(" - using this ID for the atom")
1140            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1141       
1142        title = csmlDoc.name.CONTENT
1143        logging.debug("Got dataset name (title): '%s'" %title)
1144        # NB, if a title is specified (and not as the default value), it automatically is used in
1145        # place of anything in the granulite file
1146        if title and title != "NAME OF DATASET GOES HERE":
1147            logging.info("Title, '%s', extracted from CSML file" %title)
1148            if self.title:
1149                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1150                             %self.title)
1151            self.title = title
1152               
1153        bbox1 = csmlDoc.getBoundingBox()
1154        bbox2 = csmlDoc.getCSMLBoundingBox()
1155
1156        time = None
1157        if bbox2:
1158            time = bbox2.getTimeLimits()
1159   
1160        # now check for other parameters to add to granule
1161        # Firstly, extract the bounding envelope
1162        if bbox1:
1163            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1164            n, s = (bbox1[3], bbox1[1])
1165   
1166            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1167                self.maxY = n
1168               
1169            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1170                self.minY = s
1171           
1172            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1173                self.minX = w
1174   
1175            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1176                self.maxX = e
1177           
1178            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1179                          %(w, s, e, n))
1180           
1181            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1182                          %(self.minX, self.minY, self.maxX, self.maxY))
1183        else:
1184            logging.debug("No valid bounding box data found")
1185   
1186        if time:
1187            t1 = utilities.formatDateYYYYMMDD(time[0])
1188            if not aggregateCoverage or \
1189                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1190                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1191                self.t1 = t1
1192   
1193            t2 = time[1]
1194            if t2 and t2 != 'None':
1195                t2 = utilities.formatDateYYYYMMDD(t2)
1196                if not aggregateCoverage or \
1197                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1198                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1199                    self.t2 = t2
1200           
1201            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1202        else:
1203            logging.debug("No valid time range data found")
1204   
1205        #create parameter summaries:
1206        #set up list to hold the parameters data
1207        parameters = []
1208        for feature in csmlDoc.featureCollection.featureMembers:
1209            if hasattr(feature.parameter, 'href'):
1210                paramTriple = ""
1211                if hasattr(feature, 'description'):
1212                    paramTriple = feature.description.CONTENT
1213                    paramTriple += " | " + feature.parameter.href
1214                   
1215                    term = ""
1216                    if hasattr(feature, 'name'):
1217                        term = feature.name.CONTENT
1218   
1219                    paramTriple += " | " + term
1220                   
1221                    logging.debug("Got parameter info: %s" %paramTriple)
1222                    parameters.append(paramTriple)
1223       
1224        # update the atom with the extracted parameters
1225        logging.info("Adding CSML parameters to granule atom")
1226        self.addParameters(parameters)
1227        logging.info("Finished adding CSML data")
1228        return csmlDoc
1229
1230
1231    def lookupAssociatedData(self, type, searchClient, lookupIndirectReferences=False):
1232        '''
1233        Check through the atom links and retrieve any associated data of the
1234        specified type
1235        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1236        or VTD.DE_TERM
1237        @param searchClient: Client implementing the AbstractSearchXMLDBClient class
1238        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1239        defined deployments to find those which reference it, otherwise only
1240        deployments data featured in the atom related links are processed
1241        '''
1242        logging.info("Looking up %s info" %type)
1243        self.allActivities = []
1244        self.allObs = []
1245        self.allDpts = []
1246
1247        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1248            raise ValueError('Unrecognised associated data type: %s' %type)
1249       
1250        # avoid duplicating lookup effort
1251        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1252            (type == VTD.DE_TERM and self.dataEntities):
1253            logging.info("- this info has already been looked up - returning")
1254            return
1255
1256        # firstly, collect all the references to the info required
1257        if lookupIndirectReferences:
1258            logging.info("Looking up indirect references")
1259           
1260            # if we're looking up DE data for deployments data, need to have the
1261            # deployments info looked up first
1262            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1263                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, searchClient, 
1264                                          lookupIndirectReferences = lookupIndirectReferences)
1265           
1266            logging.info("Looking up references to this atom from other %s" %type)
1267           
1268            # NB, if we're looking up deployments info, we only look up references
1269            # to this atom - if we're looking up DEs, we need to look up references
1270            # to the deployments referenced by this atom
1271            urls = [self.atomBrowseURL]
1272           
1273            if type == VTD.DE_TERM and self.isDeployable():
1274                urls = []
1275                for dep in self.deployments:
1276                    urls.append(dep.browseURL)
1277                   
1278            links = []
1279            for url in urls:
1280                doc = searchClient.getNDGDoc(type, ndgObject.ASSOCIATED_ATOM_DOC_TYPE, url,
1281                                             targetCollection = dc.ATOM_COLLECTION_PATH)
1282                # now need to turn this results set into actual atoms
1283                tree = ET.fromstring(doc)
1284                for atom in tree:
1285                    logging.debug("- found reference in %s" %type)
1286                    links.append(ET.tostring(atom))
1287                   
1288            logging.info("Finished looking up indirect references")
1289        else:
1290            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1291
1292        # now retrieve the references and extract the required data
1293        logging.info("Retrieving info from %s references" %type)
1294        if type == VTD.DEPLOYMENT_TERM:
1295            logging.info("Extracting links data to deployment entitites")
1296            self.deployments = []
1297            for link in links:
1298                if lookupIndirectReferences:
1299                    deploymentAtom = link
1300                else:
1301                    localID = link.href.split("__ATOM__")[-1]
1302                    deploymentAtom = searchClient.getNDGDoc('', 
1303                                                            'ATOM', localID, 
1304                                                            targetCollection = dc.ATOM_COLLECTION_PATH)
1305   
1306                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1307                self.deployments.append(deployment)
1308               
1309                self.addUniqueLinks(self.allActivities, deployment.activities)
1310                self.addUniqueLinks(self.allObs, deployment.obs)
1311                self.addUniqueLinks(self.allDpts, deployment.dpts)
1312        else:
1313            # for DE data, just store the title + link in a Link object
1314            self.dataEntities = []
1315            logging.info("Extracting links data to data entitites")
1316            for data in links:
1317                atom = Atom(xmlString=str(data))
1318                link = Link()
1319                link.title = atom.title
1320                link.href = atom.atomBrowseURL
1321                link.rel = atom.datasetID
1322               
1323                # NB, different deployments may be used by the same DE - so
1324                # avoid duplication
1325                self.addUniqueLinks(self.dataEntities, link)
1326           
1327        logging.info("Finished looking up %s info" %type)
1328
1329
1330    def addUniqueLinks(self, dataArray, links):
1331        '''
1332        Add links to specified array - if they are not already included
1333        @param dataArray: a list, potentially arlready containing links
1334        @param links: a Link or array of Links to add to the dataArray
1335        '''
1336        logging.debug("Adding new links")
1337        if not links:
1338            return
1339       
1340        if type(links) is not list:
1341            links = [links]
1342       
1343        for link in links:
1344            if type(link) is not Link:
1345                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1346                continue
1347            if link not in dataArray:
1348                logging.debug("- adding unique link")
1349                dataArray.append(link)
1350        logging.debug("Finished adding links")
1351
1352       
1353    def getFullPath(self):
1354        '''
1355        Return full path to atom in eXist, if it exists, or None, otherwise
1356        @return fullPath: string - collection + filename of atom in eXist
1357        '''
1358        # NB, name assigned when atom created in eXist - so if not defined, not
1359        # in eXist
1360        logging.debug("Getting full path to atom")
1361        if self.atomName:
1362            logging.debug("Return full path to atom in eXist")
1363            return self.getDefaultCollectionPath() + self.atomName
1364        logging.debug("Atom doesn't currently exist in eXist - return 'None'")
1365        return None
Note: See TracBrowser for help on using the repository browser.