source: ndgCommon/trunk/ndg/common/src/models/Atom.py @ 4983

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/models/Atom.py@4983
Revision 4983, 51.9 KB checked in by cbyrom, 11 years ago (diff)

Add a new feed for collecting corrections info submitted by users of
the atom editor + create new method on client to do this + add new
tests and tidy up some code.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6import sys, logging, re, datetime
7from xml.etree import cElementTree as ET
8import csml.parser as CsmlParser
9import ndg.common.src.clients.xmldb.eXist.dbconstants as dc
10from ndg.common.src.lib.ETxmlView import et2text
11import ndg.common.src.lib.utilities as utilities
12from ndg.common.src.models.vocabtermdata import VocabTermData as VTD
13from ndg.common.src.models import MolesEntity as ME
14from ndg.common.src.models import Deployment as Deployment
15from ndg.common.src.models import AtomState
16from ndg.common.src.models.ndgObject import ndgObject
17
18class AtomError(Exception):
19    """
20    Exception handling for Atom class.
21    """
22    def __init__(self, msg):
23        logging.error(msg)
24        Exception.__init__(self, msg)
25
26
27class Person(object):
28    '''
29    Class representing atom author type data - with name, uri and role attributes
30    @keyword personType: Type of person to create - specified using the Person.._Type
31    values.  Default is AUTHOR_TYPE.
32    @keyword namespace: a two value array of format, ['short_namespace_name', 'full_namespace_name']
33    - e.g. ['moles', 'http://ndg.nerc.ac.uk/schema/moles2beta']
34    '''
35    AUTHOR_TYPE = 0
36    CONTRIBUTOR_TYPE = 1
37    RESPONSIBLE_PARTY_TYPE = 2
38    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
39   
40    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
41        self.type = personType
42        if namespace:
43            self.ns_shortname = namespace[0]
44            self.ns_fullname = namespace[1]
45        else:
46            self.ns_shortname = ""
47            self.ns_fullname = ndgObject.ATOM_NS
48           
49        self.name = ""
50        self.uri = ""
51        self.role = ""
52       
53        # NB, the atom format specifies slightly different data contents
54        self.uriTagName = "email"
55        # NB, responsible party data is always stored in the moles section
56        if self.type == self.RESPONSIBLE_PARTY_TYPE:
57            self.ns_shortname = 'moles'
58            self.ns_fullname = ndgObject.MOLES_NS
59            self.uriTagName = "uri"
60
61    def __str__(self):
62        if self.name or self.uri or self.role:
63            return self.name + " | " + self.uri + " | " + self.role
64        return ""
65
66
67    def hasValue(self):
68        if self.name or self.uri or self.role:
69            return True
70        return False
71   
72    def fromString(self, personString):
73        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
74       
75    def fromETElement(self, personTag):
76        self.name = personTag.findtext('{%s}name' %self.ns_fullname) or ""
77        self.role = personTag.findtext('{%s}role' %self.ns_fullname) or ""
78        self.uri = personTag.findtext('{%s}%s' %(self.ns_fullname, self.uriTagName)) or ""
79        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
80                      %(self.name, self.role, self.uriTagName, self.uri))
81
82    def toXML(self):
83        prefix = ""
84        if self.ns_shortname:
85            prefix = self.ns_shortname + ':'
86
87        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
88
89        if self.name:
90            name = ET.SubElement(author, prefix + "name")
91            name.text = self.name
92       
93        if self.uri:
94            uri = ET.SubElement(author, prefix + self.uriTagName)
95            uri.text = self.uri
96       
97        if self.role:
98            role = ET.SubElement(author, prefix + "role")
99            role.text = self.role
100
101        return author
102   
103    def __cmp__(self, person1):
104        '''
105        Override comparison to allow proper object comparison when checking
106        if Person objects are in an array already - i.e. if person in personArray...
107        '''
108        if not person1:
109            return -1
110       
111        if self is person1:
112            return 0
113        elif self.uri == person1.uri and self.name == person1.name and \
114                self.role == person1.role and self.type == person1.type:
115            return 0
116        return 1
117
118
119class Link(object):
120    '''
121    Class representing an atom link - with href, title and rel attributes
122    '''
123
124    def __init__(self):
125        self.href = ""
126        self.title = ""
127        self.rel = ""
128
129    def fromString(self, linkString):
130        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
131       
132    def fromETElement(self, linkTag):
133        self.href = linkTag.attrib.get('href') or ""
134        self.rel = linkTag.attrib.get('rel') or ""
135        self.title = linkTag.attrib.get('title') or ""
136
137    def toXML(self):
138        link = ET.Element("link")
139        link.attrib["href"] = self.href
140        link.attrib["title"] = self.title
141        link.attrib["rel"] = self.rel
142        return link
143
144    def hasValue(self):
145        # NB, just a rel on its own is meaningless - so ignore
146        if self.href or self.title:
147            return True
148        return False
149   
150    def __str__(self):
151        if self.href or self.title or self.rel:
152            return self.href + " | " + self.title + " | " + self.rel
153        return ""
154   
155    def isChildAtom(self):
156        '''
157        Determines whether the link refers to another atom - e.g. a link to
158        a data granule
159        @return True, if so; False otherwise
160        '''
161        if self.rel.endswith(VTD.GRANULE_TERM) or \
162            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
163            self.rel.endswith(VTD.ACTIVITY_TERM) or \
164            self.rel.endswith(VTD.DPT_TERM) or \
165            self.rel.endswith(VTD.OBS_TERM):
166            return True
167       
168        return False
169   
170    def __cmp__(self, link1):
171        '''
172        Override comparison to allow proper object comparison when checking
173        if Link objects are in an array already - i.e. if link in linkArray...
174        '''
175        if not link1:
176            return -1
177       
178        if self is link1:
179            return 0
180        elif self.href == link1.href and self.title == link1.title and \
181                self.rel == link1.rel:
182            return 0
183        return 1
184
185
186class Category(object):
187    '''
188    Class representing an atom category - with term, scheme and label attributes
189    '''
190    def __init__(self):
191        self.term = ""
192        self.scheme = ""
193        self.label = ""
194
195    def fromString(self, linkString, escapeSpecialCharacters=True):
196        '''
197        Create Category from triple string of format, 'label | scheme | term'
198        @param linkString: triple string to create category with
199        @keyword escapeSpecialCharacters: if set to True, special characters in
200        triple string are escaped (default)
201        '''
202        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
203            doEscape=escapeSpecialCharacters)
204       
205    def fromETElement(self, linkTag):
206        self.term = linkTag.attrib.get('term') or ""
207        self.label = linkTag.attrib.get('label') or ""
208        self.scheme = linkTag.attrib.get('scheme') or ""
209
210    def toXML(self):
211        link = ET.Element("category")
212        link.attrib["term"] = self.term
213        link.attrib["scheme"] = self.scheme
214        link.attrib["label"] = self.label
215        return link
216   
217    def hasValue(self):
218        if self.scheme or self.label or self.term:
219            return True
220        return False
221
222
223class Atom(object):
224
225    # labels for use with the atom categories
226    ATOM_TYPE = "ATOM_TYPE"
227    ATOM_SUBTYPE = "ATOM_SUBTYPE"
228
229    # labels for use with the templates to set/extract specific inputs
230    ONLINE_REF_LABEL = "online_ref"
231    PARAMETER_LABEL = "parameter"
232    ATOM_REF_LABEL = "atom_ref"
233    DELIMITER = "---"
234    REMOVE_LABEL = "remove"
235   
236    # format to use for t1-t2 date range
237    YEAR_FORMAT = '%Y-%m-%d'
238
239    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
240                 xmlString = None, state = AtomState.WORKING_STATE, **inputs):
241        '''
242        Constructor - initialise the atom variables
243        @keyword atomType: type of atom to set up
244        @keyword vocabTermData: instance of VocabTermData object to use with atom
245        @keywork ndgObject: instance of ndgObject to use with atom
246        @keyword xmlString: XML representation of atom - will be parsed to populate
247        the atom data
248        @keyword state:  AtomState object representing the state of the atom
249        '''
250        logging.info("Initialising atom")
251        if atomType:
252            logging.info(" - of type '%s'" %atomType)
253        self.atomTypeID = atomType
254
255        # some data have further subtypes specified
256        self.subtypeID = None # this should be the termID
257        self.subtype = None # and this should be the fully formed vocab URL
258       
259        self.ndgObject = ndgObject
260
261        self.atomName = None
262        self.files = []
263        self.author = Person()
264        self.contributors = []
265        self.atomAuthors = []
266        self.parameters = []
267        self.spatialData = []
268        self.temporalData = []
269        self.relatedLinks = []
270        self.summary = []
271        self.content = []
272        # NB, this deployments data duplicates other atom data - and is only used for a
273        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
274        self.deployments = []
275        # ditto for the following field
276        self.dataEntities = []
277           
278        self.csmlFile = None
279        self.cdmlFile = None
280        # general variable to use for setting the atom content - NB, if a csmlFile is specified
281        # (either directly or via a cdmlFile specification), this will be the content by default
282        # for this purpose
283        self.contentFile = None     
284        self.title = None
285        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
286        self.atomID = None
287   
288        # boundary box info - to replace spatial/temporalData?
289        self.minX = None
290        self.minY = None
291        self.maxX = None
292        self.maxY = None
293        self.t1 = None
294        self.t2 = None
295
296        self.ME = ME.MolesEntity()
297       
298        # date when the atom was first ingested
299        self.publishedDate = None
300
301        # last update date
302        self.updatedDate = None
303
304        # assume atom in working state by default - this is used to define what collection
305        # in eXist the atom is stored in
306        self.state = state
307       
308        # additional, non standard atom data can be included in the molesExtra element
309        if vocabTermData:
310            self.VTD = vocabTermData
311        else:
312            self.VTD = VTD()
313       
314        if xmlString:
315            self.fromString(xmlString)
316
317        # if inputs passed in as dict, add these now
318        if inputs:
319            logging.info("Adding info to atom from input dict")
320            logging.debug(inputs)
321            self.__dict__.update(inputs)
322           
323            # NB, this doesn't trigger the Summary Property, so do this
324            # explicitly, if need be
325            if inputs.has_key('Summary'):
326                self.Summary = inputs.get('Summary')
327            if inputs.has_key('Content'):
328                self.Content = inputs.get('Content')
329            if inputs.has_key('author'):
330                name = inputs.get('author')
331                author = Person()
332                author.fromString(name)
333                self.author = author
334           
335            # also pass any moles data up to the moles entity object
336            if inputs.has_key('providerID'):
337                self.ME.providerID = inputs.get('providerID')
338               
339            if inputs.has_key('abbreviation'):
340                self.ME.abbreviation = inputs.get('abbreviation')
341
342        if self.atomTypeID:
343            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
344
345        logging.info("Atom initialised")
346
347
348    def addOnlineReferences(self, links):
349        '''
350        Add online reference data associated with the atom
351        - NB, care needs to be taken here since this data is stored in the atom
352        link elements and these are also used for the various atom associations
353        @param links: a Link or array of Links to add to the relatedLinks attribute
354        '''
355        logging.debug("Adding online references")
356        if not links:
357            return
358       
359        if type(links) is not list:
360            links = [links]
361       
362        # firstly clear out any online refs data from the existing related links
363        newLinks = []
364        for link in self.relatedLinks:
365            if link.isChildAtom():
366                newLinks.append(link)
367       
368        newLinks.extend(links)
369        self.relatedLinks = newLinks
370        logging.debug("Online references added")
371
372
373    def addUniqueRelatedLinks(self, links):
374        '''
375        Add links to relatedLinks array - if they are not already included
376        @param links: a Link or array of Links to add to the relatedLinks attribute
377        '''
378        self.addUniqueLinks(self.relatedLinks, links)
379       
380
381    def removeRelatedLinks(self, linksToDelete):
382        '''
383        Remove any links in the input list from the atom's related links list
384        @param linksToDelete: array of Link objects to remove from atom
385        '''
386        logging.debug("Removing related links from atom")
387        if not linksToDelete:
388            return
389       
390        if type(linksToDelete) is not list:
391            linksToDelete = [linksToDelete]
392       
393        updatedLinks = []
394        for link in self.relatedLinks:
395            if type(link) is not Link:
396                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
397                continue
398            if link in linksToDelete:
399                logging.debug("- found link to remove")
400            else:
401                updatedLinks.append(link)
402
403        self.relatedLinks = updatedLinks
404        logging.debug("Links removed")
405
406    def getPublicationStatePath(self):
407        '''
408        Determine the correct publication state collection for the atom
409        @return collectionPath: collection path for the publication state of the atom
410        '''
411        logging.debug("Getting collection path for atom publication state")
412        collectionPath = dc.ATOM_COLLECTION_PATH + self.state.collectionPath
413        logging.debug("Returning publication state collection, '%s'" %collectionPath)
414        return collectionPath
415       
416
417    def getDefaultEntityCollectionPath(self):
418        '''
419        Determine the correct collection for the entity type of the atom
420        @return entityPath: collection path for the data type of the atom
421        '''
422        logging.debug("Getting collection path for atom entity type")
423        collectionPath = self.getPublicationStatePath()
424       
425        if self.atomTypeID == VTD.DE_TERM:
426            collectionPath += dc.DE_COLLECTION_PATH
427        elif self.atomTypeID == VTD.GRANULE_TERM:
428            collectionPath += dc.GRANULE_COLLECTION_PATH
429        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
430            self.subtypeID == VTD.DEPLOYMENT_TERM:
431            collectionPath += dc.DEPLOYMENTS_COLLECTION_PATH
432        else:
433            collectionPath += dc.DEPLOYMENT_COLLECTION_PATH
434       
435        logging.debug("Returning entity collection, '%s'" %collectionPath)
436        return collectionPath
437       
438
439    def getDefaultCollectionPath(self):
440        '''
441        Determine the correct collection to use for the atom in eXist
442        '''
443        logging.debug("Getting default collection path for atom")
444        collectionPath = self.getDefaultEntityCollectionPath()
445        if not self.ME.providerID:
446            raise AtomError("Error: cannot determine atom collection path because " + \
447                            "the provider ID is not defined")
448           
449        collectionPath += self.ME.providerID + "/"
450        logging.debug("Returning collection, '%s'" %collectionPath)
451        return collectionPath
452
453
454    def __addAtomTypeDataXML(self, root):
455        '''
456        Add the atom type, and subtype data, if available, to atom categories
457        - and lookup and add the appropriate vocab term data
458        '''
459        if self.atomTypeID:
460            logging.info("Adding atom type info to XML output")
461            category = Category()
462            category.label = self.atomTypeID
463            # look up the appropriate vocab term data
464            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
465            category.term = self.ATOM_TYPE
466            root.append(category.toXML())
467
468        if self.subtypeID:
469            logging.info("Adding atom subtype info to XML output")
470            # NB subtypes not all defined, so leave this out for the moment
471            category.label = self.subtypeID
472            # look up the appropriate vocab term data
473            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
474            category.term = self.ATOM_SUBTYPE
475            root.append(category.toXML())
476
477
478    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
479        '''
480        Add data to include in the moles entity element
481        '''
482        logging.debug('Adding moles entity information')
483        self.ME.abbreviation = abbreviation
484        self.ME.providerID = provider_id
485        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
486        logging.debug('Moles entity information added')
487
488
489    def addAuthors(self, authors):
490        '''
491        Add author data appropriately to the atom
492        NB, these will overwrite any existing authors of the same type
493        @param authors: list of Person objects with the author data
494        '''
495        logging.debug('Adding authors data to Atom')
496        isFirstAuthor = {}
497        authorArray = None
498        for author in authors:
499            # NB, we're only allowed one atom author
500            if author.type == Person.AUTHOR_TYPE:
501                self.author = author
502                   
503                if isFirstAuthor.has_key(author.type):
504                    raise AtomError("Error: an atom can only have one author specified")
505                isFirstAuthor[author.type] = 1
506                continue
507            elif author.type == Person.CONTRIBUTOR_TYPE:
508                authorArray = self.contributors
509            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
510                authorArray = self.ME.responsibleParties
511               
512            # check if this is the first addition - if so, clear out the
513            # array in advance
514            if not isFirstAuthor.has_key(author.type):
515                logging.debug("Clearing out author array")
516                # NB, need to be careful to clear the array, not create a ref
517                # to a new array
518                del authorArray[:]
519                isFirstAuthor[author.type] = 1
520
521            if author.hasValue() and author not in authorArray:
522                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
523                              %(author.type, author.name, author.uri, author.role))
524                authorArray.append(author)
525
526        logging.debug('Finished adding authors data')
527
528
529    def _isNewParameter(self, param):
530        '''
531        Check if a parameter is already specified in the atom, return False if
532        so, otherwise return True
533        '''
534        for p in self.parameters:
535            if p.term == param.term and \
536                p.scheme == param.scheme and \
537                p.label == param.label:
538                return False
539        return True
540
541
542    def addRelatedLinks(self, linkVals):
543        '''
544        Add related links in string format - converting to Link objects
545        NB, only add the link if it is unique
546       
547        @param linkVals: string of format, 'uri | title | vocabServerURL'
548        '''
549        link = self.objectify(linkVals, 'relatedLinks')
550        if link not in self.relatedLinks:
551            self.relatedLinks.append(link)
552
553
554    def addParameters(self, params):
555        '''
556        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
557        @params param: parameter, as string array, to add to atom parameters collection
558        '''
559        # avoid strings being parsed character by character
560        if type(params) is str:
561            params = [params]
562           
563        for param in params:
564            # firstly tidy parameter
565            param = utilities.tidyUpParameters(param)
566            category = Category()
567            # NB, data already tidied up here, so set keyword to avoid this happening again
568            category.fromString(param, escapeSpecialCharacters=True)
569
570            # now check for uniqueness
571            if self._isNewParameter(category):
572                logging.debug("Adding new parameter: %s" %param)
573                self.parameters.append(category)
574   
575   
576    def _linksToXML(self, root):
577        '''
578        Add required links to the input element
579        @param root: element to add links to - NB, should be the root element of the atom
580        '''
581        selfLink = ET.SubElement(root, "link")
582        selfLink.attrib["href"] = self.atomBrowseURL
583        selfLink.attrib["rel"] = "self"
584       
585        for relatedLink in self.relatedLinks:
586            if relatedLink.hasValue():
587                root.append(relatedLink.toXML())
588   
589    def toXML(self):
590        '''
591        Convert the atom into XML representation and return this
592        @return: xml version of atom
593        '''
594        logging.info("Creating formatted XML version of Atom")
595        root = ET.Element("entry")
596        root.attrib["xmlns"] = ndgObject.ATOM_NS
597        root.attrib["xmlns:moles"] = ndgObject.MOLES_NS
598        root.attrib["xmlns:georss"] = ndgObject.GEOSS_NS
599        root.attrib["xmlns:gml"] = ndgObject.GML_NS
600        id = ET.SubElement(root, "id")
601        id.text = self.atomID
602        title = ET.SubElement(root, "title")
603        title.text = self.title
604        self._linksToXML(root)
605
606        if self.author and self.author.hasValue():
607            root.append(self.author.toXML())
608           
609        for contributor in self.contributors:
610            root.append(contributor.toXML())
611
612        # add parameters data
613        for param in self.parameters:
614            if param.hasValue():
615                root.append(param.toXML())
616
617        # add the type and subtype data
618        self.__addAtomTypeDataXML(root)
619                   
620        summary = ET.SubElement(root, "summary")
621        summary.text = self.Summary
622                   
623        # add link to content, if required - NB, can only have one content element in atom
624        # - and this is mandatory
625        content = ET.SubElement(root, "content")
626        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
627        if contentFile:
628            content.attrib["type"] = "application/xml"
629            content.attrib["src"] = contentFile
630        else:
631            content.attrib["type"] = "xhtml"
632            div = ET.SubElement(content, 'div')
633            div.attrib["xmlns:xhtml"] = ndgObject.XHTML_NS
634            div.text = self.Content
635       
636        # if there's a published date already defined, assume we're doing an update now
637        # NB, update element is mandatory
638        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
639        if not self.publishedDate:
640            self.publishedDate = currentDate
641
642        updated = ET.SubElement(root, "updated")
643        if not self.updatedDate:
644            self.updatedDate = currentDate
645        updated.text = self.updatedDate
646
647        published = ET.SubElement(root, "published")
648        published.text = self.publishedDate
649
650        # add the moles entity section, if it is required
651        if self.ME:
652            root.append(self.ME.toXML())
653
654        # add temporal range data, if available
655        temporalRange = ET.SubElement(root, "moles:temporalRange")
656        if self.t1:
657            temporalRange.text = self.t1
658            if self.t2:
659                temporalRange.text += "/" + self.t2
660
661        # add spatial range data, if available
662        self._addSpatialData(root)
663
664        tree = ET.ElementTree(root)
665        logging.info("XML version of Atom created")
666        return tree
667
668
669    def __getSummary(self):
670        logging.debug("Getting summary data")
671        summaryString = ""
672        for summary_line in self.summary:
673            summaryString += summary_line + "\n"
674
675        return summaryString
676
677    def __setSummary(self, summary):
678        logging.debug("Adding summary data")
679        self.summary = []
680        for summary_line in summary.split('\n'):
681            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
682           
683    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
684
685
686    def __getContent(self):
687        logging.debug("Getting content data")
688        contentString = ""
689        # NB, there must be content specified in an atom
690        if not self.content:
691            return "Metadata document"
692       
693        for content_line in self.content:
694            contentString += content_line + "\n"
695
696        return contentString
697
698    def __setContent(self, content):
699        logging.debug("Adding content data")
700        self.content = []
701        if not content:
702            return
703       
704        for content_line in content.split('\n'):
705            self.content.append(content_line)
706           
707    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
708
709           
710    def fromString(self, xmlString):
711        '''
712        Initialise Atom object using an xmlString
713        @param xmlString: representation of atom as an XML string
714        '''
715        logging.info("Ingesting data from XML string")
716        logging.debug("Create elementtree instance with XML string")
717        tree = ET.fromstring(xmlString)
718        title = tree.findtext('{%s}title' %ndgObject.ATOM_NS)
719        if title:
720            logging.debug("Adding title data")
721            self.title = title
722
723        summary = tree.findtext('{%s}summary' %ndgObject.ATOM_NS)
724        if summary:
725            self.Summary = summary#.decode('unicode_escape')
726
727        authorElement = tree.find('{%s}author' %ndgObject.ATOM_NS)
728        if authorElement:
729            logging.debug("Adding author data")
730            author = Person()
731            author.fromETElement(authorElement)
732            self.author = author
733
734        contributorElements = tree.findall('{%s}contributor' %ndgObject.ATOM_NS)
735        for contributorElement in contributorElements:
736            logging.debug("Adding contributor data")
737            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
738            contributor.fromETElement(contributorElement)
739            self.contributors.append(contributor)
740
741        molesElement = tree.find('{%s}entity' %ndgObject.MOLES_NS)
742        if molesElement:
743            self.ME.fromET(molesElement)
744               
745        atomID = tree.findtext('{%s}id' %ndgObject.ATOM_NS)
746        self.__parseAtomID(atomID)
747       
748        self._parseCategoryData(tree.findall('{%s}category' %ndgObject.ATOM_NS))
749
750        self._parseLinksData(tree.findall('{%s}link' %ndgObject.ATOM_NS))
751           
752        contentTag = tree.find('{%s}content' %ndgObject.ATOM_NS)
753        if contentTag != None:
754            logging.debug("Found content tag - checking for CSML/CDML file data")
755            file = contentTag.attrib.get('src')
756            if file:
757                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
758                if file.upper().find('CSML') > -1:
759                    logging.debug("Adding CSML file data")
760                    self.csmlFile = file
761                elif file.upper().find('CDML') > -1:
762                    logging.debug("Adding CDML file data")
763                    self.cdmlFile = file
764                self.contentFile = file
765            else:
766                logging.debug("No file data - adding contents of element instead")
767                div = contentTag.findtext('{%s}div'%ndgObject.XHTML_NS)
768                self.Content = div
769       
770        range = tree.findtext('{%s}temporalRange' %ndgObject.MOLES_NS)
771        if range:
772            logging.debug("Adding temporal range data")
773            timeData = range.split('/')
774            self.t1 = timeData[0]
775            if len(timeData) > 1:
776                self.t2 = timeData[1]
777       
778        where = tree.find('{%s}where' %ndgObject.GEOSS_NS)
779        if where:
780            # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
781            minBBox = where.findall('.//{%s}lowerCorner' %ndgObject.GML_NS)
782            if minBBox:
783                logging.debug("Adding min spatial range data")
784                minBBox = minBBox[0]
785                spatialData = minBBox.text.split()
786                self.minX = spatialData[0]
787                if len(spatialData) > 1:
788                    self.minY = spatialData[1]
789           
790            maxBBox = where.findall('.//{%s}upperCorner' %ndgObject.GML_NS)
791            if maxBBox:
792                maxBBox = maxBBox[0]
793                logging.debug("Adding max spatial range data")
794                spatialData = maxBBox.text.split()
795                self.maxX = spatialData[0]
796                if len(spatialData) > 1:
797                    self.maxY = spatialData[1]
798               
799        publishedDate = tree.findtext('{%s}published' %ndgObject.ATOM_NS)
800        if publishedDate:
801            logging.debug("Adding published date")
802            self.publishedDate = publishedDate
803               
804        updatedDate = tree.findtext('{%s}updated' %ndgObject.ATOM_NS)
805        if updatedDate:
806            logging.debug("Adding updated date")
807            self.updatedDate = updatedDate
808           
809        logging.info("Completed data ingest")
810   
811   
812    def _parseCategoryData(self, categories):
813        logging.debug("Adding category/parameters data")
814        for category in categories:
815            cat = Category()
816            cat.fromETElement(category)
817           
818            if cat.term == self.ATOM_TYPE:
819                logging.debug("Found atom type data")
820                self.atomTypeID = cat.label
821                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
822                continue
823            elif cat.term == self.ATOM_SUBTYPE:
824                logging.debug("Found atom subtype data")
825                self.subtypeID = cat.label
826                self.subtype = cat.scheme
827                continue
828
829            self.parameters.append(cat)
830
831
832    def __parseAtomID(self, atomID):
833        '''
834        Given an atom ID, extract the useful bits of info and set these on
835        the relevant atom attributes
836        @param atomID: an atom ID in the 'tag' format
837        '''
838        logging.debug("Extracting atom info from ID, '%s'" %atomID)
839        self.atomID = atomID
840        self.datasetID = atomID.split("__ATOM__")[-1]
841        self._generateAtomName(self.datasetID)
842        logging.debug("- all info extracted")
843   
844
845    def setDatasetID(self, datasetID):
846        '''
847        Set the dataset ID for the atom - and generate an appropriate atom name using this
848        @param datasetID: ID to set for the atom
849        '''
850        self.datasetID = datasetID
851        self._generateAtomName(datasetID) 
852        self.atomID = self.createAtomID(datasetID)
853
854
855    def createAtomID(self, datasetID):
856        '''
857        Create a unique ID, conforming to atom standards, for atom
858        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
859        @param datasetID: ID of atom's dataset
860        @return: unique ID
861        '''
862        logging.info("Creating unique ID for atom")
863        if not self.atomBrowseURL:
864            self._generateAtomName(datasetID)
865        urlBit = self.atomBrowseURL.split('://')[1]
866        urlBit = urlBit.replace('#', '')
867        urlBits = urlBit.split('/')
868        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
869        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
870       
871        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
872        logging.info("- unique ID created for atom")
873        logging.debug(" - '%s'" %id)
874        return id
875       
876       
877    def _generateAtomName(self, datasetID):
878        '''
879        Generate a consistent name for the atom - with full eXist doc path
880        @param datasetID: ID of atom's dataset
881        '''
882        self.atomName = datasetID + ".atom"
883        if not self.ME.providerID:
884            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
885        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
886        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
887
888
889    def _parseLinksData(self, links):
890        '''
891        Extract links and atom data from array of link elements in the XML representation of the atom
892        @param links: an array of <link> elements
893        '''
894        # firstly, get all data to start with, so we can properly process it afterwards
895        linkData = {}
896        logging.debug("Getting link data")
897        for linkTag in links:
898            link = Link()
899            link.fromETElement(linkTag)
900
901            if not linkData.has_key(link.rel):
902                linkData[link.rel] = []
903           
904            linkData[link.rel].append(link)
905
906        # there should be one self referencing link - which will provide info on the atom itself
907        if not linkData.has_key('self'):
908            errorMessage = "Atom does not have self referencing link - " + \
909                "cannot ascertain datasetID without this - please fix"
910            logging.error(errorMessage)
911            raise ValueError(errorMessage)
912       
913        # this is the link describing the atom itself
914        self.atomBrowseURL = linkData['self'][0].href
915       
916        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
917        self.atomName = self.datasetID + ".atom"
918        # NB, only split on the stem, since the browse host may not be
919        # the same as that defined in VTD
920        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_STEM_URL)[-1]
921       
922        # now remove this value and the associated moles doc link
923        del linkData['self']
924        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
925        if linkData.has_key('related'):
926            relatedLinks = []
927            for link in linkData['related']:
928                if link.href != molesDoc:
929                    relatedLinks.append(link)
930           
931            linkData['related'] = relatedLinks
932               
933        # now add the remaining links to the atom
934        for key in linkData:
935            for link in linkData[key]:
936                logging.debug("Adding link data")
937                self.relatedLinks.append(link)
938       
939
940    def _addSpatialData(self, element):
941        '''
942        Add spatial coverage element to an input element
943        @param element: element to add coverage data to
944        '''
945        logging.info("Adding spatial data to Atom")
946        if not self.minX:
947            logging.info("No spatial data specified")
948            return
949        bbox = ET.SubElement(element, "georss:where")
950        envelope = ET.SubElement(bbox, "gml:Envelope")
951        lc = ET.SubElement(envelope, "gml:lowerCorner")
952        lc.text = str(self.minX) + " " + str(self.minY)
953        uc = ET.SubElement(envelope, "gml:upperCorner")
954        uc.text = str(self.maxX) + " " + str(self.maxY)
955
956       
957    def setAttribute(self, attributeName, attributeValue):
958        '''
959        Set the value of an atom attribute - and do some basic tidying up of the string content
960        - to escape any XML unfriendly characters
961        @param attributeName: name of the attribute whose value to set
962        @param attributeValue: value to set the attribute to 
963        '''
964        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
965        origValue = attributeValue
966       
967        # escape any special characters if a value has been specified
968        # NB, need to cope with both single values and arrays
969        if attributeValue:
970            if type(attributeValue) is list:
971                newVals = []
972                for val in attributeValue:
973                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
974                attributeValue = newVals
975                   
976            else:
977                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
978
979        # handle the special case of authors; only one author is allowed per atom
980        # - the others should be treated as contributors
981        if attributeName == "authors":
982            setattr(self, "author", attributeValue[0])
983            if len(attributeValue) > 1:
984                setattr(self, "contributors", attributeValue[1:])
985        elif attributeName == "atomAuthors":
986            if isinstance(attributeValue, list):
987                for val in attributeValue:
988                    self.ME.responsibleParties.append(val)
989            else:
990                self.ME.responsibleParties.append(attributeValue)
991        elif attributeName == "files":
992            self.addUniqueRelatedLinks(attributeValue)
993        else:
994            setattr(self, attributeName, attributeValue)
995
996
997    def objectify(self, objectVals, attributeName):
998        '''
999        Some inputs are specified as strings but need to be converted into
1000        objects - do this here
1001        @param objectVals: a '|' delimited string of values
1002        @param attributeName: name of attribute the values belong to
1003        '''
1004        obj = None
1005        if type(objectVals) != str:
1006            return objectVals
1007       
1008        if attributeName == "relatedLinks":
1009            obj = Link()
1010        elif attributeName == "atomAuthors":
1011            obj = Person(personType = Person.RESPONSIBLE_PARTY_TYPE)
1012        elif attributeName == "authors":
1013            # NB, ensure there is only one author tag - extra authors are contributors
1014            authorType = Person.AUTHOR_TYPE
1015            if self.author and self.author.hasValue():
1016                authorType= Person.CONTRIBUTOR_TYPE
1017            obj = Person(personType = authorType)
1018        elif attributeName == 'files':
1019            obj = Link()
1020            objectVals = '%s|%s|%s' \
1021                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
1022
1023        if obj:
1024            obj.fromString(objectVals)
1025            # NB, need to set it now, just in case we don't set it before coming back
1026            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
1027                self.author = obj
1028            return obj
1029       
1030        return objectVals
1031
1032
1033    def toPrettyXML(self):
1034        '''
1035        Returns nicely formatted XML as string
1036        '''
1037        atomXML = self.toXML()
1038
1039        # create the string
1040        logging.debug("Converting the elementtree object into a string")
1041        prettyXML = et2text(atomXML.getroot())
1042
1043        # add XML version tag
1044        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1045        logging.info("Created formatted version of XML object")
1046        return prettyXML
1047
1048
1049    def getLinksOfType(self, termID):
1050        '''
1051        Returns links in the atom related links attribute which match the specified
1052        term ID
1053        @param termID: the termID to look for in the related links - NB, this is
1054        matched to the end of the link.rel value
1055        @return links: array of Link objects with matching term type
1056        '''
1057        logging.debug("Getting atom links of type, '%s'" %termID)
1058        matchingLinks = []
1059        for link in self.relatedLinks:
1060            # firstly, handle special case where we only want the online ref type links
1061            # returned
1062            if termID == self.ONLINE_REF_LABEL:
1063                if not link.isChildAtom():
1064                    logging.debug("- found link with matching term type")
1065                    matchingLinks.append(link)
1066               
1067            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1068                logging.debug("- found link with matching term type")
1069                matchingLinks.append(link)
1070               
1071        logging.debug("Returning matched links")
1072        return matchingLinks
1073       
1074       
1075    def getLogos(self):
1076        '''
1077        Return related links that are logos
1078        @return: array of Links containing the logos for the atom
1079        '''
1080        logos = []
1081        for link in self.relatedLinks:
1082            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1083                logos.append(link)
1084               
1085        return logos
1086   
1087   
1088    def isGranule(self):
1089        if self.atomTypeID == VTD.GRANULE_TERM:
1090            return True
1091        return False
1092   
1093   
1094    def isDE(self):
1095        if self.atomTypeID == VTD.DE_TERM:
1096            return True
1097        return False
1098   
1099    def isDeployment(self):
1100        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1101            return True
1102        return False
1103   
1104    def isDeployable(self):
1105        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1106            self.atomTypeID == VTD.DPT_TERM or \
1107            self.atomTypeID == VTD.OBS_TERM:
1108            return True
1109        return False
1110   
1111    def isPublished(self):
1112        '''
1113        Check state of atom doc - if published or Published return True,
1114        otherwise return False
1115        '''
1116        return self.state.isPublishedState()
1117       
1118       
1119    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1120        '''
1121        Parse CSML data and add extracted info to the atom
1122        @param csmlName: name of the csml file
1123        @param csmlContent: content of the csml file - NB, if this is set to None and the
1124        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1125        directly
1126        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1127        atom coverage data will be added
1128        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1129        this should only be True if creating a new atom - e.g. from a granulite
1130        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1131        '''
1132        logging.info("Creating CSML data model")
1133        self.csmlFile = csmlName
1134        self.contentFile = csmlName
1135        content = csmlContent or csmlName
1136   
1137        csmlDoc = CsmlParser.Dataset(file=content)
1138       
1139        logging.info("Extracting info from CSML file")
1140        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1141        if useCSMLID:
1142            logging.debug(" - using this ID for the atom")
1143            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1144       
1145        title = csmlDoc.name.CONTENT
1146        logging.debug("Got dataset name (title): '%s'" %title)
1147        # NB, if a title is specified (and not as the default value), it automatically is used in
1148        # place of anything in the granulite file
1149        if title and title != "NAME OF DATASET GOES HERE":
1150            logging.info("Title, '%s', extracted from CSML file" %title)
1151            if self.title:
1152                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1153                             %self.title)
1154            self.title = title
1155               
1156        bbox1 = csmlDoc.getBoundingBox()
1157        bbox2 = csmlDoc.getCSMLBoundingBox()
1158
1159        time = None
1160        if bbox2:
1161            time = bbox2.getTimeLimits()
1162   
1163        # now check for other parameters to add to granule
1164        # Firstly, extract the bounding envelope
1165        if bbox1:
1166            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1167            n, s = (bbox1[3], bbox1[1])
1168   
1169            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1170                self.maxY = n
1171               
1172            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1173                self.minY = s
1174           
1175            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1176                self.minX = w
1177   
1178            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1179                self.maxX = e
1180           
1181            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1182                          %(w, s, e, n))
1183           
1184            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1185                          %(self.minX, self.minY, self.maxX, self.maxY))
1186        else:
1187            logging.debug("No valid bounding box data found")
1188   
1189        if time:
1190            t1 = utilities.formatDateYYYYMMDD(time[0])
1191            if not aggregateCoverage or \
1192                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1193                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1194                self.t1 = t1
1195   
1196            t2 = time[1]
1197            if t2 and t2 != 'None':
1198                t2 = utilities.formatDateYYYYMMDD(t2)
1199                if not aggregateCoverage or \
1200                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1201                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1202                    self.t2 = t2
1203           
1204            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1205        else:
1206            logging.debug("No valid time range data found")
1207   
1208        #create parameter summaries:
1209        #set up list to hold the parameters data
1210        parameters = []
1211        for feature in csmlDoc.featureCollection.featureMembers:
1212            if hasattr(feature.parameter, 'href'):
1213                paramTriple = ""
1214                if hasattr(feature, 'description'):
1215                    paramTriple = feature.description.CONTENT
1216                    paramTriple += " | " + feature.parameter.href
1217                   
1218                    term = ""
1219                    if hasattr(feature, 'name'):
1220                        term = feature.name.CONTENT
1221   
1222                    paramTriple += " | " + term
1223                   
1224                    logging.debug("Got parameter info: %s" %paramTriple)
1225                    parameters.append(paramTriple)
1226       
1227        # update the atom with the extracted parameters
1228        logging.info("Adding CSML parameters to granule atom")
1229        self.addParameters(parameters)
1230        logging.info("Finished adding CSML data")
1231        return csmlDoc
1232
1233
1234    def lookupAssociatedData(self, type, searchClient, lookupIndirectReferences=False):
1235        '''
1236        Check through the atom links and retrieve any associated data of the
1237        specified type
1238        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1239        or VTD.DE_TERM
1240        @param searchClient: Client implementing the AbstractSearchXMLDBClient class
1241        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1242        defined deployments to find those which reference it, otherwise only
1243        deployments data featured in the atom related links are processed
1244        '''
1245        logging.info("Looking up %s info" %type)
1246        self.allActivities = []
1247        self.allObs = []
1248        self.allDpts = []
1249
1250        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1251            raise ValueError('Unrecognised associated data type: %s' %type)
1252       
1253        # avoid duplicating lookup effort
1254        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1255            (type == VTD.DE_TERM and self.dataEntities):
1256            logging.info("- this info has already been looked up - returning")
1257            return
1258
1259        # firstly, collect all the references to the info required
1260        if lookupIndirectReferences:
1261            logging.info("Looking up indirect references")
1262           
1263            # if we're looking up DE data for deployments data, need to have the
1264            # deployments info looked up first
1265            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1266                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, searchClient, 
1267                                          lookupIndirectReferences = lookupIndirectReferences)
1268           
1269            logging.info("Looking up references to this atom from other %s" %type)
1270           
1271            # NB, if we're looking up deployments info, we only look up references
1272            # to this atom - if we're looking up DEs, we need to look up references
1273            # to the deployments referenced by this atom
1274            urls = [self.atomBrowseURL]
1275           
1276            if type == VTD.DE_TERM and self.isDeployable():
1277                urls = []
1278                for dep in self.deployments:
1279                    urls.append(dep.browseURL)
1280                   
1281            links = []
1282            for url in urls:
1283                doc = searchClient.getNDGDoc(type, ndgObject.ASSOCIATED_ATOM_DOC_TYPE, url,
1284                                             targetCollection = dc.ATOM_COLLECTION_PATH)
1285                # now need to turn this results set into actual atoms
1286                tree = ET.fromstring(doc)
1287                for atom in tree:
1288                    logging.debug("- found reference in %s" %type)
1289                    links.append(ET.tostring(atom))
1290                   
1291            logging.info("Finished looking up indirect references")
1292        else:
1293            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1294
1295        # now retrieve the references and extract the required data
1296        logging.info("Retrieving info from %s references" %type)
1297        if type == VTD.DEPLOYMENT_TERM:
1298            logging.info("Extracting links data to deployment entitites")
1299            self.deployments = []
1300            for link in links:
1301                if lookupIndirectReferences:
1302                    deploymentAtom = link
1303                else:
1304                    localID = link.href.split("__ATOM__")[-1]
1305                    deploymentAtom = searchClient.getNDGDoc('', 
1306                                                            'ATOM', localID, 
1307                                                            targetCollection = dc.ATOM_COLLECTION_PATH)
1308   
1309                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1310                self.deployments.append(deployment)
1311               
1312                self.addUniqueLinks(self.allActivities, deployment.activities)
1313                self.addUniqueLinks(self.allObs, deployment.obs)
1314                self.addUniqueLinks(self.allDpts, deployment.dpts)
1315        else:
1316            # for DE data, just store the title + link in a Link object
1317            self.dataEntities = []
1318            logging.info("Extracting links data to data entitites")
1319            for data in links:
1320                atom = Atom(xmlString=str(data))
1321                link = Link()
1322                link.title = atom.title
1323                link.href = atom.atomBrowseURL
1324                link.rel = atom.datasetID
1325               
1326                # NB, different deployments may be used by the same DE - so
1327                # avoid duplication
1328                self.addUniqueLinks(self.dataEntities, link)
1329           
1330        logging.info("Finished looking up %s info" %type)
1331
1332
1333    def addUniqueLinks(self, dataArray, links):
1334        '''
1335        Add links to specified array - if they are not already included
1336        @param dataArray: a list, potentially arlready containing links
1337        @param links: a Link or array of Links to add to the dataArray
1338        '''
1339        logging.debug("Adding new links")
1340        if not links:
1341            return
1342       
1343        if type(links) is not list:
1344            links = [links]
1345       
1346        for link in links:
1347            if type(link) is not Link:
1348                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1349                continue
1350            if link not in dataArray:
1351                logging.debug("- adding unique link")
1352                dataArray.append(link)
1353        logging.debug("Finished adding links")
1354
1355       
1356    def getFullPath(self):
1357        '''
1358        Return full path to atom in eXist, if it exists, or None, otherwise
1359        @return fullPath: string - collection + filename of atom in eXist
1360        '''
1361        # NB, name assigned when atom created in eXist - so if not defined, not
1362        # in eXist
1363        logging.debug("Getting full path to atom")
1364        if self.atomName:
1365            logging.debug("Return full path to atom in eXist")
1366            return self.getDefaultCollectionPath() + self.atomName
1367        logging.debug("Atom doesn't currently exist in eXist - return 'None'")
1368        return None
Note: See TracBrowser for help on using the repository browser.