source: exist/trunk/python/ndgUtils/models/Atom.py @ 4494

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4494
Revision 4494, 43.9 KB checked in by cbyrom, 12 years ago (diff)

Add new methods to lookup simple URLs and vocab term urls. NB, problems were encountered using the checkURL method, which uses httplib, when running with proxy server. Implement usage of new methods + fix small bug with keeping too many related links + tidy up unused imports.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from ndgUtils.lib.utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date, normaliseLongitude
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class Person(object):
36    '''
37    Class representing atom author type data - with name, uri and role attributes
38    '''
39    AUTHOR_TYPE = 0
40    CONTRIBUTOR_TYPE = 1
41    RESPONSIBLE_PARTY_TYPE = 2
42    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
43   
44    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
45        self.type = personType
46        self.ns = namespace
47        self.name = ""
48        self.uri = ""
49        self.role = ""
50       
51        # NB, the atom format specifies slightly different data contents
52        self.uriTagName = "email"
53        # NB, responsible party data is always stored in the moles section
54        if self.type == self.RESPONSIBLE_PARTY_TYPE:
55            self.ns = 'moles'
56            self.uriTagName = "uri"
57
58    def __str__(self):
59        if self.name or self.uri or self.role:
60            return self.name + " | " + self.uri + " | " + self.role
61        return ""
62   
63    def fromString(self, personString):
64        (self.name, self.uri, self.role) = getTripleData(personString)
65       
66    def fromETElement(self, personTag):
67        self.name = personTag.findtext('name') or ""
68        self.role = personTag.findtext('role') or ""
69        self.uri = personTag.findtext(self.uriTagName) or ""
70        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
71                      %(self.name, self.role, self.uriTagName, self.uri))
72
73    def toXML(self):
74        prefix = ""
75        if self.ns:
76            prefix = self.ns + ':'
77
78        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
79
80        if self.name:
81            name = ET.SubElement(author, prefix + "name")
82            name.text = self.name
83       
84        if self.uri:
85            uri = ET.SubElement(author, prefix + self.uriTagName)
86            uri.text = self.uri
87       
88        if self.role:
89            role = ET.SubElement(author, prefix + "role")
90            role.text = self.role
91
92        return author
93   
94    def __cmp__(self, person1):
95        '''
96        Override comparison to allow proper object comparison when checking
97        if Person objects are in an array already - i.e. if person in personArray...
98        '''
99        if not person1:
100            return -1
101       
102        if self is person1:
103            return 0
104        elif self.uri == person1.uri and self.name == person1.name and \
105                self.role == person1.role and self.type == person1.type:
106            return 0
107        return 1
108
109
110class Link(object):
111    '''
112    Class representing an atom link - with href, title and rel attributes
113    '''
114
115    def __init__(self):
116        self.href = ""
117        self.title = ""
118        self.rel = ""
119
120    def fromString(self, linkString):
121        (self.href, self.title, self.rel) = getTripleData(linkString)
122       
123    def fromETElement(self, linkTag):
124        self.href = linkTag.attrib.get('href') or ""
125        self.rel = linkTag.attrib.get('rel') or ""
126        self.title = linkTag.attrib.get('title') or ""
127
128    def toXML(self):
129        link = ET.Element("link")
130        link.attrib["href"] = self.href
131        link.attrib["title"] = self.title
132        link.attrib["rel"] = self.rel
133        return link
134
135    def hasValue(self):
136        # NB, just a rel on its own is meaningless - so ignore
137        if self.href or self.title:
138            return True
139        return False
140   
141    def __str__(self):
142        if self.href or self.title or self.rel:
143            return self.href + " | " + self.title + " | " + self.rel
144        return ""
145   
146    def isChildAtom(self):
147        '''
148        Determines whether the link refers to another atom - e.g. a link to
149        a data granule
150        @return True, if so; False otherwise
151        '''
152        if self.rel.endswith(VTD.GRANULE_TERM) or \
153            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
154            self.rel.endswith(VTD.ACTIVITY_TERM) or \
155            self.rel.endswith(VTD.DPT_TERM) or \
156            self.rel.endswith(VTD.OBS_TERM):
157            return True
158       
159        return False
160   
161    def __cmp__(self, link1):
162        '''
163        Override comparison to allow proper object comparison when checking
164        if Link objects are in an array already - i.e. if link in linkArray...
165        '''
166        if not link1:
167            return -1
168       
169        if self is link1:
170            return 0
171        elif self.href == link1.href and self.title == link1.title and \
172                self.rel == link1.rel:
173            return 0
174        return 1
175
176
177class Category(object):
178    '''
179    Class representing an atom category - with term, scheme and label attributes
180    '''
181    def __init__(self):
182        self.term = ""
183        self.scheme = ""
184        self.label = ""
185
186    def fromString(self, linkString, escapeSpecialCharacters=True):
187        '''
188        Create Category from triple string of format, 'label | scheme | term'
189        @param linkString: triple string to create category with
190        @keyword escapeSpecialCharacters: if set to True, special characters in
191        triple string are escaped (default)
192        '''
193        (self.label, self.scheme, self.term) = getTripleData(linkString, \
194            doEscape=escapeSpecialCharacters)
195       
196    def fromETElement(self, linkTag):
197        self.term = linkTag.attrib.get('term') or ""
198        self.label = linkTag.attrib.get('label') or ""
199        self.scheme = linkTag.attrib.get('scheme') or ""
200
201    def toXML(self):
202        link = ET.Element("category")
203        link.attrib["term"] = self.term
204        link.attrib["scheme"] = self.scheme
205        link.attrib["label"] = self.label
206        return link
207   
208    def hasValue(self):
209        if self.scheme or self.label or self.term:
210            return True
211        return False
212
213
214class Atom(object):
215
216    # labels for use with the atom categories
217    ATOM_TYPE = "ATOM_TYPE"
218    ATOM_SUBTYPE = "ATOM_SUBTYPE"
219
220    # labels for use with the templates to set/extract specific inputs
221    ONLINE_REF_LABEL = "online_ref"
222    PARAMETER_LABEL = "parameter"
223    ATOM_REF_LABEL = "atom_ref"
224    DELIMITER = "---"
225    REMOVE_LABEL = "remove"
226   
227    # format to use for t1-t2 date range
228    YEAR_FORMAT = '%Y-%m-%d'
229
230    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
231                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
232        '''
233        Constructor - initialise the atom variables
234        '''
235        logging.info("Initialising atom")
236        if atomType:
237            logging.info(" - of type '%s'" %atomType)
238        self.atomTypeID = atomType
239
240        # some data have further subtypes specified
241        self.subtypeID = None # this should be the termID
242        self.subtype = None # and this should be the fully formed vocab URL
243       
244        self.ndgObject = ndgObject
245
246        self.atomName = None
247        self.files = []
248        self.author = None
249        self.contributors = []
250        self.atomAuthors = []
251        self.parameters = []
252        self.spatialData = []
253        self.temporalData = []
254        self.relatedLinks = []
255        self.summary = []
256        self.content = []
257        # NB, this deployments data duplicates other atom data - and is only used for a
258        # convenient way to collect the info (by lookupDeploymentsInfo()) for use in templates
259        self.deployments = []   
260        self.csmlFile = None
261        self.cdmlFile = None
262        # general variable to use for setting the atom content - NB, if a csmlFile is specified
263        # (either directly or via a cdmlFile specification), this will be the content by default
264        # for this purpose
265        self.contentFile = None     
266        self.title = None
267        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
268        self.atomID = None
269   
270        # boundary box info - to replace spatial/temporalData?
271        self.minX = None
272        self.minY = None
273        self.maxX = None
274        self.maxY = None
275        self.t1 = None
276        self.t2 = None
277
278        self.ME = ME.MolesEntity()
279       
280        # date when the atom was first ingested
281        self.publishedDate = None
282
283        # last update date
284        self.updatedDate = None
285
286        # assume atom in working state by default - this is used to define what collection
287        # in eXist the atom is stored in
288        self.state = state
289       
290        # additional, non standard atom data can be included in the molesExtra element
291        if vocabTermData:
292            self.VTD = vocabTermData
293        else:
294            self.VTD = VTD()
295       
296        if xmlString:
297            self.fromString(xmlString)
298
299        # if inputs passed in as dict, add these now
300        if inputs:
301            logging.info("Adding info to atom from input dict")
302            logging.debug(inputs)
303            self.__dict__.update(inputs)
304           
305            # NB, this doesn't trigger the Summary Property, so do this
306            # explicitly, if need be
307            if inputs.has_key('Summary'):
308                self.Summary = inputs.get('Summary')
309            if inputs.has_key('Content'):
310                self.Content = inputs.get('Content')
311           
312            # also pass any moles data up to the moles entity object
313            if inputs.get('providerID'):
314                self.ME.providerID = inputs.get('providerID')
315               
316            if inputs.get('abbreviation'):
317                self.ME.abbreviation = inputs.get('abbreviation')
318
319        if self.atomTypeID:
320            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
321
322        logging.info("Atom initialised")
323
324
325    def addOnlineReferences(self, links):
326        '''
327        Add online reference data associated with the atom
328        - NB, care needs to be taken here since this data is stored in the atom
329        link elements and these are also used for the various atom associations
330        @param links: a Link or array of Links to add to the relatedLinks attribute
331        '''
332        logging.debug("Adding online references")
333        if not links:
334            return
335       
336        if type(links) is not list:
337            links = [links]
338       
339        # firstly clear out any online refs data from the existing related links
340        newLinks = []
341        for link in self.relatedLinks:
342            if link.isChildAtom():
343                newLinks.append(link)
344       
345        newLinks.extend(links)
346        self.relatedLinks = newLinks
347        logging.debug("Online references added")
348
349
350    def addUniqueRelatedLinks(self, links):
351        '''
352        Add links to relatedLinks array - if they are not already included
353        @param links: a Link or array of Links to add to the relatedLinks attribute
354        '''
355        self.addUniqueLinks(self.relatedLinks, links)
356       
357
358    def removeRelatedLinks(self, linksToDelete):
359        '''
360        Remove any links in the input list from the atom's related links list
361        @param linksToDelete: array of Link objects to remove from atom
362        '''
363        logging.debug("Removing related links from atom")
364        if not linksToDelete:
365            return
366       
367        if type(linksToDelete) is not list:
368            linksToDelete = [linksToDelete]
369       
370        updatedLinks = []
371        for link in self.relatedLinks:
372            if type(link) is not Link:
373                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
374                continue
375            if link in linksToDelete:
376                logging.debug("- found link to remove")
377            else:
378                updatedLinks.append(link)
379
380        self.relatedLinks = updatedLinks
381        logging.debug("Links removed")
382       
383
384    def getDefaultCollectionPath(self):
385        '''
386        Determine the correct collection to use for the atom in eXist
387        '''
388        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
389       
390        if self.atomTypeID == VTD.DE_TERM:
391            collectionPath += eXistConnector.DE_COLLECTION_PATH
392        elif self.atomTypeID == VTD.GRANULE_TERM:
393            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
394        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
395            self.subtypeID == VTD.DEPLOYMENT_TERM:
396            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
397        else:
398            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
399       
400        if not self.ME.providerID:
401            raise AtomError("Error: cannot determine atom collection path because " + \
402                            "the provider ID is not defined")
403           
404        collectionPath += self.ME.providerID + "/"
405        return collectionPath
406
407
408    def __addAtomTypeDataXML(self, root):
409        '''
410        Add the atom type, and subtype data, if available, to atom categories
411        - and lookup and add the appropriate vocab term data
412        '''
413        if self.atomTypeID:
414            logging.info("Adding atom type info to XML output")
415            category = Category()
416            category.label = self.atomTypeID
417            # look up the appropriate vocab term data
418            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
419            category.term = self.ATOM_TYPE
420            root.append(category.toXML())
421
422        if self.subtypeID:
423            logging.info("Adding atom subtype info to XML output")
424            # NB subtypes not all defined, so leave this out for the moment
425            category.label = self.subtypeID
426            # look up the appropriate vocab term data
427            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
428            category.term = self.ATOM_SUBTYPE
429            root.append(category.toXML())
430
431
432    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
433        '''
434        Add data to include in the moles entity element
435        '''
436        logging.debug('Adding moles entity information')
437        self.ME.abbreviation = abbreviation
438        self.ME.providerID = provider_id
439        self.ME.createdDate = getISO8601Date(object_creation_time)
440        logging.debug('Moles entity information added')
441
442
443    def addAuthors(self, authors):
444        '''
445        Add author data appropriately to the atom
446        NB, these will overwrite any existing authors of the same type
447        @param authors: list of Person objects with the author data
448        '''
449        logging.debug('Adding authors data to Atom')
450        isFirstAuthor = {}
451        authorArray = None
452        for author in authors:
453            # NB, we're only allowed one atom author
454            if author.type == Person.AUTHOR_TYPE:
455                self.author = author
456                if isFirstAuthor.has_key(author.type):
457                    raise AtomError("Error: an atom can only have one author specified")
458                isFirstAuthor[author.type] = 1
459                continue
460            elif author.type == Person.CONTRIBUTOR_TYPE:
461                authorArray = self.contributors
462            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
463                authorArray = self.ME.responsibleParties
464               
465            # check if this is the first addition - if so, clear out the
466            # array in advance
467            if not isFirstAuthor.has_key(author.type):
468                logging.debug("Clearing out author array")
469                # NB, need to be careful to clear the array, not create a ref
470                # to a new array
471                del authorArray[:]
472                isFirstAuthor[author.type] = 1
473
474            if str(author) != "" and author not in authorArray:
475                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
476                              %(author.type, author.name, author.uri, author.role))
477                authorArray.append(author)
478
479        logging.debug('Finished adding authors data')
480
481
482    def _isNewParameter(self, param):
483        '''
484        Check if a parameter is already specified in the atom, return False if
485        so, otherwise return True
486        '''
487        for p in self.parameters:
488            if p.term == param.term and \
489                p.scheme == param.scheme and \
490                p.label == param.label:
491                return False
492        return True
493
494
495    def addRelatedLinks(self, linkVals):
496        '''
497        Add related links in string format - converting to Link objects
498        @param linkVals: string of format, 'uri | title | vocabServerURL'
499        '''
500        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
501
502
503    def addParameters(self, params):
504        '''
505        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
506        @params param: parameter, as string array, to add to atom parameters collection
507        '''
508        # avoid strings being parsed character by character
509        if type(params) is str:
510            params = [params]
511           
512        for param in params:
513            # firstly tidy parameter
514            param = tidyUpParameters(param)
515            category = Category()
516            # NB, data already tidied up here, so set keyword to avoid this happening again
517            category.fromString(param, escapeSpecialCharacters=False)
518
519            # now check for uniqueness
520            if self._isNewParameter(category):
521                logging.debug("Adding new parameter: %s" %param)
522                self.parameters.append(category)
523   
524   
525    def _linksToXML(self, root):
526        '''
527        Add required links to the input element
528        @param root: element to add links to - NB, should be the root element of the atom
529        '''
530        selfLink = ET.SubElement(root, "link")
531        selfLink.attrib["href"] = self.atomBrowseURL
532        selfLink.attrib["rel"] = "self"
533        if self.subtypeID != VTD.DEPLOYMENT_TERM:
534            molesLink = ET.SubElement(root, "link")
535            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
536            molesLink.attrib["href"] = molesDoc
537            molesLink.attrib["rel"] = 'related'
538       
539        for relatedLink in self.relatedLinks:
540            if relatedLink.hasValue():
541                root.append(relatedLink.toXML())
542   
543    def toXML(self):
544        '''
545        Convert the atom into XML representation and return this
546        @return: xml version of atom
547        '''
548        logging.info("Creating formatted XML version of Atom")
549        root = ET.Element("entry")
550        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
551        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
552        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
553        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
554        id = ET.SubElement(root, "id")
555        id.text = self.atomID
556        title = ET.SubElement(root, "title")
557        title.text = self.title
558        self._linksToXML(root)
559
560        # NB, the author tag is mandatory for atoms - so if an explicit
561        # author has not been set, just take the author to be the provider
562        if not self.author:
563            author = Person()
564            author.name = self.ME.providerID
565            #author.uri = self.ME.providerID
566            self.author = author
567
568        root.append(self.author.toXML())
569           
570        for contributor in self.contributors:
571            root.append(contributor.toXML())
572
573        # add the moles entity section, if it is required
574        if self.ME:
575            root.append(self.ME.toXML())
576
577        # add parameters data
578        for param in self.parameters:
579            if param.hasValue():
580                root.append(param.toXML())
581
582        # add the type and subtype data
583        self.__addAtomTypeDataXML(root)
584                   
585        summary = ET.SubElement(root, "summary")
586        summary.text = self.Summary
587                   
588        # add link to content, if required - NB, can only have one content element in atom
589        # - and this is mandatory
590        content = ET.SubElement(root, "content")
591        if self.contentFile:
592            content.attrib["type"] = "application/xml"
593            content.attrib["src"] = self.contentFile
594        else:
595            content.text = self.Content
596            content.attrib["type"] = "xhtml"
597       
598        # if there's a published date already defined, assume we're doing an update now
599        # NB, update element is mandatory
600        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
601        if not self.publishedDate:
602            self.publishedDate = currentDate
603
604        updated = ET.SubElement(root, "updated")
605        if not self.updatedDate:
606            self.updatedDate = currentDate
607        updated.text = self.updatedDate
608
609        published = ET.SubElement(root, "published")
610        published.text = self.publishedDate
611
612        # add temporal range data, if available
613        temporalRange = ET.SubElement(root, "moles:temporalRange")
614        if self.t1:
615            temporalRange.text = self.t1
616            if self.t2:
617                temporalRange.text += "/" + self.t2
618
619        # add spatial range data, if available
620        self._addSpatialData(root)
621
622        tree = ET.ElementTree(root)
623        logging.info("XML version of Atom created")
624        return tree
625
626
627    def __getSummary(self):
628        logging.debug("Getting summary data")
629        summaryString = ""
630        for summary_line in self.summary:
631            summaryString += summary_line + "\n"
632
633        return summaryString
634
635    def __setSummary(self, summary):
636        logging.debug("Adding summary data")
637        self.summary = []
638        for summary_line in summary.split('\n'):
639            self.summary.append(escapeSpecialCharacters(summary_line))
640           
641    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
642
643
644    def __getContent(self):
645        logging.debug("Getting content data")
646        contentString = ""
647        # NB, there must be content specified in an atom
648        if not self.content:
649            return "Metadata document"
650       
651        for content_line in self.content:
652            contentString += content_line + "\n"
653
654        return contentString
655
656    def __setContent(self, content):
657        logging.debug("Adding content data")
658        self.content = []
659        for content_line in content.split('\n'):
660            self.content.append(content_line)
661           
662    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
663
664           
665    def fromString(self, xmlString):
666        '''
667        Initialise Atom object using an xmlString
668        @param xmlString: representation of atom as an XML string
669        '''
670        logging.info("Ingesting data from XML string")
671       
672        # firstly, remove any namespaces used - to avoid problems with elementtree
673        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
674        xmlString = xmlString.replace('moles:', '')
675        xmlString = xmlString.replace('georss:', '')
676        xmlString = xmlString.replace('gml:', '')
677        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
678        xmlString = xmlString.replace('default:', '')
679
680        # now create elementtree with the XML string
681        logging.debug("Create elementtree instance with XML string")
682        tree = ET.fromstring(xmlString)
683       
684        title = tree.findtext('title')
685        if title:
686            logging.debug("Adding title data")
687            self.title = title
688
689        summary = tree.findtext('summary')
690        if summary:
691            self.Summary = summary#.decode('unicode_escape')
692
693        authorElement = tree.find('author')
694        logging.debug("Adding author data")
695        author = Person()
696        author.fromETElement(authorElement)
697        self.author = author
698
699        contributorElements = tree.findall('contributor')
700        for contributorElement in contributorElements:
701            logging.debug("Adding contributor data")
702            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
703            contributor.fromETElement(contributorElement)
704            self.contributors.append(contributor)
705
706        molesElement = tree.find('entity')
707        if molesElement:
708            self.ME.fromET(molesElement)
709               
710        self.atomID = tree.findtext('id')
711
712        self._parseCategoryData(tree.findall('category'))
713
714        self._parseLinksData(tree.findall('link'))
715           
716        contentTag = tree.find('content')
717        if contentTag != None:
718            logging.debug("Found content tag - checking for CSML/CDML file data")
719            file = contentTag.attrib.get('src')
720            if file:
721                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
722                if file.upper().find('CSML') > -1:
723                    logging.debug("Adding CSML file data")
724                    self.csmlFile = file
725                elif file.upper().find('CDML') > -1:
726                    logging.debug("Adding CDML file data")
727                    self.cdmlFile = file
728                self.contentFile = file
729            else:
730                logging.debug("No file data - adding contents of element instead")
731                self.Content = contentTag.text
732       
733        range = tree.findtext('temporalRange')
734        if range:
735            logging.debug("Adding temporal range data")
736            timeData = range.split('/')
737            self.t1 = timeData[0]
738            if len(timeData) > 1:
739                self.t2 = timeData[1]
740       
741        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
742        minBBox = tree.findall('.//lowerCorner')
743        if minBBox:
744            logging.debug("Adding min spatial range data")
745            minBBox = minBBox[0]
746            spatialData = minBBox.text.split()
747            self.minX = spatialData[0]
748            if len(spatialData) > 1:
749                self.minY = spatialData[1]
750       
751        maxBBox = tree.findall('.//upperCorner')
752        if maxBBox:
753            maxBBox = maxBBox[0]
754            logging.debug("Adding max spatial range data")
755            spatialData = maxBBox.text.split()
756            self.maxX = spatialData[0]
757            if len(spatialData) > 1:
758                self.maxY = spatialData[1]
759               
760        publishedDate = tree.findtext('published')
761        if publishedDate:
762            logging.debug("Adding published date")
763            self.publishedDate = publishedDate
764               
765        updatedDate = tree.findtext('updated')
766        if updatedDate:
767            logging.debug("Adding updated date")
768            self.updatedDate = updatedDate
769           
770        logging.info("Completed data ingest")
771   
772   
773    def _parseCategoryData(self, categories):
774        logging.debug("Adding category/parameters data")
775        for category in categories:
776            cat = Category()
777            cat.fromETElement(category)
778           
779            if cat.term == self.ATOM_TYPE:
780                logging.debug("Found atom type data")
781                self.atomTypeID = cat.label
782                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
783                continue
784            elif cat.term == self.ATOM_SUBTYPE:
785                logging.debug("Found atom subtype data")
786                self.subtypeID = cat.label
787                self.subtype = cat.scheme
788                continue
789
790            self.parameters.append(cat)
791   
792
793    def setDatasetID(self, datasetID):
794        '''
795        Set the dataset ID for the atom - and generate an appropriate atom name using this
796        @param datasetID: ID to set for the atom
797        '''
798        self.datasetID = datasetID
799        self._generateAtomName(datasetID) 
800        self.atomID = self.createAtomID(datasetID)
801
802
803    def createAtomID(self, datasetID):
804        '''
805        Create a unique ID, conforming to atom standards, for atom
806        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
807        @param datasetID: ID of atom's dataset
808        @return: unique ID
809        '''
810        logging.info("Creating unique ID for atom")
811        if not self.atomBrowseURL:
812            self._generateAtomName(datasetID)
813        urlBit = self.atomBrowseURL.split('://')[1]
814        urlBit = urlBit.replace('#', '')
815        urlBits = urlBit.split('/')
816        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
817       
818        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
819        logging.info("- unique ID created for atom")
820        logging.debug(" - '%s'" %id)
821        return id
822       
823       
824    def _generateAtomName(self, datasetID):
825        '''
826        Generate a consistent name for the atom - with full eXist doc path
827        @param datasetID: ID of atom's dataset
828        '''
829        self.atomName = datasetID + ".atom"
830        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
831        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
832
833
834    def _parseLinksData(self, links):
835        '''
836        Extract links and atom data from array of link elements in the XML representation of the atom
837        @param links: an array of <link> elements
838        '''
839        # firstly, get all data to start with, so we can properly process it afterwards
840        linkData = {}
841        logging.debug("Getting link data")
842        for linkTag in links:
843            link = Link()
844            link.fromETElement(linkTag)
845
846            if not linkData.has_key(link.rel):
847                linkData[link.rel] = []
848           
849            linkData[link.rel].append(link)
850
851        # there should be one self referencing link - which will provide info on the atom itself
852        if not linkData.has_key('self'):
853            errorMessage = "Atom does not have self referencing link - " + \
854                "cannot ascertain datasetID without this - please fix"
855            logging.error(errorMessage)
856            raise ValueError(errorMessage)
857       
858        # this is the link describing the atom itself
859        self.atomBrowseURL = linkData['self'][0].href
860       
861        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
862        self.atomName = self.datasetID + ".atom"
863        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
864       
865        # now remove this value and the associated moles doc link
866        del linkData['self']
867        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
868        if linkData.has_key('related'):
869            relatedLinks = []
870            for link in linkData['related']:
871                if link.href != molesDoc:
872                    relatedLinks.append(link)
873           
874            linkData['related'] = relatedLinks
875               
876        # now add the remaining links to the atom
877        for key in linkData:
878            for link in linkData[key]:
879                logging.debug("Adding link data")
880                self.relatedLinks.append(link)
881       
882
883    def _addSpatialData(self, element):
884        '''
885        Add spatial coverage element to an input element
886        @param element: element to add coverage data to
887        '''
888        logging.info("Adding spatial data to Atom")
889        if not self.minX:
890            logging.info("No spatial data specified")
891            return
892        bbox = ET.SubElement(element, "georss:where")
893        envelope = ET.SubElement(bbox, "gml:Envelope")
894        lc = ET.SubElement(envelope, "gml:lowerCorner")
895        lc.text = self.minX + " " + self.minY
896        uc = ET.SubElement(envelope, "gml:upperCorner")
897        uc.text = self.maxX + " " + self.maxY
898
899       
900    def setAttribute(self, attributeName, attributeValue):
901        '''
902        Set the value of an atom attribute - and do some basic tidying up of the string content
903        - to escape any XML unfriendly characters
904        @param attributeName: name of the attribute whose value to set
905        @param attributeValue: value to set the attribute to 
906        '''
907        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
908        origValue = attributeValue
909       
910        # escape any special characters if a value has been specified
911        # NB, need to cope with both single values and arrays
912        if attributeValue:
913            if type(attributeValue) is list:
914                newVals = []
915                for val in attributeValue:
916                    newVals.append(self.objectify(escapeSpecialCharacters(val), attributeName))
917                attributeValue = newVals
918                   
919            else:
920                attributeValue = self.objectify(escapeSpecialCharacters(attributeValue), attributeName)
921
922        # handle the special case of authors; only one author is allowed per atom
923        # - the others should be treated as contributors
924        if attributeName == "authors":
925            setattr(self, "author", attributeValue[0])
926            if len(attributeValue) > 1:
927                setattr(self, "contributors", attributeValue[1:])
928        elif attributeName == "atomAuthors":
929            self.ME.responsibleParties.append(attributeValue)
930        else:
931            setattr(self, attributeName, attributeValue)
932
933
934    def objectify(self, objectVals, attributeName):
935        '''
936        Some inputs are specified as strings but need to be converted into
937        objects - do this here
938        @param objectVals: a '|' delimited string of values
939        @param attributeName: name of attribute the values belong to
940        '''
941        obj = None
942        if type(objectVals) != str:
943            return objectVals
944       
945        if attributeName == "relatedLinks":
946            obj = Link()
947        elif attributeName == "atomAuthors" or attributeName == "authors":
948            obj = Person()
949
950        if obj:
951            obj.fromString(objectVals)
952            return obj
953       
954        return objectVals
955
956
957    def toPrettyXML(self):
958        '''
959        Returns nicely formatted XML as string
960        '''
961        atomXML = self.toXML()
962
963        # create the string
964        logging.debug("Converting the elementtree object into a string")
965        prettyXML = et2text(atomXML.getroot())
966
967        # add XML version tag
968        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
969        logging.info("Created formatted version of XML object")
970        return prettyXML
971
972
973    def getLinksOfType(self, termID):
974        '''
975        Returns links in the atom related links attribute which match the specified
976        term ID
977        @param termID: the termID to look for in the related links - NB, this is
978        matched to the end of the link.rel value
979        @return links: array of Link objects with matching term type
980        '''
981        logging.debug("Getting atom links of type, '%s'" %termID)
982        matchingLinks = []
983        for link in self.relatedLinks:
984            # firstly, handle special case where we only want the online ref type links
985            # returned
986            if termID == self.ONLINE_REF_LABEL:
987                if not link.isChildAtom():
988                    logging.debug("- found link with matching term type")
989                    matchingLinks.append(link)
990               
991            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
992                logging.debug("- found link with matching term type")
993                matchingLinks.append(link)
994               
995        logging.debug("Returning matched links")
996        return matchingLinks
997       
998       
999    def getLogos(self):
1000        '''
1001        Return related links that are logos
1002        @return: array of Links containing the logos for the atom
1003        '''
1004        logos = []
1005        for link in self.relatedLinks:
1006            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1007                logos.append(link)
1008               
1009        return logos
1010   
1011   
1012    def isGranule(self):
1013        if self.atomTypeID == VTD.GRANULE_TERM:
1014            return True
1015        return False
1016   
1017   
1018    def isDE(self):
1019        if self.atomTypeID == VTD.DE_TERM:
1020            return True
1021        return False
1022   
1023    def isDeployment(self):
1024        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1025            return True
1026        return False
1027   
1028    def isDeployable(self):
1029        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1030            self.atomTypeID == VTD.DPT_TERM or \
1031            self.atomTypeID == VTD.OBS_TERM:
1032            return True
1033        return False
1034
1035       
1036    def addCSMLData(self, csml, aggregateCoverage=False):
1037        '''
1038        Parse CSML data and add extracted info to the atom
1039        @param csml: csml file contents - or path to csml file
1040        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1041        atom coverage data will be added
1042        '''
1043        logging.info("Creating CSML data model")
1044        self.csmlFile = csml.filename
1045        csmlDoc = CsmlParser.Dataset(file=csml.value)
1046       
1047        logging.info("Extracting info from CSML file")
1048        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1049        self.setDatasetID(csmlDoc.id)
1050       
1051        title = csmlDoc.name.CONTENT
1052        logging.debug("Got dataset name (title): '%s'" %title)
1053        # NB, if a title is specified (and not as the default value), it automatically is used in
1054        # place of anything in the granulite file
1055        if title and title != "NAME OF DATASET GOES HERE":
1056            logging.info("Title, '%s', extracted from CSML file" %title)
1057            if self.title:
1058                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1059                             %self.title)
1060            self.title = title
1061               
1062        bbox1 = csmlDoc.getBoundingBox()
1063        bbox2 = csmlDoc.getCSMLBoundingBox()
1064        time = bbox2.getTimeLimits()
1065   
1066        # now check for other parameters to add to granule
1067        # Firstly, extract the bounding envelope
1068        if bbox1:
1069            w, e = normaliseLongitude(bbox1[0],bbox1[2])
1070            n, s = (bbox1[3], bbox1[1])
1071   
1072            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1073                self.maxY = n
1074               
1075            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1076                self.minY = s
1077           
1078            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1079                self.minX = w
1080   
1081            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1082                self.maxX = e
1083           
1084            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1085                          %(w, s, e, n))
1086           
1087            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1088                          %(self.minX, self.minY, self.maxX, self.maxY))
1089        else:
1090            logging.debug("No valid bounding box data found")
1091   
1092        if time:
1093            t1 = formatDateYYYYMMDD(time[0])
1094            if not aggregateCoverage or \
1095                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1096                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1097                self.t1 = t1
1098   
1099            t2 = time[1]
1100            if t2 and t2 != 'None':
1101                t2 = formatDateYYYYMMDD(t2)
1102                if not aggregateCoverage or \
1103                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1104                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1105                    self.t2 = t2
1106           
1107            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1108        else:
1109            logging.debug("No valid time range data found")
1110   
1111        #create parameter summaries:
1112        #set up list to hold the parameters data
1113        parameters = []
1114        for feature in csmlDoc.featureCollection.featureMembers:
1115            if hasattr(feature.parameter, 'href'):
1116                paramTriple = ""
1117                if hasattr(feature, 'description'):
1118                    paramTriple = feature.description.CONTENT
1119                    paramTriple += " | " + feature.parameter.href
1120                   
1121                    term = ""
1122                    if hasattr(feature, 'name'):
1123                        term = feature.name.CONTENT
1124   
1125                    paramTriple += " | " + term
1126                   
1127                    logging.debug("Got parameter info: %s" %paramTriple)
1128                    parameters.append(paramTriple)
1129       
1130        # update the atom with the extracted parameters
1131        logging.info("Adding CSML parameters to granule atom")
1132        self.addParameters(parameters)
1133        logging.info("Finished adding CSML data")
1134
1135
1136    def lookupDeploymentsInfo(self, dr, lookupIndirectReferences=False):
1137        '''
1138        Check through the atom links and retrieve any associated deployments
1139        data
1140        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1141        config details which are not available to the Atom object
1142        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1143        defined deployments to find those which reference it, otherwise only
1144        deployments data featured in the atom related links are processed
1145        '''
1146        logging.info("Looking up deployments info")
1147        self.deployments = []
1148        self.allActivities = []
1149        self.allObs = []
1150        self.allDpts = []
1151
1152        if lookupIndirectReferences:
1153            logging.info("Looking up references to this atom from other deployments")
1154            doc = dr.get(self.ME.providerID, dr.ATOM_DEPLOYMENTS, self.atomBrowseURL, \
1155                                        targetCollection='/db/atoms/')
1156            # now need to turn this results set into actual atoms
1157            tree = ET.fromstring(doc)
1158            links = []
1159            for atom in tree:
1160                logging.debug("- found reference in deployment")
1161                links.append(ET.tostring(atom))
1162           
1163        else:
1164            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1165
1166        for link in links:
1167            if lookupIndirectReferences:
1168                deploymentAtom = link
1169            else:
1170                localID = link.href.split("__ATOM__")[-1]
1171                deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1172                                        targetCollection='/db/atoms/')
1173
1174            deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1175            self.deployments.append(deployment)
1176           
1177            self.addUniqueLinks(self.allActivities, deployment.activities)
1178            self.addUniqueLinks(self.allObs, deployment.obs)
1179            self.addUniqueLinks(self.allDpts, deployment.dpts)
1180       
1181        logging.info("Finished looking up deployments info")
1182
1183
1184    def addUniqueLinks(self, dataArray, links):
1185        '''
1186        Add links to specified array - if they are not already included
1187        @param dataArray: a list, potentially arlready containing links
1188        @param links: a Link or array of Links to add to the dataArray
1189        '''
1190        logging.debug("Adding new links")
1191        if not links:
1192            return
1193       
1194        if type(links) is not list:
1195            links = [links]
1196       
1197        for link in links:
1198            if type(link) is not Link:
1199                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1200                continue
1201            if link not in dataArray:
1202                logging.debug("- adding unique link")
1203                dataArray.append(link)
1204        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.