source: exist/branches/proglue_production_rev4605_xquery4884/python/ndgUtils/models/Atom.py @ 4886

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/branches/proglue_production_rev4605_xquery4884/python/ndgUtils/models/Atom.py@4886
Revision 4886, 48.5 KB checked in by sdonegan, 10 years ago (diff)

Creating operational branch for ndgUtils as used on proglue

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18import ndgUtils.lib.utilities as utilities
19from ndgUtils.vocabtermdata import VocabTermData as VTD
20from ndgUtils.models import MolesEntity as ME
21import csml.parser as CsmlParser
22from ndgUtils.models import Deployment as Deployment
23
24
25class AtomError(Exception):
26    """
27    Exception handling for Atom class.
28    """
29    def __init__(self, msg):
30        logging.error(msg)
31        Exception.__init__(self, msg)
32
33
34class Person(object):
35    '''
36    Class representing atom author type data - with name, uri and role attributes
37    '''
38    AUTHOR_TYPE = 0
39    CONTRIBUTOR_TYPE = 1
40    RESPONSIBLE_PARTY_TYPE = 2
41    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
42   
43    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
44        self.type = personType
45        self.ns = namespace
46        self.name = ""
47        self.uri = ""
48        self.role = ""
49       
50        # NB, the atom format specifies slightly different data contents
51        self.uriTagName = "email"
52        # NB, responsible party data is always stored in the moles section
53        if self.type == self.RESPONSIBLE_PARTY_TYPE:
54            self.ns = 'moles'
55            self.uriTagName = "uri"
56
57    def __str__(self):
58        if self.name or self.uri or self.role:
59            return self.name + " | " + self.uri + " | " + self.role
60        return ""
61
62
63    def hasValue(self):
64        if self.name or self.uri or self.role:
65            return True
66        return False
67   
68    def fromString(self, personString):
69        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
70       
71    def fromETElement(self, personTag):
72        self.name = personTag.findtext('name') or ""
73        self.role = personTag.findtext('role') or ""
74        self.uri = personTag.findtext(self.uriTagName) or ""
75        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
76                      %(self.name, self.role, self.uriTagName, self.uri))
77
78    def toXML(self):
79        prefix = ""
80        if self.ns:
81            prefix = self.ns + ':'
82
83        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
84
85        if self.name:
86            name = ET.SubElement(author, prefix + "name")
87            name.text = self.name
88       
89        if self.uri:
90            uri = ET.SubElement(author, prefix + self.uriTagName)
91            uri.text = self.uri
92       
93        if self.role:
94            role = ET.SubElement(author, prefix + "role")
95            role.text = self.role
96
97        return author
98   
99    def __cmp__(self, person1):
100        '''
101        Override comparison to allow proper object comparison when checking
102        if Person objects are in an array already - i.e. if person in personArray...
103        '''
104        if not person1:
105            return -1
106       
107        if self is person1:
108            return 0
109        elif self.uri == person1.uri and self.name == person1.name and \
110                self.role == person1.role and self.type == person1.type:
111            return 0
112        return 1
113
114
115class Link(object):
116    '''
117    Class representing an atom link - with href, title and rel attributes
118    '''
119
120    def __init__(self):
121        self.href = ""
122        self.title = ""
123        self.rel = ""
124
125    def fromString(self, linkString):
126        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
127       
128    def fromETElement(self, linkTag):
129        self.href = linkTag.attrib.get('href') or ""
130        self.rel = linkTag.attrib.get('rel') or ""
131        self.title = linkTag.attrib.get('title') or ""
132
133    def toXML(self):
134        link = ET.Element("link")
135        link.attrib["href"] = self.href
136        link.attrib["title"] = self.title
137        link.attrib["rel"] = self.rel
138        return link
139
140    def hasValue(self):
141        # NB, just a rel on its own is meaningless - so ignore
142        if self.href or self.title:
143            return True
144        return False
145   
146    def __str__(self):
147        if self.href or self.title or self.rel:
148            return self.href + " | " + self.title + " | " + self.rel
149        return ""
150   
151    def isChildAtom(self):
152        '''
153        Determines whether the link refers to another atom - e.g. a link to
154        a data granule
155        @return True, if so; False otherwise
156        '''
157        if self.rel.endswith(VTD.GRANULE_TERM) or \
158            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
159            self.rel.endswith(VTD.ACTIVITY_TERM) or \
160            self.rel.endswith(VTD.DPT_TERM) or \
161            self.rel.endswith(VTD.OBS_TERM):
162            return True
163       
164        return False
165   
166    def __cmp__(self, link1):
167        '''
168        Override comparison to allow proper object comparison when checking
169        if Link objects are in an array already - i.e. if link in linkArray...
170        '''
171        if not link1:
172            return -1
173       
174        if self is link1:
175            return 0
176        elif self.href == link1.href and self.title == link1.title and \
177                self.rel == link1.rel:
178            return 0
179        return 1
180
181
182class Category(object):
183    '''
184    Class representing an atom category - with term, scheme and label attributes
185    '''
186    def __init__(self):
187        self.term = ""
188        self.scheme = ""
189        self.label = ""
190
191    def fromString(self, linkString, escapeSpecialCharacters=True):
192        '''
193        Create Category from triple string of format, 'label | scheme | term'
194        @param linkString: triple string to create category with
195        @keyword escapeSpecialCharacters: if set to True, special characters in
196        triple string are escaped (default)
197        '''
198        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
199            doEscape=escapeSpecialCharacters)
200       
201    def fromETElement(self, linkTag):
202        self.term = linkTag.attrib.get('term') or ""
203        self.label = linkTag.attrib.get('label') or ""
204        self.scheme = linkTag.attrib.get('scheme') or ""
205
206    def toXML(self):
207        link = ET.Element("category")
208        link.attrib["term"] = self.term
209        link.attrib["scheme"] = self.scheme
210        link.attrib["label"] = self.label
211        return link
212   
213    def hasValue(self):
214        if self.scheme or self.label or self.term:
215            return True
216        return False
217
218
219class Atom(object):
220
221    # labels for use with the atom categories
222    ATOM_TYPE = "ATOM_TYPE"
223    ATOM_SUBTYPE = "ATOM_SUBTYPE"
224
225    # labels for use with the templates to set/extract specific inputs
226    ONLINE_REF_LABEL = "online_ref"
227    PARAMETER_LABEL = "parameter"
228    ATOM_REF_LABEL = "atom_ref"
229    DELIMITER = "---"
230    REMOVE_LABEL = "remove"
231   
232    # format to use for t1-t2 date range
233    YEAR_FORMAT = '%Y-%m-%d'
234
235    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
236                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
237        '''
238        Constructor - initialise the atom variables
239        '''
240        logging.info("Initialising atom")
241        if atomType:
242            logging.info(" - of type '%s'" %atomType)
243        self.atomTypeID = atomType
244
245        # some data have further subtypes specified
246        self.subtypeID = None # this should be the termID
247        self.subtype = None # and this should be the fully formed vocab URL
248       
249        self.ndgObject = ndgObject
250
251        self.atomName = None
252        self.files = []
253        self.author = Person()
254        self.contributors = []
255        self.atomAuthors = []
256        self.parameters = []
257        self.spatialData = []
258        self.temporalData = []
259        self.relatedLinks = []
260        self.summary = []
261        self.content = []
262        # NB, this deployments data duplicates other atom data - and is only used for a
263        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
264        self.deployments = []
265        # ditto for the following field
266        self.dataEntities = []
267           
268        self.csmlFile = None
269        self.cdmlFile = None
270        # general variable to use for setting the atom content - NB, if a csmlFile is specified
271        # (either directly or via a cdmlFile specification), this will be the content by default
272        # for this purpose
273        self.contentFile = None     
274        self.title = None
275        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
276        self.atomID = None
277   
278        # boundary box info - to replace spatial/temporalData?
279        self.minX = None
280        self.minY = None
281        self.maxX = None
282        self.maxY = None
283        self.t1 = None
284        self.t2 = None
285
286        self.ME = ME.MolesEntity()
287       
288        # date when the atom was first ingested
289        self.publishedDate = None
290
291        # last update date
292        self.updatedDate = None
293
294        # assume atom in working state by default - this is used to define what collection
295        # in eXist the atom is stored in
296        self.state = state
297       
298        # additional, non standard atom data can be included in the molesExtra element
299        if vocabTermData:
300            self.VTD = vocabTermData
301        else:
302            self.VTD = VTD()
303       
304        if xmlString:
305            self.fromString(xmlString)
306
307        # if inputs passed in as dict, add these now
308        if inputs:
309            logging.info("Adding info to atom from input dict")
310            logging.debug(inputs)
311            self.__dict__.update(inputs)
312           
313            # NB, this doesn't trigger the Summary Property, so do this
314            # explicitly, if need be
315            if inputs.has_key('Summary'):
316                self.Summary = inputs.get('Summary')
317            if inputs.has_key('Content'):
318                self.Content = inputs.get('Content')
319            if inputs.has_key('author'):
320                name = inputs.get('author')
321                author = Person()
322                author.fromString(name)
323                self.author = author
324           
325            # also pass any moles data up to the moles entity object
326            if inputs.get('providerID'):
327                self.ME.providerID = inputs.get('providerID')
328               
329            if inputs.get('abbreviation'):
330                self.ME.abbreviation = inputs.get('abbreviation')
331
332        if self.atomTypeID:
333            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
334
335        logging.info("Atom initialised")
336
337
338    def addOnlineReferences(self, links):
339        '''
340        Add online reference data associated with the atom
341        - NB, care needs to be taken here since this data is stored in the atom
342        link elements and these are also used for the various atom associations
343        @param links: a Link or array of Links to add to the relatedLinks attribute
344        '''
345        logging.debug("Adding online references")
346        if not links:
347            return
348       
349        if type(links) is not list:
350            links = [links]
351       
352        # firstly clear out any online refs data from the existing related links
353        newLinks = []
354        for link in self.relatedLinks:
355            if link.isChildAtom():
356                newLinks.append(link)
357       
358        newLinks.extend(links)
359        self.relatedLinks = newLinks
360        logging.debug("Online references added")
361
362
363    def addUniqueRelatedLinks(self, links):
364        '''
365        Add links to relatedLinks array - if they are not already included
366        @param links: a Link or array of Links to add to the relatedLinks attribute
367        '''
368        self.addUniqueLinks(self.relatedLinks, links)
369       
370
371    def removeRelatedLinks(self, linksToDelete):
372        '''
373        Remove any links in the input list from the atom's related links list
374        @param linksToDelete: array of Link objects to remove from atom
375        '''
376        logging.debug("Removing related links from atom")
377        if not linksToDelete:
378            return
379       
380        if type(linksToDelete) is not list:
381            linksToDelete = [linksToDelete]
382       
383        updatedLinks = []
384        for link in self.relatedLinks:
385            if type(link) is not Link:
386                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
387                continue
388            if link in linksToDelete:
389                logging.debug("- found link to remove")
390            else:
391                updatedLinks.append(link)
392
393        self.relatedLinks = updatedLinks
394        logging.debug("Links removed")
395       
396
397    def getDefaultCollectionPath(self):
398        '''
399        Determine the correct collection to use for the atom in eXist
400        '''
401        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
402       
403        if self.atomTypeID == VTD.DE_TERM:
404            collectionPath += eXistConnector.DE_COLLECTION_PATH
405        elif self.atomTypeID == VTD.GRANULE_TERM:
406            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
407        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
408            self.subtypeID == VTD.DEPLOYMENT_TERM:
409            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
410        else:
411            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
412       
413        if not self.ME.providerID:
414            raise AtomError("Error: cannot determine atom collection path because " + \
415                            "the provider ID is not defined")
416           
417        collectionPath += self.ME.providerID + "/"
418        return collectionPath
419
420
421    def __addAtomTypeDataXML(self, root):
422        '''
423        Add the atom type, and subtype data, if available, to atom categories
424        - and lookup and add the appropriate vocab term data
425        '''
426        if self.atomTypeID:
427            logging.info("Adding atom type info to XML output")
428            category = Category()
429            category.label = self.atomTypeID
430            # look up the appropriate vocab term data
431            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
432            category.term = self.ATOM_TYPE
433            root.append(category.toXML())
434
435        if self.subtypeID:
436            logging.info("Adding atom subtype info to XML output")
437            # NB subtypes not all defined, so leave this out for the moment
438            category.label = self.subtypeID
439            # look up the appropriate vocab term data
440            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
441            category.term = self.ATOM_SUBTYPE
442            root.append(category.toXML())
443
444
445    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
446        '''
447        Add data to include in the moles entity element
448        '''
449        logging.debug('Adding moles entity information')
450        self.ME.abbreviation = abbreviation
451        self.ME.providerID = provider_id
452        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
453        logging.debug('Moles entity information added')
454
455
456    def addAuthors(self, authors):
457        '''
458        Add author data appropriately to the atom
459        NB, these will overwrite any existing authors of the same type
460        @param authors: list of Person objects with the author data
461        '''
462        logging.debug('Adding authors data to Atom')
463        isFirstAuthor = {}
464        authorArray = None
465        for author in authors:
466            # NB, we're only allowed one atom author
467            if author.type == Person.AUTHOR_TYPE:
468                self.author = author
469                   
470                if isFirstAuthor.has_key(author.type):
471                    raise AtomError("Error: an atom can only have one author specified")
472                isFirstAuthor[author.type] = 1
473                continue
474            elif author.type == Person.CONTRIBUTOR_TYPE:
475                authorArray = self.contributors
476            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
477                authorArray = self.ME.responsibleParties
478               
479            # check if this is the first addition - if so, clear out the
480            # array in advance
481            if not isFirstAuthor.has_key(author.type):
482                logging.debug("Clearing out author array")
483                # NB, need to be careful to clear the array, not create a ref
484                # to a new array
485                del authorArray[:]
486                isFirstAuthor[author.type] = 1
487
488            if author.hasValue() and author not in authorArray:
489                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
490                              %(author.type, author.name, author.uri, author.role))
491                authorArray.append(author)
492
493        logging.debug('Finished adding authors data')
494
495
496    def _isNewParameter(self, param):
497        '''
498        Check if a parameter is already specified in the atom, return False if
499        so, otherwise return True
500        '''
501        for p in self.parameters:
502            if p.term == param.term and \
503                p.scheme == param.scheme and \
504                p.label == param.label:
505                return False
506        return True
507
508
509    def addRelatedLinks(self, linkVals):
510        '''
511        Add related links in string format - converting to Link objects
512        NB, only add the link if it is unique
513       
514        @param linkVals: string of format, 'uri | title | vocabServerURL'
515        '''
516        link = self.objectify(linkVals, 'relatedLinks')
517        if link not in self.relatedLinks:
518            self.relatedLinks.append(link)
519
520
521    def addParameters(self, params):
522        '''
523        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
524        @params param: parameter, as string array, to add to atom parameters collection
525        '''
526        # avoid strings being parsed character by character
527        if type(params) is str:
528            params = [params]
529           
530        for param in params:
531            # firstly tidy parameter
532            param = utilities.tidyUpParameters(param)
533            category = Category()
534            # NB, data already tidied up here, so set keyword to avoid this happening again
535            category.fromString(param, escapeSpecialCharacters=True)
536
537            # now check for uniqueness
538            if self._isNewParameter(category):
539                logging.debug("Adding new parameter: %s" %param)
540                self.parameters.append(category)
541   
542   
543    def _linksToXML(self, root):
544        '''
545        Add required links to the input element
546        @param root: element to add links to - NB, should be the root element of the atom
547        '''
548        selfLink = ET.SubElement(root, "link")
549        selfLink.attrib["href"] = self.atomBrowseURL
550        selfLink.attrib["rel"] = "self"
551        if self.subtypeID != VTD.DEPLOYMENT_TERM:
552            molesLink = ET.SubElement(root, "link")
553            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
554            molesLink.attrib["href"] = molesDoc
555            molesLink.attrib["rel"] = 'related'
556       
557        for relatedLink in self.relatedLinks:
558            if relatedLink.hasValue():
559                root.append(relatedLink.toXML())
560   
561    def toXML(self):
562        '''
563        Convert the atom into XML representation and return this
564        @return: xml version of atom
565        '''
566        logging.info("Creating formatted XML version of Atom")
567        root = ET.Element("entry")
568        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
569        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
570        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
571        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
572        id = ET.SubElement(root, "id")
573        id.text = self.atomID
574        title = ET.SubElement(root, "title")
575        title.text = self.title
576        self._linksToXML(root)
577
578        if self.author and self.author.hasValue():
579            root.append(self.author.toXML())
580           
581        for contributor in self.contributors:
582            root.append(contributor.toXML())
583
584        # add parameters data
585        for param in self.parameters:
586            if param.hasValue():
587                root.append(param.toXML())
588
589        # add the type and subtype data
590        self.__addAtomTypeDataXML(root)
591                   
592        summary = ET.SubElement(root, "summary")
593        summary.text = self.Summary
594                   
595        # add link to content, if required - NB, can only have one content element in atom
596        # - and this is mandatory
597        content = ET.SubElement(root, "content")
598        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
599        if contentFile:
600            content.attrib["type"] = "application/xml"
601            content.attrib["src"] = contentFile
602        else:
603            content.attrib["type"] = "xhtml"
604            div = ET.SubElement(content, 'div')
605            div.attrib["xmlns"] = "http://www.w3.org/1999/xhtml"
606            div.text = self.Content
607       
608        # if there's a published date already defined, assume we're doing an update now
609        # NB, update element is mandatory
610        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
611        if not self.publishedDate:
612            self.publishedDate = currentDate
613
614        updated = ET.SubElement(root, "updated")
615        if not self.updatedDate:
616            self.updatedDate = currentDate
617        updated.text = self.updatedDate
618
619        published = ET.SubElement(root, "published")
620        published.text = self.publishedDate
621
622        # add the moles entity section, if it is required
623        if self.ME:
624            root.append(self.ME.toXML())
625
626        # add temporal range data, if available
627        temporalRange = ET.SubElement(root, "moles:temporalRange")
628        if self.t1:
629            temporalRange.text = self.t1
630            if self.t2:
631                temporalRange.text += "/" + self.t2
632
633        # add spatial range data, if available
634        self._addSpatialData(root)
635
636        tree = ET.ElementTree(root)
637        logging.info("XML version of Atom created")
638        return tree
639
640
641    def __getSummary(self):
642        logging.debug("Getting summary data")
643        summaryString = ""
644        for summary_line in self.summary:
645            summaryString += summary_line + "\n"
646
647        return summaryString
648
649    def __setSummary(self, summary):
650        logging.debug("Adding summary data")
651        self.summary = []
652        for summary_line in summary.split('\n'):
653            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
654           
655    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
656
657
658    def __getContent(self):
659        logging.debug("Getting content data")
660        contentString = ""
661        # NB, there must be content specified in an atom
662        if not self.content:
663            return "Metadata document"
664       
665        for content_line in self.content:
666            contentString += content_line + "\n"
667
668        return contentString
669
670    def __setContent(self, content):
671        logging.debug("Adding content data")
672        self.content = []
673        for content_line in content.split('\n'):
674            self.content.append(content_line)
675           
676    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
677
678           
679    def fromString(self, xmlString):
680        '''
681        Initialise Atom object using an xmlString
682        @param xmlString: representation of atom as an XML string
683        '''
684        logging.info("Ingesting data from XML string")
685       
686        # firstly, remove any namespaces used - to avoid problems with elementtree
687        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
688        xmlString = xmlString.replace('moles:', '')
689        xmlString = xmlString.replace('georss:', '')
690        xmlString = xmlString.replace('gml:', '')
691        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
692        xmlString = xmlString.replace('default:', '')
693        xmlString = xmlString.replace('xs:', '')
694
695        # now create elementtree with the XML string
696        logging.debug("Create elementtree instance with XML string")
697        tree = ET.fromstring(xmlString)
698       
699        title = tree.findtext('title')
700        if title:
701            logging.debug("Adding title data")
702            self.title = title
703
704        summary = tree.findtext('summary')
705        if summary:
706            self.Summary = summary#.decode('unicode_escape')
707
708        authorElement = tree.find('author')
709        if authorElement:
710            logging.debug("Adding author data")
711            author = Person()
712            author.fromETElement(authorElement)
713            self.author = author
714
715        contributorElements = tree.findall('contributor')
716        for contributorElement in contributorElements:
717            logging.debug("Adding contributor data")
718            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
719            contributor.fromETElement(contributorElement)
720            self.contributors.append(contributor)
721
722        molesElement = tree.find('entity')
723        if molesElement:
724            self.ME.fromET(molesElement)
725               
726        self.atomID = tree.findtext('id')
727
728        self._parseCategoryData(tree.findall('category'))
729
730        self._parseLinksData(tree.findall('link'))
731           
732        contentTag = tree.find('content')
733        if contentTag != None:
734            logging.debug("Found content tag - checking for CSML/CDML file data")
735            file = contentTag.attrib.get('src')
736            if file:
737                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
738                if file.upper().find('CSML') > -1:
739                    logging.debug("Adding CSML file data")
740                    self.csmlFile = file
741                elif file.upper().find('CDML') > -1:
742                    logging.debug("Adding CDML file data")
743                    self.cdmlFile = file
744                self.contentFile = file
745            else:
746                logging.debug("No file data - adding contents of element instead")
747                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
748                self.Content = div.text
749       
750        range = tree.findtext('temporalRange')
751        if range:
752            logging.debug("Adding temporal range data")
753            timeData = range.split('/')
754            self.t1 = timeData[0]
755            if len(timeData) > 1:
756                self.t2 = timeData[1]
757       
758        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
759        minBBox = tree.findall('.//lowerCorner')
760        if minBBox:
761            logging.debug("Adding min spatial range data")
762            minBBox = minBBox[0]
763            spatialData = minBBox.text.split()
764            self.minX = spatialData[0]
765            if len(spatialData) > 1:
766                self.minY = spatialData[1]
767       
768        maxBBox = tree.findall('.//upperCorner')
769        if maxBBox:
770            maxBBox = maxBBox[0]
771            logging.debug("Adding max spatial range data")
772            spatialData = maxBBox.text.split()
773            self.maxX = spatialData[0]
774            if len(spatialData) > 1:
775                self.maxY = spatialData[1]
776               
777        publishedDate = tree.findtext('published')
778        if publishedDate:
779            logging.debug("Adding published date")
780            self.publishedDate = publishedDate
781               
782        updatedDate = tree.findtext('updated')
783        if updatedDate:
784            logging.debug("Adding updated date")
785            self.updatedDate = updatedDate
786           
787        logging.info("Completed data ingest")
788   
789   
790    def _parseCategoryData(self, categories):
791        logging.debug("Adding category/parameters data")
792        for category in categories:
793            cat = Category()
794            cat.fromETElement(category)
795           
796            if cat.term == self.ATOM_TYPE:
797                logging.debug("Found atom type data")
798                self.atomTypeID = cat.label
799                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
800                continue
801            elif cat.term == self.ATOM_SUBTYPE:
802                logging.debug("Found atom subtype data")
803                self.subtypeID = cat.label
804                self.subtype = cat.scheme
805                continue
806
807            self.parameters.append(cat)
808   
809
810    def setDatasetID(self, datasetID):
811        '''
812        Set the dataset ID for the atom - and generate an appropriate atom name using this
813        @param datasetID: ID to set for the atom
814        '''
815        self.datasetID = datasetID
816        self._generateAtomName(datasetID) 
817        self.atomID = self.createAtomID(datasetID)
818
819
820    def createAtomID(self, datasetID):
821        '''
822        Create a unique ID, conforming to atom standards, for atom
823        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
824        @param datasetID: ID of atom's dataset
825        @return: unique ID
826        '''
827        logging.info("Creating unique ID for atom")
828        if not self.atomBrowseURL:
829            self._generateAtomName(datasetID)
830        urlBit = self.atomBrowseURL.split('://')[1]
831        urlBit = urlBit.replace('#', '')
832        urlBits = urlBit.split('/')
833        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
834        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
835       
836        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
837        logging.info("- unique ID created for atom")
838        logging.debug(" - '%s'" %id)
839        return id
840       
841       
842    def _generateAtomName(self, datasetID):
843        '''
844        Generate a consistent name for the atom - with full eXist doc path
845        @param datasetID: ID of atom's dataset
846        '''
847        self.atomName = datasetID + ".atom"
848        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
849        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
850
851
852    def _parseLinksData(self, links):
853        '''
854        Extract links and atom data from array of link elements in the XML representation of the atom
855        @param links: an array of <link> elements
856        '''
857        # firstly, get all data to start with, so we can properly process it afterwards
858        linkData = {}
859        logging.debug("Getting link data")
860        for linkTag in links:
861            link = Link()
862            link.fromETElement(linkTag)
863
864            if not linkData.has_key(link.rel):
865                linkData[link.rel] = []
866           
867            linkData[link.rel].append(link)
868
869        # there should be one self referencing link - which will provide info on the atom itself
870        if not linkData.has_key('self'):
871            errorMessage = "Atom does not have self referencing link - " + \
872                "cannot ascertain datasetID without this - please fix"
873            logging.error(errorMessage)
874            raise ValueError(errorMessage)
875       
876        # this is the link describing the atom itself
877        self.atomBrowseURL = linkData['self'][0].href
878       
879        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
880        self.atomName = self.datasetID + ".atom"
881        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
882       
883        # now remove this value and the associated moles doc link
884        del linkData['self']
885        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
886        if linkData.has_key('related'):
887            relatedLinks = []
888            for link in linkData['related']:
889                if link.href != molesDoc:
890                    relatedLinks.append(link)
891           
892            linkData['related'] = relatedLinks
893               
894        # now add the remaining links to the atom
895        for key in linkData:
896            for link in linkData[key]:
897                logging.debug("Adding link data")
898                self.relatedLinks.append(link)
899       
900
901    def _addSpatialData(self, element):
902        '''
903        Add spatial coverage element to an input element
904        @param element: element to add coverage data to
905        '''
906        logging.info("Adding spatial data to Atom")
907        if not self.minX:
908            logging.info("No spatial data specified")
909            return
910        bbox = ET.SubElement(element, "georss:where")
911        envelope = ET.SubElement(bbox, "gml:Envelope")
912        lc = ET.SubElement(envelope, "gml:lowerCorner")
913        lc.text = str(self.minX) + " " + str(self.minY)
914        uc = ET.SubElement(envelope, "gml:upperCorner")
915        uc.text = str(self.maxX) + " " + str(self.maxY)
916
917       
918    def setAttribute(self, attributeName, attributeValue):
919        '''
920        Set the value of an atom attribute - and do some basic tidying up of the string content
921        - to escape any XML unfriendly characters
922        @param attributeName: name of the attribute whose value to set
923        @param attributeValue: value to set the attribute to 
924        '''
925        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
926        origValue = attributeValue
927       
928        # escape any special characters if a value has been specified
929        # NB, need to cope with both single values and arrays
930        if attributeValue:
931            if type(attributeValue) is list:
932                newVals = []
933                for val in attributeValue:
934                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
935                attributeValue = newVals
936                   
937            else:
938                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
939
940        # handle the special case of authors; only one author is allowed per atom
941        # - the others should be treated as contributors
942        if attributeName == "authors":
943            setattr(self, "author", attributeValue[0])
944            if len(attributeValue) > 1:
945                setattr(self, "contributors", attributeValue[1:])
946        elif attributeName == "atomAuthors":
947            if isinstance(attributeValue, list):
948                for val in attributeValue:
949                    self.ME.responsibleParties.append(val)
950            else:
951                self.ME.responsibleParties.append(attributeValue)
952        elif attributeName == "files":
953            self.addUniqueRelatedLinks(attributeValue)
954        else:
955            setattr(self, attributeName, attributeValue)
956
957
958    def objectify(self, objectVals, attributeName):
959        '''
960        Some inputs are specified as strings but need to be converted into
961        objects - do this here
962        @param objectVals: a '|' delimited string of values
963        @param attributeName: name of attribute the values belong to
964        '''
965        obj = None
966        if type(objectVals) != str:
967            return objectVals
968       
969        if attributeName == "relatedLinks":
970            obj = Link()
971        elif attributeName == "atomAuthors" or attributeName == "authors":
972            # NB, ensure there is only one author tag - extra authors are contributors
973            authorType = Person.AUTHOR_TYPE
974            if self.author and self.author.hasValue():
975                authorType= Person.CONTRIBUTOR_TYPE
976            obj = Person(personType = authorType)
977        elif attributeName == 'files':
978            obj = Link()
979            objectVals = '%s|%s|%s' \
980                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
981
982        if obj:
983            obj.fromString(objectVals)
984            # NB, need to set it now, just in case we don't set it before coming back
985            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
986                self.author = obj
987            return obj
988       
989        return objectVals
990
991
992    def toPrettyXML(self):
993        '''
994        Returns nicely formatted XML as string
995        '''
996        atomXML = self.toXML()
997
998        # create the string
999        logging.debug("Converting the elementtree object into a string")
1000        prettyXML = et2text(atomXML.getroot())
1001
1002        # add XML version tag
1003        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1004        logging.info("Created formatted version of XML object")
1005        return prettyXML
1006
1007
1008    def getLinksOfType(self, termID):
1009        '''
1010        Returns links in the atom related links attribute which match the specified
1011        term ID
1012        @param termID: the termID to look for in the related links - NB, this is
1013        matched to the end of the link.rel value
1014        @return links: array of Link objects with matching term type
1015        '''
1016        logging.debug("Getting atom links of type, '%s'" %termID)
1017        matchingLinks = []
1018        for link in self.relatedLinks:
1019            # firstly, handle special case where we only want the online ref type links
1020            # returned
1021            if termID == self.ONLINE_REF_LABEL:
1022                if not link.isChildAtom():
1023                    logging.debug("- found link with matching term type")
1024                    matchingLinks.append(link)
1025               
1026            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1027                logging.debug("- found link with matching term type")
1028                matchingLinks.append(link)
1029               
1030        logging.debug("Returning matched links")
1031        return matchingLinks
1032       
1033       
1034    def getLogos(self):
1035        '''
1036        Return related links that are logos
1037        @return: array of Links containing the logos for the atom
1038        '''
1039        logos = []
1040        for link in self.relatedLinks:
1041            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1042                logos.append(link)
1043               
1044        return logos
1045   
1046   
1047    def isGranule(self):
1048        if self.atomTypeID == VTD.GRANULE_TERM:
1049            return True
1050        return False
1051   
1052   
1053    def isDE(self):
1054        if self.atomTypeID == VTD.DE_TERM:
1055            return True
1056        return False
1057   
1058    def isDeployment(self):
1059        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1060            return True
1061        return False
1062   
1063    def isDeployable(self):
1064        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1065            self.atomTypeID == VTD.DPT_TERM or \
1066            self.atomTypeID == VTD.OBS_TERM:
1067            return True
1068        return False
1069       
1070       
1071    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1072        '''
1073        Parse CSML data and add extracted info to the atom
1074        @param csmlName: name of the csml file
1075        @param csmlContent: content of the csml file - NB, if this is set to None and the
1076        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1077        directly
1078        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1079        atom coverage data will be added
1080        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1081        this should only be True if creating a new atom - e.g. from a granulite
1082        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1083        '''
1084        logging.info("Creating CSML data model")
1085        self.csmlFile = csmlName
1086        self.contentFile = csmlName
1087        content = csmlContent or csmlName
1088   
1089        csmlDoc = CsmlParser.Dataset(file=content)
1090       
1091        logging.info("Extracting info from CSML file")
1092        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1093        if useCSMLID:
1094            logging.debug(" - using this ID for the atom")
1095            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1096       
1097        title = csmlDoc.name.CONTENT
1098        logging.debug("Got dataset name (title): '%s'" %title)
1099        # NB, if a title is specified (and not as the default value), it automatically is used in
1100        # place of anything in the granulite file
1101        if title and title != "NAME OF DATASET GOES HERE":
1102            logging.info("Title, '%s', extracted from CSML file" %title)
1103            if self.title:
1104                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1105                             %self.title)
1106            self.title = title
1107               
1108        bbox1 = csmlDoc.getBoundingBox()
1109        bbox2 = csmlDoc.getCSMLBoundingBox()
1110
1111        time = None
1112        if bbox2:
1113            time = bbox2.getTimeLimits()
1114   
1115        # now check for other parameters to add to granule
1116        # Firstly, extract the bounding envelope
1117        if bbox1:
1118            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1119            n, s = (bbox1[3], bbox1[1])
1120   
1121            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1122                self.maxY = n
1123               
1124            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1125                self.minY = s
1126           
1127            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1128                self.minX = w
1129   
1130            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1131                self.maxX = e
1132           
1133            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1134                          %(w, s, e, n))
1135           
1136            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1137                          %(self.minX, self.minY, self.maxX, self.maxY))
1138        else:
1139            logging.debug("No valid bounding box data found")
1140   
1141        if time:
1142            t1 = utilities.formatDateYYYYMMDD(time[0])
1143            if not aggregateCoverage or \
1144                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1145                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1146                self.t1 = t1
1147   
1148            t2 = time[1]
1149            if t2 and t2 != 'None':
1150                t2 = utilities.formatDateYYYYMMDD(t2)
1151                if not aggregateCoverage or \
1152                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1153                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1154                    self.t2 = t2
1155           
1156            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1157        else:
1158            logging.debug("No valid time range data found")
1159   
1160        #create parameter summaries:
1161        #set up list to hold the parameters data
1162        parameters = []
1163        for feature in csmlDoc.featureCollection.featureMembers:
1164            if hasattr(feature.parameter, 'href'):
1165                paramTriple = ""
1166                if hasattr(feature, 'description'):
1167                    paramTriple = feature.description.CONTENT
1168                    paramTriple += " | " + feature.parameter.href
1169                   
1170                    term = ""
1171                    if hasattr(feature, 'name'):
1172                        term = feature.name.CONTENT
1173   
1174                    paramTriple += " | " + term
1175                   
1176                    logging.debug("Got parameter info: %s" %paramTriple)
1177                    parameters.append(paramTriple)
1178       
1179        # update the atom with the extracted parameters
1180        logging.info("Adding CSML parameters to granule atom")
1181        self.addParameters(parameters)
1182        logging.info("Finished adding CSML data")
1183        return csmlDoc
1184
1185
1186    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1187        '''
1188        Check through the atom links and retrieve any associated data of the
1189        specified type
1190        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1191        or VTD.DE_TERM
1192        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1193        config details which are not available to the Atom object
1194        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1195        defined deployments to find those which reference it, otherwise only
1196        deployments data featured in the atom related links are processed
1197        '''
1198        logging.info("Looking up %s info" %type)
1199       
1200        self.allActivities = []
1201        self.allObs = []
1202        self.allDpts = []
1203
1204        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1205            raise ValueError('Unrecognised associated data type: %s' %type)
1206       
1207        # avoid duplicating lookup effort
1208        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1209            (type == VTD.DE_TERM and self.dataEntities):
1210            logging.info("- this info has already been looked up - returning")
1211            return
1212
1213        # firstly, collect all the references to the info required
1214        if lookupIndirectReferences:
1215            logging.info("Looking up indirect references")
1216           
1217            # if we're looking up DE data for deployments data, need to have the
1218            # deployments info looked up first
1219            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1220                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1221           
1222            logging.info("Looking up references to this atom from other %s" %type)
1223           
1224            # NB, if we're looking up deployments info, we only look up references
1225            # to this atom - if we're looking up DEs, we need to look up references
1226            # to the deployments referenced by this atom
1227            urls = [self.atomBrowseURL]
1228           
1229            if type == VTD.DE_TERM and self.isDeployable():
1230                urls = []
1231                for dep in self.deployments:
1232                    urls.append(dep.browseURL)
1233                   
1234            links = []
1235            for url in urls:
1236                doc = dr.get(type, dr.ATOM_TYPE, url, \
1237                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1238                # now need to turn this results set into actual atoms
1239                tree = ET.fromstring(doc)
1240                for atom in tree:
1241                    logging.debug("- found reference in %s" %type)
1242                    links.append(ET.tostring(atom))
1243                   
1244            logging.info("Finished looking up indirect references")
1245        else:
1246            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1247
1248        # now retrieve the references and extract the required data
1249        logging.info("Retrieving info from %s references" %type)
1250        if type == VTD.DEPLOYMENT_TERM:
1251            self.deployments = []
1252            for link in links:
1253                if lookupIndirectReferences:
1254                    deploymentAtom = link
1255                else:
1256                    localID = link.href.split("__ATOM__")[-1]
1257                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1258                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1259   
1260                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1261                self.deployments.append(deployment)
1262               
1263                self.addUniqueLinks(self.allActivities, deployment.activities)
1264                self.addUniqueLinks(self.allObs, deployment.obs)
1265                self.addUniqueLinks(self.allDpts, deployment.dpts)
1266        else:
1267            # for DE data, just store the title + link in a Link object
1268            self.dataEntities = []
1269            for data in links:
1270                atom = Atom(xmlString=str(data))
1271                link = Link()
1272                link.title = atom.title
1273                link.href = atom.atomBrowseURL
1274               
1275                # NB, different deployments may be used by the same DE - so
1276                # avoid duplication
1277                self.addUniqueLinks(self.dataEntities, link)
1278           
1279        logging.info("Finished looking up %s info" %type)
1280
1281
1282    def addUniqueLinks(self, dataArray, links):
1283        '''
1284        Add links to specified array - if they are not already included
1285        @param dataArray: a list, potentially arlready containing links
1286        @param links: a Link or array of Links to add to the dataArray
1287        '''
1288        logging.debug("Adding new links")
1289        if not links:
1290            return
1291       
1292        if type(links) is not list:
1293            links = [links]
1294       
1295        for link in links:
1296            if type(link) is not Link:
1297                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1298                continue
1299            if link not in dataArray:
1300                logging.debug("- adding unique link")
1301                dataArray.append(link)
1302        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.