source: exist/trunk/python/ndgUtils/models/Atom.py @ 4555

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4555
Revision 4555, 47.7 KB checked in by cbyrom, 11 years ago (diff)

Move existbdclient to lib package + extend to make use of DocumentRetrieve? to allow retrieval of atoms by ID + fix handling of
authors vs contributors when doing Atom to XML exports.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from ndgUtils.lib.utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date, normaliseLongitude, formatDateYYYYMMDD
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class Person(object):
36    '''
37    Class representing atom author type data - with name, uri and role attributes
38    '''
39    AUTHOR_TYPE = 0
40    CONTRIBUTOR_TYPE = 1
41    RESPONSIBLE_PARTY_TYPE = 2
42    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
43   
44    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
45        self.type = personType
46        self.ns = namespace
47        self.name = ""
48        self.uri = ""
49        self.role = ""
50       
51        # NB, the atom format specifies slightly different data contents
52        self.uriTagName = "email"
53        # NB, responsible party data is always stored in the moles section
54        if self.type == self.RESPONSIBLE_PARTY_TYPE:
55            self.ns = 'moles'
56            self.uriTagName = "uri"
57
58    def __str__(self):
59        if self.name or self.uri or self.role:
60            return self.name + " | " + self.uri + " | " + self.role
61        return ""
62   
63    def fromString(self, personString):
64        (self.name, self.uri, self.role) = getTripleData(personString)
65       
66    def fromETElement(self, personTag):
67        self.name = personTag.findtext('name') or ""
68        self.role = personTag.findtext('role') or ""
69        self.uri = personTag.findtext(self.uriTagName) or ""
70        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
71                      %(self.name, self.role, self.uriTagName, self.uri))
72
73    def toXML(self):
74        prefix = ""
75        if self.ns:
76            prefix = self.ns + ':'
77
78        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
79
80        if self.name:
81            name = ET.SubElement(author, prefix + "name")
82            name.text = self.name
83       
84        if self.uri:
85            uri = ET.SubElement(author, prefix + self.uriTagName)
86            uri.text = self.uri
87       
88        if self.role:
89            role = ET.SubElement(author, prefix + "role")
90            role.text = self.role
91
92        return author
93   
94    def __cmp__(self, person1):
95        '''
96        Override comparison to allow proper object comparison when checking
97        if Person objects are in an array already - i.e. if person in personArray...
98        '''
99        if not person1:
100            return -1
101       
102        if self is person1:
103            return 0
104        elif self.uri == person1.uri and self.name == person1.name and \
105                self.role == person1.role and self.type == person1.type:
106            return 0
107        return 1
108
109
110class Link(object):
111    '''
112    Class representing an atom link - with href, title and rel attributes
113    '''
114
115    def __init__(self):
116        self.href = ""
117        self.title = ""
118        self.rel = ""
119
120    def fromString(self, linkString):
121        (self.href, self.title, self.rel) = getTripleData(linkString)
122       
123    def fromETElement(self, linkTag):
124        self.href = linkTag.attrib.get('href') or ""
125        self.rel = linkTag.attrib.get('rel') or ""
126        self.title = linkTag.attrib.get('title') or ""
127
128    def toXML(self):
129        link = ET.Element("link")
130        link.attrib["href"] = self.href
131        link.attrib["title"] = self.title
132        link.attrib["rel"] = self.rel
133        return link
134
135    def hasValue(self):
136        # NB, just a rel on its own is meaningless - so ignore
137        if self.href or self.title:
138            return True
139        return False
140   
141    def __str__(self):
142        if self.href or self.title or self.rel:
143            return self.href + " | " + self.title + " | " + self.rel
144        return ""
145   
146    def isChildAtom(self):
147        '''
148        Determines whether the link refers to another atom - e.g. a link to
149        a data granule
150        @return True, if so; False otherwise
151        '''
152        if self.rel.endswith(VTD.GRANULE_TERM) or \
153            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
154            self.rel.endswith(VTD.ACTIVITY_TERM) or \
155            self.rel.endswith(VTD.DPT_TERM) or \
156            self.rel.endswith(VTD.OBS_TERM):
157            return True
158       
159        return False
160   
161    def __cmp__(self, link1):
162        '''
163        Override comparison to allow proper object comparison when checking
164        if Link objects are in an array already - i.e. if link in linkArray...
165        '''
166        if not link1:
167            return -1
168       
169        if self is link1:
170            return 0
171        elif self.href == link1.href and self.title == link1.title and \
172                self.rel == link1.rel:
173            return 0
174        return 1
175
176
177class Category(object):
178    '''
179    Class representing an atom category - with term, scheme and label attributes
180    '''
181    def __init__(self):
182        self.term = ""
183        self.scheme = ""
184        self.label = ""
185
186    def fromString(self, linkString, escapeSpecialCharacters=True):
187        '''
188        Create Category from triple string of format, 'label | scheme | term'
189        @param linkString: triple string to create category with
190        @keyword escapeSpecialCharacters: if set to True, special characters in
191        triple string are escaped (default)
192        '''
193        (self.label, self.scheme, self.term) = getTripleData(linkString, \
194            doEscape=escapeSpecialCharacters)
195       
196    def fromETElement(self, linkTag):
197        self.term = linkTag.attrib.get('term') or ""
198        self.label = linkTag.attrib.get('label') or ""
199        self.scheme = linkTag.attrib.get('scheme') or ""
200
201    def toXML(self):
202        link = ET.Element("category")
203        link.attrib["term"] = self.term
204        link.attrib["scheme"] = self.scheme
205        link.attrib["label"] = self.label
206        return link
207   
208    def hasValue(self):
209        if self.scheme or self.label or self.term:
210            return True
211        return False
212
213
214class Atom(object):
215
216    # labels for use with the atom categories
217    ATOM_TYPE = "ATOM_TYPE"
218    ATOM_SUBTYPE = "ATOM_SUBTYPE"
219
220    # labels for use with the templates to set/extract specific inputs
221    ONLINE_REF_LABEL = "online_ref"
222    PARAMETER_LABEL = "parameter"
223    ATOM_REF_LABEL = "atom_ref"
224    DELIMITER = "---"
225    REMOVE_LABEL = "remove"
226   
227    # format to use for t1-t2 date range
228    YEAR_FORMAT = '%Y-%m-%d'
229
230    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
231                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
232        '''
233        Constructor - initialise the atom variables
234        '''
235        logging.info("Initialising atom")
236        if atomType:
237            logging.info(" - of type '%s'" %atomType)
238        self.atomTypeID = atomType
239
240        # some data have further subtypes specified
241        self.subtypeID = None # this should be the termID
242        self.subtype = None # and this should be the fully formed vocab URL
243       
244        self.ndgObject = ndgObject
245
246        self.atomName = None
247        self.files = []
248        self.author = None
249        self.contributors = []
250        self.atomAuthors = []
251        self.parameters = []
252        self.spatialData = []
253        self.temporalData = []
254        self.relatedLinks = []
255        self.summary = []
256        self.content = []
257        # NB, this deployments data duplicates other atom data - and is only used for a
258        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
259        self.deployments = []
260        # ditto for the following field
261        self.dataEntities = []
262           
263        self.csmlFile = None
264        self.cdmlFile = None
265        # general variable to use for setting the atom content - NB, if a csmlFile is specified
266        # (either directly or via a cdmlFile specification), this will be the content by default
267        # for this purpose
268        self.contentFile = None     
269        self.title = None
270        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
271        self.atomID = None
272   
273        # boundary box info - to replace spatial/temporalData?
274        self.minX = None
275        self.minY = None
276        self.maxX = None
277        self.maxY = None
278        self.t1 = None
279        self.t2 = None
280
281        self.ME = ME.MolesEntity()
282       
283        # date when the atom was first ingested
284        self.publishedDate = None
285
286        # last update date
287        self.updatedDate = None
288
289        # assume atom in working state by default - this is used to define what collection
290        # in eXist the atom is stored in
291        self.state = state
292       
293        # additional, non standard atom data can be included in the molesExtra element
294        if vocabTermData:
295            self.VTD = vocabTermData
296        else:
297            self.VTD = VTD()
298       
299        if xmlString:
300            self.fromString(xmlString)
301
302        # if inputs passed in as dict, add these now
303        if inputs:
304            logging.info("Adding info to atom from input dict")
305            logging.debug(inputs)
306            self.__dict__.update(inputs)
307           
308            # NB, this doesn't trigger the Summary Property, so do this
309            # explicitly, if need be
310            if inputs.has_key('Summary'):
311                self.Summary = inputs.get('Summary')
312            if inputs.has_key('Content'):
313                self.Content = inputs.get('Content')
314           
315            # also pass any moles data up to the moles entity object
316            if inputs.get('providerID'):
317                self.ME.providerID = inputs.get('providerID')
318               
319            if inputs.get('abbreviation'):
320                self.ME.abbreviation = inputs.get('abbreviation')
321
322        if self.atomTypeID:
323            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
324
325        logging.info("Atom initialised")
326
327
328    def addOnlineReferences(self, links):
329        '''
330        Add online reference data associated with the atom
331        - NB, care needs to be taken here since this data is stored in the atom
332        link elements and these are also used for the various atom associations
333        @param links: a Link or array of Links to add to the relatedLinks attribute
334        '''
335        logging.debug("Adding online references")
336        if not links:
337            return
338       
339        if type(links) is not list:
340            links = [links]
341       
342        # firstly clear out any online refs data from the existing related links
343        newLinks = []
344        for link in self.relatedLinks:
345            if link.isChildAtom():
346                newLinks.append(link)
347       
348        newLinks.extend(links)
349        self.relatedLinks = newLinks
350        logging.debug("Online references added")
351
352
353    def addUniqueRelatedLinks(self, links):
354        '''
355        Add links to relatedLinks array - if they are not already included
356        @param links: a Link or array of Links to add to the relatedLinks attribute
357        '''
358        self.addUniqueLinks(self.relatedLinks, links)
359       
360
361    def removeRelatedLinks(self, linksToDelete):
362        '''
363        Remove any links in the input list from the atom's related links list
364        @param linksToDelete: array of Link objects to remove from atom
365        '''
366        logging.debug("Removing related links from atom")
367        if not linksToDelete:
368            return
369       
370        if type(linksToDelete) is not list:
371            linksToDelete = [linksToDelete]
372       
373        updatedLinks = []
374        for link in self.relatedLinks:
375            if type(link) is not Link:
376                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
377                continue
378            if link in linksToDelete:
379                logging.debug("- found link to remove")
380            else:
381                updatedLinks.append(link)
382
383        self.relatedLinks = updatedLinks
384        logging.debug("Links removed")
385       
386
387    def getDefaultCollectionPath(self):
388        '''
389        Determine the correct collection to use for the atom in eXist
390        '''
391        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
392       
393        if self.atomTypeID == VTD.DE_TERM:
394            collectionPath += eXistConnector.DE_COLLECTION_PATH
395        elif self.atomTypeID == VTD.GRANULE_TERM:
396            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
397        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
398            self.subtypeID == VTD.DEPLOYMENT_TERM:
399            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
400        else:
401            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
402       
403        if not self.ME.providerID:
404            raise AtomError("Error: cannot determine atom collection path because " + \
405                            "the provider ID is not defined")
406           
407        collectionPath += self.ME.providerID + "/"
408        return collectionPath
409
410
411    def __addAtomTypeDataXML(self, root):
412        '''
413        Add the atom type, and subtype data, if available, to atom categories
414        - and lookup and add the appropriate vocab term data
415        '''
416        if self.atomTypeID:
417            logging.info("Adding atom type info to XML output")
418            category = Category()
419            category.label = self.atomTypeID
420            # look up the appropriate vocab term data
421            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
422            category.term = self.ATOM_TYPE
423            root.append(category.toXML())
424
425        if self.subtypeID:
426            logging.info("Adding atom subtype info to XML output")
427            # NB subtypes not all defined, so leave this out for the moment
428            category.label = self.subtypeID
429            # look up the appropriate vocab term data
430            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
431            category.term = self.ATOM_SUBTYPE
432            root.append(category.toXML())
433
434
435    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
436        '''
437        Add data to include in the moles entity element
438        '''
439        logging.debug('Adding moles entity information')
440        self.ME.abbreviation = abbreviation
441        self.ME.providerID = provider_id
442        self.ME.createdDate = getISO8601Date(object_creation_time)
443        logging.debug('Moles entity information added')
444
445
446    def addAuthors(self, authors):
447        '''
448        Add author data appropriately to the atom
449        NB, these will overwrite any existing authors of the same type
450        @param authors: list of Person objects with the author data
451        '''
452        logging.debug('Adding authors data to Atom')
453        isFirstAuthor = {}
454        authorArray = None
455        for author in authors:
456            # NB, we're only allowed one atom author
457            if author.type == Person.AUTHOR_TYPE:
458                self.author = author
459                if isFirstAuthor.has_key(author.type):
460                    raise AtomError("Error: an atom can only have one author specified")
461                isFirstAuthor[author.type] = 1
462                continue
463            elif author.type == Person.CONTRIBUTOR_TYPE:
464                authorArray = self.contributors
465            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
466                authorArray = self.ME.responsibleParties
467               
468            # check if this is the first addition - if so, clear out the
469            # array in advance
470            if not isFirstAuthor.has_key(author.type):
471                logging.debug("Clearing out author array")
472                # NB, need to be careful to clear the array, not create a ref
473                # to a new array
474                del authorArray[:]
475                isFirstAuthor[author.type] = 1
476
477            if str(author) != "" and author not in authorArray:
478                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
479                              %(author.type, author.name, author.uri, author.role))
480                authorArray.append(author)
481
482        logging.debug('Finished adding authors data')
483
484
485    def _isNewParameter(self, param):
486        '''
487        Check if a parameter is already specified in the atom, return False if
488        so, otherwise return True
489        '''
490        for p in self.parameters:
491            if p.term == param.term and \
492                p.scheme == param.scheme and \
493                p.label == param.label:
494                return False
495        return True
496
497
498    def addRelatedLinks(self, linkVals):
499        '''
500        Add related links in string format - converting to Link objects
501        NB, only add the link if it is unique
502       
503        @param linkVals: string of format, 'uri | title | vocabServerURL'
504        '''
505        link = self.objectify(linkVals, 'relatedLinks')
506        if link not in self.relatedLinks:
507            self.relatedLinks.append(link)
508
509
510    def addParameters(self, params):
511        '''
512        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
513        @params param: parameter, as string array, to add to atom parameters collection
514        '''
515        # avoid strings being parsed character by character
516        if type(params) is str:
517            params = [params]
518           
519        for param in params:
520            # firstly tidy parameter
521            param = tidyUpParameters(param)
522            category = Category()
523            # NB, data already tidied up here, so set keyword to avoid this happening again
524            category.fromString(param, escapeSpecialCharacters=True)
525
526            # now check for uniqueness
527            if self._isNewParameter(category):
528                logging.debug("Adding new parameter: %s" %param)
529                self.parameters.append(category)
530   
531   
532    def _linksToXML(self, root):
533        '''
534        Add required links to the input element
535        @param root: element to add links to - NB, should be the root element of the atom
536        '''
537        selfLink = ET.SubElement(root, "link")
538        selfLink.attrib["href"] = self.atomBrowseURL
539        selfLink.attrib["rel"] = "self"
540        if self.subtypeID != VTD.DEPLOYMENT_TERM:
541            molesLink = ET.SubElement(root, "link")
542            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
543            molesLink.attrib["href"] = molesDoc
544            molesLink.attrib["rel"] = 'related'
545       
546        for relatedLink in self.relatedLinks:
547            if relatedLink.hasValue():
548                root.append(relatedLink.toXML())
549   
550    def toXML(self):
551        '''
552        Convert the atom into XML representation and return this
553        @return: xml version of atom
554        '''
555        logging.info("Creating formatted XML version of Atom")
556        root = ET.Element("entry")
557        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
558        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
559        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
560        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
561        id = ET.SubElement(root, "id")
562        id.text = self.atomID
563        title = ET.SubElement(root, "title")
564        title.text = self.title
565        self._linksToXML(root)
566
567        # NB, the author tag is mandatory for atoms - so if an explicit
568        # author has not been set, just take the author to be the provider
569        if not self.author:
570            author = Person()
571            author.name = self.ME.providerID
572            #author.uri = self.ME.providerID
573            self.author = author
574
575        root.append(self.author.toXML())
576           
577        for contributor in self.contributors:
578            root.append(contributor.toXML())
579
580        # add the moles entity section, if it is required
581        if self.ME:
582            root.append(self.ME.toXML())
583
584        # add parameters data
585        for param in self.parameters:
586            if param.hasValue():
587                root.append(param.toXML())
588
589        # add the type and subtype data
590        self.__addAtomTypeDataXML(root)
591                   
592        summary = ET.SubElement(root, "summary")
593        summary.text = self.Summary
594                   
595        # add link to content, if required - NB, can only have one content element in atom
596        # - and this is mandatory
597        content = ET.SubElement(root, "content")
598        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
599        if contentFile:
600            content.attrib["type"] = "application/xml"
601            content.attrib["src"] = contentFile
602        else:
603            content.text = self.Content
604            content.attrib["type"] = "xhtml"
605       
606        # if there's a published date already defined, assume we're doing an update now
607        # NB, update element is mandatory
608        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
609        if not self.publishedDate:
610            self.publishedDate = currentDate
611
612        updated = ET.SubElement(root, "updated")
613        if not self.updatedDate:
614            self.updatedDate = currentDate
615        updated.text = self.updatedDate
616
617        published = ET.SubElement(root, "published")
618        published.text = self.publishedDate
619
620        # add temporal range data, if available
621        temporalRange = ET.SubElement(root, "moles:temporalRange")
622        if self.t1:
623            temporalRange.text = self.t1
624            if self.t2:
625                temporalRange.text += "/" + self.t2
626
627        # add spatial range data, if available
628        self._addSpatialData(root)
629
630        tree = ET.ElementTree(root)
631        logging.info("XML version of Atom created")
632        return tree
633
634
635    def __getSummary(self):
636        logging.debug("Getting summary data")
637        summaryString = ""
638        for summary_line in self.summary:
639            summaryString += summary_line + "\n"
640
641        return summaryString
642
643    def __setSummary(self, summary):
644        logging.debug("Adding summary data")
645        self.summary = []
646        for summary_line in summary.split('\n'):
647            self.summary.append(escapeSpecialCharacters(summary_line))
648           
649    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
650
651
652    def __getContent(self):
653        logging.debug("Getting content data")
654        contentString = ""
655        # NB, there must be content specified in an atom
656        if not self.content:
657            return "Metadata document"
658       
659        for content_line in self.content:
660            contentString += content_line + "\n"
661
662        return contentString
663
664    def __setContent(self, content):
665        logging.debug("Adding content data")
666        self.content = []
667        for content_line in content.split('\n'):
668            self.content.append(content_line)
669           
670    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
671
672           
673    def fromString(self, xmlString):
674        '''
675        Initialise Atom object using an xmlString
676        @param xmlString: representation of atom as an XML string
677        '''
678        logging.info("Ingesting data from XML string")
679       
680        # firstly, remove any namespaces used - to avoid problems with elementtree
681        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
682        xmlString = xmlString.replace('moles:', '')
683        xmlString = xmlString.replace('georss:', '')
684        xmlString = xmlString.replace('gml:', '')
685        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
686        xmlString = xmlString.replace('default:', '')
687
688        # now create elementtree with the XML string
689        logging.debug("Create elementtree instance with XML string")
690        tree = ET.fromstring(xmlString)
691       
692        title = tree.findtext('title')
693        if title:
694            logging.debug("Adding title data")
695            self.title = title
696
697        summary = tree.findtext('summary')
698        if summary:
699            self.Summary = summary#.decode('unicode_escape')
700
701        authorElement = tree.find('author')
702        logging.debug("Adding author data")
703        author = Person()
704        author.fromETElement(authorElement)
705        self.author = author
706
707        contributorElements = tree.findall('contributor')
708        for contributorElement in contributorElements:
709            logging.debug("Adding contributor data")
710            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
711            contributor.fromETElement(contributorElement)
712            self.contributors.append(contributor)
713
714        molesElement = tree.find('entity')
715        if molesElement:
716            self.ME.fromET(molesElement)
717               
718        self.atomID = tree.findtext('id')
719
720        self._parseCategoryData(tree.findall('category'))
721
722        self._parseLinksData(tree.findall('link'))
723           
724        contentTag = tree.find('content')
725        if contentTag != None:
726            logging.debug("Found content tag - checking for CSML/CDML file data")
727            file = contentTag.attrib.get('src')
728            if file:
729                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
730                if file.upper().find('CSML') > -1:
731                    logging.debug("Adding CSML file data")
732                    self.csmlFile = file
733                elif file.upper().find('CDML') > -1:
734                    logging.debug("Adding CDML file data")
735                    self.cdmlFile = file
736                self.contentFile = file
737            else:
738                logging.debug("No file data - adding contents of element instead")
739                self.Content = contentTag.text
740       
741        range = tree.findtext('temporalRange')
742        if range:
743            logging.debug("Adding temporal range data")
744            timeData = range.split('/')
745            self.t1 = timeData[0]
746            if len(timeData) > 1:
747                self.t2 = timeData[1]
748       
749        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
750        minBBox = tree.findall('.//lowerCorner')
751        if minBBox:
752            logging.debug("Adding min spatial range data")
753            minBBox = minBBox[0]
754            spatialData = minBBox.text.split()
755            self.minX = spatialData[0]
756            if len(spatialData) > 1:
757                self.minY = spatialData[1]
758       
759        maxBBox = tree.findall('.//upperCorner')
760        if maxBBox:
761            maxBBox = maxBBox[0]
762            logging.debug("Adding max spatial range data")
763            spatialData = maxBBox.text.split()
764            self.maxX = spatialData[0]
765            if len(spatialData) > 1:
766                self.maxY = spatialData[1]
767               
768        publishedDate = tree.findtext('published')
769        if publishedDate:
770            logging.debug("Adding published date")
771            self.publishedDate = publishedDate
772               
773        updatedDate = tree.findtext('updated')
774        if updatedDate:
775            logging.debug("Adding updated date")
776            self.updatedDate = updatedDate
777           
778        logging.info("Completed data ingest")
779   
780   
781    def _parseCategoryData(self, categories):
782        logging.debug("Adding category/parameters data")
783        for category in categories:
784            cat = Category()
785            cat.fromETElement(category)
786           
787            if cat.term == self.ATOM_TYPE:
788                logging.debug("Found atom type data")
789                self.atomTypeID = cat.label
790                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
791                continue
792            elif cat.term == self.ATOM_SUBTYPE:
793                logging.debug("Found atom subtype data")
794                self.subtypeID = cat.label
795                self.subtype = cat.scheme
796                continue
797
798            self.parameters.append(cat)
799   
800
801    def setDatasetID(self, datasetID):
802        '''
803        Set the dataset ID for the atom - and generate an appropriate atom name using this
804        @param datasetID: ID to set for the atom
805        '''
806        self.datasetID = datasetID
807        self._generateAtomName(datasetID) 
808        self.atomID = self.createAtomID(datasetID)
809
810
811    def createAtomID(self, datasetID):
812        '''
813        Create a unique ID, conforming to atom standards, for atom
814        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
815        @param datasetID: ID of atom's dataset
816        @return: unique ID
817        '''
818        logging.info("Creating unique ID for atom")
819        if not self.atomBrowseURL:
820            self._generateAtomName(datasetID)
821        urlBit = self.atomBrowseURL.split('://')[1]
822        urlBit = urlBit.replace('#', '')
823        urlBits = urlBit.split('/')
824        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
825       
826        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
827        logging.info("- unique ID created for atom")
828        logging.debug(" - '%s'" %id)
829        return id
830       
831       
832    def _generateAtomName(self, datasetID):
833        '''
834        Generate a consistent name for the atom - with full eXist doc path
835        @param datasetID: ID of atom's dataset
836        '''
837        self.atomName = datasetID + ".atom"
838        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
839        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
840
841
842    def _parseLinksData(self, links):
843        '''
844        Extract links and atom data from array of link elements in the XML representation of the atom
845        @param links: an array of <link> elements
846        '''
847        # firstly, get all data to start with, so we can properly process it afterwards
848        linkData = {}
849        logging.debug("Getting link data")
850        for linkTag in links:
851            link = Link()
852            link.fromETElement(linkTag)
853
854            if not linkData.has_key(link.rel):
855                linkData[link.rel] = []
856           
857            linkData[link.rel].append(link)
858
859        # there should be one self referencing link - which will provide info on the atom itself
860        if not linkData.has_key('self'):
861            errorMessage = "Atom does not have self referencing link - " + \
862                "cannot ascertain datasetID without this - please fix"
863            logging.error(errorMessage)
864            raise ValueError(errorMessage)
865       
866        # this is the link describing the atom itself
867        self.atomBrowseURL = linkData['self'][0].href
868       
869        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
870        self.atomName = self.datasetID + ".atom"
871        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
872       
873        # now remove this value and the associated moles doc link
874        del linkData['self']
875        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
876        if linkData.has_key('related'):
877            relatedLinks = []
878            for link in linkData['related']:
879                if link.href != molesDoc:
880                    relatedLinks.append(link)
881           
882            linkData['related'] = relatedLinks
883               
884        # now add the remaining links to the atom
885        for key in linkData:
886            for link in linkData[key]:
887                logging.debug("Adding link data")
888                self.relatedLinks.append(link)
889       
890
891    def _addSpatialData(self, element):
892        '''
893        Add spatial coverage element to an input element
894        @param element: element to add coverage data to
895        '''
896        logging.info("Adding spatial data to Atom")
897        if not self.minX:
898            logging.info("No spatial data specified")
899            return
900        bbox = ET.SubElement(element, "georss:where")
901        envelope = ET.SubElement(bbox, "gml:Envelope")
902        lc = ET.SubElement(envelope, "gml:lowerCorner")
903        lc.text = str(self.minX) + " " + str(self.minY)
904        uc = ET.SubElement(envelope, "gml:upperCorner")
905        uc.text = str(self.maxX) + " " + str(self.maxY)
906
907       
908    def setAttribute(self, attributeName, attributeValue):
909        '''
910        Set the value of an atom attribute - and do some basic tidying up of the string content
911        - to escape any XML unfriendly characters
912        @param attributeName: name of the attribute whose value to set
913        @param attributeValue: value to set the attribute to 
914        '''
915        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
916        origValue = attributeValue
917       
918        # escape any special characters if a value has been specified
919        # NB, need to cope with both single values and arrays
920        if attributeValue:
921            if type(attributeValue) is list:
922                newVals = []
923                for val in attributeValue:
924                    newVals.append(self.objectify(escapeSpecialCharacters(val), attributeName))
925                attributeValue = newVals
926                   
927            else:
928                attributeValue = self.objectify(escapeSpecialCharacters(attributeValue), attributeName)
929
930        # handle the special case of authors; only one author is allowed per atom
931        # - the others should be treated as contributors
932        if attributeName == "authors":
933            setattr(self, "author", attributeValue[0])
934            if len(attributeValue) > 1:
935                setattr(self, "contributors", attributeValue[1:])
936        elif attributeName == "atomAuthors":
937            if isinstance(attributeValue, list):
938                for val in attributeValue:
939                    self.ME.responsibleParties.append(val)
940            else:
941                self.ME.responsibleParties.append(attributeValue)
942        else:
943            setattr(self, attributeName, attributeValue)
944
945
946    def objectify(self, objectVals, attributeName):
947        '''
948        Some inputs are specified as strings but need to be converted into
949        objects - do this here
950        @param objectVals: a '|' delimited string of values
951        @param attributeName: name of attribute the values belong to
952        '''
953        obj = None
954        if type(objectVals) != str:
955            return objectVals
956       
957        if attributeName == "relatedLinks":
958            obj = Link()
959        elif attributeName == "atomAuthors" or attributeName == "authors":
960            # NB, ensure there is only one author tag - extra authors are contributors
961            authorType = Person.AUTHOR_TYPE
962            if self.author:
963                authorType= Person.CONTRIBUTOR_TYPE
964            obj = Person(personType = authorType)
965
966        if obj:
967            obj.fromString(objectVals)
968            # NB, need to set it now, just in case we don't set it before coming back
969            if attributeName == "authors" and not self.author:
970                self.author = obj
971            return obj
972       
973        return objectVals
974
975
976    def toPrettyXML(self):
977        '''
978        Returns nicely formatted XML as string
979        '''
980        atomXML = self.toXML()
981
982        # create the string
983        logging.debug("Converting the elementtree object into a string")
984        prettyXML = et2text(atomXML.getroot())
985
986        # add XML version tag
987        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
988        logging.info("Created formatted version of XML object")
989        return prettyXML
990
991
992    def getLinksOfType(self, termID):
993        '''
994        Returns links in the atom related links attribute which match the specified
995        term ID
996        @param termID: the termID to look for in the related links - NB, this is
997        matched to the end of the link.rel value
998        @return links: array of Link objects with matching term type
999        '''
1000        logging.debug("Getting atom links of type, '%s'" %termID)
1001        matchingLinks = []
1002        for link in self.relatedLinks:
1003            # firstly, handle special case where we only want the online ref type links
1004            # returned
1005            if termID == self.ONLINE_REF_LABEL:
1006                if not link.isChildAtom():
1007                    logging.debug("- found link with matching term type")
1008                    matchingLinks.append(link)
1009               
1010            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1011                logging.debug("- found link with matching term type")
1012                matchingLinks.append(link)
1013               
1014        logging.debug("Returning matched links")
1015        return matchingLinks
1016       
1017       
1018    def getLogos(self):
1019        '''
1020        Return related links that are logos
1021        @return: array of Links containing the logos for the atom
1022        '''
1023        logos = []
1024        for link in self.relatedLinks:
1025            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1026                logos.append(link)
1027               
1028        return logos
1029   
1030   
1031    def isGranule(self):
1032        if self.atomTypeID == VTD.GRANULE_TERM:
1033            return True
1034        return False
1035   
1036   
1037    def isDE(self):
1038        if self.atomTypeID == VTD.DE_TERM:
1039            return True
1040        return False
1041   
1042    def isDeployment(self):
1043        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1044            return True
1045        return False
1046   
1047    def isDeployable(self):
1048        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1049            self.atomTypeID == VTD.DPT_TERM or \
1050            self.atomTypeID == VTD.OBS_TERM:
1051            return True
1052        return False
1053
1054       
1055    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1056        '''
1057        Parse CSML data and add extracted info to the atom
1058        @param csmlName: name of the csml file
1059        @param csmlContent: content of the csml file - NB, if this is set to None and the
1060        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1061        directly
1062        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1063        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1064        this should only be True if creating a new atom - e.g. from a granulite
1065        atom coverage data will be added
1066        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1067        '''
1068        logging.info("Creating CSML data model")
1069        self.csmlFile = csmlName
1070        self.contentFile = csmlName
1071        content = csmlContent or csmlName
1072   
1073        csmlDoc = CsmlParser.Dataset(file=content)
1074       
1075        logging.info("Extracting info from CSML file")
1076        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1077        if useCSMLID:
1078            logging.debug(" - using this ID for the atom")
1079            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1080       
1081        title = csmlDoc.name.CONTENT
1082        logging.debug("Got dataset name (title): '%s'" %title)
1083        # NB, if a title is specified (and not as the default value), it automatically is used in
1084        # place of anything in the granulite file
1085        if title and title != "NAME OF DATASET GOES HERE":
1086            logging.info("Title, '%s', extracted from CSML file" %title)
1087            if self.title:
1088                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1089                             %self.title)
1090            self.title = title
1091               
1092        bbox1 = csmlDoc.getBoundingBox()
1093        bbox2 = csmlDoc.getCSMLBoundingBox()
1094        time = bbox2.getTimeLimits()
1095   
1096        # now check for other parameters to add to granule
1097        # Firstly, extract the bounding envelope
1098        if bbox1:
1099            w, e = normaliseLongitude(bbox1[0],bbox1[2])
1100            n, s = (bbox1[3], bbox1[1])
1101   
1102            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1103                self.maxY = n
1104               
1105            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1106                self.minY = s
1107           
1108            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1109                self.minX = w
1110   
1111            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1112                self.maxX = e
1113           
1114            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1115                          %(w, s, e, n))
1116           
1117            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1118                          %(self.minX, self.minY, self.maxX, self.maxY))
1119        else:
1120            logging.debug("No valid bounding box data found")
1121   
1122        if time:
1123            t1 = formatDateYYYYMMDD(time[0])
1124            if not aggregateCoverage or \
1125                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1126                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1127                self.t1 = t1
1128   
1129            t2 = time[1]
1130            if t2 and t2 != 'None':
1131                t2 = formatDateYYYYMMDD(t2)
1132                if not aggregateCoverage or \
1133                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1134                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1135                    self.t2 = t2
1136           
1137            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1138        else:
1139            logging.debug("No valid time range data found")
1140   
1141        #create parameter summaries:
1142        #set up list to hold the parameters data
1143        parameters = []
1144        for feature in csmlDoc.featureCollection.featureMembers:
1145            if hasattr(feature.parameter, 'href'):
1146                paramTriple = ""
1147                if hasattr(feature, 'description'):
1148                    paramTriple = feature.description.CONTENT
1149                    paramTriple += " | " + feature.parameter.href
1150                   
1151                    term = ""
1152                    if hasattr(feature, 'name'):
1153                        term = feature.name.CONTENT
1154   
1155                    paramTriple += " | " + term
1156                   
1157                    logging.debug("Got parameter info: %s" %paramTriple)
1158                    parameters.append(paramTriple)
1159       
1160        # update the atom with the extracted parameters
1161        logging.info("Adding CSML parameters to granule atom")
1162        self.addParameters(parameters)
1163        logging.info("Finished adding CSML data")
1164        return csmlDoc
1165
1166
1167    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1168        '''
1169        Check through the atom links and retrieve any associated data of the
1170        specified type
1171        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1172        or VTD.DE_TERM
1173        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1174        config details which are not available to the Atom object
1175        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1176        defined deployments to find those which reference it, otherwise only
1177        deployments data featured in the atom related links are processed
1178        '''
1179        logging.info("Looking up %s info" %type)
1180       
1181        self.allActivities = []
1182        self.allObs = []
1183        self.allDpts = []
1184
1185        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1186            raise ValueError('Unrecognised associated data type: %s' %type)
1187       
1188        # avoid duplicating lookup effort
1189        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1190            (type == VTD.DE_TERM and self.dataEntities):
1191            logging.info("- this info has already been looked up - returning")
1192            return
1193
1194        # firstly, collect all the references to the info required
1195        if lookupIndirectReferences:
1196            logging.info("Looking up indirect references")
1197           
1198            # if we're looking up DE data for deployments data, need to have the
1199            # deployments info looked up first
1200            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1201                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1202           
1203            logging.info("Looking up references to this atom from other %s" %type)
1204           
1205            # NB, if we're looking up deployments info, we only look up references
1206            # to this atom - if we're looking up DEs, we need to look up references
1207            # to the deployments referenced by this atom
1208            urls = [self.atomBrowseURL]
1209           
1210            if type == VTD.DE_TERM and self.isDeployable():
1211                urls = []
1212                for dep in self.deployments:
1213                    urls.append(dep.browseURL)
1214                   
1215            links = []
1216            for url in urls:
1217                doc = dr.get(type, dr.ATOM_TYPE, url, \
1218                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1219                # now need to turn this results set into actual atoms
1220                tree = ET.fromstring(doc)
1221                for atom in tree:
1222                    logging.debug("- found reference in %s" %type)
1223                    links.append(ET.tostring(atom))
1224                   
1225            logging.info("Finished looking up indirect references")
1226        else:
1227            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1228
1229        # now retrieve the references and extract the required data
1230        logging.info("Retrieving info from %s references" %type)
1231        if type == VTD.DEPLOYMENT_TERM:
1232            self.deployments = []
1233            for link in links:
1234                if lookupIndirectReferences:
1235                    deploymentAtom = link
1236                else:
1237                    localID = link.href.split("__ATOM__")[-1]
1238                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1239                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1240   
1241                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1242                self.deployments.append(deployment)
1243               
1244                self.addUniqueLinks(self.allActivities, deployment.activities)
1245                self.addUniqueLinks(self.allObs, deployment.obs)
1246                self.addUniqueLinks(self.allDpts, deployment.dpts)
1247        else:
1248            # for DE data, just store the title + link in a Link object
1249            self.dataEntities = []
1250            for data in links:
1251                atom = Atom(xmlString=str(data))
1252                link = Link()
1253                link.title = atom.title
1254                link.href = atom.atomBrowseURL
1255               
1256                # NB, different deployments may be used by the same DE - so
1257                # avoid duplication
1258                self.addUniqueLinks(self.dataEntities, link)
1259           
1260        logging.info("Finished looking up %s info" %type)
1261
1262
1263    def addUniqueLinks(self, dataArray, links):
1264        '''
1265        Add links to specified array - if they are not already included
1266        @param dataArray: a list, potentially arlready containing links
1267        @param links: a Link or array of Links to add to the dataArray
1268        '''
1269        logging.debug("Adding new links")
1270        if not links:
1271            return
1272       
1273        if type(links) is not list:
1274            links = [links]
1275       
1276        for link in links:
1277            if type(link) is not Link:
1278                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1279                continue
1280            if link not in dataArray:
1281                logging.debug("- adding unique link")
1282                dataArray.append(link)
1283        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.