source: exist/trunk/python/ndgUtils/models/Atom.py @ 4679

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4679
Revision 4679, 48.4 KB checked in by cbyrom, 11 years ago (diff)

Extend granulite to allow command line operation - with input options
to specify logging level and 'replace atom' mode - which command line
inputs to ask users if they want to replace duplicated data.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18import ndgUtils.lib.utilities as utilities
19from ndgUtils.models.vocabtermdata import VocabTermData as VTD
20from ndgUtils.models import MolesEntity as ME
21import csml.parser as CsmlParser
22from ndgUtils.models import Deployment as Deployment
23
24
25class AtomError(Exception):
26    """
27    Exception handling for Atom class.
28    """
29    def __init__(self, msg):
30        logging.error(msg)
31        Exception.__init__(self, msg)
32
33
34class Person(object):
35    '''
36    Class representing atom author type data - with name, uri and role attributes
37    '''
38    AUTHOR_TYPE = 0
39    CONTRIBUTOR_TYPE = 1
40    RESPONSIBLE_PARTY_TYPE = 2
41    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
42   
43    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
44        self.type = personType
45        self.ns = namespace
46        self.name = ""
47        self.uri = ""
48        self.role = ""
49       
50        # NB, the atom format specifies slightly different data contents
51        self.uriTagName = "email"
52        # NB, responsible party data is always stored in the moles section
53        if self.type == self.RESPONSIBLE_PARTY_TYPE:
54            self.ns = 'moles'
55            self.uriTagName = "uri"
56
57    def __str__(self):
58        if self.name or self.uri or self.role:
59            return self.name + " | " + self.uri + " | " + self.role
60        return ""
61
62
63    def hasValue(self):
64        if self.name or self.uri or self.role:
65            return True
66        return False
67   
68    def fromString(self, personString):
69        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
70       
71    def fromETElement(self, personTag):
72        self.name = personTag.findtext('name') or ""
73        self.role = personTag.findtext('role') or ""
74        self.uri = personTag.findtext(self.uriTagName) or ""
75        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
76                      %(self.name, self.role, self.uriTagName, self.uri))
77
78    def toXML(self):
79        prefix = ""
80        if self.ns:
81            prefix = self.ns + ':'
82
83        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
84
85        if self.name:
86            name = ET.SubElement(author, prefix + "name")
87            name.text = self.name
88       
89        if self.uri:
90            uri = ET.SubElement(author, prefix + self.uriTagName)
91            uri.text = self.uri
92       
93        if self.role:
94            role = ET.SubElement(author, prefix + "role")
95            role.text = self.role
96
97        return author
98   
99    def __cmp__(self, person1):
100        '''
101        Override comparison to allow proper object comparison when checking
102        if Person objects are in an array already - i.e. if person in personArray...
103        '''
104        if not person1:
105            return -1
106       
107        if self is person1:
108            return 0
109        elif self.uri == person1.uri and self.name == person1.name and \
110                self.role == person1.role and self.type == person1.type:
111            return 0
112        return 1
113
114
115class Link(object):
116    '''
117    Class representing an atom link - with href, title and rel attributes
118    '''
119
120    def __init__(self):
121        self.href = ""
122        self.title = ""
123        self.rel = ""
124
125    def fromString(self, linkString):
126        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
127       
128    def fromETElement(self, linkTag):
129        self.href = linkTag.attrib.get('href') or ""
130        self.rel = linkTag.attrib.get('rel') or ""
131        self.title = linkTag.attrib.get('title') or ""
132
133    def toXML(self):
134        link = ET.Element("link")
135        link.attrib["href"] = self.href
136        link.attrib["title"] = self.title
137        link.attrib["rel"] = self.rel
138        return link
139
140    def hasValue(self):
141        # NB, just a rel on its own is meaningless - so ignore
142        if self.href or self.title:
143            return True
144        return False
145   
146    def __str__(self):
147        if self.href or self.title or self.rel:
148            return self.href + " | " + self.title + " | " + self.rel
149        return ""
150   
151    def isChildAtom(self):
152        '''
153        Determines whether the link refers to another atom - e.g. a link to
154        a data granule
155        @return True, if so; False otherwise
156        '''
157        if self.rel.endswith(VTD.GRANULE_TERM) or \
158            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
159            self.rel.endswith(VTD.ACTIVITY_TERM) or \
160            self.rel.endswith(VTD.DPT_TERM) or \
161            self.rel.endswith(VTD.OBS_TERM):
162            return True
163       
164        return False
165   
166    def __cmp__(self, link1):
167        '''
168        Override comparison to allow proper object comparison when checking
169        if Link objects are in an array already - i.e. if link in linkArray...
170        '''
171        if not link1:
172            return -1
173       
174        if self is link1:
175            return 0
176        elif self.href == link1.href and self.title == link1.title and \
177                self.rel == link1.rel:
178            return 0
179        return 1
180
181
182class Category(object):
183    '''
184    Class representing an atom category - with term, scheme and label attributes
185    '''
186    def __init__(self):
187        self.term = ""
188        self.scheme = ""
189        self.label = ""
190
191    def fromString(self, linkString, escapeSpecialCharacters=True):
192        '''
193        Create Category from triple string of format, 'label | scheme | term'
194        @param linkString: triple string to create category with
195        @keyword escapeSpecialCharacters: if set to True, special characters in
196        triple string are escaped (default)
197        '''
198        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
199            doEscape=escapeSpecialCharacters)
200       
201    def fromETElement(self, linkTag):
202        self.term = linkTag.attrib.get('term') or ""
203        self.label = linkTag.attrib.get('label') or ""
204        self.scheme = linkTag.attrib.get('scheme') or ""
205
206    def toXML(self):
207        link = ET.Element("category")
208        link.attrib["term"] = self.term
209        link.attrib["scheme"] = self.scheme
210        link.attrib["label"] = self.label
211        return link
212   
213    def hasValue(self):
214        if self.scheme or self.label or self.term:
215            return True
216        return False
217
218
219class Atom(object):
220
221    # labels for use with the atom categories
222    ATOM_TYPE = "ATOM_TYPE"
223    ATOM_SUBTYPE = "ATOM_SUBTYPE"
224
225    # labels for use with the templates to set/extract specific inputs
226    ONLINE_REF_LABEL = "online_ref"
227    PARAMETER_LABEL = "parameter"
228    ATOM_REF_LABEL = "atom_ref"
229    DELIMITER = "---"
230    REMOVE_LABEL = "remove"
231   
232    # format to use for t1-t2 date range
233    YEAR_FORMAT = '%Y-%m-%d'
234
235    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
236                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
237        '''
238        Constructor - initialise the atom variables
239        '''
240        logging.info("Initialising atom")
241        if atomType:
242            logging.info(" - of type '%s'" %atomType)
243        self.atomTypeID = atomType
244
245        # some data have further subtypes specified
246        self.subtypeID = None # this should be the termID
247        self.subtype = None # and this should be the fully formed vocab URL
248       
249        self.ndgObject = ndgObject
250
251        self.atomName = None
252        self.files = []
253        self.author = Person()
254        self.contributors = []
255        self.atomAuthors = []
256        self.parameters = []
257        self.spatialData = []
258        self.temporalData = []
259        self.relatedLinks = []
260        self.summary = []
261        self.content = []
262        # NB, this deployments data duplicates other atom data - and is only used for a
263        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
264        self.deployments = []
265        # ditto for the following field
266        self.dataEntities = []
267           
268        self.csmlFile = None
269        self.cdmlFile = None
270        # general variable to use for setting the atom content - NB, if a csmlFile is specified
271        # (either directly or via a cdmlFile specification), this will be the content by default
272        # for this purpose
273        self.contentFile = None     
274        self.title = None
275        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
276        self.atomID = None
277   
278        # boundary box info - to replace spatial/temporalData?
279        self.minX = None
280        self.minY = None
281        self.maxX = None
282        self.maxY = None
283        self.t1 = None
284        self.t2 = None
285
286        self.ME = ME.MolesEntity()
287       
288        # date when the atom was first ingested
289        self.publishedDate = None
290
291        # last update date
292        self.updatedDate = None
293
294        # assume atom in working state by default - this is used to define what collection
295        # in eXist the atom is stored in
296        self.state = state
297       
298        # additional, non standard atom data can be included in the molesExtra element
299        if vocabTermData:
300            self.VTD = vocabTermData
301        else:
302            self.VTD = VTD()
303       
304        if xmlString:
305            self.fromString(xmlString)
306
307        # if inputs passed in as dict, add these now
308        if inputs:
309            logging.info("Adding info to atom from input dict")
310            logging.debug(inputs)
311            self.__dict__.update(inputs)
312           
313            # NB, this doesn't trigger the Summary Property, so do this
314            # explicitly, if need be
315            if inputs.has_key('Summary'):
316                self.Summary = inputs.get('Summary')
317            if inputs.has_key('Content'):
318                self.Content = inputs.get('Content')
319            if inputs.has_key('author'):
320                name = inputs.get('author')
321                author = Person()
322                author.fromString(name)
323                self.author = author
324           
325            # also pass any moles data up to the moles entity object
326            if inputs.has_key('providerID'):
327                self.ME.providerID = inputs.get('providerID')
328               
329            if inputs.has_key('abbreviation'):
330                self.ME.abbreviation = inputs.get('abbreviation')
331
332        if self.atomTypeID:
333            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
334
335        logging.info("Atom initialised")
336
337
338    def addOnlineReferences(self, links):
339        '''
340        Add online reference data associated with the atom
341        - NB, care needs to be taken here since this data is stored in the atom
342        link elements and these are also used for the various atom associations
343        @param links: a Link or array of Links to add to the relatedLinks attribute
344        '''
345        logging.debug("Adding online references")
346        if not links:
347            return
348       
349        if type(links) is not list:
350            links = [links]
351       
352        # firstly clear out any online refs data from the existing related links
353        newLinks = []
354        for link in self.relatedLinks:
355            if link.isChildAtom():
356                newLinks.append(link)
357       
358        newLinks.extend(links)
359        self.relatedLinks = newLinks
360        logging.debug("Online references added")
361
362
363    def addUniqueRelatedLinks(self, links):
364        '''
365        Add links to relatedLinks array - if they are not already included
366        @param links: a Link or array of Links to add to the relatedLinks attribute
367        '''
368        self.addUniqueLinks(self.relatedLinks, links)
369       
370
371    def removeRelatedLinks(self, linksToDelete):
372        '''
373        Remove any links in the input list from the atom's related links list
374        @param linksToDelete: array of Link objects to remove from atom
375        '''
376        logging.debug("Removing related links from atom")
377        if not linksToDelete:
378            return
379       
380        if type(linksToDelete) is not list:
381            linksToDelete = [linksToDelete]
382       
383        updatedLinks = []
384        for link in self.relatedLinks:
385            if type(link) is not Link:
386                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
387                continue
388            if link in linksToDelete:
389                logging.debug("- found link to remove")
390            else:
391                updatedLinks.append(link)
392
393        self.relatedLinks = updatedLinks
394        logging.debug("Links removed")
395       
396
397    def getDefaultCollectionPath(self):
398        '''
399        Determine the correct collection to use for the atom in eXist
400        '''
401        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
402       
403        if self.atomTypeID == VTD.DE_TERM:
404            collectionPath += eXistConnector.DE_COLLECTION_PATH
405        elif self.atomTypeID == VTD.GRANULE_TERM:
406            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
407        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
408            self.subtypeID == VTD.DEPLOYMENT_TERM:
409            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
410        else:
411            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
412       
413        if not self.ME.providerID:
414            raise AtomError("Error: cannot determine atom collection path because " + \
415                            "the provider ID is not defined")
416           
417        collectionPath += self.ME.providerID + "/"
418        return collectionPath
419
420
421    def __addAtomTypeDataXML(self, root):
422        '''
423        Add the atom type, and subtype data, if available, to atom categories
424        - and lookup and add the appropriate vocab term data
425        '''
426        if self.atomTypeID:
427            logging.info("Adding atom type info to XML output")
428            category = Category()
429            category.label = self.atomTypeID
430            # look up the appropriate vocab term data
431            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
432            category.term = self.ATOM_TYPE
433            root.append(category.toXML())
434
435        if self.subtypeID:
436            logging.info("Adding atom subtype info to XML output")
437            # NB subtypes not all defined, so leave this out for the moment
438            category.label = self.subtypeID
439            # look up the appropriate vocab term data
440            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
441            category.term = self.ATOM_SUBTYPE
442            root.append(category.toXML())
443
444
445    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
446        '''
447        Add data to include in the moles entity element
448        '''
449        logging.debug('Adding moles entity information')
450        self.ME.abbreviation = abbreviation
451        self.ME.providerID = provider_id
452        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
453        logging.debug('Moles entity information added')
454
455
456    def addAuthors(self, authors):
457        '''
458        Add author data appropriately to the atom
459        NB, these will overwrite any existing authors of the same type
460        @param authors: list of Person objects with the author data
461        '''
462        logging.debug('Adding authors data to Atom')
463        isFirstAuthor = {}
464        authorArray = None
465        for author in authors:
466            # NB, we're only allowed one atom author
467            if author.type == Person.AUTHOR_TYPE:
468                self.author = author
469                   
470                if isFirstAuthor.has_key(author.type):
471                    raise AtomError("Error: an atom can only have one author specified")
472                isFirstAuthor[author.type] = 1
473                continue
474            elif author.type == Person.CONTRIBUTOR_TYPE:
475                authorArray = self.contributors
476            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
477                authorArray = self.ME.responsibleParties
478               
479            # check if this is the first addition - if so, clear out the
480            # array in advance
481            if not isFirstAuthor.has_key(author.type):
482                logging.debug("Clearing out author array")
483                # NB, need to be careful to clear the array, not create a ref
484                # to a new array
485                del authorArray[:]
486                isFirstAuthor[author.type] = 1
487
488            if author.hasValue() and author not in authorArray:
489                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
490                              %(author.type, author.name, author.uri, author.role))
491                authorArray.append(author)
492
493        logging.debug('Finished adding authors data')
494
495
496    def _isNewParameter(self, param):
497        '''
498        Check if a parameter is already specified in the atom, return False if
499        so, otherwise return True
500        '''
501        for p in self.parameters:
502            if p.term == param.term and \
503                p.scheme == param.scheme and \
504                p.label == param.label:
505                return False
506        return True
507
508
509    def addRelatedLinks(self, linkVals):
510        '''
511        Add related links in string format - converting to Link objects
512        NB, only add the link if it is unique
513       
514        @param linkVals: string of format, 'uri | title | vocabServerURL'
515        '''
516        link = self.objectify(linkVals, 'relatedLinks')
517        if link not in self.relatedLinks:
518            self.relatedLinks.append(link)
519
520
521    def addParameters(self, params):
522        '''
523        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
524        @params param: parameter, as string array, to add to atom parameters collection
525        '''
526        # avoid strings being parsed character by character
527        if type(params) is str:
528            params = [params]
529           
530        for param in params:
531            # firstly tidy parameter
532            param = utilities.tidyUpParameters(param)
533            category = Category()
534            # NB, data already tidied up here, so set keyword to avoid this happening again
535            category.fromString(param, escapeSpecialCharacters=True)
536
537            # now check for uniqueness
538            if self._isNewParameter(category):
539                logging.debug("Adding new parameter: %s" %param)
540                self.parameters.append(category)
541   
542   
543    def _linksToXML(self, root):
544        '''
545        Add required links to the input element
546        @param root: element to add links to - NB, should be the root element of the atom
547        '''
548        selfLink = ET.SubElement(root, "link")
549        selfLink.attrib["href"] = self.atomBrowseURL
550        selfLink.attrib["rel"] = "self"
551       
552        for relatedLink in self.relatedLinks:
553            if relatedLink.hasValue():
554                root.append(relatedLink.toXML())
555   
556    def toXML(self):
557        '''
558        Convert the atom into XML representation and return this
559        @return: xml version of atom
560        '''
561        logging.info("Creating formatted XML version of Atom")
562        root = ET.Element("entry")
563        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
564        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
565        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
566        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
567        id = ET.SubElement(root, "id")
568        id.text = self.atomID
569        title = ET.SubElement(root, "title")
570        title.text = self.title
571        self._linksToXML(root)
572
573        if self.author and self.author.hasValue():
574            root.append(self.author.toXML())
575           
576        for contributor in self.contributors:
577            root.append(contributor.toXML())
578
579        # add parameters data
580        for param in self.parameters:
581            if param.hasValue():
582                root.append(param.toXML())
583
584        # add the type and subtype data
585        self.__addAtomTypeDataXML(root)
586                   
587        summary = ET.SubElement(root, "summary")
588        summary.text = self.Summary
589                   
590        # add link to content, if required - NB, can only have one content element in atom
591        # - and this is mandatory
592        content = ET.SubElement(root, "content")
593        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
594        if contentFile:
595            content.attrib["type"] = "application/xml"
596            content.attrib["src"] = contentFile
597        else:
598            content.attrib["type"] = "xhtml"
599            div = ET.SubElement(content, 'div')
600            div.attrib["xmlns"] = "http://www.w3.org/1999/xhtml"
601            div.text = self.Content
602       
603        # if there's a published date already defined, assume we're doing an update now
604        # NB, update element is mandatory
605        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
606        if not self.publishedDate:
607            self.publishedDate = currentDate
608
609        updated = ET.SubElement(root, "updated")
610        if not self.updatedDate:
611            self.updatedDate = currentDate
612        updated.text = self.updatedDate
613
614        published = ET.SubElement(root, "published")
615        published.text = self.publishedDate
616
617        # add the moles entity section, if it is required
618        if self.ME:
619            root.append(self.ME.toXML())
620
621        # add temporal range data, if available
622        temporalRange = ET.SubElement(root, "moles:temporalRange")
623        if self.t1:
624            temporalRange.text = self.t1
625            if self.t2:
626                temporalRange.text += "/" + self.t2
627
628        # add spatial range data, if available
629        self._addSpatialData(root)
630
631        tree = ET.ElementTree(root)
632        logging.info("XML version of Atom created")
633        return tree
634
635
636    def __getSummary(self):
637        logging.debug("Getting summary data")
638        summaryString = ""
639        for summary_line in self.summary:
640            summaryString += summary_line + "\n"
641
642        return summaryString
643
644    def __setSummary(self, summary):
645        logging.debug("Adding summary data")
646        self.summary = []
647        for summary_line in summary.split('\n'):
648            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
649           
650    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
651
652
653    def __getContent(self):
654        logging.debug("Getting content data")
655        contentString = ""
656        # NB, there must be content specified in an atom
657        if not self.content:
658            return "Metadata document"
659       
660        for content_line in self.content:
661            contentString += content_line + "\n"
662
663        return contentString
664
665    def __setContent(self, content):
666        logging.debug("Adding content data")
667        self.content = []
668        for content_line in content.split('\n'):
669            self.content.append(content_line)
670           
671    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
672
673           
674    def fromString(self, xmlString):
675        '''
676        Initialise Atom object using an xmlString
677        @param xmlString: representation of atom as an XML string
678        '''
679        logging.info("Ingesting data from XML string")
680       
681        # firstly, remove any namespaces used - to avoid problems with elementtree
682        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
683        xmlString = xmlString.replace('moles:', '')
684        xmlString = xmlString.replace('georss:', '')
685        xmlString = xmlString.replace('gml:', '')
686        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
687        xmlString = xmlString.replace('default:', '')
688        xmlString = xmlString.replace('xs:', '')
689
690        # now create elementtree with the XML string
691        logging.debug("Create elementtree instance with XML string")
692        tree = ET.fromstring(xmlString)
693       
694        title = tree.findtext('title')
695        if title:
696            logging.debug("Adding title data")
697            self.title = title
698
699        summary = tree.findtext('summary')
700        if summary:
701            self.Summary = summary#.decode('unicode_escape')
702
703        authorElement = tree.find('author')
704        if authorElement:
705            logging.debug("Adding author data")
706            author = Person()
707            author.fromETElement(authorElement)
708            self.author = author
709
710        contributorElements = tree.findall('contributor')
711        for contributorElement in contributorElements:
712            logging.debug("Adding contributor data")
713            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
714            contributor.fromETElement(contributorElement)
715            self.contributors.append(contributor)
716
717        molesElement = tree.find('entity')
718        if molesElement:
719            self.ME.fromET(molesElement)
720               
721        self.atomID = tree.findtext('id')
722
723        self._parseCategoryData(tree.findall('category'))
724
725        self._parseLinksData(tree.findall('link'))
726           
727        contentTag = tree.find('content')
728        if contentTag != None:
729            logging.debug("Found content tag - checking for CSML/CDML file data")
730            file = contentTag.attrib.get('src')
731            if file:
732                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
733                if file.upper().find('CSML') > -1:
734                    logging.debug("Adding CSML file data")
735                    self.csmlFile = file
736                elif file.upper().find('CDML') > -1:
737                    logging.debug("Adding CDML file data")
738                    self.cdmlFile = file
739                self.contentFile = file
740            else:
741                logging.debug("No file data - adding contents of element instead")
742                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
743                self.Content = div.text
744       
745        range = tree.findtext('temporalRange')
746        if range:
747            logging.debug("Adding temporal range data")
748            timeData = range.split('/')
749            self.t1 = timeData[0]
750            if len(timeData) > 1:
751                self.t2 = timeData[1]
752       
753        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
754        minBBox = tree.findall('.//lowerCorner')
755        if minBBox:
756            logging.debug("Adding min spatial range data")
757            minBBox = minBBox[0]
758            spatialData = minBBox.text.split()
759            self.minX = spatialData[0]
760            if len(spatialData) > 1:
761                self.minY = spatialData[1]
762       
763        maxBBox = tree.findall('.//upperCorner')
764        if maxBBox:
765            maxBBox = maxBBox[0]
766            logging.debug("Adding max spatial range data")
767            spatialData = maxBBox.text.split()
768            self.maxX = spatialData[0]
769            if len(spatialData) > 1:
770                self.maxY = spatialData[1]
771               
772        publishedDate = tree.findtext('published')
773        if publishedDate:
774            logging.debug("Adding published date")
775            self.publishedDate = publishedDate
776               
777        updatedDate = tree.findtext('updated')
778        if updatedDate:
779            logging.debug("Adding updated date")
780            self.updatedDate = updatedDate
781           
782        logging.info("Completed data ingest")
783   
784   
785    def _parseCategoryData(self, categories):
786        logging.debug("Adding category/parameters data")
787        for category in categories:
788            cat = Category()
789            cat.fromETElement(category)
790           
791            if cat.term == self.ATOM_TYPE:
792                logging.debug("Found atom type data")
793                self.atomTypeID = cat.label
794                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
795                continue
796            elif cat.term == self.ATOM_SUBTYPE:
797                logging.debug("Found atom subtype data")
798                self.subtypeID = cat.label
799                self.subtype = cat.scheme
800                continue
801
802            self.parameters.append(cat)
803   
804
805    def setDatasetID(self, datasetID):
806        '''
807        Set the dataset ID for the atom - and generate an appropriate atom name using this
808        @param datasetID: ID to set for the atom
809        '''
810        self.datasetID = datasetID
811        self._generateAtomName(datasetID) 
812        self.atomID = self.createAtomID(datasetID)
813
814
815    def createAtomID(self, datasetID):
816        '''
817        Create a unique ID, conforming to atom standards, for atom
818        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
819        @param datasetID: ID of atom's dataset
820        @return: unique ID
821        '''
822        logging.info("Creating unique ID for atom")
823        if not self.atomBrowseURL:
824            self._generateAtomName(datasetID)
825        urlBit = self.atomBrowseURL.split('://')[1]
826        urlBit = urlBit.replace('#', '')
827        urlBits = urlBit.split('/')
828        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
829        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
830       
831        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
832        logging.info("- unique ID created for atom")
833        logging.debug(" - '%s'" %id)
834        return id
835       
836       
837    def _generateAtomName(self, datasetID):
838        '''
839        Generate a consistent name for the atom - with full eXist doc path
840        @param datasetID: ID of atom's dataset
841        '''
842        self.atomName = datasetID + ".atom"
843        if not self.ME.providerID:
844            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
845        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
846        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
847
848
849    def _parseLinksData(self, links):
850        '''
851        Extract links and atom data from array of link elements in the XML representation of the atom
852        @param links: an array of <link> elements
853        '''
854        # firstly, get all data to start with, so we can properly process it afterwards
855        linkData = {}
856        logging.debug("Getting link data")
857        for linkTag in links:
858            link = Link()
859            link.fromETElement(linkTag)
860
861            if not linkData.has_key(link.rel):
862                linkData[link.rel] = []
863           
864            linkData[link.rel].append(link)
865
866        # there should be one self referencing link - which will provide info on the atom itself
867        if not linkData.has_key('self'):
868            errorMessage = "Atom does not have self referencing link - " + \
869                "cannot ascertain datasetID without this - please fix"
870            logging.error(errorMessage)
871            raise ValueError(errorMessage)
872       
873        # this is the link describing the atom itself
874        self.atomBrowseURL = linkData['self'][0].href
875       
876        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
877        self.atomName = self.datasetID + ".atom"
878        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
879       
880        # now remove this value and the associated moles doc link
881        del linkData['self']
882        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
883        if linkData.has_key('related'):
884            relatedLinks = []
885            for link in linkData['related']:
886                if link.href != molesDoc:
887                    relatedLinks.append(link)
888           
889            linkData['related'] = relatedLinks
890               
891        # now add the remaining links to the atom
892        for key in linkData:
893            for link in linkData[key]:
894                logging.debug("Adding link data")
895                self.relatedLinks.append(link)
896       
897
898    def _addSpatialData(self, element):
899        '''
900        Add spatial coverage element to an input element
901        @param element: element to add coverage data to
902        '''
903        logging.info("Adding spatial data to Atom")
904        if not self.minX:
905            logging.info("No spatial data specified")
906            return
907        bbox = ET.SubElement(element, "georss:where")
908        envelope = ET.SubElement(bbox, "gml:Envelope")
909        lc = ET.SubElement(envelope, "gml:lowerCorner")
910        lc.text = str(self.minX) + " " + str(self.minY)
911        uc = ET.SubElement(envelope, "gml:upperCorner")
912        uc.text = str(self.maxX) + " " + str(self.maxY)
913
914       
915    def setAttribute(self, attributeName, attributeValue):
916        '''
917        Set the value of an atom attribute - and do some basic tidying up of the string content
918        - to escape any XML unfriendly characters
919        @param attributeName: name of the attribute whose value to set
920        @param attributeValue: value to set the attribute to 
921        '''
922        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
923        origValue = attributeValue
924       
925        # escape any special characters if a value has been specified
926        # NB, need to cope with both single values and arrays
927        if attributeValue:
928            if type(attributeValue) is list:
929                newVals = []
930                for val in attributeValue:
931                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
932                attributeValue = newVals
933                   
934            else:
935                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
936
937        # handle the special case of authors; only one author is allowed per atom
938        # - the others should be treated as contributors
939        if attributeName == "authors":
940            setattr(self, "author", attributeValue[0])
941            if len(attributeValue) > 1:
942                setattr(self, "contributors", attributeValue[1:])
943        elif attributeName == "atomAuthors":
944            if isinstance(attributeValue, list):
945                for val in attributeValue:
946                    self.ME.responsibleParties.append(val)
947            else:
948                self.ME.responsibleParties.append(attributeValue)
949        elif attributeName == "files":
950            self.addUniqueRelatedLinks(attributeValue)
951        else:
952            setattr(self, attributeName, attributeValue)
953
954
955    def objectify(self, objectVals, attributeName):
956        '''
957        Some inputs are specified as strings but need to be converted into
958        objects - do this here
959        @param objectVals: a '|' delimited string of values
960        @param attributeName: name of attribute the values belong to
961        '''
962        obj = None
963        if type(objectVals) != str:
964            return objectVals
965       
966        if attributeName == "relatedLinks":
967            obj = Link()
968        elif attributeName == "atomAuthors" or attributeName == "authors":
969            # NB, ensure there is only one author tag - extra authors are contributors
970            authorType = Person.AUTHOR_TYPE
971            if self.author and self.author.hasValue():
972                authorType= Person.CONTRIBUTOR_TYPE
973            obj = Person(personType = authorType)
974        elif attributeName == 'files':
975            obj = Link()
976            objectVals = '%s|%s|%s' \
977                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
978
979        if obj:
980            obj.fromString(objectVals)
981            # NB, need to set it now, just in case we don't set it before coming back
982            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
983                self.author = obj
984            return obj
985       
986        return objectVals
987
988
989    def toPrettyXML(self):
990        '''
991        Returns nicely formatted XML as string
992        '''
993        atomXML = self.toXML()
994
995        # create the string
996        logging.debug("Converting the elementtree object into a string")
997        prettyXML = et2text(atomXML.getroot())
998
999        # add XML version tag
1000        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1001        logging.info("Created formatted version of XML object")
1002        return prettyXML
1003
1004
1005    def getLinksOfType(self, termID):
1006        '''
1007        Returns links in the atom related links attribute which match the specified
1008        term ID
1009        @param termID: the termID to look for in the related links - NB, this is
1010        matched to the end of the link.rel value
1011        @return links: array of Link objects with matching term type
1012        '''
1013        logging.debug("Getting atom links of type, '%s'" %termID)
1014        matchingLinks = []
1015        for link in self.relatedLinks:
1016            # firstly, handle special case where we only want the online ref type links
1017            # returned
1018            if termID == self.ONLINE_REF_LABEL:
1019                if not link.isChildAtom():
1020                    logging.debug("- found link with matching term type")
1021                    matchingLinks.append(link)
1022               
1023            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1024                logging.debug("- found link with matching term type")
1025                matchingLinks.append(link)
1026               
1027        logging.debug("Returning matched links")
1028        return matchingLinks
1029       
1030       
1031    def getLogos(self):
1032        '''
1033        Return related links that are logos
1034        @return: array of Links containing the logos for the atom
1035        '''
1036        logos = []
1037        for link in self.relatedLinks:
1038            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1039                logos.append(link)
1040               
1041        return logos
1042   
1043   
1044    def isGranule(self):
1045        if self.atomTypeID == VTD.GRANULE_TERM:
1046            return True
1047        return False
1048   
1049   
1050    def isDE(self):
1051        if self.atomTypeID == VTD.DE_TERM:
1052            return True
1053        return False
1054   
1055    def isDeployment(self):
1056        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1057            return True
1058        return False
1059   
1060    def isDeployable(self):
1061        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1062            self.atomTypeID == VTD.DPT_TERM or \
1063            self.atomTypeID == VTD.OBS_TERM:
1064            return True
1065        return False
1066       
1067       
1068    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1069        '''
1070        Parse CSML data and add extracted info to the atom
1071        @param csmlName: name of the csml file
1072        @param csmlContent: content of the csml file - NB, if this is set to None and the
1073        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1074        directly
1075        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1076        atom coverage data will be added
1077        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1078        this should only be True if creating a new atom - e.g. from a granulite
1079        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1080        '''
1081        logging.info("Creating CSML data model")
1082        self.csmlFile = csmlName
1083        self.contentFile = csmlName
1084        content = csmlContent or csmlName
1085   
1086        csmlDoc = CsmlParser.Dataset(file=content)
1087       
1088        logging.info("Extracting info from CSML file")
1089        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1090        if useCSMLID:
1091            logging.debug(" - using this ID for the atom")
1092            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1093       
1094        title = csmlDoc.name.CONTENT
1095        logging.debug("Got dataset name (title): '%s'" %title)
1096        # NB, if a title is specified (and not as the default value), it automatically is used in
1097        # place of anything in the granulite file
1098        if title and title != "NAME OF DATASET GOES HERE":
1099            logging.info("Title, '%s', extracted from CSML file" %title)
1100            if self.title:
1101                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1102                             %self.title)
1103            self.title = title
1104               
1105        bbox1 = csmlDoc.getBoundingBox()
1106        bbox2 = csmlDoc.getCSMLBoundingBox()
1107
1108        time = None
1109        if bbox2:
1110            time = bbox2.getTimeLimits()
1111   
1112        # now check for other parameters to add to granule
1113        # Firstly, extract the bounding envelope
1114        if bbox1:
1115            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1116            n, s = (bbox1[3], bbox1[1])
1117   
1118            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1119                self.maxY = n
1120               
1121            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1122                self.minY = s
1123           
1124            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1125                self.minX = w
1126   
1127            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1128                self.maxX = e
1129           
1130            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1131                          %(w, s, e, n))
1132           
1133            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1134                          %(self.minX, self.minY, self.maxX, self.maxY))
1135        else:
1136            logging.debug("No valid bounding box data found")
1137   
1138        if time:
1139            t1 = utilities.formatDateYYYYMMDD(time[0])
1140            if not aggregateCoverage or \
1141                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1142                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1143                self.t1 = t1
1144   
1145            t2 = time[1]
1146            if t2 and t2 != 'None':
1147                t2 = utilities.formatDateYYYYMMDD(t2)
1148                if not aggregateCoverage or \
1149                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1150                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1151                    self.t2 = t2
1152           
1153            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1154        else:
1155            logging.debug("No valid time range data found")
1156   
1157        #create parameter summaries:
1158        #set up list to hold the parameters data
1159        parameters = []
1160        for feature in csmlDoc.featureCollection.featureMembers:
1161            if hasattr(feature.parameter, 'href'):
1162                paramTriple = ""
1163                if hasattr(feature, 'description'):
1164                    paramTriple = feature.description.CONTENT
1165                    paramTriple += " | " + feature.parameter.href
1166                   
1167                    term = ""
1168                    if hasattr(feature, 'name'):
1169                        term = feature.name.CONTENT
1170   
1171                    paramTriple += " | " + term
1172                   
1173                    logging.debug("Got parameter info: %s" %paramTriple)
1174                    parameters.append(paramTriple)
1175       
1176        # update the atom with the extracted parameters
1177        logging.info("Adding CSML parameters to granule atom")
1178        self.addParameters(parameters)
1179        logging.info("Finished adding CSML data")
1180        return csmlDoc
1181
1182
1183    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1184        '''
1185        Check through the atom links and retrieve any associated data of the
1186        specified type
1187        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1188        or VTD.DE_TERM
1189        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1190        config details which are not available to the Atom object
1191        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1192        defined deployments to find those which reference it, otherwise only
1193        deployments data featured in the atom related links are processed
1194        '''
1195        logging.info("Looking up %s info" %type)
1196       
1197        self.allActivities = []
1198        self.allObs = []
1199        self.allDpts = []
1200
1201        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1202            raise ValueError('Unrecognised associated data type: %s' %type)
1203       
1204        # avoid duplicating lookup effort
1205        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1206            (type == VTD.DE_TERM and self.dataEntities):
1207            logging.info("- this info has already been looked up - returning")
1208            return
1209
1210        # firstly, collect all the references to the info required
1211        if lookupIndirectReferences:
1212            logging.info("Looking up indirect references")
1213           
1214            # if we're looking up DE data for deployments data, need to have the
1215            # deployments info looked up first
1216            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1217                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1218           
1219            logging.info("Looking up references to this atom from other %s" %type)
1220           
1221            # NB, if we're looking up deployments info, we only look up references
1222            # to this atom - if we're looking up DEs, we need to look up references
1223            # to the deployments referenced by this atom
1224            urls = [self.atomBrowseURL]
1225           
1226            if type == VTD.DE_TERM and self.isDeployable():
1227                urls = []
1228                for dep in self.deployments:
1229                    urls.append(dep.browseURL)
1230                   
1231            links = []
1232            for url in urls:
1233                doc = dr.get(type, dr.ATOM_TYPE, url, \
1234                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1235                # now need to turn this results set into actual atoms
1236                tree = ET.fromstring(doc)
1237                for atom in tree:
1238                    logging.debug("- found reference in %s" %type)
1239                    links.append(ET.tostring(atom))
1240                   
1241            logging.info("Finished looking up indirect references")
1242        else:
1243            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1244
1245        # now retrieve the references and extract the required data
1246        logging.info("Retrieving info from %s references" %type)
1247        if type == VTD.DEPLOYMENT_TERM:
1248            self.deployments = []
1249            for link in links:
1250                if lookupIndirectReferences:
1251                    deploymentAtom = link
1252                else:
1253                    localID = link.href.split("__ATOM__")[-1]
1254                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1255                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1256   
1257                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1258                self.deployments.append(deployment)
1259               
1260                self.addUniqueLinks(self.allActivities, deployment.activities)
1261                self.addUniqueLinks(self.allObs, deployment.obs)
1262                self.addUniqueLinks(self.allDpts, deployment.dpts)
1263        else:
1264            # for DE data, just store the title + link in a Link object
1265            self.dataEntities = []
1266            for data in links:
1267                atom = Atom(xmlString=str(data))
1268                link = Link()
1269                link.title = atom.title
1270                link.href = atom.atomBrowseURL
1271               
1272                # NB, different deployments may be used by the same DE - so
1273                # avoid duplication
1274                self.addUniqueLinks(self.dataEntities, link)
1275           
1276        logging.info("Finished looking up %s info" %type)
1277
1278
1279    def addUniqueLinks(self, dataArray, links):
1280        '''
1281        Add links to specified array - if they are not already included
1282        @param dataArray: a list, potentially arlready containing links
1283        @param links: a Link or array of Links to add to the dataArray
1284        '''
1285        logging.debug("Adding new links")
1286        if not links:
1287            return
1288       
1289        if type(links) is not list:
1290            links = [links]
1291       
1292        for link in links:
1293            if type(link) is not Link:
1294                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1295                continue
1296            if link not in dataArray:
1297                logging.debug("- adding unique link")
1298                dataArray.append(link)
1299        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.