source: exist/trunk/python/ndgUtils/models/Atom.py @ 4592

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4592
Revision 4592, 48.3 KB checked in by cbyrom, 11 years ago (diff)

Extend the Author class with a hasValue method and implement usage
throughout - NB, treat the main author more specifically - not
including an initial dummy value so that schema validation will fail if
not specified. Fix atom id format - remove ':' from hostname since
this is a delimeter used by the id. Adjust 'content' content to
make schema compliant - xhtml content must be wrapped in div tags.
Adjust schemas accordingly - and update these to use sequences rather
than choices - to properly enforce checking for required elements (at
the marginal expense of enforcing an order on the atom contents).

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18import ndgUtils.lib.utilities as utilities
19from ndgUtils.vocabtermdata import VocabTermData as VTD
20from ndgUtils.models import MolesEntity as ME
21import csml.parser as CsmlParser
22from ndgUtils.models import Deployment as Deployment
23
24
25class AtomError(Exception):
26    """
27    Exception handling for Atom class.
28    """
29    def __init__(self, msg):
30        logging.error(msg)
31        Exception.__init__(self, msg)
32
33
34class Person(object):
35    '''
36    Class representing atom author type data - with name, uri and role attributes
37    '''
38    AUTHOR_TYPE = 0
39    CONTRIBUTOR_TYPE = 1
40    RESPONSIBLE_PARTY_TYPE = 2
41    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
42   
43    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
44        self.type = personType
45        self.ns = namespace
46        self.name = ""
47        self.uri = ""
48        self.role = ""
49       
50        # NB, the atom format specifies slightly different data contents
51        self.uriTagName = "email"
52        # NB, responsible party data is always stored in the moles section
53        if self.type == self.RESPONSIBLE_PARTY_TYPE:
54            self.ns = 'moles'
55            self.uriTagName = "uri"
56
57    def __str__(self):
58        if self.name or self.uri or self.role:
59            return self.name + " | " + self.uri + " | " + self.role
60        return ""
61
62
63    def hasValue(self):
64        if self.name or self.uri or self.role:
65            return True
66        return False
67   
68    def fromString(self, personString):
69        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
70       
71    def fromETElement(self, personTag):
72        self.name = personTag.findtext('name') or ""
73        self.role = personTag.findtext('role') or ""
74        self.uri = personTag.findtext(self.uriTagName) or ""
75        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
76                      %(self.name, self.role, self.uriTagName, self.uri))
77
78    def toXML(self):
79        prefix = ""
80        if self.ns:
81            prefix = self.ns + ':'
82
83        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
84
85        if self.name:
86            name = ET.SubElement(author, prefix + "name")
87            name.text = self.name
88       
89        if self.uri:
90            uri = ET.SubElement(author, prefix + self.uriTagName)
91            uri.text = self.uri
92       
93        if self.role:
94            role = ET.SubElement(author, prefix + "role")
95            role.text = self.role
96
97        return author
98   
99    def __cmp__(self, person1):
100        '''
101        Override comparison to allow proper object comparison when checking
102        if Person objects are in an array already - i.e. if person in personArray...
103        '''
104        if not person1:
105            return -1
106       
107        if self is person1:
108            return 0
109        elif self.uri == person1.uri and self.name == person1.name and \
110                self.role == person1.role and self.type == person1.type:
111            return 0
112        return 1
113
114
115class Link(object):
116    '''
117    Class representing an atom link - with href, title and rel attributes
118    '''
119
120    def __init__(self):
121        self.href = ""
122        self.title = ""
123        self.rel = ""
124
125    def fromString(self, linkString):
126        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
127       
128    def fromETElement(self, linkTag):
129        self.href = linkTag.attrib.get('href') or ""
130        self.rel = linkTag.attrib.get('rel') or ""
131        self.title = linkTag.attrib.get('title') or ""
132
133    def toXML(self):
134        link = ET.Element("link")
135        link.attrib["href"] = self.href
136        link.attrib["title"] = self.title
137        link.attrib["rel"] = self.rel
138        return link
139
140    def hasValue(self):
141        # NB, just a rel on its own is meaningless - so ignore
142        if self.href or self.title:
143            return True
144        return False
145   
146    def __str__(self):
147        if self.href or self.title or self.rel:
148            return self.href + " | " + self.title + " | " + self.rel
149        return ""
150   
151    def isChildAtom(self):
152        '''
153        Determines whether the link refers to another atom - e.g. a link to
154        a data granule
155        @return True, if so; False otherwise
156        '''
157        if self.rel.endswith(VTD.GRANULE_TERM) or \
158            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
159            self.rel.endswith(VTD.ACTIVITY_TERM) or \
160            self.rel.endswith(VTD.DPT_TERM) or \
161            self.rel.endswith(VTD.OBS_TERM):
162            return True
163       
164        return False
165   
166    def __cmp__(self, link1):
167        '''
168        Override comparison to allow proper object comparison when checking
169        if Link objects are in an array already - i.e. if link in linkArray...
170        '''
171        if not link1:
172            return -1
173       
174        if self is link1:
175            return 0
176        elif self.href == link1.href and self.title == link1.title and \
177                self.rel == link1.rel:
178            return 0
179        return 1
180
181
182class Category(object):
183    '''
184    Class representing an atom category - with term, scheme and label attributes
185    '''
186    def __init__(self):
187        self.term = ""
188        self.scheme = ""
189        self.label = ""
190
191    def fromString(self, linkString, escapeSpecialCharacters=True):
192        '''
193        Create Category from triple string of format, 'label | scheme | term'
194        @param linkString: triple string to create category with
195        @keyword escapeSpecialCharacters: if set to True, special characters in
196        triple string are escaped (default)
197        '''
198        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
199            doEscape=escapeSpecialCharacters)
200       
201    def fromETElement(self, linkTag):
202        self.term = linkTag.attrib.get('term') or ""
203        self.label = linkTag.attrib.get('label') or ""
204        self.scheme = linkTag.attrib.get('scheme') or ""
205
206    def toXML(self):
207        link = ET.Element("category")
208        link.attrib["term"] = self.term
209        link.attrib["scheme"] = self.scheme
210        link.attrib["label"] = self.label
211        return link
212   
213    def hasValue(self):
214        if self.scheme or self.label or self.term:
215            return True
216        return False
217
218
219class Atom(object):
220
221    # labels for use with the atom categories
222    ATOM_TYPE = "ATOM_TYPE"
223    ATOM_SUBTYPE = "ATOM_SUBTYPE"
224
225    # labels for use with the templates to set/extract specific inputs
226    ONLINE_REF_LABEL = "online_ref"
227    PARAMETER_LABEL = "parameter"
228    ATOM_REF_LABEL = "atom_ref"
229    DELIMITER = "---"
230    REMOVE_LABEL = "remove"
231   
232    # format to use for t1-t2 date range
233    YEAR_FORMAT = '%Y-%m-%d'
234
235    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
236                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
237        '''
238        Constructor - initialise the atom variables
239        '''
240        logging.info("Initialising atom")
241        if atomType:
242            logging.info(" - of type '%s'" %atomType)
243        self.atomTypeID = atomType
244
245        # some data have further subtypes specified
246        self.subtypeID = None # this should be the termID
247        self.subtype = None # and this should be the fully formed vocab URL
248       
249        self.ndgObject = ndgObject
250
251        self.atomName = None
252        self.files = []
253        self.author = None
254        self.contributors = []
255        self.atomAuthors = []
256        self.parameters = []
257        self.spatialData = []
258        self.temporalData = []
259        self.relatedLinks = []
260        self.summary = []
261        self.content = []
262        # NB, this deployments data duplicates other atom data - and is only used for a
263        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
264        self.deployments = []
265        # ditto for the following field
266        self.dataEntities = []
267           
268        self.csmlFile = None
269        self.cdmlFile = None
270        # general variable to use for setting the atom content - NB, if a csmlFile is specified
271        # (either directly or via a cdmlFile specification), this will be the content by default
272        # for this purpose
273        self.contentFile = None     
274        self.title = None
275        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
276        self.atomID = None
277   
278        # boundary box info - to replace spatial/temporalData?
279        self.minX = None
280        self.minY = None
281        self.maxX = None
282        self.maxY = None
283        self.t1 = None
284        self.t2 = None
285
286        self.ME = ME.MolesEntity()
287       
288        # date when the atom was first ingested
289        self.publishedDate = None
290
291        # last update date
292        self.updatedDate = None
293
294        # assume atom in working state by default - this is used to define what collection
295        # in eXist the atom is stored in
296        self.state = state
297       
298        # additional, non standard atom data can be included in the molesExtra element
299        if vocabTermData:
300            self.VTD = vocabTermData
301        else:
302            self.VTD = VTD()
303       
304        if xmlString:
305            self.fromString(xmlString)
306
307        # if inputs passed in as dict, add these now
308        if inputs:
309            logging.info("Adding info to atom from input dict")
310            logging.debug(inputs)
311            self.__dict__.update(inputs)
312           
313            # NB, this doesn't trigger the Summary Property, so do this
314            # explicitly, if need be
315            if inputs.has_key('Summary'):
316                self.Summary = inputs.get('Summary')
317            if inputs.has_key('Content'):
318                self.Content = inputs.get('Content')
319           
320            # also pass any moles data up to the moles entity object
321            if inputs.get('providerID'):
322                self.ME.providerID = inputs.get('providerID')
323               
324            if inputs.get('abbreviation'):
325                self.ME.abbreviation = inputs.get('abbreviation')
326
327        if self.atomTypeID:
328            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
329
330        logging.info("Atom initialised")
331
332
333    def addOnlineReferences(self, links):
334        '''
335        Add online reference data associated with the atom
336        - NB, care needs to be taken here since this data is stored in the atom
337        link elements and these are also used for the various atom associations
338        @param links: a Link or array of Links to add to the relatedLinks attribute
339        '''
340        logging.debug("Adding online references")
341        if not links:
342            return
343       
344        if type(links) is not list:
345            links = [links]
346       
347        # firstly clear out any online refs data from the existing related links
348        newLinks = []
349        for link in self.relatedLinks:
350            if link.isChildAtom():
351                newLinks.append(link)
352       
353        newLinks.extend(links)
354        self.relatedLinks = newLinks
355        logging.debug("Online references added")
356
357
358    def addUniqueRelatedLinks(self, links):
359        '''
360        Add links to relatedLinks array - if they are not already included
361        @param links: a Link or array of Links to add to the relatedLinks attribute
362        '''
363        self.addUniqueLinks(self.relatedLinks, links)
364       
365
366    def removeRelatedLinks(self, linksToDelete):
367        '''
368        Remove any links in the input list from the atom's related links list
369        @param linksToDelete: array of Link objects to remove from atom
370        '''
371        logging.debug("Removing related links from atom")
372        if not linksToDelete:
373            return
374       
375        if type(linksToDelete) is not list:
376            linksToDelete = [linksToDelete]
377       
378        updatedLinks = []
379        for link in self.relatedLinks:
380            if type(link) is not Link:
381                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
382                continue
383            if link in linksToDelete:
384                logging.debug("- found link to remove")
385            else:
386                updatedLinks.append(link)
387
388        self.relatedLinks = updatedLinks
389        logging.debug("Links removed")
390       
391
392    def getDefaultCollectionPath(self):
393        '''
394        Determine the correct collection to use for the atom in eXist
395        '''
396        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
397       
398        if self.atomTypeID == VTD.DE_TERM:
399            collectionPath += eXistConnector.DE_COLLECTION_PATH
400        elif self.atomTypeID == VTD.GRANULE_TERM:
401            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
402        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
403            self.subtypeID == VTD.DEPLOYMENT_TERM:
404            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
405        else:
406            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
407       
408        if not self.ME.providerID:
409            raise AtomError("Error: cannot determine atom collection path because " + \
410                            "the provider ID is not defined")
411           
412        collectionPath += self.ME.providerID + "/"
413        return collectionPath
414
415
416    def __addAtomTypeDataXML(self, root):
417        '''
418        Add the atom type, and subtype data, if available, to atom categories
419        - and lookup and add the appropriate vocab term data
420        '''
421        if self.atomTypeID:
422            logging.info("Adding atom type info to XML output")
423            category = Category()
424            category.label = self.atomTypeID
425            # look up the appropriate vocab term data
426            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
427            category.term = self.ATOM_TYPE
428            root.append(category.toXML())
429
430        if self.subtypeID:
431            logging.info("Adding atom subtype info to XML output")
432            # NB subtypes not all defined, so leave this out for the moment
433            category.label = self.subtypeID
434            # look up the appropriate vocab term data
435            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
436            category.term = self.ATOM_SUBTYPE
437            root.append(category.toXML())
438
439
440    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
441        '''
442        Add data to include in the moles entity element
443        '''
444        logging.debug('Adding moles entity information')
445        self.ME.abbreviation = abbreviation
446        self.ME.providerID = provider_id
447        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
448        logging.debug('Moles entity information added')
449
450
451    def addAuthors(self, authors):
452        '''
453        Add author data appropriately to the atom
454        NB, these will overwrite any existing authors of the same type
455        @param authors: list of Person objects with the author data
456        '''
457        logging.debug('Adding authors data to Atom')
458        isFirstAuthor = {}
459        authorArray = None
460        for author in authors:
461            # NB, we're only allowed one atom author
462            if author.type == Person.AUTHOR_TYPE:
463                self.author = author
464                   
465                if isFirstAuthor.has_key(author.type):
466                    raise AtomError("Error: an atom can only have one author specified")
467                isFirstAuthor[author.type] = 1
468                continue
469            elif author.type == Person.CONTRIBUTOR_TYPE:
470                authorArray = self.contributors
471            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
472                authorArray = self.ME.responsibleParties
473               
474            # check if this is the first addition - if so, clear out the
475            # array in advance
476            if not isFirstAuthor.has_key(author.type):
477                logging.debug("Clearing out author array")
478                # NB, need to be careful to clear the array, not create a ref
479                # to a new array
480                del authorArray[:]
481                isFirstAuthor[author.type] = 1
482
483            if author.hasValue() and author not in authorArray:
484                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
485                              %(author.type, author.name, author.uri, author.role))
486                authorArray.append(author)
487
488        logging.debug('Finished adding authors data')
489
490
491    def _isNewParameter(self, param):
492        '''
493        Check if a parameter is already specified in the atom, return False if
494        so, otherwise return True
495        '''
496        for p in self.parameters:
497            if p.term == param.term and \
498                p.scheme == param.scheme and \
499                p.label == param.label:
500                return False
501        return True
502
503
504    def addRelatedLinks(self, linkVals):
505        '''
506        Add related links in string format - converting to Link objects
507        NB, only add the link if it is unique
508       
509        @param linkVals: string of format, 'uri | title | vocabServerURL'
510        '''
511        link = self.objectify(linkVals, 'relatedLinks')
512        if link not in self.relatedLinks:
513            self.relatedLinks.append(link)
514
515
516    def addParameters(self, params):
517        '''
518        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
519        @params param: parameter, as string array, to add to atom parameters collection
520        '''
521        # avoid strings being parsed character by character
522        if type(params) is str:
523            params = [params]
524           
525        for param in params:
526            # firstly tidy parameter
527            param = utilities.tidyUpParameters(param)
528            category = Category()
529            # NB, data already tidied up here, so set keyword to avoid this happening again
530            category.fromString(param, escapeSpecialCharacters=True)
531
532            # now check for uniqueness
533            if self._isNewParameter(category):
534                logging.debug("Adding new parameter: %s" %param)
535                self.parameters.append(category)
536   
537   
538    def _linksToXML(self, root):
539        '''
540        Add required links to the input element
541        @param root: element to add links to - NB, should be the root element of the atom
542        '''
543        selfLink = ET.SubElement(root, "link")
544        selfLink.attrib["href"] = self.atomBrowseURL
545        selfLink.attrib["rel"] = "self"
546        if self.subtypeID != VTD.DEPLOYMENT_TERM:
547            molesLink = ET.SubElement(root, "link")
548            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
549            molesLink.attrib["href"] = molesDoc
550            molesLink.attrib["rel"] = 'related'
551       
552        for relatedLink in self.relatedLinks:
553            if relatedLink.hasValue():
554                root.append(relatedLink.toXML())
555   
556    def toXML(self):
557        '''
558        Convert the atom into XML representation and return this
559        @return: xml version of atom
560        '''
561        logging.info("Creating formatted XML version of Atom")
562        root = ET.Element("entry")
563        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
564        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
565        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
566        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
567        id = ET.SubElement(root, "id")
568        id.text = self.atomID
569        title = ET.SubElement(root, "title")
570        title.text = self.title
571        self._linksToXML(root)
572
573        if self.author.hasValue():
574            root.append(self.author.toXML())
575           
576        for contributor in self.contributors:
577            root.append(contributor.toXML())
578
579        # add parameters data
580        for param in self.parameters:
581            if param.hasValue():
582                root.append(param.toXML())
583
584        # add the type and subtype data
585        self.__addAtomTypeDataXML(root)
586                   
587        summary = ET.SubElement(root, "summary")
588        summary.text = self.Summary
589                   
590        # add link to content, if required - NB, can only have one content element in atom
591        # - and this is mandatory
592        content = ET.SubElement(root, "content")
593        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
594        if contentFile:
595            content.attrib["type"] = "application/xml"
596            content.attrib["src"] = contentFile
597        else:
598            content.attrib["type"] = "xhtml"
599            div = ET.SubElement(content, 'div')
600            div.attrib["xmlns"] = "http://www.w3.org/1999/xhtml"
601            div.text = self.Content
602       
603        # if there's a published date already defined, assume we're doing an update now
604        # NB, update element is mandatory
605        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
606        if not self.publishedDate:
607            self.publishedDate = currentDate
608
609        updated = ET.SubElement(root, "updated")
610        if not self.updatedDate:
611            self.updatedDate = currentDate
612        updated.text = self.updatedDate
613
614        published = ET.SubElement(root, "published")
615        published.text = self.publishedDate
616
617        # add the moles entity section, if it is required
618        if self.ME:
619            root.append(self.ME.toXML())
620
621        # add temporal range data, if available
622        temporalRange = ET.SubElement(root, "moles:temporalRange")
623        if self.t1:
624            temporalRange.text = self.t1
625            if self.t2:
626                temporalRange.text += "/" + self.t2
627
628        # add spatial range data, if available
629        self._addSpatialData(root)
630
631        tree = ET.ElementTree(root)
632        logging.info("XML version of Atom created")
633        return tree
634
635
636    def __getSummary(self):
637        logging.debug("Getting summary data")
638        summaryString = ""
639        for summary_line in self.summary:
640            summaryString += summary_line + "\n"
641
642        return summaryString
643
644    def __setSummary(self, summary):
645        logging.debug("Adding summary data")
646        self.summary = []
647        for summary_line in summary.split('\n'):
648            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
649           
650    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
651
652
653    def __getContent(self):
654        logging.debug("Getting content data")
655        contentString = ""
656        # NB, there must be content specified in an atom
657        if not self.content:
658            return "Metadata document"
659       
660        for content_line in self.content:
661            contentString += content_line + "\n"
662
663        return contentString
664
665    def __setContent(self, content):
666        logging.debug("Adding content data")
667        self.content = []
668        for content_line in content.split('\n'):
669            self.content.append(content_line)
670           
671    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
672
673           
674    def fromString(self, xmlString):
675        '''
676        Initialise Atom object using an xmlString
677        @param xmlString: representation of atom as an XML string
678        '''
679        logging.info("Ingesting data from XML string")
680       
681        # firstly, remove any namespaces used - to avoid problems with elementtree
682        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
683        xmlString = xmlString.replace('moles:', '')
684        xmlString = xmlString.replace('georss:', '')
685        xmlString = xmlString.replace('gml:', '')
686        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
687        xmlString = xmlString.replace('default:', '')
688        xmlString = xmlString.replace('xs:', '')
689
690        # now create elementtree with the XML string
691        logging.debug("Create elementtree instance with XML string")
692        tree = ET.fromstring(xmlString)
693       
694        title = tree.findtext('title')
695        if title:
696            logging.debug("Adding title data")
697            self.title = title
698
699        summary = tree.findtext('summary')
700        if summary:
701            self.Summary = summary#.decode('unicode_escape')
702
703        authorElement = tree.find('author')
704        if authorElement:
705            logging.debug("Adding author data")
706            author = Person()
707            author.fromETElement(authorElement)
708            self.author = author
709
710        contributorElements = tree.findall('contributor')
711        for contributorElement in contributorElements:
712            logging.debug("Adding contributor data")
713            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
714            contributor.fromETElement(contributorElement)
715            self.contributors.append(contributor)
716
717        molesElement = tree.find('entity')
718        if molesElement:
719            self.ME.fromET(molesElement)
720               
721        self.atomID = tree.findtext('id')
722
723        self._parseCategoryData(tree.findall('category'))
724
725        self._parseLinksData(tree.findall('link'))
726           
727        contentTag = tree.find('content')
728        if contentTag != None:
729            logging.debug("Found content tag - checking for CSML/CDML file data")
730            file = contentTag.attrib.get('src')
731            if file:
732                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
733                if file.upper().find('CSML') > -1:
734                    logging.debug("Adding CSML file data")
735                    self.csmlFile = file
736                elif file.upper().find('CDML') > -1:
737                    logging.debug("Adding CDML file data")
738                    self.cdmlFile = file
739                self.contentFile = file
740            else:
741                logging.debug("No file data - adding contents of element instead")
742                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
743                self.Content = div.text
744       
745        range = tree.findtext('temporalRange')
746        if range:
747            logging.debug("Adding temporal range data")
748            timeData = range.split('/')
749            self.t1 = timeData[0]
750            if len(timeData) > 1:
751                self.t2 = timeData[1]
752       
753        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
754        minBBox = tree.findall('.//lowerCorner')
755        if minBBox:
756            logging.debug("Adding min spatial range data")
757            minBBox = minBBox[0]
758            spatialData = minBBox.text.split()
759            self.minX = spatialData[0]
760            if len(spatialData) > 1:
761                self.minY = spatialData[1]
762       
763        maxBBox = tree.findall('.//upperCorner')
764        if maxBBox:
765            maxBBox = maxBBox[0]
766            logging.debug("Adding max spatial range data")
767            spatialData = maxBBox.text.split()
768            self.maxX = spatialData[0]
769            if len(spatialData) > 1:
770                self.maxY = spatialData[1]
771               
772        publishedDate = tree.findtext('published')
773        if publishedDate:
774            logging.debug("Adding published date")
775            self.publishedDate = publishedDate
776               
777        updatedDate = tree.findtext('updated')
778        if updatedDate:
779            logging.debug("Adding updated date")
780            self.updatedDate = updatedDate
781           
782        logging.info("Completed data ingest")
783   
784   
785    def _parseCategoryData(self, categories):
786        logging.debug("Adding category/parameters data")
787        for category in categories:
788            cat = Category()
789            cat.fromETElement(category)
790           
791            if cat.term == self.ATOM_TYPE:
792                logging.debug("Found atom type data")
793                self.atomTypeID = cat.label
794                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
795                continue
796            elif cat.term == self.ATOM_SUBTYPE:
797                logging.debug("Found atom subtype data")
798                self.subtypeID = cat.label
799                self.subtype = cat.scheme
800                continue
801
802            self.parameters.append(cat)
803   
804
805    def setDatasetID(self, datasetID):
806        '''
807        Set the dataset ID for the atom - and generate an appropriate atom name using this
808        @param datasetID: ID to set for the atom
809        '''
810        self.datasetID = datasetID
811        self._generateAtomName(datasetID) 
812        self.atomID = self.createAtomID(datasetID)
813
814
815    def createAtomID(self, datasetID):
816        '''
817        Create a unique ID, conforming to atom standards, for atom
818        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
819        @param datasetID: ID of atom's dataset
820        @return: unique ID
821        '''
822        logging.info("Creating unique ID for atom")
823        if not self.atomBrowseURL:
824            self._generateAtomName(datasetID)
825        urlBit = self.atomBrowseURL.split('://')[1]
826        urlBit = urlBit.replace('#', '')
827        urlBits = urlBit.split('/')
828        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
829        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
830       
831        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
832        logging.info("- unique ID created for atom")
833        logging.debug(" - '%s'" %id)
834        return id
835       
836       
837    def _generateAtomName(self, datasetID):
838        '''
839        Generate a consistent name for the atom - with full eXist doc path
840        @param datasetID: ID of atom's dataset
841        '''
842        self.atomName = datasetID + ".atom"
843        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
844        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
845
846
847    def _parseLinksData(self, links):
848        '''
849        Extract links and atom data from array of link elements in the XML representation of the atom
850        @param links: an array of <link> elements
851        '''
852        # firstly, get all data to start with, so we can properly process it afterwards
853        linkData = {}
854        logging.debug("Getting link data")
855        for linkTag in links:
856            link = Link()
857            link.fromETElement(linkTag)
858
859            if not linkData.has_key(link.rel):
860                linkData[link.rel] = []
861           
862            linkData[link.rel].append(link)
863
864        # there should be one self referencing link - which will provide info on the atom itself
865        if not linkData.has_key('self'):
866            errorMessage = "Atom does not have self referencing link - " + \
867                "cannot ascertain datasetID without this - please fix"
868            logging.error(errorMessage)
869            raise ValueError(errorMessage)
870       
871        # this is the link describing the atom itself
872        self.atomBrowseURL = linkData['self'][0].href
873       
874        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
875        self.atomName = self.datasetID + ".atom"
876        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
877       
878        # now remove this value and the associated moles doc link
879        del linkData['self']
880        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
881        if linkData.has_key('related'):
882            relatedLinks = []
883            for link in linkData['related']:
884                if link.href != molesDoc:
885                    relatedLinks.append(link)
886           
887            linkData['related'] = relatedLinks
888               
889        # now add the remaining links to the atom
890        for key in linkData:
891            for link in linkData[key]:
892                logging.debug("Adding link data")
893                self.relatedLinks.append(link)
894       
895
896    def _addSpatialData(self, element):
897        '''
898        Add spatial coverage element to an input element
899        @param element: element to add coverage data to
900        '''
901        logging.info("Adding spatial data to Atom")
902        if not self.minX:
903            logging.info("No spatial data specified")
904            return
905        bbox = ET.SubElement(element, "georss:where")
906        envelope = ET.SubElement(bbox, "gml:Envelope")
907        lc = ET.SubElement(envelope, "gml:lowerCorner")
908        lc.text = str(self.minX) + " " + str(self.minY)
909        uc = ET.SubElement(envelope, "gml:upperCorner")
910        uc.text = str(self.maxX) + " " + str(self.maxY)
911
912       
913    def setAttribute(self, attributeName, attributeValue):
914        '''
915        Set the value of an atom attribute - and do some basic tidying up of the string content
916        - to escape any XML unfriendly characters
917        @param attributeName: name of the attribute whose value to set
918        @param attributeValue: value to set the attribute to 
919        '''
920        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
921        origValue = attributeValue
922       
923        # escape any special characters if a value has been specified
924        # NB, need to cope with both single values and arrays
925        if attributeValue:
926            if type(attributeValue) is list:
927                newVals = []
928                for val in attributeValue:
929                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
930                attributeValue = newVals
931                   
932            else:
933                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
934
935        # handle the special case of authors; only one author is allowed per atom
936        # - the others should be treated as contributors
937        if attributeName == "authors":
938            setattr(self, "author", attributeValue[0])
939            if len(attributeValue) > 1:
940                setattr(self, "contributors", attributeValue[1:])
941        elif attributeName == "atomAuthors":
942            if isinstance(attributeValue, list):
943                for val in attributeValue:
944                    self.ME.responsibleParties.append(val)
945            else:
946                self.ME.responsibleParties.append(attributeValue)
947        elif attributeName == "files":
948            self.addUniqueRelatedLinks(attributeValue)
949        else:
950            setattr(self, attributeName, attributeValue)
951
952
953    def objectify(self, objectVals, attributeName):
954        '''
955        Some inputs are specified as strings but need to be converted into
956        objects - do this here
957        @param objectVals: a '|' delimited string of values
958        @param attributeName: name of attribute the values belong to
959        '''
960        obj = None
961        if type(objectVals) != str:
962            return objectVals
963       
964        if attributeName == "relatedLinks":
965            obj = Link()
966        elif attributeName == "atomAuthors" or attributeName == "authors":
967            # NB, ensure there is only one author tag - extra authors are contributors
968            authorType = Person.AUTHOR_TYPE
969            if self.author:
970                authorType= Person.CONTRIBUTOR_TYPE
971            obj = Person(personType = authorType)
972        elif attributeName == 'files':
973            obj = Link()
974            objectVals = '%s|%s|%s' \
975                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
976
977        if obj:
978            obj.fromString(objectVals)
979            # NB, need to set it now, just in case we don't set it before coming back
980            if attributeName == "authors" and not self.author:
981                self.author = obj
982            return obj
983       
984        return objectVals
985
986
987    def toPrettyXML(self):
988        '''
989        Returns nicely formatted XML as string
990        '''
991        atomXML = self.toXML()
992
993        # create the string
994        logging.debug("Converting the elementtree object into a string")
995        prettyXML = et2text(atomXML.getroot())
996
997        # add XML version tag
998        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
999        logging.info("Created formatted version of XML object")
1000        return prettyXML
1001
1002
1003    def getLinksOfType(self, termID):
1004        '''
1005        Returns links in the atom related links attribute which match the specified
1006        term ID
1007        @param termID: the termID to look for in the related links - NB, this is
1008        matched to the end of the link.rel value
1009        @return links: array of Link objects with matching term type
1010        '''
1011        logging.debug("Getting atom links of type, '%s'" %termID)
1012        matchingLinks = []
1013        for link in self.relatedLinks:
1014            # firstly, handle special case where we only want the online ref type links
1015            # returned
1016            if termID == self.ONLINE_REF_LABEL:
1017                if not link.isChildAtom():
1018                    logging.debug("- found link with matching term type")
1019                    matchingLinks.append(link)
1020               
1021            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1022                logging.debug("- found link with matching term type")
1023                matchingLinks.append(link)
1024               
1025        logging.debug("Returning matched links")
1026        return matchingLinks
1027       
1028       
1029    def getLogos(self):
1030        '''
1031        Return related links that are logos
1032        @return: array of Links containing the logos for the atom
1033        '''
1034        logos = []
1035        for link in self.relatedLinks:
1036            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1037                logos.append(link)
1038               
1039        return logos
1040   
1041   
1042    def isGranule(self):
1043        if self.atomTypeID == VTD.GRANULE_TERM:
1044            return True
1045        return False
1046   
1047   
1048    def isDE(self):
1049        if self.atomTypeID == VTD.DE_TERM:
1050            return True
1051        return False
1052   
1053    def isDeployment(self):
1054        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1055            return True
1056        return False
1057   
1058    def isDeployable(self):
1059        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1060            self.atomTypeID == VTD.DPT_TERM or \
1061            self.atomTypeID == VTD.OBS_TERM:
1062            return True
1063        return False
1064       
1065       
1066    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1067        '''
1068        Parse CSML data and add extracted info to the atom
1069        @param csmlName: name of the csml file
1070        @param csmlContent: content of the csml file - NB, if this is set to None and the
1071        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1072        directly
1073        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1074        atom coverage data will be added
1075        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1076        this should only be True if creating a new atom - e.g. from a granulite
1077        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1078        '''
1079        logging.info("Creating CSML data model")
1080        self.csmlFile = csmlName
1081        self.contentFile = csmlName
1082        content = csmlContent or csmlName
1083   
1084        csmlDoc = CsmlParser.Dataset(file=content)
1085       
1086        logging.info("Extracting info from CSML file")
1087        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1088        if useCSMLID:
1089            logging.debug(" - using this ID for the atom")
1090            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1091       
1092        title = csmlDoc.name.CONTENT
1093        logging.debug("Got dataset name (title): '%s'" %title)
1094        # NB, if a title is specified (and not as the default value), it automatically is used in
1095        # place of anything in the granulite file
1096        if title and title != "NAME OF DATASET GOES HERE":
1097            logging.info("Title, '%s', extracted from CSML file" %title)
1098            if self.title:
1099                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1100                             %self.title)
1101            self.title = title
1102               
1103        bbox1 = csmlDoc.getBoundingBox()
1104        bbox2 = csmlDoc.getCSMLBoundingBox()
1105
1106        time = None
1107        if bbox2:
1108            time = bbox2.getTimeLimits()
1109   
1110        # now check for other parameters to add to granule
1111        # Firstly, extract the bounding envelope
1112        if bbox1:
1113            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1114            n, s = (bbox1[3], bbox1[1])
1115   
1116            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1117                self.maxY = n
1118               
1119            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1120                self.minY = s
1121           
1122            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1123                self.minX = w
1124   
1125            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1126                self.maxX = e
1127           
1128            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1129                          %(w, s, e, n))
1130           
1131            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1132                          %(self.minX, self.minY, self.maxX, self.maxY))
1133        else:
1134            logging.debug("No valid bounding box data found")
1135   
1136        if time:
1137            t1 = utilities.formatDateYYYYMMDD(time[0])
1138            if not aggregateCoverage or \
1139                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1140                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1141                self.t1 = t1
1142   
1143            t2 = time[1]
1144            if t2 and t2 != 'None':
1145                t2 = utilities.formatDateYYYYMMDD(t2)
1146                if not aggregateCoverage or \
1147                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1148                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1149                    self.t2 = t2
1150           
1151            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1152        else:
1153            logging.debug("No valid time range data found")
1154   
1155        #create parameter summaries:
1156        #set up list to hold the parameters data
1157        parameters = []
1158        for feature in csmlDoc.featureCollection.featureMembers:
1159            if hasattr(feature.parameter, 'href'):
1160                paramTriple = ""
1161                if hasattr(feature, 'description'):
1162                    paramTriple = feature.description.CONTENT
1163                    paramTriple += " | " + feature.parameter.href
1164                   
1165                    term = ""
1166                    if hasattr(feature, 'name'):
1167                        term = feature.name.CONTENT
1168   
1169                    paramTriple += " | " + term
1170                   
1171                    logging.debug("Got parameter info: %s" %paramTriple)
1172                    parameters.append(paramTriple)
1173       
1174        # update the atom with the extracted parameters
1175        logging.info("Adding CSML parameters to granule atom")
1176        self.addParameters(parameters)
1177        logging.info("Finished adding CSML data")
1178        return csmlDoc
1179
1180
1181    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1182        '''
1183        Check through the atom links and retrieve any associated data of the
1184        specified type
1185        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1186        or VTD.DE_TERM
1187        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1188        config details which are not available to the Atom object
1189        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1190        defined deployments to find those which reference it, otherwise only
1191        deployments data featured in the atom related links are processed
1192        '''
1193        logging.info("Looking up %s info" %type)
1194       
1195        self.allActivities = []
1196        self.allObs = []
1197        self.allDpts = []
1198
1199        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1200            raise ValueError('Unrecognised associated data type: %s' %type)
1201       
1202        # avoid duplicating lookup effort
1203        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1204            (type == VTD.DE_TERM and self.dataEntities):
1205            logging.info("- this info has already been looked up - returning")
1206            return
1207
1208        # firstly, collect all the references to the info required
1209        if lookupIndirectReferences:
1210            logging.info("Looking up indirect references")
1211           
1212            # if we're looking up DE data for deployments data, need to have the
1213            # deployments info looked up first
1214            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1215                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1216           
1217            logging.info("Looking up references to this atom from other %s" %type)
1218           
1219            # NB, if we're looking up deployments info, we only look up references
1220            # to this atom - if we're looking up DEs, we need to look up references
1221            # to the deployments referenced by this atom
1222            urls = [self.atomBrowseURL]
1223           
1224            if type == VTD.DE_TERM and self.isDeployable():
1225                urls = []
1226                for dep in self.deployments:
1227                    urls.append(dep.browseURL)
1228                   
1229            links = []
1230            for url in urls:
1231                doc = dr.get(type, dr.ATOM_TYPE, url, \
1232                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1233                # now need to turn this results set into actual atoms
1234                tree = ET.fromstring(doc)
1235                for atom in tree:
1236                    logging.debug("- found reference in %s" %type)
1237                    links.append(ET.tostring(atom))
1238                   
1239            logging.info("Finished looking up indirect references")
1240        else:
1241            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1242
1243        # now retrieve the references and extract the required data
1244        logging.info("Retrieving info from %s references" %type)
1245        if type == VTD.DEPLOYMENT_TERM:
1246            self.deployments = []
1247            for link in links:
1248                if lookupIndirectReferences:
1249                    deploymentAtom = link
1250                else:
1251                    localID = link.href.split("__ATOM__")[-1]
1252                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1253                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1254   
1255                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1256                self.deployments.append(deployment)
1257               
1258                self.addUniqueLinks(self.allActivities, deployment.activities)
1259                self.addUniqueLinks(self.allObs, deployment.obs)
1260                self.addUniqueLinks(self.allDpts, deployment.dpts)
1261        else:
1262            # for DE data, just store the title + link in a Link object
1263            self.dataEntities = []
1264            for data in links:
1265                atom = Atom(xmlString=str(data))
1266                link = Link()
1267                link.title = atom.title
1268                link.href = atom.atomBrowseURL
1269               
1270                # NB, different deployments may be used by the same DE - so
1271                # avoid duplication
1272                self.addUniqueLinks(self.dataEntities, link)
1273           
1274        logging.info("Finished looking up %s info" %type)
1275
1276
1277    def addUniqueLinks(self, dataArray, links):
1278        '''
1279        Add links to specified array - if they are not already included
1280        @param dataArray: a list, potentially arlready containing links
1281        @param links: a Link or array of Links to add to the dataArray
1282        '''
1283        logging.debug("Adding new links")
1284        if not links:
1285            return
1286       
1287        if type(links) is not list:
1288            links = [links]
1289       
1290        for link in links:
1291            if type(link) is not Link:
1292                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1293                continue
1294            if link not in dataArray:
1295                logging.debug("- adding unique link")
1296                dataArray.append(link)
1297        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.