source: exist/trunk/python/ndgUtils/models/Atom.py @ 4427

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4427
Revision 4427, 43.3 KB checked in by cbyrom, 11 years ago (diff)

Upgrade the various 'list' xqueries - to nest all results in a single
root element - so that only one document need be retrieved to get all
results + adjust ndgDirectory to cope with processing the new results +
fix the various namespaces mentioned in the codebase to map to the
current atom/moles ones.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23
24
25class AtomError(Exception):
26    """
27    Exception handling for Atom class.
28    """
29    def __init__(self, msg):
30        logging.error(msg)
31        Exception.__init__(self, msg)
32
33
34class ValidationError(Exception):
35    """
36    Exception handling for validation.
37    """
38    def __init__(self, errorDict):
39        msg = "Data validation error"
40        logging.error(msg)
41        Exception.__init__(self, msg)
42        for val in errorDict.itervalues():
43            logging.error(val)
44        self._errorDict = errorDict
45           
46    def unpack_errors(self):
47        return self._errorDict
48
49
50class Person(object):
51    '''
52    Class representing atom author type data - with name, uri and role attributes
53    '''
54    AUTHOR_TYPE = 0
55    CONTRIBUTOR_TYPE = 1
56    RESPONSIBLE_PARTY_TYPE = 2
57    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
58   
59    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
60        self.type = personType
61        self.ns = namespace
62        self.name = ""
63        self.uri = ""
64        self.role = ""
65       
66        # NB, the atom format specifies slightly different data contents
67        self.uriTagName = "email"
68        # NB, responsible party data is always stored in the moles section
69        if self.type == self.RESPONSIBLE_PARTY_TYPE:
70            self.ns = 'moles'
71            self.uriTagName = "uri"
72
73    def __str__(self):
74        if self.name or self.uri or self.role:
75            return self.name + " | " + self.uri + " | " + self.role
76        return ""
77   
78    def fromString(self, personString):
79        (self.name, self.uri, self.role) = getTripleData(personString)
80       
81    def fromETElement(self, personTag):
82        self.name = personTag.findtext('name') or ""
83        self.role = personTag.findtext('role') or ""
84        self.uri = personTag.findtext(self.uriTagName) or ""
85        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
86                      %(self.name, self.role, self.uriTagName, self.uri))
87
88    def toXML(self):
89        prefix = ""
90        if self.ns:
91            prefix = self.ns + ':'
92
93        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
94
95        if self.name:
96            name = ET.SubElement(author, prefix + "name")
97            name.text = self.name
98       
99        if self.uri:
100            uri = ET.SubElement(author, prefix + self.uriTagName)
101            uri.text = self.uri
102       
103        if self.role:
104            role = ET.SubElement(author, prefix + "role")
105            role.text = self.role
106
107        return author
108   
109    def __cmp__(self, person1):
110        '''
111        Override comparison to allow proper object comparison when checking
112        if Person objects are in an array already - i.e. if person in personArray...
113        '''
114        if not person1:
115            return -1
116       
117        if self is person1:
118            return 0
119        elif self.uri == person1.uri and self.name == person1.name and \
120                self.role == person1.role and self.type == person1.type:
121            return 0
122        return 1
123
124
125class Link(object):
126    '''
127    Class representing an atom link - with href, title and rel attributes
128    '''
129
130    def __init__(self):
131        self.href = ""
132        self.title = ""
133        self.rel = ""
134
135    def fromString(self, linkString):
136        (self.href, self.title, self.rel) = getTripleData(linkString)
137       
138    def fromETElement(self, linkTag):
139        self.href = linkTag.attrib.get('href') or ""
140        self.rel = linkTag.attrib.get('rel') or ""
141        self.title = linkTag.attrib.get('title') or ""
142
143    def toXML(self):
144        link = ET.Element("link")
145        link.attrib["href"] = self.href
146        link.attrib["title"] = self.title
147        link.attrib["rel"] = self.rel
148        return link
149
150    def hasValue(self):
151        # NB, just a rel on its own is meaningless - so ignore
152        if self.href or self.title:
153            return True
154        return False
155   
156    def __str__(self):
157        if self.href or self.title or self.rel:
158            return self.href + " | " + self.title + " | " + self.rel
159        return ""
160   
161    def isChildAtom(self):
162        '''
163        Determines whether the link refers to another atom - e.g. a link to
164        a data granule
165        @return True, if so; False otherwise
166        '''
167        if self.rel.endswith(VTD.GRANULE_TERM) or \
168            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
169            self.rel.endswith(VTD.ACTIVITY_TERM) or \
170            self.rel.endswith(VTD.DPT_TERM) or \
171            self.rel.endswith(VTD.OBS_TERM):
172            return True
173       
174        return False
175   
176    def __cmp__(self, link1):
177        '''
178        Override comparison to allow proper object comparison when checking
179        if Link objects are in an array already - i.e. if link in linkArray...
180        '''
181        if not link1:
182            return -1
183       
184        if self is link1:
185            return 0
186        elif self.href == link1.href and self.title == link1.title and \
187                self.rel == link1.rel:
188            return 0
189        return 1
190
191
192class Category(object):
193    '''
194    Class representing an atom category - with term, scheme and label attributes
195    '''
196    def __init__(self):
197        self.term = ""
198        self.scheme = ""
199        self.label = ""
200
201    def fromString(self, linkString):
202        (self.label, self.scheme, self.term) = getTripleData(linkString)
203       
204    def fromETElement(self, linkTag):
205        self.term = linkTag.attrib.get('term') or ""
206        self.label = linkTag.attrib.get('label') or ""
207        self.scheme = linkTag.attrib.get('scheme') or ""
208
209    def toXML(self):
210        link = ET.Element("category")
211        link.attrib["term"] = self.term
212        link.attrib["scheme"] = self.scheme
213        link.attrib["label"] = self.label
214        return link
215   
216    def hasValue(self):
217        if self.scheme or self.label or self.term:
218            return True
219        return False
220
221
222class Atom(object):
223
224    # labels for use with the atom categories
225    ATOM_TYPE = "ATOM_TYPE"
226    ATOM_SUBTYPE = "ATOM_SUBTYPE"
227
228    # labels for use with the templates to set/extract specific inputs
229    ONLINE_REF_LABEL = "online_ref"
230    PARAMETER_LABEL = "parameter"
231    ATOM_REF_LABEL = "atom_ref"
232    DELIMITER = "---"
233    REMOVE_LABEL = "remove"
234   
235    # format to use for t1-t2 date range
236    YEAR_FORMAT = '%Y-%m-%d'
237
238    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
239                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
240        '''
241        Constructor - initialise the atom variables
242        '''
243        logging.info("Initialising atom")
244        if atomType:
245            logging.info(" - of type '%s'" %atomType)
246        self.atomTypeID = atomType
247
248        # some data have further subtypes specified
249        self.subtypeID = None # this should be the termID
250        self.subtype = None # and this should be the fully formed vocab URL
251       
252        self.ndgObject = ndgObject
253
254        self.atomName = None
255        self.files = []
256        self.author = None
257        self.contributors = []
258        self.atomAuthors = []
259        self.parameters = []
260        self.spatialData = []
261        self.temporalData = []
262        self.relatedLinks = []
263        self.summary = []
264        self.content = []
265        self.csmlFile = None
266        self.cdmlFile = None
267        # general variable to use for setting the atom content - NB, if a csmlFile is specified
268        # (either directly or via a cdmlFile specification), this will be the content by default
269        # for this purpose
270        self.contentFile = None     
271        self.title = None
272        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
273        self.atomID = None
274   
275        # boundary box info - to replace spatial/temporalData?
276        self.minX = None
277        self.minY = None
278        self.maxX = None
279        self.maxY = None
280        self.t1 = None
281        self.t2 = None
282
283        self.ME = ME.MolesEntity()
284       
285        # date when the atom was first ingested
286        self.publishedDate = None
287
288        # last update date
289        self.updatedDate = None
290
291        # assume atom in working state by default - this is used to define what collection
292        # in eXist the atom is stored in
293        self.state = state
294       
295        # additional, non standard atom data can be included in the molesExtra element
296        if vocabTermData:
297            self.VTD = vocabTermData
298        else:
299            self.VTD = VTD()
300       
301        if xmlString:
302            self.fromString(xmlString)
303
304        # if inputs passed in as dict, add these now
305        if inputs:
306            logging.info("Adding info to atom from input dict")
307            logging.debug(inputs)
308            self.__dict__.update(inputs)
309           
310            # NB, this doesn't trigger the Summary Property, so do this
311            # explicitly, if need be
312            if inputs.has_key('Summary'):
313                self.Summary = inputs.get('Summary')
314            if inputs.has_key('Content'):
315                self.Content = inputs.get('Content')
316           
317            # also pass any moles data up to the moles entity object
318            if inputs.get('providerID'):
319                self.ME.providerID = inputs.get('providerID')
320               
321            if inputs.get('abbreviation'):
322                self.ME.abbreviation = inputs.get('abbreviation')
323
324        if self.atomTypeID:
325            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
326
327        logging.info("Atom initialised")
328
329
330    def addOnlineReferences(self, links):
331        '''
332        Add online reference data associated with the atom
333        - NB, care needs to be taken here since this data is stored in the atom
334        link elements and these are also used for the various atom associations
335        @param links: a Link or array of Links to add to the relatedLinks attribute
336        '''
337        logging.debug("Adding online references")
338        if not links:
339            return
340       
341        if type(links) is not list:
342            links = [links]
343       
344        # firstly clear out any online refs data from the existing related links
345        newLinks = []
346        for link in self.relatedLinks:
347            if link.isChildAtom():
348                newLinks.append(link)
349       
350        newLinks.extend(links)
351        self.relatedLinks = newLinks
352        logging.debug("Online references added")
353
354
355
356    def addUniqueRelatedLinks(self, links):
357        '''
358        Add links to relatedLinks array - if they are not already included
359        @param links: a Link or array of Links to add to the relatedLinks attribute
360        '''
361        logging.debug("Adding new related links")
362        if not links:
363            return
364       
365        if type(links) is not list:
366            links = [links]
367       
368        for link in links:
369            if type(link) is not Link:
370                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
371                continue
372            if link not in self.relatedLinks:
373                logging.debug("- adding unique link")
374                self.relatedLinks.append(link)
375        logging.debug("Finished adding links")
376       
377
378    def removeRelatedLinks(self, linksToDelete):
379        '''
380        Remove any links in the input list from the atom's related links list
381        @param linksToDelete: array of Link objects to remove from atom
382        '''
383        logging.debug("Removing related links from atom")
384        if not linksToDelete:
385            return
386       
387        if type(linksToDelete) is not list:
388            linksToDelete = [linksToDelete]
389       
390        updatedLinks = []
391        for link in self.relatedLinks:
392            if type(link) is not Link:
393                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
394                continue
395            if link in linksToDelete:
396                logging.debug("- found link to remove")
397            else:
398                updatedLinks.append(link)
399
400        self.relatedLinks = updatedLinks
401        logging.debug("Links removed")
402       
403
404    def getDefaultCollectionPath(self):
405        '''
406        Determine the correct collection to use for the atom in eXist
407        '''
408        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
409       
410        if self.atomTypeID == VTD.DE_TERM:
411            collectionPath += eXistConnector.DE_COLLECTION_PATH
412        elif self.atomTypeID == VTD.GRANULE_TERM:
413            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
414        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
415            self.subtypeID == VTD.DEPLOYMENT_TERM:
416            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
417        else:
418            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
419       
420        if not self.ME.providerID:
421            raise AtomError("Error: cannot determine atom collection path because " + \
422                            "the provider ID is not defined")
423           
424        collectionPath += self.ME.providerID + "/"
425        return collectionPath
426           
427
428    def __addAtomTypeDataXML(self, root):
429        '''
430        Add the atom type, and subtype data, if available, to atom categories
431        - and lookup and add the appropriate vocab term data
432        '''
433        if self.atomTypeID:
434            logging.info("Adding atom type info to XML output")
435            category = Category()
436            category.label = self.atomTypeID
437            # look up the appropriate vocab term data
438            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
439            category.term = self.ATOM_TYPE
440            root.append(category.toXML())
441
442        if self.subtypeID:
443            logging.info("Adding atom subtype info to XML output")
444            # NB subtypes not all defined, so leave this out for the moment
445            category.label = self.subtypeID
446            # look up the appropriate vocab term data
447            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
448            category.term = self.ATOM_SUBTYPE
449            root.append(category.toXML())
450
451
452    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
453        '''
454        Add data to include in the moles entity element
455        '''
456        logging.debug('Adding moles entity information')
457        self.ME.abbreviation = abbreviation
458        self.ME.providerID = provider_id
459        self.ME.createdDate = getISO8601Date(object_creation_time)
460        logging.debug('Moles entity information added')
461
462
463    def addAuthors(self, authors):
464        '''
465        Add author data appropriately to the atom
466        NB, these will overwrite any existing authors of the same type
467        @param authors: list of Person objects with the author data
468        '''
469        logging.debug('Adding authors data to Atom')
470        isFirstAuthor = {}
471        authorArray = None
472        for author in authors:
473            # NB, we're only allowed one atom author
474            if author.type == Person.AUTHOR_TYPE:
475                self.author = author
476                if isFirstAuthor.has_key(author.type):
477                    raise AtomError("Error: an atom can only have one author specified")
478                isFirstAuthor[author.type] = 1
479                continue
480            elif author.type == Person.CONTRIBUTOR_TYPE:
481                authorArray = self.contributors
482            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
483                authorArray = self.ME.responsibleParties
484               
485            # check if this is the first addition - if so, clear out the
486            # array in advance
487            if not isFirstAuthor.has_key(author.type):
488                logging.debug("Clearing out author array")
489                # NB, need to be careful to clear the array, not create a ref
490                # to a new array
491                del authorArray[:]
492                isFirstAuthor[author.type] = 1
493
494            if str(author) != "" and author not in authorArray:
495                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
496                              %(author.type, author.name, author.uri, author.role))
497                authorArray.append(author)
498
499        logging.debug('Finished adding authors data')
500
501
502    def _isNewParameter(self, param):
503        '''
504        Check if a parameter is already specified in the atom, return False if
505        so, otherwise return True
506        '''
507        for p in self.parameters:
508            if p.term == param.term and \
509                p.scheme == param.scheme and \
510                p.label == param.label:
511                return False
512        return True
513
514
515    def addRelatedLinks(self, linkVals):
516        '''
517        Add related links in string format - converting to Link objects
518        @param linkVals: string of format, 'uri | title | vocabServerURL'
519        '''
520        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
521
522
523    def addParameters(self, params):
524        '''
525        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
526        @params param: parameter, as string array, to add to atom parameters collection
527        '''
528        # avoid strings being parsed character by character
529        if type(params) is str:
530            params = [params]
531           
532        for param in params:
533            # firstly tidy parameter
534            param = tidyUpParameters(param)
535            category = Category()
536            category.fromString(param)
537
538            # now check for uniqueness
539            if self._isNewParameter(category):
540                logging.debug("Adding new parameter: %s" %param)
541                self.parameters.append(category)
542   
543   
544    def _linksToXML(self, root):
545        '''
546        Add required links to the input element
547        @param root: element to add links to - NB, should be the root element of the atom
548        '''
549        selfLink = ET.SubElement(root, "link")
550        selfLink.attrib["href"] = self.atomBrowseURL
551        selfLink.attrib["rel"] = "self"
552        if self.subtypeID != VTD.DEPLOYMENT_TERM:
553            molesLink = ET.SubElement(root, "link")
554            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
555            molesLink.attrib["href"] = molesDoc
556            molesLink.attrib["rel"] = 'related'
557       
558        for relatedLink in self.relatedLinks:
559            if relatedLink.hasValue():
560                root.append(relatedLink.toXML())
561   
562    def toXML(self):
563        '''
564        Convert the atom into XML representation and return this
565        @return: xml version of atom
566        '''
567        logging.info("Creating formatted XML version of Atom")
568        root = ET.Element("entry")
569        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
570        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
571        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
572        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
573        id = ET.SubElement(root, "id")
574        id.text = self.atomID
575        title = ET.SubElement(root, "title")
576        title.text = self.title
577        self._linksToXML(root)
578
579        # NB, the author tag is mandatory for atoms - so if an explicit
580        # author has not been set, just take the author to be the provider
581        if not self.author:
582            author = Person()
583            author.name = self.ME.providerID
584            author.uri = self.ME.providerID
585            self.author = author
586
587        root.append(self.author.toXML())
588           
589        for contributor in self.contributors:
590            root.append(contributor.toXML())
591
592        # add the moles entity section, if it is required
593        if self.ME:
594            root.append(self.ME.toXML())
595
596        # add parameters data
597        for param in self.parameters:
598            if param.hasValue():
599                root.append(param.toXML())
600
601        # add the type and subtype data
602        self.__addAtomTypeDataXML(root)
603                   
604        summary = ET.SubElement(root, "summary")
605        summary.text = self.Summary
606                   
607        # add link to content, if required - NB, can only have one content element in atom
608        # - and this is mandatory
609        content = ET.SubElement(root, "content")
610        if self.contentFile:
611            content.attrib["type"] = "application/xml"
612            content.attrib["src"] = self.contentFile
613        else:
614            content.text = self.Content
615            content.attrib["type"] = "xhtml"
616       
617        # if there's a published date already defined, assume we're doing an update now
618        # NB, update element is mandatory
619        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
620        if not self.publishedDate:
621            self.publishedDate = currentDate
622
623        updated = ET.SubElement(root, "updated")
624        if not self.updatedDate:
625            self.updatedDate = currentDate
626        updated.text = self.updatedDate
627
628        published = ET.SubElement(root, "published")
629        published.text = self.publishedDate
630
631        # add temporal range data, if available
632        temporalRange = ET.SubElement(root, "moles:temporalRange")
633        if self.t1:
634            temporalRange.text = self.t1
635            if self.t2:
636                temporalRange.text += "/" + self.t2
637
638        # add spatial range data, if available
639        self._addSpatialData(root)
640
641        tree = ET.ElementTree(root)
642        logging.info("XML version of Atom created")
643        return tree
644
645
646    def __getSummary(self):
647        logging.debug("Getting summary data")
648        summaryString = ""
649        for summary_line in self.summary:
650            summaryString += summary_line + "\n"
651
652        return summaryString
653
654    def __setSummary(self, summary):
655        logging.debug("Adding summary data")
656        self.summary = []
657        for summary_line in summary.split('\n'):
658            self.summary.append(escapeSpecialCharacters(summary_line))
659           
660    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
661
662
663    def __getContent(self):
664        logging.debug("Getting content data")
665        contentString = ""
666        # NB, there must be content specified in an atom
667        if not self.content:
668            return "Metadata document"
669       
670        for content_line in self.content:
671            contentString += content_line + "\n"
672
673        return contentString
674
675    def __setContent(self, content):
676        logging.debug("Adding content data")
677        self.content = []
678        for content_line in content.split('\n'):
679            self.content.append(escapeSpecialCharacters(content_line))
680           
681    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
682
683           
684    def fromString(self, xmlString):
685        '''
686        Initialise Atom object using an xmlString
687        @param xmlString: representation of atom as an XML string
688        '''
689        logging.info("Ingesting data from XML string")
690       
691        # firstly, remove any namespaces used - to avoid problems with elementtree
692        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
693        xmlString = xmlString.replace('moles:', '')
694        xmlString = xmlString.replace('georss:', '')
695        xmlString = xmlString.replace('gml:', '')
696        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
697
698        # now create elementtree with the XML string
699        logging.debug("Create elementtree instance with XML string")
700        tree = ET.fromstring(xmlString)
701       
702        title = tree.findtext('title')
703        if title:
704            logging.debug("Adding title data")
705            self.title = title
706
707        summary = tree.findtext('summary')
708        if summary:
709            self.Summary = summary
710
711        authorElement = tree.find('author')
712        logging.debug("Adding author data")
713        author = Person()
714        author.fromETElement(authorElement)
715        self.author = author
716
717        contributorElements = tree.findall('contributor')
718        for contributorElement in contributorElements:
719            logging.debug("Adding contributor data")
720            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
721            contributor.fromETElement(contributorElement)
722            self.contributors.append(contributor)
723
724        molesElement = tree.find('entity')
725        if molesElement:
726            self.ME.fromET(molesElement)
727               
728        self.atomID = tree.findtext('id')
729
730        self._parseCategoryData(tree.findall('category'))
731
732        self._parseLinksData(tree.findall('link'))
733           
734        contentTag = tree.find('content')
735        if contentTag != None:
736            logging.debug("Found content tag - checking for CSML/CDML file data")
737            file = contentTag.attrib.get('src')
738            if file:
739                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
740                if file.upper().find('CSML') > -1:
741                    logging.debug("Adding CSML file data")
742                    self.csmlFile = file
743                elif file.upper().find('CDML') > -1:
744                    logging.debug("Adding CDML file data")
745                    self.cdmlFile = file
746                self.contentFile = file
747            else:
748                logging.debug("No file data - adding contents of element instead")
749                self.Content = contentTag.text
750       
751        range = tree.findtext('temporalRange')
752        if range:
753            logging.debug("Adding temporal range data")
754            timeData = range.split('/')
755            self.t1 = timeData[0]
756            if len(timeData) > 1:
757                self.t2 = timeData[1]
758       
759        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
760        minBBox = tree.findall('.//lowerCorner')
761        if minBBox:
762            logging.debug("Adding min spatial range data")
763            minBBox = minBBox[0]
764            spatialData = minBBox.text.split()
765            self.minX = spatialData[0]
766            if len(spatialData) > 1:
767                self.minY = spatialData[1]
768       
769        maxBBox = tree.findall('.//upperCorner')
770        if maxBBox:
771            maxBBox = maxBBox[0]
772            logging.debug("Adding max spatial range data")
773            spatialData = maxBBox.text.split()
774            self.maxX = spatialData[0]
775            if len(spatialData) > 1:
776                self.maxY = spatialData[1]
777               
778        publishedDate = tree.findtext('published')
779        if publishedDate:
780            logging.debug("Adding published date")
781            self.publishedDate = publishedDate
782               
783        updatedDate = tree.findtext('updated')
784        if updatedDate:
785            logging.debug("Adding updated date")
786            self.updatedDate = updatedDate
787           
788        logging.info("Completed data ingest")
789   
790   
791    def _parseCategoryData(self, categories):
792        logging.debug("Adding category/parameters data")
793        for category in categories:
794            cat = Category()
795            cat.fromETElement(category)
796           
797            if cat.term == self.ATOM_TYPE:
798                logging.debug("Found atom type data")
799                self.atomTypeID = cat.label
800                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
801                continue
802            elif cat.term == self.ATOM_SUBTYPE:
803                logging.debug("Found atom subtype data")
804                self.subtypeID = cat.label
805                self.subtype = cat.scheme
806                continue
807
808            self.parameters.append(cat)
809   
810
811    def setDatasetID(self, datasetID):
812        '''
813        Set the dataset ID for the atom - and generate an appropriate atom name using this
814        @param datasetID: ID to set for the atom
815        '''
816        self.datasetID = datasetID
817        self._generateAtomName(datasetID) 
818        self.atomID = self.createAtomID(datasetID)
819
820
821    def createAtomID(self, datasetID):
822        '''
823        Create a unique ID, conforming to atom standards, for atom
824        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
825        @param datasetID: ID of atom's dataset
826        @return: unique ID
827        '''
828        logging.info("Creating unique ID for atom")
829        if not self.atomBrowseURL:
830            self._generateAtomName(datasetID)
831        urlBit = self.atomBrowseURL.split('://')[1]
832        urlBit = urlBit.replace('#', '')
833        urlBits = urlBit.split('/')
834        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
835       
836        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
837        logging.info("- unique ID created for atom")
838        logging.debug(" - '%s'" %id)
839        return id
840       
841       
842    def _generateAtomName(self, datasetID):
843        '''
844        Generate a consistent name for the atom - with full eXist doc path
845        @param datasetID: ID of atom's dataset
846        '''
847        self.atomName = datasetID + ".atom"
848        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
849        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
850
851
852    def _parseLinksData(self, links):
853        '''
854        Extract links and atom data from array of link elements in the XML representation of the atom
855        @param links: an array of <link> elements
856        '''
857        # firstly, get all data to start with, so we can properly process it afterwards
858        linkData = {}
859        logging.debug("Getting link data")
860        for linkTag in links:
861            link = Link()
862            link.fromETElement(linkTag)
863
864            if not linkData.has_key(link.rel):
865                linkData[link.rel] = []
866           
867            linkData[link.rel].append(link)
868
869        # there should be one self referencing link - which will provide info on the atom itself
870        if not linkData.has_key('self'):
871            errorMessage = "Atom does not have self referencing link - " + \
872                "cannot ascertain datasetID without this - please fix"
873            logging.error(errorMessage)
874            raise ValueError(errorMessage)
875       
876        # this is the link describing the atom itself
877        self.atomBrowseURL = linkData['self'][0].href
878       
879        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
880        self.atomName = self.datasetID + ".atom"
881        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
882       
883        # now remove this value and the associated moles doc link
884        del linkData['self']
885        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
886        if linkData.has_key('related'):
887            relatedLinks = []
888            for link in linkData['related']:
889                if link.href != molesDoc:
890                    relatedLinks.append(link)
891           
892            linkData['related'] = relatedLinks
893               
894        # now add the remaining links to the atom
895        for key in linkData:
896            for link in linkData[key]:
897                logging.debug("Adding link data")
898                self.relatedLinks.append(link)
899       
900
901    def _addSpatialData(self, element):
902        '''
903        Add spatial coverage element to an input element
904        @param element: element to add coverage data to
905        '''
906        logging.info("Adding spatial data to Atom")
907        bbox = ET.SubElement(element, "georss:where")
908        if not self.minX:
909            logging.info("No spatial data specified")
910            return
911       
912        envelope = ET.SubElement(bbox, "gml:Envelope")
913        lc = ET.SubElement(envelope, "gml:lowerCorner")
914        lc.text = self.minX + " " + self.minY
915        uc = ET.SubElement(envelope, "gml:upperCorner")
916        uc.text = self.maxX + " " + self.maxY
917
918       
919    def setAttribute(self, attributeName, attributeValue):
920        '''
921        Set the value of an atom attribute - and do some basic tidying up of the string content
922        - to escape any XML unfriendly characters
923        @param attributeName: name of the attribute whose value to set
924        @param attributeValue: value to set the attribute to 
925        '''
926        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
927        origValue = attributeValue
928       
929        # escape any special characters if a value has been specified
930        # NB, need to cope with both single values and arrays
931        if attributeValue:
932            if type(attributeValue) is list:
933                newVals = []
934                for val in attributeValue:
935                    newVals.append(self.objectify(escapeSpecialCharacters(val), attributeName))
936                attributeValue = newVals
937                   
938            else:
939                attributeValue = self.objectify(escapeSpecialCharacters(attributeValue), attributeName)
940
941        # handle the special case of authors; only one author is allowed per atom
942        # - the others should be treated as contributors
943        if attributeName == "authors":
944            setattr(self, "author", attributeValue[0])
945            if len(attributeValue) > 1:
946                setattr(self, "contributors", attributeValue[1:])
947        elif attributeName == "atomAuthors":
948            self.ME.responsibleParties.append(attributeValue)
949        else:
950            setattr(self, attributeName, attributeValue)
951
952
953    def objectify(self, objectVals, attributeName):
954        '''
955        Some inputs are specified as strings but need to be converted into
956        objects - do this here
957        @param objectVals: a '|' delimited string of values
958        @param attributeName: name of attribute the values belong to
959        '''
960        obj = None
961        if type(objectVals) != str:
962            return objectVals
963       
964        if attributeName == "relatedLinks":
965            obj = Link()
966        elif attributeName == "atomAuthors" or attributeName == "authors":
967            obj = Person()
968
969        if obj:
970            obj.fromString(objectVals)
971            return obj
972       
973        return objectVals
974
975
976    def toPrettyXML(self):
977        '''
978        Returns nicely formatted XML as string
979        '''
980        atomXML = self.toXML()
981
982        # create the string
983        logging.debug("Converting the elementtree object into a string")
984        prettyXML = et2text(atomXML.getroot())
985
986        # add XML version tag
987        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
988        logging.info("Created formatted version of XML object")
989        return prettyXML
990
991
992    def getLinksOfType(self, termID):
993        '''
994        Returns links in the atom related links attribute which match the specified
995        term ID
996        @param termID: the termID to look for in the related links - NB, this is
997        matched to the end of the link.rel value
998        @return links: array of Link objects with matching term type
999        '''
1000        logging.debug("Getting atom links of type, '%s'" %termID)
1001        matchingLinks = []
1002        for link in self.relatedLinks:
1003            # firstly, handle special case where we only want the online ref type links
1004            # returned
1005            if termID == self.ONLINE_REF_LABEL:
1006                if not link.isChildAtom():
1007                    logging.debug("- found link with matching term type")
1008                    matchingLinks.append(link)
1009               
1010            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1011                logging.debug("- found link with matching term type")
1012                matchingLinks.append(link)
1013               
1014        logging.debug("Returning matched links")
1015        return matchingLinks
1016   
1017   
1018    def validate(self):
1019        '''
1020        Check the various values of the various atom attributes; if an error with any of
1021        these is found, raise a ValueError
1022        @raise ValueError: if any atom attributes have a problem
1023        '''
1024        logging.info("Validating the atom data model")
1025        errors = {}
1026        if not self.title:
1027            errors['title'] = "Title attribute cannot be empty"
1028           
1029        if self.minX or self.maxX or self.minY or self.maxY:
1030            missingVals = False
1031            incorrectFormat = False 
1032            for val in [self.minX, self.maxX, self.minY, self.maxY]:
1033                if val == '':
1034                    missingVals = True
1035                else:
1036                    try:
1037                        float(val)
1038                    except:
1039                        incorrectFormat = True
1040           
1041            if missingVals or incorrectFormat:
1042                errors['spatialcoverage'] = ""
1043            if missingVals:
1044                errors['spatialcoverage'] += "Incomplete spatial coverage data.\n"
1045            if incorrectFormat:
1046                errors['spatialcoverage'] += "Spatial coverage data not in numerical format."
1047
1048        if self.t1 or self.t2:
1049            timeErrors = ''
1050            d1 = None
1051            d2 = None
1052            if self.t1:
1053                try:
1054                    d1 = datetime.datetime.strptime(self.t1, self.YEAR_FORMAT)
1055                except:
1056                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12. \n'" %self.t1
1057            if self.t2:
1058                try:
1059                    d2 = datetime.datetime.strptime(self.t2, self.YEAR_FORMAT)
1060                except:
1061                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12. \n'" %self.t2
1062
1063            if d1 and d2:
1064                if d1 > d2 or d2 < d1:
1065                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
1066                        %(d1.strftime(self.YEAR_FORMAT), d2.strftime(self.YEAR_FORMAT))
1067
1068            if timeErrors:
1069                errors['temporalrange'] = timeErrors
1070
1071           
1072        # do a quick recursion over all the attributes to look for ascii characters
1073        for key, val in self.__dict__.items():
1074            if val:
1075                if type(val) == str:
1076                    try:
1077                        # NB, the latin coding accepts unicode up to 255
1078                        correctedString = val.decode('latin-1')
1079                    except:
1080                        if not errors.has_key(key):
1081                            errors[key] = ''
1082                        errors[key] += "Illegal unicode found in string: '%s'.\n" %val
1083               
1084        if errors:
1085            logging.warning("Errors found in atom data: %s" %errors)
1086            raise ValidationError(errors)
1087        logging.info("Atom model validated successfully")
1088       
1089       
1090    def getLogos(self):
1091        '''
1092        Return related links that are logos
1093        @return: array of Links containing the logos for the atom
1094        '''
1095        logos = []
1096        for link in self.relatedLinks:
1097            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1098                logos.append(link)
1099               
1100        return logos
1101   
1102   
1103    def isGranule(self):
1104        if self.atomTypeID == VTD.GRANULE_TERM:
1105            return True
1106        return False
1107   
1108   
1109    def isDE(self):
1110        if self.atomTypeID == VTD.DE_TERM:
1111            return True
1112        return False
1113   
1114    def isDeployment(self):
1115        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1116            return True
1117        return False
1118
1119       
1120   
1121def addCSMLData(csml, aggregateCoverage=False):
1122    '''
1123    Parse CSML data and add extracted info to the atom
1124    @param csml: csml file contents - or path to csml file
1125    @keyword aggregateCoverage: if set to True, only coverage data that extends the
1126    atom coverage data will be added
1127    '''
1128    logging.info("Creating CSML data model")
1129    csmlDoc = CsmlParser.Dataset(file=self._csmlFile)
1130   
1131    logging.info("Extracting info from CSML file")
1132    logging.debug("Got dataset ID: %s" %csmlDoc.id)
1133    self.setDatasetID(csmlDoc.id)
1134   
1135    title = csmlDoc.name.CONTENT
1136    logging.debug("Got dataset name (title): '%s'" %title)
1137    # NB, if a title is specified (and not as the default value), it automatically is used in
1138    # place of anything in the granulite file
1139    if title and title != "NAME OF DATASET GOES HERE":
1140        logging.info("Title, '%s', extracted from CSML file" %title)
1141        if self.title:
1142            logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1143                         %self.title)
1144        self.title = title
1145           
1146    bbox1 = csmlDoc.getBoundingBox()
1147    bbox2 = csmlDoc.getCSMLBoundingBox()
1148    time = bbox2.getTimeLimits()
1149
1150    # now check for other parameters to add to granule
1151    # Firstly, extract the bounding envelope
1152    if bbox1:
1153        w, e = self.moveBox(bbox1[0],bbox1[2])
1154        n, s = (bbox1[3], bbox1[1])
1155
1156        if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1157            self.maxY = n
1158           
1159        if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1160            self.minY = s
1161       
1162        if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1163            self.minX = w
1164
1165        if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1166            self.maxX = e
1167       
1168        logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1169                      %(w, s, e, n))
1170       
1171        logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1172                      %(self.minX, self.minY, self.maxX, self.maxY))
1173    else:
1174        logging.debug("No valid bounding box data found")
1175
1176    if time:
1177        t1 = formatDateYYYYMMDD(time[0])
1178        if not aggregateCoverage or \
1179            (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1180                datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1181            self.t1 = t1
1182
1183        t2 = time[1]
1184        if t2 and t2 != 'None':
1185            t2 = formatDateYYYYMMDD(t2)
1186            if not aggregateCoverage or \
1187                (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1188                    datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1189                self.t2 = t2
1190       
1191        logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1192    else:
1193        logging.debug("No valid time range data found")
1194
1195    #create parameter summaries:
1196    #set up list to hold the parameters data
1197    parameters = []
1198    for feature in csmlDoc.featureCollection.featureMembers:
1199        if hasattr(feature.parameter, 'href'):
1200            paramTriple = ""
1201            if hasattr(feature, 'description'):
1202                paramTriple = feature.description.CONTENT
1203                paramTriple += " | " + feature.parameter.href
1204               
1205                term = ""
1206                if hasattr(feature, 'name'):
1207                    term = feature.name.CONTENT
1208
1209                paramTriple += " | " + term
1210               
1211                logging.debug("Got parameter info: %s" %paramTriple)
1212                parameters.append(paramTriple)
1213   
1214    # update the atom with the extracted parameters
1215    logging.info("Adding CSML parameters to granule atom")
1216    self.addParameters(parameters)
1217    logging.info("Finished adding CSML data")
Note: See TracBrowser for help on using the repository browser.