source: exist/trunk/python/ndgUtils/models/Atom.py @ 4444

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4444
Revision 4444, 47.1 KB checked in by cbyrom, 12 years ago (diff)

Add new xquery to lookup deployment atoms associated with an atom ID +
add code to models to allow the use of this to retrieve this information
+ update tests + avoid doubly escaping special characters.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class ValidationError(Exception):
36    """
37    Exception handling for validation.
38    """
39    def __init__(self, errorDict):
40        msg = "Data validation error"
41        logging.error(msg)
42        Exception.__init__(self, msg)
43        for val in errorDict.itervalues():
44            logging.error(val)
45        self._errorDict = errorDict
46           
47    def unpack_errors(self):
48        return self._errorDict
49
50
51class Person(object):
52    '''
53    Class representing atom author type data - with name, uri and role attributes
54    '''
55    AUTHOR_TYPE = 0
56    CONTRIBUTOR_TYPE = 1
57    RESPONSIBLE_PARTY_TYPE = 2
58    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
59   
60    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
61        self.type = personType
62        self.ns = namespace
63        self.name = ""
64        self.uri = ""
65        self.role = ""
66       
67        # NB, the atom format specifies slightly different data contents
68        self.uriTagName = "email"
69        # NB, responsible party data is always stored in the moles section
70        if self.type == self.RESPONSIBLE_PARTY_TYPE:
71            self.ns = 'moles'
72            self.uriTagName = "uri"
73
74    def __str__(self):
75        if self.name or self.uri or self.role:
76            return self.name + " | " + self.uri + " | " + self.role
77        return ""
78   
79    def fromString(self, personString):
80        (self.name, self.uri, self.role) = getTripleData(personString)
81       
82    def fromETElement(self, personTag):
83        self.name = personTag.findtext('name') or ""
84        self.role = personTag.findtext('role') or ""
85        self.uri = personTag.findtext(self.uriTagName) or ""
86        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
87                      %(self.name, self.role, self.uriTagName, self.uri))
88
89    def toXML(self):
90        prefix = ""
91        if self.ns:
92            prefix = self.ns + ':'
93
94        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
95
96        if self.name:
97            name = ET.SubElement(author, prefix + "name")
98            name.text = self.name
99       
100        if self.uri:
101            uri = ET.SubElement(author, prefix + self.uriTagName)
102            uri.text = self.uri
103       
104        if self.role:
105            role = ET.SubElement(author, prefix + "role")
106            role.text = self.role
107
108        return author
109   
110    def __cmp__(self, person1):
111        '''
112        Override comparison to allow proper object comparison when checking
113        if Person objects are in an array already - i.e. if person in personArray...
114        '''
115        if not person1:
116            return -1
117       
118        if self is person1:
119            return 0
120        elif self.uri == person1.uri and self.name == person1.name and \
121                self.role == person1.role and self.type == person1.type:
122            return 0
123        return 1
124
125
126class Link(object):
127    '''
128    Class representing an atom link - with href, title and rel attributes
129    '''
130
131    def __init__(self):
132        self.href = ""
133        self.title = ""
134        self.rel = ""
135
136    def fromString(self, linkString):
137        (self.href, self.title, self.rel) = getTripleData(linkString)
138       
139    def fromETElement(self, linkTag):
140        self.href = linkTag.attrib.get('href') or ""
141        self.rel = linkTag.attrib.get('rel') or ""
142        self.title = linkTag.attrib.get('title') or ""
143
144    def toXML(self):
145        link = ET.Element("link")
146        link.attrib["href"] = self.href
147        link.attrib["title"] = self.title
148        link.attrib["rel"] = self.rel
149        return link
150
151    def hasValue(self):
152        # NB, just a rel on its own is meaningless - so ignore
153        if self.href or self.title:
154            return True
155        return False
156   
157    def __str__(self):
158        if self.href or self.title or self.rel:
159            return self.href + " | " + self.title + " | " + self.rel
160        return ""
161   
162    def isChildAtom(self):
163        '''
164        Determines whether the link refers to another atom - e.g. a link to
165        a data granule
166        @return True, if so; False otherwise
167        '''
168        if self.rel.endswith(VTD.GRANULE_TERM) or \
169            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
170            self.rel.endswith(VTD.ACTIVITY_TERM) or \
171            self.rel.endswith(VTD.DPT_TERM) or \
172            self.rel.endswith(VTD.OBS_TERM):
173            return True
174       
175        return False
176   
177    def __cmp__(self, link1):
178        '''
179        Override comparison to allow proper object comparison when checking
180        if Link objects are in an array already - i.e. if link in linkArray...
181        '''
182        if not link1:
183            return -1
184       
185        if self is link1:
186            return 0
187        elif self.href == link1.href and self.title == link1.title and \
188                self.rel == link1.rel:
189            return 0
190        return 1
191
192
193class Category(object):
194    '''
195    Class representing an atom category - with term, scheme and label attributes
196    '''
197    def __init__(self):
198        self.term = ""
199        self.scheme = ""
200        self.label = ""
201
202    def fromString(self, linkString, escapeSpecialCharacters=True):
203        '''
204        Create Category from triple string of format, 'label | scheme | term'
205        @param linkString: triple string to create category with
206        @keyword escapeSpecialCharacters: if set to True, special characters in
207        triple string are escaped (default)
208        '''
209        (self.label, self.scheme, self.term) = getTripleData(linkString, \
210            doEscape=escapeSpecialCharacters)
211       
212    def fromETElement(self, linkTag):
213        self.term = linkTag.attrib.get('term') or ""
214        self.label = linkTag.attrib.get('label') or ""
215        self.scheme = linkTag.attrib.get('scheme') or ""
216
217    def toXML(self):
218        link = ET.Element("category")
219        link.attrib["term"] = self.term
220        link.attrib["scheme"] = self.scheme
221        link.attrib["label"] = self.label
222        return link
223   
224    def hasValue(self):
225        if self.scheme or self.label or self.term:
226            return True
227        return False
228
229
230class Atom(object):
231
232    # labels for use with the atom categories
233    ATOM_TYPE = "ATOM_TYPE"
234    ATOM_SUBTYPE = "ATOM_SUBTYPE"
235
236    # labels for use with the templates to set/extract specific inputs
237    ONLINE_REF_LABEL = "online_ref"
238    PARAMETER_LABEL = "parameter"
239    ATOM_REF_LABEL = "atom_ref"
240    DELIMITER = "---"
241    REMOVE_LABEL = "remove"
242   
243    # format to use for t1-t2 date range
244    YEAR_FORMAT = '%Y-%m-%d'
245
246    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
247                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
248        '''
249        Constructor - initialise the atom variables
250        '''
251        logging.info("Initialising atom")
252        if atomType:
253            logging.info(" - of type '%s'" %atomType)
254        self.atomTypeID = atomType
255
256        # some data have further subtypes specified
257        self.subtypeID = None # this should be the termID
258        self.subtype = None # and this should be the fully formed vocab URL
259       
260        self.ndgObject = ndgObject
261
262        self.atomName = None
263        self.files = []
264        self.author = None
265        self.contributors = []
266        self.atomAuthors = []
267        self.parameters = []
268        self.spatialData = []
269        self.temporalData = []
270        self.relatedLinks = []
271        self.summary = []
272        self.content = []
273        # NB, this deployments data duplicates other atom data - and is only used for a
274        # convenient way to collect the info (by lookupDeploymentsInfo()) for use in templates
275        self.deployments = []   
276        self.csmlFile = None
277        self.cdmlFile = None
278        # general variable to use for setting the atom content - NB, if a csmlFile is specified
279        # (either directly or via a cdmlFile specification), this will be the content by default
280        # for this purpose
281        self.contentFile = None     
282        self.title = None
283        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
284        self.atomID = None
285   
286        # boundary box info - to replace spatial/temporalData?
287        self.minX = None
288        self.minY = None
289        self.maxX = None
290        self.maxY = None
291        self.t1 = None
292        self.t2 = None
293
294        self.ME = ME.MolesEntity()
295       
296        # date when the atom was first ingested
297        self.publishedDate = None
298
299        # last update date
300        self.updatedDate = None
301
302        # assume atom in working state by default - this is used to define what collection
303        # in eXist the atom is stored in
304        self.state = state
305       
306        # additional, non standard atom data can be included in the molesExtra element
307        if vocabTermData:
308            self.VTD = vocabTermData
309        else:
310            self.VTD = VTD()
311       
312        if xmlString:
313            self.fromString(xmlString)
314
315        # if inputs passed in as dict, add these now
316        if inputs:
317            logging.info("Adding info to atom from input dict")
318            logging.debug(inputs)
319            self.__dict__.update(inputs)
320           
321            # NB, this doesn't trigger the Summary Property, so do this
322            # explicitly, if need be
323            if inputs.has_key('Summary'):
324                self.Summary = inputs.get('Summary')
325            if inputs.has_key('Content'):
326                self.Content = inputs.get('Content')
327           
328            # also pass any moles data up to the moles entity object
329            if inputs.get('providerID'):
330                self.ME.providerID = inputs.get('providerID')
331               
332            if inputs.get('abbreviation'):
333                self.ME.abbreviation = inputs.get('abbreviation')
334
335        if self.atomTypeID:
336            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
337
338        logging.info("Atom initialised")
339
340
341    def addOnlineReferences(self, links):
342        '''
343        Add online reference data associated with the atom
344        - NB, care needs to be taken here since this data is stored in the atom
345        link elements and these are also used for the various atom associations
346        @param links: a Link or array of Links to add to the relatedLinks attribute
347        '''
348        logging.debug("Adding online references")
349        if not links:
350            return
351       
352        if type(links) is not list:
353            links = [links]
354       
355        # firstly clear out any online refs data from the existing related links
356        newLinks = []
357        for link in self.relatedLinks:
358            if link.isChildAtom():
359                newLinks.append(link)
360       
361        newLinks.extend(links)
362        self.relatedLinks = newLinks
363        logging.debug("Online references added")
364
365
366    def addUniqueRelatedLinks(self, links):
367        '''
368        Add links to relatedLinks array - if they are not already included
369        @param links: a Link or array of Links to add to the relatedLinks attribute
370        '''
371        self.addUniqueLinks(self.relatedLinks, links)
372       
373
374    def removeRelatedLinks(self, linksToDelete):
375        '''
376        Remove any links in the input list from the atom's related links list
377        @param linksToDelete: array of Link objects to remove from atom
378        '''
379        logging.debug("Removing related links from atom")
380        if not linksToDelete:
381            return
382       
383        if type(linksToDelete) is not list:
384            linksToDelete = [linksToDelete]
385       
386        updatedLinks = []
387        for link in self.relatedLinks:
388            if type(link) is not Link:
389                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
390                continue
391            if link in linksToDelete:
392                logging.debug("- found link to remove")
393            else:
394                updatedLinks.append(link)
395
396        self.relatedLinks = updatedLinks
397        logging.debug("Links removed")
398       
399
400    def getDefaultCollectionPath(self):
401        '''
402        Determine the correct collection to use for the atom in eXist
403        '''
404        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
405       
406        if self.atomTypeID == VTD.DE_TERM:
407            collectionPath += eXistConnector.DE_COLLECTION_PATH
408        elif self.atomTypeID == VTD.GRANULE_TERM:
409            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
410        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
411            self.subtypeID == VTD.DEPLOYMENT_TERM:
412            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
413        else:
414            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
415       
416        if not self.ME.providerID:
417            raise AtomError("Error: cannot determine atom collection path because " + \
418                            "the provider ID is not defined")
419           
420        collectionPath += self.ME.providerID + "/"
421        return collectionPath
422
423
424    def __addAtomTypeDataXML(self, root):
425        '''
426        Add the atom type, and subtype data, if available, to atom categories
427        - and lookup and add the appropriate vocab term data
428        '''
429        if self.atomTypeID:
430            logging.info("Adding atom type info to XML output")
431            category = Category()
432            category.label = self.atomTypeID
433            # look up the appropriate vocab term data
434            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
435            category.term = self.ATOM_TYPE
436            root.append(category.toXML())
437
438        if self.subtypeID:
439            logging.info("Adding atom subtype info to XML output")
440            # NB subtypes not all defined, so leave this out for the moment
441            category.label = self.subtypeID
442            # look up the appropriate vocab term data
443            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
444            category.term = self.ATOM_SUBTYPE
445            root.append(category.toXML())
446
447
448    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
449        '''
450        Add data to include in the moles entity element
451        '''
452        logging.debug('Adding moles entity information')
453        self.ME.abbreviation = abbreviation
454        self.ME.providerID = provider_id
455        self.ME.createdDate = getISO8601Date(object_creation_time)
456        logging.debug('Moles entity information added')
457
458
459    def addAuthors(self, authors):
460        '''
461        Add author data appropriately to the atom
462        NB, these will overwrite any existing authors of the same type
463        @param authors: list of Person objects with the author data
464        '''
465        logging.debug('Adding authors data to Atom')
466        isFirstAuthor = {}
467        authorArray = None
468        for author in authors:
469            # NB, we're only allowed one atom author
470            if author.type == Person.AUTHOR_TYPE:
471                self.author = author
472                if isFirstAuthor.has_key(author.type):
473                    raise AtomError("Error: an atom can only have one author specified")
474                isFirstAuthor[author.type] = 1
475                continue
476            elif author.type == Person.CONTRIBUTOR_TYPE:
477                authorArray = self.contributors
478            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
479                authorArray = self.ME.responsibleParties
480               
481            # check if this is the first addition - if so, clear out the
482            # array in advance
483            if not isFirstAuthor.has_key(author.type):
484                logging.debug("Clearing out author array")
485                # NB, need to be careful to clear the array, not create a ref
486                # to a new array
487                del authorArray[:]
488                isFirstAuthor[author.type] = 1
489
490            if str(author) != "" and author not in authorArray:
491                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
492                              %(author.type, author.name, author.uri, author.role))
493                authorArray.append(author)
494
495        logging.debug('Finished adding authors data')
496
497
498    def _isNewParameter(self, param):
499        '''
500        Check if a parameter is already specified in the atom, return False if
501        so, otherwise return True
502        '''
503        for p in self.parameters:
504            if p.term == param.term and \
505                p.scheme == param.scheme and \
506                p.label == param.label:
507                return False
508        return True
509
510
511    def addRelatedLinks(self, linkVals):
512        '''
513        Add related links in string format - converting to Link objects
514        @param linkVals: string of format, 'uri | title | vocabServerURL'
515        '''
516        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
517
518
519    def addParameters(self, params):
520        '''
521        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
522        @params param: parameter, as string array, to add to atom parameters collection
523        '''
524        # avoid strings being parsed character by character
525        if type(params) is str:
526            params = [params]
527           
528        for param in params:
529            # firstly tidy parameter
530            param = tidyUpParameters(param)
531            category = Category()
532            # NB, data already tidied up here, so set keyword to avoid this happening again
533            category.fromString(param, escapeSpecialCharacters=False)
534
535            # now check for uniqueness
536            if self._isNewParameter(category):
537                logging.debug("Adding new parameter: %s" %param)
538                self.parameters.append(category)
539   
540   
541    def _linksToXML(self, root):
542        '''
543        Add required links to the input element
544        @param root: element to add links to - NB, should be the root element of the atom
545        '''
546        selfLink = ET.SubElement(root, "link")
547        selfLink.attrib["href"] = self.atomBrowseURL
548        selfLink.attrib["rel"] = "self"
549        if self.subtypeID != VTD.DEPLOYMENT_TERM:
550            molesLink = ET.SubElement(root, "link")
551            molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
552            molesLink.attrib["href"] = molesDoc
553            molesLink.attrib["rel"] = 'related'
554       
555        for relatedLink in self.relatedLinks:
556            if relatedLink.hasValue():
557                root.append(relatedLink.toXML())
558   
559    def toXML(self):
560        '''
561        Convert the atom into XML representation and return this
562        @return: xml version of atom
563        '''
564        logging.info("Creating formatted XML version of Atom")
565        root = ET.Element("entry")
566        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
567        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2beta"
568        root.attrib["xmlns:georss"] = "http://www.georss.org/georss/10"
569        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
570        id = ET.SubElement(root, "id")
571        id.text = self.atomID
572        title = ET.SubElement(root, "title")
573        title.text = self.title
574        self._linksToXML(root)
575
576        # NB, the author tag is mandatory for atoms - so if an explicit
577        # author has not been set, just take the author to be the provider
578        if not self.author:
579            author = Person()
580            author.name = self.ME.providerID
581            author.uri = self.ME.providerID
582            self.author = author
583
584        root.append(self.author.toXML())
585           
586        for contributor in self.contributors:
587            root.append(contributor.toXML())
588
589        # add the moles entity section, if it is required
590        if self.ME:
591            root.append(self.ME.toXML())
592
593        # add parameters data
594        for param in self.parameters:
595            if param.hasValue():
596                root.append(param.toXML())
597
598        # add the type and subtype data
599        self.__addAtomTypeDataXML(root)
600                   
601        summary = ET.SubElement(root, "summary")
602        summary.text = self.Summary
603                   
604        # add link to content, if required - NB, can only have one content element in atom
605        # - and this is mandatory
606        content = ET.SubElement(root, "content")
607        if self.contentFile:
608            content.attrib["type"] = "application/xml"
609            content.attrib["src"] = self.contentFile
610        else:
611            content.text = self.Content
612            content.attrib["type"] = "xhtml"
613       
614        # if there's a published date already defined, assume we're doing an update now
615        # NB, update element is mandatory
616        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
617        if not self.publishedDate:
618            self.publishedDate = currentDate
619
620        updated = ET.SubElement(root, "updated")
621        if not self.updatedDate:
622            self.updatedDate = currentDate
623        updated.text = self.updatedDate
624
625        published = ET.SubElement(root, "published")
626        published.text = self.publishedDate
627
628        # add temporal range data, if available
629        temporalRange = ET.SubElement(root, "moles:temporalRange")
630        if self.t1:
631            temporalRange.text = self.t1
632            if self.t2:
633                temporalRange.text += "/" + self.t2
634
635        # add spatial range data, if available
636        self._addSpatialData(root)
637
638        tree = ET.ElementTree(root)
639        logging.info("XML version of Atom created")
640        return tree
641
642
643    def __getSummary(self):
644        logging.debug("Getting summary data")
645        summaryString = ""
646        for summary_line in self.summary:
647            summaryString += summary_line + "\n"
648
649        return summaryString
650
651    def __setSummary(self, summary):
652        logging.debug("Adding summary data")
653        self.summary = []
654        for summary_line in summary.split('\n'):
655            self.summary.append(escapeSpecialCharacters(summary_line))
656           
657    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
658
659
660    def __getContent(self):
661        logging.debug("Getting content data")
662        contentString = ""
663        # NB, there must be content specified in an atom
664        if not self.content:
665            return "Metadata document"
666       
667        for content_line in self.content:
668            contentString += content_line + "\n"
669
670        return contentString
671
672    def __setContent(self, content):
673        logging.debug("Adding content data")
674        self.content = []
675        for content_line in content.split('\n'):
676            self.content.append(escapeSpecialCharacters(content_line))
677           
678    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
679
680           
681    def fromString(self, xmlString):
682        '''
683        Initialise Atom object using an xmlString
684        @param xmlString: representation of atom as an XML string
685        '''
686        logging.info("Ingesting data from XML string")
687       
688        # firstly, remove any namespaces used - to avoid problems with elementtree
689        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
690        xmlString = xmlString.replace('moles:', '')
691        xmlString = xmlString.replace('georss:', '')
692        xmlString = xmlString.replace('gml:', '')
693        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
694        xmlString = xmlString.replace('default:', '')
695
696        # now create elementtree with the XML string
697        logging.debug("Create elementtree instance with XML string")
698        tree = ET.fromstring(xmlString)
699       
700        title = tree.findtext('title')
701        if title:
702            logging.debug("Adding title data")
703            self.title = title
704
705        summary = tree.findtext('summary')
706        if summary:
707            self.Summary = summary
708
709        authorElement = tree.find('author')
710        logging.debug("Adding author data")
711        author = Person()
712        author.fromETElement(authorElement)
713        self.author = author
714
715        contributorElements = tree.findall('contributor')
716        for contributorElement in contributorElements:
717            logging.debug("Adding contributor data")
718            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
719            contributor.fromETElement(contributorElement)
720            self.contributors.append(contributor)
721
722        molesElement = tree.find('entity')
723        if molesElement:
724            self.ME.fromET(molesElement)
725               
726        self.atomID = tree.findtext('id')
727
728        self._parseCategoryData(tree.findall('category'))
729
730        self._parseLinksData(tree.findall('link'))
731           
732        contentTag = tree.find('content')
733        if contentTag != None:
734            logging.debug("Found content tag - checking for CSML/CDML file data")
735            file = contentTag.attrib.get('src')
736            if file:
737                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
738                if file.upper().find('CSML') > -1:
739                    logging.debug("Adding CSML file data")
740                    self.csmlFile = file
741                elif file.upper().find('CDML') > -1:
742                    logging.debug("Adding CDML file data")
743                    self.cdmlFile = file
744                self.contentFile = file
745            else:
746                logging.debug("No file data - adding contents of element instead")
747                self.Content = contentTag.text
748       
749        range = tree.findtext('temporalRange')
750        if range:
751            logging.debug("Adding temporal range data")
752            timeData = range.split('/')
753            self.t1 = timeData[0]
754            if len(timeData) > 1:
755                self.t2 = timeData[1]
756       
757        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
758        minBBox = tree.findall('.//lowerCorner')
759        if minBBox:
760            logging.debug("Adding min spatial range data")
761            minBBox = minBBox[0]
762            spatialData = minBBox.text.split()
763            self.minX = spatialData[0]
764            if len(spatialData) > 1:
765                self.minY = spatialData[1]
766       
767        maxBBox = tree.findall('.//upperCorner')
768        if maxBBox:
769            maxBBox = maxBBox[0]
770            logging.debug("Adding max spatial range data")
771            spatialData = maxBBox.text.split()
772            self.maxX = spatialData[0]
773            if len(spatialData) > 1:
774                self.maxY = spatialData[1]
775               
776        publishedDate = tree.findtext('published')
777        if publishedDate:
778            logging.debug("Adding published date")
779            self.publishedDate = publishedDate
780               
781        updatedDate = tree.findtext('updated')
782        if updatedDate:
783            logging.debug("Adding updated date")
784            self.updatedDate = updatedDate
785           
786        logging.info("Completed data ingest")
787   
788   
789    def _parseCategoryData(self, categories):
790        logging.debug("Adding category/parameters data")
791        for category in categories:
792            cat = Category()
793            cat.fromETElement(category)
794           
795            if cat.term == self.ATOM_TYPE:
796                logging.debug("Found atom type data")
797                self.atomTypeID = cat.label
798                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
799                continue
800            elif cat.term == self.ATOM_SUBTYPE:
801                logging.debug("Found atom subtype data")
802                self.subtypeID = cat.label
803                self.subtype = cat.scheme
804                continue
805
806            self.parameters.append(cat)
807   
808
809    def setDatasetID(self, datasetID):
810        '''
811        Set the dataset ID for the atom - and generate an appropriate atom name using this
812        @param datasetID: ID to set for the atom
813        '''
814        self.datasetID = datasetID
815        self._generateAtomName(datasetID) 
816        self.atomID = self.createAtomID(datasetID)
817
818
819    def createAtomID(self, datasetID):
820        '''
821        Create a unique ID, conforming to atom standards, for atom
822        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
823        @param datasetID: ID of atom's dataset
824        @return: unique ID
825        '''
826        logging.info("Creating unique ID for atom")
827        if not self.atomBrowseURL:
828            self._generateAtomName(datasetID)
829        urlBit = self.atomBrowseURL.split('://')[1]
830        urlBit = urlBit.replace('#', '')
831        urlBits = urlBit.split('/')
832        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
833       
834        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
835        logging.info("- unique ID created for atom")
836        logging.debug(" - '%s'" %id)
837        return id
838       
839       
840    def _generateAtomName(self, datasetID):
841        '''
842        Generate a consistent name for the atom - with full eXist doc path
843        @param datasetID: ID of atom's dataset
844        '''
845        self.atomName = datasetID + ".atom"
846        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
847        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
848
849
850    def _parseLinksData(self, links):
851        '''
852        Extract links and atom data from array of link elements in the XML representation of the atom
853        @param links: an array of <link> elements
854        '''
855        # firstly, get all data to start with, so we can properly process it afterwards
856        linkData = {}
857        logging.debug("Getting link data")
858        for linkTag in links:
859            link = Link()
860            link.fromETElement(linkTag)
861
862            if not linkData.has_key(link.rel):
863                linkData[link.rel] = []
864           
865            linkData[link.rel].append(link)
866
867        # there should be one self referencing link - which will provide info on the atom itself
868        if not linkData.has_key('self'):
869            errorMessage = "Atom does not have self referencing link - " + \
870                "cannot ascertain datasetID without this - please fix"
871            logging.error(errorMessage)
872            raise ValueError(errorMessage)
873       
874        # this is the link describing the atom itself
875        self.atomBrowseURL = linkData['self'][0].href
876       
877        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
878        self.atomName = self.datasetID + ".atom"
879        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
880       
881        # now remove this value and the associated moles doc link
882        del linkData['self']
883        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
884        if linkData.has_key('related'):
885            relatedLinks = []
886            for link in linkData['related']:
887                if link.href != molesDoc:
888                    relatedLinks.append(link)
889           
890            linkData['related'] = relatedLinks
891               
892        # now add the remaining links to the atom
893        for key in linkData:
894            for link in linkData[key]:
895                logging.debug("Adding link data")
896                self.relatedLinks.append(link)
897       
898
899    def _addSpatialData(self, element):
900        '''
901        Add spatial coverage element to an input element
902        @param element: element to add coverage data to
903        '''
904        logging.info("Adding spatial data to Atom")
905        bbox = ET.SubElement(element, "georss:where")
906        if not self.minX:
907            logging.info("No spatial data specified")
908            return
909       
910        envelope = ET.SubElement(bbox, "gml:Envelope")
911        lc = ET.SubElement(envelope, "gml:lowerCorner")
912        lc.text = self.minX + " " + self.minY
913        uc = ET.SubElement(envelope, "gml:upperCorner")
914        uc.text = self.maxX + " " + self.maxY
915
916       
917    def setAttribute(self, attributeName, attributeValue):
918        '''
919        Set the value of an atom attribute - and do some basic tidying up of the string content
920        - to escape any XML unfriendly characters
921        @param attributeName: name of the attribute whose value to set
922        @param attributeValue: value to set the attribute to 
923        '''
924        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
925        origValue = attributeValue
926       
927        # escape any special characters if a value has been specified
928        # NB, need to cope with both single values and arrays
929        if attributeValue:
930            if type(attributeValue) is list:
931                newVals = []
932                for val in attributeValue:
933                    newVals.append(self.objectify(escapeSpecialCharacters(val), attributeName))
934                attributeValue = newVals
935                   
936            else:
937                attributeValue = self.objectify(escapeSpecialCharacters(attributeValue), attributeName)
938
939        # handle the special case of authors; only one author is allowed per atom
940        # - the others should be treated as contributors
941        if attributeName == "authors":
942            setattr(self, "author", attributeValue[0])
943            if len(attributeValue) > 1:
944                setattr(self, "contributors", attributeValue[1:])
945        elif attributeName == "atomAuthors":
946            self.ME.responsibleParties.append(attributeValue)
947        else:
948            setattr(self, attributeName, attributeValue)
949
950
951    def objectify(self, objectVals, attributeName):
952        '''
953        Some inputs are specified as strings but need to be converted into
954        objects - do this here
955        @param objectVals: a '|' delimited string of values
956        @param attributeName: name of attribute the values belong to
957        '''
958        obj = None
959        if type(objectVals) != str:
960            return objectVals
961       
962        if attributeName == "relatedLinks":
963            obj = Link()
964        elif attributeName == "atomAuthors" or attributeName == "authors":
965            obj = Person()
966
967        if obj:
968            obj.fromString(objectVals)
969            return obj
970       
971        return objectVals
972
973
974    def toPrettyXML(self):
975        '''
976        Returns nicely formatted XML as string
977        '''
978        atomXML = self.toXML()
979
980        # create the string
981        logging.debug("Converting the elementtree object into a string")
982        prettyXML = et2text(atomXML.getroot())
983
984        # add XML version tag
985        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
986        logging.info("Created formatted version of XML object")
987        return prettyXML
988
989
990    def getLinksOfType(self, termID):
991        '''
992        Returns links in the atom related links attribute which match the specified
993        term ID
994        @param termID: the termID to look for in the related links - NB, this is
995        matched to the end of the link.rel value
996        @return links: array of Link objects with matching term type
997        '''
998        logging.debug("Getting atom links of type, '%s'" %termID)
999        matchingLinks = []
1000        for link in self.relatedLinks:
1001            # firstly, handle special case where we only want the online ref type links
1002            # returned
1003            if termID == self.ONLINE_REF_LABEL:
1004                if not link.isChildAtom():
1005                    logging.debug("- found link with matching term type")
1006                    matchingLinks.append(link)
1007               
1008            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1009                logging.debug("- found link with matching term type")
1010                matchingLinks.append(link)
1011               
1012        logging.debug("Returning matched links")
1013        return matchingLinks
1014   
1015   
1016    def validate(self):
1017        '''
1018        Check the various values of the various atom attributes; if an error with any of
1019        these is found, raise a ValueError
1020        @raise ValueError: if any atom attributes have a problem
1021        '''
1022        logging.info("Validating the atom data model")
1023        errors = {}
1024        if not self.title:
1025            errors['title'] = "Title attribute cannot be empty"
1026           
1027        if self.minX or self.maxX or self.minY or self.maxY:
1028            missingVals = False
1029            incorrectFormat = False 
1030            for val in [self.minX, self.maxX, self.minY, self.maxY]:
1031                if val == '':
1032                    missingVals = True
1033                else:
1034                    try:
1035                        float(val)
1036                    except:
1037                        incorrectFormat = True
1038           
1039            if missingVals or incorrectFormat:
1040                errors['spatialcoverage'] = ""
1041            if missingVals:
1042                errors['spatialcoverage'] += "Incomplete spatial coverage data.\n"
1043            if incorrectFormat:
1044                errors['spatialcoverage'] += "Spatial coverage data not in numerical format."
1045
1046        if self.t1 or self.t2:
1047            timeErrors = ''
1048            d1 = None
1049            d2 = None
1050            if self.t1:
1051                try:
1052                    d1 = datetime.datetime.strptime(self.t1, self.YEAR_FORMAT)
1053                except:
1054                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12. \n'" %self.t1
1055            if self.t2:
1056                try:
1057                    d2 = datetime.datetime.strptime(self.t2, self.YEAR_FORMAT)
1058                except:
1059                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12. \n'" %self.t2
1060
1061            if d1 and d2:
1062                if d1 > d2 or d2 < d1:
1063                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
1064                        %(d1.strftime(self.YEAR_FORMAT), d2.strftime(self.YEAR_FORMAT))
1065
1066            if timeErrors:
1067                errors['temporalrange'] = timeErrors
1068
1069           
1070        # do a quick recursion over all the attributes to look for ascii characters
1071        for key, val in self.__dict__.items():
1072            if val:
1073                if type(val) == str:
1074                    try:
1075                        # NB, the latin coding accepts unicode up to 255
1076                        correctedString = val.decode('latin-1')
1077                    except:
1078                        if not errors.has_key(key):
1079                            errors[key] = ''
1080                        errors[key] += "Illegal unicode found in string: '%s'.\n" %val
1081               
1082        if errors:
1083            logging.warning("Errors found in atom data: %s" %errors)
1084            raise ValidationError(errors)
1085        logging.info("Atom model validated successfully")
1086       
1087       
1088    def getLogos(self):
1089        '''
1090        Return related links that are logos
1091        @return: array of Links containing the logos for the atom
1092        '''
1093        logos = []
1094        for link in self.relatedLinks:
1095            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1096                logos.append(link)
1097               
1098        return logos
1099   
1100   
1101    def isGranule(self):
1102        if self.atomTypeID == VTD.GRANULE_TERM:
1103            return True
1104        return False
1105   
1106   
1107    def isDE(self):
1108        if self.atomTypeID == VTD.DE_TERM:
1109            return True
1110        return False
1111   
1112    def isDeployment(self):
1113        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1114            return True
1115        return False
1116   
1117    def isDeployable(self):
1118        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1119            self.atomTypeID == VTD.DPT_TERM or \
1120            self.atomTypeID == VTD.OBS_TERM:
1121            return True
1122        return False
1123
1124       
1125    def addCSMLData(csml, aggregateCoverage=False):
1126        '''
1127        Parse CSML data and add extracted info to the atom
1128        @param csml: csml file contents - or path to csml file
1129        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1130        atom coverage data will be added
1131        '''
1132        logging.info("Creating CSML data model")
1133        csmlDoc = CsmlParser.Dataset(file=self._csmlFile)
1134       
1135        logging.info("Extracting info from CSML file")
1136        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1137        self.setDatasetID(csmlDoc.id)
1138       
1139        title = csmlDoc.name.CONTENT
1140        logging.debug("Got dataset name (title): '%s'" %title)
1141        # NB, if a title is specified (and not as the default value), it automatically is used in
1142        # place of anything in the granulite file
1143        if title and title != "NAME OF DATASET GOES HERE":
1144            logging.info("Title, '%s', extracted from CSML file" %title)
1145            if self.title:
1146                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1147                             %self.title)
1148            self.title = title
1149               
1150        bbox1 = csmlDoc.getBoundingBox()
1151        bbox2 = csmlDoc.getCSMLBoundingBox()
1152        time = bbox2.getTimeLimits()
1153   
1154        # now check for other parameters to add to granule
1155        # Firstly, extract the bounding envelope
1156        if bbox1:
1157            w, e = self.moveBox(bbox1[0],bbox1[2])
1158            n, s = (bbox1[3], bbox1[1])
1159   
1160            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1161                self.maxY = n
1162               
1163            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1164                self.minY = s
1165           
1166            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1167                self.minX = w
1168   
1169            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1170                self.maxX = e
1171           
1172            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1173                          %(w, s, e, n))
1174           
1175            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1176                          %(self.minX, self.minY, self.maxX, self.maxY))
1177        else:
1178            logging.debug("No valid bounding box data found")
1179   
1180        if time:
1181            t1 = formatDateYYYYMMDD(time[0])
1182            if not aggregateCoverage or \
1183                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1184                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1185                self.t1 = t1
1186   
1187            t2 = time[1]
1188            if t2 and t2 != 'None':
1189                t2 = formatDateYYYYMMDD(t2)
1190                if not aggregateCoverage or \
1191                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1192                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1193                    self.t2 = t2
1194           
1195            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1196        else:
1197            logging.debug("No valid time range data found")
1198   
1199        #create parameter summaries:
1200        #set up list to hold the parameters data
1201        parameters = []
1202        for feature in csmlDoc.featureCollection.featureMembers:
1203            if hasattr(feature.parameter, 'href'):
1204                paramTriple = ""
1205                if hasattr(feature, 'description'):
1206                    paramTriple = feature.description.CONTENT
1207                    paramTriple += " | " + feature.parameter.href
1208                   
1209                    term = ""
1210                    if hasattr(feature, 'name'):
1211                        term = feature.name.CONTENT
1212   
1213                    paramTriple += " | " + term
1214                   
1215                    logging.debug("Got parameter info: %s" %paramTriple)
1216                    parameters.append(paramTriple)
1217       
1218        # update the atom with the extracted parameters
1219        logging.info("Adding CSML parameters to granule atom")
1220        self.addParameters(parameters)
1221        logging.info("Finished adding CSML data")
1222
1223
1224    def lookupDeploymentsInfo(self, dr, lookupIndirectReferences=False):
1225        '''
1226        Check through the atom links and retrieve any associated deployments
1227        data
1228        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1229        config details which are not available to the Atom object
1230        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1231        defined deployments to find those which reference it, otherwise only
1232        deployments data featured in the atom related links are processed
1233        '''
1234        logging.info("Looking up deployments info")
1235        self.deployments = []
1236        self.allActivities = []
1237        self.allObs = []
1238        self.allDpts = []
1239
1240        if lookupIndirectReferences:
1241            logging.info("Looking up references to this atom from other deployments")
1242            doc = dr.get(self.ME.providerID, dr.ATOM_DEPLOYMENTS, self.atomBrowseURL, \
1243                                        targetCollection='/db/atoms/')
1244            # now need to turn this results set into actual atoms
1245            tree = ET.fromstring(doc)
1246            links = []
1247            for atom in tree:
1248                logging.debug("- found reference in deployment")
1249                links.append(ET.tostring(atom))
1250           
1251        else:
1252            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1253
1254        for link in links:
1255            if lookupIndirectReferences:
1256                deploymentAtom = link
1257            else:
1258                localID = link.href.split("__ATOM__")[-1]
1259                deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1260                                        targetCollection='/db/atoms/')
1261
1262            deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1263            self.deployments.append(deployment)
1264           
1265            self.addUniqueLinks(self.allActivities, deployment.activities)
1266            self.addUniqueLinks(self.allObs, deployment.obs)
1267            self.addUniqueLinks(self.allDpts, deployment.dpts)
1268       
1269        logging.info("Finished looking up deployments info")
1270
1271
1272
1273    def addUniqueLinks(self, dataArray, links):
1274        '''
1275        Add links to specified array - if they are not already included
1276        @param dataArray: a list, potentially arlready containing links
1277        @param links: a Link or array of Links to add to the dataArray
1278        '''
1279        logging.debug("Adding new links")
1280        if not links:
1281            return
1282       
1283        if type(links) is not list:
1284            links = [links]
1285       
1286        for link in links:
1287            if type(link) is not Link:
1288                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1289                continue
1290            if link not in dataArray:
1291                logging.debug("- adding unique link")
1292                dataArray.append(link)
1293        logging.debug("Finished adding links")
Note: See TracBrowser for help on using the repository browser.