source: exist/trunk/python/ndgUtils/models/Atom.py @ 4780

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4780
Revision 4780, 51.4 KB checked in by cbyrom, 11 years ago (diff)

Create AtomState? object to represent the different atom publication
states - tieing together the associated name and collection paths.
Update Atom model to use this.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils import ndgObject
17from ndgUtils.eXistConnector import eXistConnector
18from ndgUtils.ETxmlView import et2text
19import ndgUtils.lib.utilities as utilities
20from ndgUtils.models.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22import csml.parser as CsmlParser
23from ndgUtils.models import Deployment as Deployment
24from ndgUtils.models import AtomState
25
26class AtomError(Exception):
27    """
28    Exception handling for Atom class.
29    """
30    def __init__(self, msg):
31        logging.error(msg)
32        Exception.__init__(self, msg)
33
34
35class Person(object):
36    '''
37    Class representing atom author type data - with name, uri and role attributes
38    @keyword personType: Type of person to create - specified using the Person.._Type
39    values.  Default is AUTHOR_TYPE.
40    @keyword namespace: a two value array of format, ['short_namespace_name', 'full_namespace_name']
41    - e.g. ['moles', 'http://ndg.nerc.ac.uk/schema/moles2beta']
42    '''
43    AUTHOR_TYPE = 0
44    CONTRIBUTOR_TYPE = 1
45    RESPONSIBLE_PARTY_TYPE = 2
46    ELEMENT_NAMES = ["author", "contributor", "responsibleParty"]
47   
48    def __init__(self, personType = AUTHOR_TYPE, namespace = None):
49        self.type = personType
50        if namespace:
51            self.ns_shortname = namespace[0]
52            self.ns_fullname = namespace[1]
53        else:
54            self.ns_shortname = ""
55            self.ns_fullname = ndgObject.ATOM_NS
56           
57        self.name = ""
58        self.uri = ""
59        self.role = ""
60       
61        # NB, the atom format specifies slightly different data contents
62        self.uriTagName = "email"
63        # NB, responsible party data is always stored in the moles section
64        if self.type == self.RESPONSIBLE_PARTY_TYPE:
65            self.ns_shortname = 'moles'
66            self.ns_fullname = ndgObject.MOLES_NS
67            self.uriTagName = "uri"
68
69    def __str__(self):
70        if self.name or self.uri or self.role:
71            return self.name + " | " + self.uri + " | " + self.role
72        return ""
73
74
75    def hasValue(self):
76        if self.name or self.uri or self.role:
77            return True
78        return False
79   
80    def fromString(self, personString):
81        (self.name, self.uri, self.role) = utilities.getTripleData(personString)
82       
83    def fromETElement(self, personTag):
84        self.name = personTag.findtext('{%s}name' %self.ns_fullname) or ""
85        self.role = personTag.findtext('{%s}role' %self.ns_fullname) or ""
86        self.uri = personTag.findtext('{%s}%s' %(self.ns_fullname, self.uriTagName)) or ""
87        logging.debug("Added name: '%s', role: '%s', %s: '%s'" \
88                      %(self.name, self.role, self.uriTagName, self.uri))
89
90    def toXML(self):
91        prefix = ""
92        if self.ns_shortname:
93            prefix = self.ns_shortname + ':'
94
95        author = ET.Element(prefix + self.ELEMENT_NAMES[self.type])
96
97        if self.name:
98            name = ET.SubElement(author, prefix + "name")
99            name.text = self.name
100       
101        if self.uri:
102            uri = ET.SubElement(author, prefix + self.uriTagName)
103            uri.text = self.uri
104       
105        if self.role:
106            role = ET.SubElement(author, prefix + "role")
107            role.text = self.role
108
109        return author
110   
111    def __cmp__(self, person1):
112        '''
113        Override comparison to allow proper object comparison when checking
114        if Person objects are in an array already - i.e. if person in personArray...
115        '''
116        if not person1:
117            return -1
118       
119        if self is person1:
120            return 0
121        elif self.uri == person1.uri and self.name == person1.name and \
122                self.role == person1.role and self.type == person1.type:
123            return 0
124        return 1
125
126
127class Link(object):
128    '''
129    Class representing an atom link - with href, title and rel attributes
130    '''
131
132    def __init__(self):
133        self.href = ""
134        self.title = ""
135        self.rel = ""
136
137    def fromString(self, linkString):
138        (self.href, self.title, self.rel) = utilities.getTripleData(linkString)
139       
140    def fromETElement(self, linkTag):
141        self.href = linkTag.attrib.get('href') or ""
142        self.rel = linkTag.attrib.get('rel') or ""
143        self.title = linkTag.attrib.get('title') or ""
144
145    def toXML(self):
146        link = ET.Element("link")
147        link.attrib["href"] = self.href
148        link.attrib["title"] = self.title
149        link.attrib["rel"] = self.rel
150        return link
151
152    def hasValue(self):
153        # NB, just a rel on its own is meaningless - so ignore
154        if self.href or self.title:
155            return True
156        return False
157   
158    def __str__(self):
159        if self.href or self.title or self.rel:
160            return self.href + " | " + self.title + " | " + self.rel
161        return ""
162   
163    def isChildAtom(self):
164        '''
165        Determines whether the link refers to another atom - e.g. a link to
166        a data granule
167        @return True, if so; False otherwise
168        '''
169        if self.rel.endswith(VTD.GRANULE_TERM) or \
170            self.rel.endswith(VTD.DEPLOYMENT_TERM) or \
171            self.rel.endswith(VTD.ACTIVITY_TERM) or \
172            self.rel.endswith(VTD.DPT_TERM) or \
173            self.rel.endswith(VTD.OBS_TERM):
174            return True
175       
176        return False
177   
178    def __cmp__(self, link1):
179        '''
180        Override comparison to allow proper object comparison when checking
181        if Link objects are in an array already - i.e. if link in linkArray...
182        '''
183        if not link1:
184            return -1
185       
186        if self is link1:
187            return 0
188        elif self.href == link1.href and self.title == link1.title and \
189                self.rel == link1.rel:
190            return 0
191        return 1
192
193
194class Category(object):
195    '''
196    Class representing an atom category - with term, scheme and label attributes
197    '''
198    def __init__(self):
199        self.term = ""
200        self.scheme = ""
201        self.label = ""
202
203    def fromString(self, linkString, escapeSpecialCharacters=True):
204        '''
205        Create Category from triple string of format, 'label | scheme | term'
206        @param linkString: triple string to create category with
207        @keyword escapeSpecialCharacters: if set to True, special characters in
208        triple string are escaped (default)
209        '''
210        (self.label, self.scheme, self.term) = utilities.getTripleData(linkString, \
211            doEscape=escapeSpecialCharacters)
212       
213    def fromETElement(self, linkTag):
214        self.term = linkTag.attrib.get('term') or ""
215        self.label = linkTag.attrib.get('label') or ""
216        self.scheme = linkTag.attrib.get('scheme') or ""
217
218    def toXML(self):
219        link = ET.Element("category")
220        link.attrib["term"] = self.term
221        link.attrib["scheme"] = self.scheme
222        link.attrib["label"] = self.label
223        return link
224   
225    def hasValue(self):
226        if self.scheme or self.label or self.term:
227            return True
228        return False
229
230
231class Atom(object):
232
233    # labels for use with the atom categories
234    ATOM_TYPE = "ATOM_TYPE"
235    ATOM_SUBTYPE = "ATOM_SUBTYPE"
236
237    # labels for use with the templates to set/extract specific inputs
238    ONLINE_REF_LABEL = "online_ref"
239    PARAMETER_LABEL = "parameter"
240    ATOM_REF_LABEL = "atom_ref"
241    DELIMITER = "---"
242    REMOVE_LABEL = "remove"
243   
244    # format to use for t1-t2 date range
245    YEAR_FORMAT = '%Y-%m-%d'
246
247    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
248                 xmlString = None, state = AtomState.WORKING_STATE, **inputs):
249        '''
250        Constructor - initialise the atom variables
251        @keyword atomType: type of atom to set up
252        @keyword vocabTermData: instance of VocabTermData object to use with atom
253        @keywork ndgObject: instance of ndgObject to use with atom
254        @keyword xmlString: XML representation of atom - will be parsed to populate
255        the atom data
256        @keyword state:  AtomState object representing the state of the atom
257        '''
258        logging.info("Initialising atom")
259        if atomType:
260            logging.info(" - of type '%s'" %atomType)
261        self.atomTypeID = atomType
262
263        # some data have further subtypes specified
264        self.subtypeID = None # this should be the termID
265        self.subtype = None # and this should be the fully formed vocab URL
266       
267        self.ndgObject = ndgObject
268
269        self.atomName = None
270        self.files = []
271        self.author = Person()
272        self.contributors = []
273        self.atomAuthors = []
274        self.parameters = []
275        self.spatialData = []
276        self.temporalData = []
277        self.relatedLinks = []
278        self.summary = []
279        self.content = []
280        # NB, this deployments data duplicates other atom data - and is only used for a
281        # convenient way to collect the info (by lookupAssociatedData()) for use in templates
282        self.deployments = []
283        # ditto for the following field
284        self.dataEntities = []
285           
286        self.csmlFile = None
287        self.cdmlFile = None
288        # general variable to use for setting the atom content - NB, if a csmlFile is specified
289        # (either directly or via a cdmlFile specification), this will be the content by default
290        # for this purpose
291        self.contentFile = None     
292        self.title = None
293        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
294        self.atomID = None
295   
296        # boundary box info - to replace spatial/temporalData?
297        self.minX = None
298        self.minY = None
299        self.maxX = None
300        self.maxY = None
301        self.t1 = None
302        self.t2 = None
303
304        self.ME = ME.MolesEntity()
305       
306        # date when the atom was first ingested
307        self.publishedDate = None
308
309        # last update date
310        self.updatedDate = None
311
312        # assume atom in working state by default - this is used to define what collection
313        # in eXist the atom is stored in
314        self.state = state
315       
316        # additional, non standard atom data can be included in the molesExtra element
317        if vocabTermData:
318            self.VTD = vocabTermData
319        else:
320            self.VTD = VTD()
321       
322        if xmlString:
323            self.fromString(xmlString)
324
325        # if inputs passed in as dict, add these now
326        if inputs:
327            logging.info("Adding info to atom from input dict")
328            logging.debug(inputs)
329            self.__dict__.update(inputs)
330           
331            # NB, this doesn't trigger the Summary Property, so do this
332            # explicitly, if need be
333            if inputs.has_key('Summary'):
334                self.Summary = inputs.get('Summary')
335            if inputs.has_key('Content'):
336                self.Content = inputs.get('Content')
337            if inputs.has_key('author'):
338                name = inputs.get('author')
339                author = Person()
340                author.fromString(name)
341                self.author = author
342           
343            # also pass any moles data up to the moles entity object
344            if inputs.has_key('providerID'):
345                self.ME.providerID = inputs.get('providerID')
346               
347            if inputs.has_key('abbreviation'):
348                self.ME.abbreviation = inputs.get('abbreviation')
349
350        if self.atomTypeID:
351            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
352
353        logging.info("Atom initialised")
354
355
356    def addOnlineReferences(self, links):
357        '''
358        Add online reference data associated with the atom
359        - NB, care needs to be taken here since this data is stored in the atom
360        link elements and these are also used for the various atom associations
361        @param links: a Link or array of Links to add to the relatedLinks attribute
362        '''
363        logging.debug("Adding online references")
364        if not links:
365            return
366       
367        if type(links) is not list:
368            links = [links]
369       
370        # firstly clear out any online refs data from the existing related links
371        newLinks = []
372        for link in self.relatedLinks:
373            if link.isChildAtom():
374                newLinks.append(link)
375       
376        newLinks.extend(links)
377        self.relatedLinks = newLinks
378        logging.debug("Online references added")
379
380
381    def addUniqueRelatedLinks(self, links):
382        '''
383        Add links to relatedLinks array - if they are not already included
384        @param links: a Link or array of Links to add to the relatedLinks attribute
385        '''
386        self.addUniqueLinks(self.relatedLinks, links)
387       
388
389    def removeRelatedLinks(self, linksToDelete):
390        '''
391        Remove any links in the input list from the atom's related links list
392        @param linksToDelete: array of Link objects to remove from atom
393        '''
394        logging.debug("Removing related links from atom")
395        if not linksToDelete:
396            return
397       
398        if type(linksToDelete) is not list:
399            linksToDelete = [linksToDelete]
400       
401        updatedLinks = []
402        for link in self.relatedLinks:
403            if type(link) is not Link:
404                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
405                continue
406            if link in linksToDelete:
407                logging.debug("- found link to remove")
408            else:
409                updatedLinks.append(link)
410
411        self.relatedLinks = updatedLinks
412        logging.debug("Links removed")
413
414    def getPublicationStatePath(self):
415        '''
416        Determine the correct publication state collection for the atom
417        @return collectionPath: collection path for the publication state of the atom
418        '''
419        logging.debug("Getting collection path for atom publication state")
420        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state.collectionPath
421        logging.debug("Returning publication state collection, '%s'" %collectionPath)
422        return collectionPath
423       
424
425    def getDefaultEntityCollectionPath(self):
426        '''
427        Determine the correct collection for the entity type of the atom
428        @return entityPath: collection path for the data type of the atom
429        '''
430        logging.debug("Getting collection path for atom entity type")
431        collectionPath = self.getPublicationStatePath()
432       
433        if self.atomTypeID == VTD.DE_TERM:
434            collectionPath += eXistConnector.DE_COLLECTION_PATH
435        elif self.atomTypeID == VTD.GRANULE_TERM:
436            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
437        elif self.atomTypeID == VTD.ACTIVITY_TERM and \
438            self.subtypeID == VTD.DEPLOYMENT_TERM:
439            collectionPath += eXistConnector.DEPLOYMENTS_COLLECTION_PATH
440        else:
441            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
442       
443        logging.debug("Returning entity collection, '%s'" %collectionPath)
444        return collectionPath
445       
446
447    def getDefaultCollectionPath(self):
448        '''
449        Determine the correct collection to use for the atom in eXist
450        '''
451        logging.debug("Getting default collection path for atom")
452        collectionPath = self.getDefaultEntityCollectionPath()
453        if not self.ME.providerID:
454            raise AtomError("Error: cannot determine atom collection path because " + \
455                            "the provider ID is not defined")
456           
457        collectionPath += self.ME.providerID + "/"
458        logging.debug("Returning collection, '%s'" %collectionPath)
459        return collectionPath
460
461
462    def __addAtomTypeDataXML(self, root):
463        '''
464        Add the atom type, and subtype data, if available, to atom categories
465        - and lookup and add the appropriate vocab term data
466        '''
467        if self.atomTypeID:
468            logging.info("Adding atom type info to XML output")
469            category = Category()
470            category.label = self.atomTypeID
471            # look up the appropriate vocab term data
472            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
473            category.term = self.ATOM_TYPE
474            root.append(category.toXML())
475
476        if self.subtypeID:
477            logging.info("Adding atom subtype info to XML output")
478            # NB subtypes not all defined, so leave this out for the moment
479            category.label = self.subtypeID
480            # look up the appropriate vocab term data
481            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtypeID)
482            category.term = self.ATOM_SUBTYPE
483            root.append(category.toXML())
484
485
486    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
487        '''
488        Add data to include in the moles entity element
489        '''
490        logging.debug('Adding moles entity information')
491        self.ME.abbreviation = abbreviation
492        self.ME.providerID = provider_id
493        self.ME.createdDate = utilities.getISO8601Date(object_creation_time)
494        logging.debug('Moles entity information added')
495
496
497    def addAuthors(self, authors):
498        '''
499        Add author data appropriately to the atom
500        NB, these will overwrite any existing authors of the same type
501        @param authors: list of Person objects with the author data
502        '''
503        logging.debug('Adding authors data to Atom')
504        isFirstAuthor = {}
505        authorArray = None
506        for author in authors:
507            # NB, we're only allowed one atom author
508            if author.type == Person.AUTHOR_TYPE:
509                self.author = author
510                   
511                if isFirstAuthor.has_key(author.type):
512                    raise AtomError("Error: an atom can only have one author specified")
513                isFirstAuthor[author.type] = 1
514                continue
515            elif author.type == Person.CONTRIBUTOR_TYPE:
516                authorArray = self.contributors
517            elif author.type == Person.RESPONSIBLE_PARTY_TYPE:
518                authorArray = self.ME.responsibleParties
519               
520            # check if this is the first addition - if so, clear out the
521            # array in advance
522            if not isFirstAuthor.has_key(author.type):
523                logging.debug("Clearing out author array")
524                # NB, need to be careful to clear the array, not create a ref
525                # to a new array
526                del authorArray[:]
527                isFirstAuthor[author.type] = 1
528
529            if author.hasValue() and author not in authorArray:
530                logging.debug("Adding author (type:'%s', name:'%s', uri:'%s', role:'%s')" \
531                              %(author.type, author.name, author.uri, author.role))
532                authorArray.append(author)
533
534        logging.debug('Finished adding authors data')
535
536
537    def _isNewParameter(self, param):
538        '''
539        Check if a parameter is already specified in the atom, return False if
540        so, otherwise return True
541        '''
542        for p in self.parameters:
543            if p.term == param.term and \
544                p.scheme == param.scheme and \
545                p.label == param.label:
546                return False
547        return True
548
549
550    def addRelatedLinks(self, linkVals):
551        '''
552        Add related links in string format - converting to Link objects
553        NB, only add the link if it is unique
554       
555        @param linkVals: string of format, 'uri | title | vocabServerURL'
556        '''
557        link = self.objectify(linkVals, 'relatedLinks')
558        if link not in self.relatedLinks:
559            self.relatedLinks.append(link)
560
561
562    def addParameters(self, params):
563        '''
564        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
565        @params param: parameter, as string array, to add to atom parameters collection
566        '''
567        # avoid strings being parsed character by character
568        if type(params) is str:
569            params = [params]
570           
571        for param in params:
572            # firstly tidy parameter
573            param = utilities.tidyUpParameters(param)
574            category = Category()
575            # NB, data already tidied up here, so set keyword to avoid this happening again
576            category.fromString(param, escapeSpecialCharacters=True)
577
578            # now check for uniqueness
579            if self._isNewParameter(category):
580                logging.debug("Adding new parameter: %s" %param)
581                self.parameters.append(category)
582   
583   
584    def _linksToXML(self, root):
585        '''
586        Add required links to the input element
587        @param root: element to add links to - NB, should be the root element of the atom
588        '''
589        selfLink = ET.SubElement(root, "link")
590        selfLink.attrib["href"] = self.atomBrowseURL
591        selfLink.attrib["rel"] = "self"
592       
593        for relatedLink in self.relatedLinks:
594            if relatedLink.hasValue():
595                root.append(relatedLink.toXML())
596   
597    def toXML(self):
598        '''
599        Convert the atom into XML representation and return this
600        @return: xml version of atom
601        '''
602        logging.info("Creating formatted XML version of Atom")
603        root = ET.Element("entry")
604        root.attrib["xmlns"] = ndgObject.ATOM_NS
605        root.attrib["xmlns:moles"] = ndgObject.MOLES_NS
606        root.attrib["xmlns:georss"] = ndgObject.GEOSS_NS
607        root.attrib["xmlns:gml"] = ndgObject.GML_NS
608        id = ET.SubElement(root, "id")
609        id.text = self.atomID
610        title = ET.SubElement(root, "title")
611        title.text = self.title
612        self._linksToXML(root)
613
614        if self.author and self.author.hasValue():
615            root.append(self.author.toXML())
616           
617        for contributor in self.contributors:
618            root.append(contributor.toXML())
619
620        # add parameters data
621        for param in self.parameters:
622            if param.hasValue():
623                root.append(param.toXML())
624
625        # add the type and subtype data
626        self.__addAtomTypeDataXML(root)
627                   
628        summary = ET.SubElement(root, "summary")
629        summary.text = self.Summary
630                   
631        # add link to content, if required - NB, can only have one content element in atom
632        # - and this is mandatory
633        content = ET.SubElement(root, "content")
634        contentFile = self.contentFile or self.csmlFile or self.cdmlFile
635        if contentFile:
636            content.attrib["type"] = "application/xml"
637            content.attrib["src"] = contentFile
638        else:
639            content.attrib["type"] = "xhtml"
640            div = ET.SubElement(content, 'div')
641            div.attrib["xmlns"] = ndgObject.XHTML_NS
642            div.text = self.Content
643       
644        # if there's a published date already defined, assume we're doing an update now
645        # NB, update element is mandatory
646        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
647        if not self.publishedDate:
648            self.publishedDate = currentDate
649
650        updated = ET.SubElement(root, "updated")
651        if not self.updatedDate:
652            self.updatedDate = currentDate
653        updated.text = self.updatedDate
654
655        published = ET.SubElement(root, "published")
656        published.text = self.publishedDate
657
658        # add the moles entity section, if it is required
659        if self.ME:
660            root.append(self.ME.toXML())
661
662        # add temporal range data, if available
663        temporalRange = ET.SubElement(root, "moles:temporalRange")
664        if self.t1:
665            temporalRange.text = self.t1
666            if self.t2:
667                temporalRange.text += "/" + self.t2
668
669        # add spatial range data, if available
670        self._addSpatialData(root)
671
672        tree = ET.ElementTree(root)
673        logging.info("XML version of Atom created")
674        return tree
675
676
677    def __getSummary(self):
678        logging.debug("Getting summary data")
679        summaryString = ""
680        for summary_line in self.summary:
681            summaryString += summary_line + "\n"
682
683        return summaryString
684
685    def __setSummary(self, summary):
686        logging.debug("Adding summary data")
687        self.summary = []
688        for summary_line in summary.split('\n'):
689            self.summary.append(utilities.escapeSpecialCharacters(summary_line))
690           
691    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
692
693
694    def __getContent(self):
695        logging.debug("Getting content data")
696        contentString = ""
697        # NB, there must be content specified in an atom
698        if not self.content:
699            return "Metadata document"
700       
701        for content_line in self.content:
702            contentString += content_line + "\n"
703
704        return contentString
705
706    def __setContent(self, content):
707        logging.debug("Adding content data")
708        self.content = []
709        for content_line in content.split('\n'):
710            self.content.append(content_line)
711           
712    Content = property(fset=__setContent, fget=__getContent, doc="Atom content")
713
714           
715    def fromString(self, xmlString):
716        '''
717        Initialise Atom object using an xmlString
718        @param xmlString: representation of atom as an XML string
719        '''
720        logging.info("Ingesting data from XML string")
721        logging.debug("Create elementtree instance with XML string")
722        tree = ET.fromstring(xmlString)
723        title = tree.findtext('{%s}title' %ndgObject.ATOM_NS)
724        if title:
725            logging.debug("Adding title data")
726            self.title = title
727
728        summary = tree.findtext('{%s}summary' %ndgObject.ATOM_NS)
729        if summary:
730            self.Summary = summary#.decode('unicode_escape')
731
732        authorElement = tree.find('{%s}author' %ndgObject.ATOM_NS)
733        if authorElement:
734            logging.debug("Adding author data")
735            author = Person()
736            author.fromETElement(authorElement)
737            self.author = author
738
739        contributorElements = tree.findall('{%s}contributor' %ndgObject.ATOM_NS)
740        for contributorElement in contributorElements:
741            logging.debug("Adding contributor data")
742            contributor = Person(personType = Person.CONTRIBUTOR_TYPE)
743            contributor.fromETElement(contributorElement)
744            self.contributors.append(contributor)
745
746        molesElement = tree.find('{%s}entity' %ndgObject.MOLES_NS)
747        if molesElement:
748            self.ME.fromET(molesElement)
749               
750        self.atomID = tree.findtext('{%s}id' %ndgObject.ATOM_NS)
751
752        self._parseCategoryData(tree.findall('{%s}category' %ndgObject.ATOM_NS))
753
754        self._parseLinksData(tree.findall('{%s}link' %ndgObject.ATOM_NS))
755           
756        contentTag = tree.find('{%s}content' %ndgObject.ATOM_NS)
757        if contentTag != None:
758            logging.debug("Found content tag - checking for CSML/CDML file data")
759            file = contentTag.attrib.get('src')
760            if file:
761                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
762                if file.upper().find('CSML') > -1:
763                    logging.debug("Adding CSML file data")
764                    self.csmlFile = file
765                elif file.upper().find('CDML') > -1:
766                    logging.debug("Adding CDML file data")
767                    self.cdmlFile = file
768                self.contentFile = file
769            else:
770                logging.debug("No file data - adding contents of element instead")
771                div = contentTag.find('{http://www.w3.org/1999/xhtml}div')
772                self.Content = div.text
773       
774        range = tree.findtext('{%s}temporalRange' %ndgObject.MOLES_NS)
775        if range:
776            logging.debug("Adding temporal range data")
777            timeData = range.split('/')
778            self.t1 = timeData[0]
779            if len(timeData) > 1:
780                self.t2 = timeData[1]
781       
782        where = tree.find('{%s}where' %ndgObject.GEOSS_NS)
783        if where:
784            # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
785            minBBox = where.findall('.//{%s}lowerCorner' %ndgObject.GML_NS)
786            if minBBox:
787                logging.debug("Adding min spatial range data")
788                minBBox = minBBox[0]
789                spatialData = minBBox.text.split()
790                self.minX = spatialData[0]
791                if len(spatialData) > 1:
792                    self.minY = spatialData[1]
793           
794            maxBBox = where.findall('.//{%s}upperCorner' %ndgObject.GML_NS)
795            if maxBBox:
796                maxBBox = maxBBox[0]
797                logging.debug("Adding max spatial range data")
798                spatialData = maxBBox.text.split()
799                self.maxX = spatialData[0]
800                if len(spatialData) > 1:
801                    self.maxY = spatialData[1]
802               
803        publishedDate = tree.findtext('{%s}published' %ndgObject.ATOM_NS)
804        if publishedDate:
805            logging.debug("Adding published date")
806            self.publishedDate = publishedDate
807               
808        updatedDate = tree.findtext('{%s}updated' %ndgObject.ATOM_NS)
809        if updatedDate:
810            logging.debug("Adding updated date")
811            self.updatedDate = updatedDate
812           
813        logging.info("Completed data ingest")
814   
815   
816    def _parseCategoryData(self, categories):
817        logging.debug("Adding category/parameters data")
818        for category in categories:
819            cat = Category()
820            cat.fromETElement(category)
821           
822            if cat.term == self.ATOM_TYPE:
823                logging.debug("Found atom type data")
824                self.atomTypeID = cat.label
825                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
826                continue
827            elif cat.term == self.ATOM_SUBTYPE:
828                logging.debug("Found atom subtype data")
829                self.subtypeID = cat.label
830                self.subtype = cat.scheme
831                continue
832
833            self.parameters.append(cat)
834   
835
836    def setDatasetID(self, datasetID):
837        '''
838        Set the dataset ID for the atom - and generate an appropriate atom name using this
839        @param datasetID: ID to set for the atom
840        '''
841        self.datasetID = datasetID
842        self._generateAtomName(datasetID) 
843        self.atomID = self.createAtomID(datasetID)
844
845
846    def createAtomID(self, datasetID):
847        '''
848        Create a unique ID, conforming to atom standards, for atom
849        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
850        @param datasetID: ID of atom's dataset
851        @return: unique ID
852        '''
853        logging.info("Creating unique ID for atom")
854        if not self.atomBrowseURL:
855            self._generateAtomName(datasetID)
856        urlBit = self.atomBrowseURL.split('://')[1]
857        urlBit = urlBit.replace('#', '')
858        urlBits = urlBit.split('/')
859        host = urlBits[0].split(':')[0] # avoid the port colon - as this breaks the ID format
860        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
861       
862        id = "tag:" + host + "," + dateBit + ":/" + "/".join(urlBits[1:])
863        logging.info("- unique ID created for atom")
864        logging.debug(" - '%s'" %id)
865        return id
866       
867       
868    def _generateAtomName(self, datasetID):
869        '''
870        Generate a consistent name for the atom - with full eXist doc path
871        @param datasetID: ID of atom's dataset
872        '''
873        self.atomName = datasetID + ".atom"
874        if not self.ME.providerID:
875            raise ValueError("Provider ID has not been specified for atom - please add this and retry")
876        self.ndgURI = self.ME.providerID + "__ATOM__" + datasetID
877        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + self.ndgURI
878
879
880    def _parseLinksData(self, links):
881        '''
882        Extract links and atom data from array of link elements in the XML representation of the atom
883        @param links: an array of <link> elements
884        '''
885        # firstly, get all data to start with, so we can properly process it afterwards
886        linkData = {}
887        logging.debug("Getting link data")
888        for linkTag in links:
889            link = Link()
890            link.fromETElement(linkTag)
891
892            if not linkData.has_key(link.rel):
893                linkData[link.rel] = []
894           
895            linkData[link.rel].append(link)
896
897        # there should be one self referencing link - which will provide info on the atom itself
898        if not linkData.has_key('self'):
899            errorMessage = "Atom does not have self referencing link - " + \
900                "cannot ascertain datasetID without this - please fix"
901            logging.error(errorMessage)
902            raise ValueError(errorMessage)
903       
904        # this is the link describing the atom itself
905        self.atomBrowseURL = linkData['self'][0].href
906       
907        self.datasetID = self.atomBrowseURL.split("__ATOM__")[-1]
908        self.atomName = self.datasetID + ".atom"
909        self.ndgURI = self.atomBrowseURL.split(VTD.BROWSE_ROOT_URL)[1]
910       
911        # now remove this value and the associated moles doc link
912        del linkData['self']
913        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
914        if linkData.has_key('related'):
915            relatedLinks = []
916            for link in linkData['related']:
917                if link.href != molesDoc:
918                    relatedLinks.append(link)
919           
920            linkData['related'] = relatedLinks
921               
922        # now add the remaining links to the atom
923        for key in linkData:
924            for link in linkData[key]:
925                logging.debug("Adding link data")
926                self.relatedLinks.append(link)
927       
928
929    def _addSpatialData(self, element):
930        '''
931        Add spatial coverage element to an input element
932        @param element: element to add coverage data to
933        '''
934        logging.info("Adding spatial data to Atom")
935        if not self.minX:
936            logging.info("No spatial data specified")
937            return
938        bbox = ET.SubElement(element, "georss:where")
939        envelope = ET.SubElement(bbox, "gml:Envelope")
940        lc = ET.SubElement(envelope, "gml:lowerCorner")
941        lc.text = str(self.minX) + " " + str(self.minY)
942        uc = ET.SubElement(envelope, "gml:upperCorner")
943        uc.text = str(self.maxX) + " " + str(self.maxY)
944
945       
946    def setAttribute(self, attributeName, attributeValue):
947        '''
948        Set the value of an atom attribute - and do some basic tidying up of the string content
949        - to escape any XML unfriendly characters
950        @param attributeName: name of the attribute whose value to set
951        @param attributeValue: value to set the attribute to 
952        '''
953        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
954        origValue = attributeValue
955       
956        # escape any special characters if a value has been specified
957        # NB, need to cope with both single values and arrays
958        if attributeValue:
959            if type(attributeValue) is list:
960                newVals = []
961                for val in attributeValue:
962                    newVals.append(self.objectify(utilities.escapeSpecialCharacters(val), attributeName))
963                attributeValue = newVals
964                   
965            else:
966                attributeValue = self.objectify(utilities.escapeSpecialCharacters(attributeValue), attributeName)
967
968        # handle the special case of authors; only one author is allowed per atom
969        # - the others should be treated as contributors
970        if attributeName == "authors":
971            setattr(self, "author", attributeValue[0])
972            if len(attributeValue) > 1:
973                setattr(self, "contributors", attributeValue[1:])
974        elif attributeName == "atomAuthors":
975            if isinstance(attributeValue, list):
976                for val in attributeValue:
977                    self.ME.responsibleParties.append(val)
978            else:
979                self.ME.responsibleParties.append(attributeValue)
980        elif attributeName == "files":
981            self.addUniqueRelatedLinks(attributeValue)
982        else:
983            setattr(self, attributeName, attributeValue)
984
985
986    def objectify(self, objectVals, attributeName):
987        '''
988        Some inputs are specified as strings but need to be converted into
989        objects - do this here
990        @param objectVals: a '|' delimited string of values
991        @param attributeName: name of attribute the values belong to
992        '''
993        obj = None
994        if type(objectVals) != str:
995            return objectVals
996       
997        if attributeName == "relatedLinks":
998            obj = Link()
999        elif attributeName == "atomAuthors":
1000            obj = Person(personType = Person.RESPONSIBLE_PARTY_TYPE)
1001        elif attributeName == "authors":
1002            # NB, ensure there is only one author tag - extra authors are contributors
1003            authorType = Person.AUTHOR_TYPE
1004            if self.author and self.author.hasValue():
1005                authorType= Person.CONTRIBUTOR_TYPE
1006            obj = Person(personType = authorType)
1007        elif attributeName == 'files':
1008            obj = Link()
1009            objectVals = '%s|%s|%s' \
1010                %(self.VTD.getTermCurrentVocabURL(VTD.METADATA_SOURCE_TERM), objectVals, VTD.METADATA_SOURCE_TERM)
1011
1012        if obj:
1013            obj.fromString(objectVals)
1014            # NB, need to set it now, just in case we don't set it before coming back
1015            if attributeName == "authors" and (not self.author or not self.author.hasValue()):
1016                self.author = obj
1017            return obj
1018       
1019        return objectVals
1020
1021
1022    def toPrettyXML(self):
1023        '''
1024        Returns nicely formatted XML as string
1025        '''
1026        atomXML = self.toXML()
1027
1028        # create the string
1029        logging.debug("Converting the elementtree object into a string")
1030        prettyXML = et2text(atomXML.getroot())
1031
1032        # add XML version tag
1033        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
1034        logging.info("Created formatted version of XML object")
1035        return prettyXML
1036
1037
1038    def getLinksOfType(self, termID):
1039        '''
1040        Returns links in the atom related links attribute which match the specified
1041        term ID
1042        @param termID: the termID to look for in the related links - NB, this is
1043        matched to the end of the link.rel value
1044        @return links: array of Link objects with matching term type
1045        '''
1046        logging.debug("Getting atom links of type, '%s'" %termID)
1047        matchingLinks = []
1048        for link in self.relatedLinks:
1049            # firstly, handle special case where we only want the online ref type links
1050            # returned
1051            if termID == self.ONLINE_REF_LABEL:
1052                if not link.isChildAtom():
1053                    logging.debug("- found link with matching term type")
1054                    matchingLinks.append(link)
1055               
1056            elif link and link.rel and link.rel.lower().endswith(termID.lower()):
1057                logging.debug("- found link with matching term type")
1058                matchingLinks.append(link)
1059               
1060        logging.debug("Returning matched links")
1061        return matchingLinks
1062       
1063       
1064    def getLogos(self):
1065        '''
1066        Return related links that are logos
1067        @return: array of Links containing the logos for the atom
1068        '''
1069        logos = []
1070        for link in self.relatedLinks:
1071            if link.rel.lower().endswith(VTD.LOGO_TERM.lower()):
1072                logos.append(link)
1073               
1074        return logos
1075   
1076   
1077    def isGranule(self):
1078        if self.atomTypeID == VTD.GRANULE_TERM:
1079            return True
1080        return False
1081   
1082   
1083    def isDE(self):
1084        if self.atomTypeID == VTD.DE_TERM:
1085            return True
1086        return False
1087   
1088    def isDeployment(self):
1089        if self.subtypeID and self.subtypeID == VTD.DEPLOYMENT_TERM:
1090            return True
1091        return False
1092   
1093    def isDeployable(self):
1094        if (self.atomTypeID == VTD.ACTIVITY_TERM and self.subtypeID != VTD.DEPLOYMENT_TERM) or \
1095            self.atomTypeID == VTD.DPT_TERM or \
1096            self.atomTypeID == VTD.OBS_TERM:
1097            return True
1098        return False
1099   
1100    def isPublished(self):
1101        '''
1102        Check state of atom doc - if published or Published return True,
1103        otherwise return False
1104        '''
1105        return self.state.isPublishedState()
1106       
1107       
1108    def addCSMLData(self, csmlName, csmlContent, aggregateCoverage=False, useCSMLID=False):
1109        '''
1110        Parse CSML data and add extracted info to the atom
1111        @param csmlName: name of the csml file
1112        @param csmlContent: content of the csml file - NB, if this is set to None and the
1113        file, csmlName, is available locally, CsmlParser.Dataset will read in the file
1114        directly
1115        @keyword aggregateCoverage: if set to True, only coverage data that extends the
1116        atom coverage data will be added
1117        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
1118        this should only be True if creating a new atom - e.g. from a granulite
1119        @return csmlDoc: the CsmlParser.Dataset object with the csml data in
1120        '''
1121        logging.info("Creating CSML data model")
1122        self.csmlFile = csmlName
1123        self.contentFile = csmlName
1124        content = csmlContent or csmlName
1125   
1126        csmlDoc = CsmlParser.Dataset(file=content)
1127       
1128        logging.info("Extracting info from CSML file")
1129        logging.debug("Got dataset ID: %s" %csmlDoc.id)
1130        if useCSMLID:
1131            logging.debug(" - using this ID for the atom")
1132            self.setDatasetID(VTD.GRANULE_TERM + '_' + csmlDoc.id)
1133       
1134        title = csmlDoc.name.CONTENT
1135        logging.debug("Got dataset name (title): '%s'" %title)
1136        # NB, if a title is specified (and not as the default value), it automatically is used in
1137        # place of anything in the granulite file
1138        if title and title != "NAME OF DATASET GOES HERE":
1139            logging.info("Title, '%s', extracted from CSML file" %title)
1140            if self.title:
1141                logging.info("- NB, this will override the title specified in the granulite file ('%s')" \
1142                             %self.title)
1143            self.title = title
1144               
1145        bbox1 = csmlDoc.getBoundingBox()
1146        bbox2 = csmlDoc.getCSMLBoundingBox()
1147
1148        time = None
1149        if bbox2:
1150            time = bbox2.getTimeLimits()
1151   
1152        # now check for other parameters to add to granule
1153        # Firstly, extract the bounding envelope
1154        if bbox1:
1155            w, e = utilities.normaliseLongitude(bbox1[0],bbox1[2])
1156            n, s = (bbox1[3], bbox1[1])
1157   
1158            if not aggregateCoverage or (not self.maxY or float(n) > float(self.maxY)):
1159                self.maxY = n
1160               
1161            if not aggregateCoverage or (not self.minY or float(s) < float(self.minY)):
1162                self.minY = s
1163           
1164            if not aggregateCoverage or (not self.minX or float(w) < float(self.minX)):
1165                self.minX = w
1166   
1167            if not aggregateCoverage or (not self.maxX or float(e) > float(self.maxX)):
1168                self.maxX = e
1169           
1170            logging.debug("Got bounding box data from file: (%s, %s) , (%s, %s)" \
1171                          %(w, s, e, n))
1172           
1173            logging.debug("Updated atom bounding box data: (%s, %s) , (%s, %s)" \
1174                          %(self.minX, self.minY, self.maxX, self.maxY))
1175        else:
1176            logging.debug("No valid bounding box data found")
1177   
1178        if time:
1179            t1 = utilities.formatDateYYYYMMDD(time[0])
1180            if not aggregateCoverage or \
1181                (not self.t1 or datetime.datetime.strptime(t1, YEAR_FORMAT) < \
1182                    datetime.datetime.strptime(self.t1, YEAR_FORMAT)):
1183                self.t1 = t1
1184   
1185            t2 = time[1]
1186            if t2 and t2 != 'None':
1187                t2 = utilities.formatDateYYYYMMDD(t2)
1188                if not aggregateCoverage or \
1189                    (not self.t2 or datetime.datetime.strptime(t2, YEAR_FORMAT) > \
1190                        datetime.datetime.strptime(self.t2, YEAR_FORMAT)):
1191                    self.t2 = t2
1192           
1193            logging.debug("Got time range: %s -> %s" %(self.t1, self.t2))
1194        else:
1195            logging.debug("No valid time range data found")
1196   
1197        #create parameter summaries:
1198        #set up list to hold the parameters data
1199        parameters = []
1200        for feature in csmlDoc.featureCollection.featureMembers:
1201            if hasattr(feature.parameter, 'href'):
1202                paramTriple = ""
1203                if hasattr(feature, 'description'):
1204                    paramTriple = feature.description.CONTENT
1205                    paramTriple += " | " + feature.parameter.href
1206                   
1207                    term = ""
1208                    if hasattr(feature, 'name'):
1209                        term = feature.name.CONTENT
1210   
1211                    paramTriple += " | " + term
1212                   
1213                    logging.debug("Got parameter info: %s" %paramTriple)
1214                    parameters.append(paramTriple)
1215       
1216        # update the atom with the extracted parameters
1217        logging.info("Adding CSML parameters to granule atom")
1218        self.addParameters(parameters)
1219        logging.info("Finished adding CSML data")
1220        return csmlDoc
1221
1222
1223    def lookupAssociatedData(self, type, dr, lookupIndirectReferences=False):
1224        '''
1225        Check through the atom links and retrieve any associated data of the
1226        specified type
1227        @param type: type of associated data to lookup - currently VTD.DEPLOYMENT_TERM
1228        or VTD.DE_TERM
1229        @param dr: Instance of DocumentRetrieve object - NB, this requires eXist
1230        config details which are not available to the Atom object
1231        @keyword lookupIndirectReferences: if True, the atom ID is used to search
1232        defined deployments to find those which reference it, otherwise only
1233        deployments data featured in the atom related links are processed
1234        '''
1235        logging.info("Looking up %s info" %type)
1236       
1237        self.allActivities = []
1238        self.allObs = []
1239        self.allDpts = []
1240
1241        if type != VTD.DE_TERM and type != VTD.DEPLOYMENT_TERM:
1242            raise ValueError('Unrecognised associated data type: %s' %type)
1243       
1244        # avoid duplicating lookup effort
1245        if (type == VTD.DEPLOYMENT_TERM and self.deployments) or \
1246            (type == VTD.DE_TERM and self.dataEntities):
1247            logging.info("- this info has already been looked up - returning")
1248            return
1249
1250        # firstly, collect all the references to the info required
1251        if lookupIndirectReferences:
1252            logging.info("Looking up indirect references")
1253           
1254            # if we're looking up DE data for deployments data, need to have the
1255            # deployments info looked up first
1256            if type == VTD.DE_TERM and self.isDeployable() and not self.deployments:
1257                self.lookupAssociatedData(VTD.DEPLOYMENT_TERM, dr, lookupIndirectReferences)
1258           
1259            logging.info("Looking up references to this atom from other %s" %type)
1260           
1261            # NB, if we're looking up deployments info, we only look up references
1262            # to this atom - if we're looking up DEs, we need to look up references
1263            # to the deployments referenced by this atom
1264            urls = [self.atomBrowseURL]
1265           
1266            if type == VTD.DE_TERM and self.isDeployable():
1267                urls = []
1268                for dep in self.deployments:
1269                    urls.append(dep.browseURL)
1270                   
1271            links = []
1272            for url in urls:
1273                doc = dr.get(type, dr.ATOM_TYPE, url, \
1274                             targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1275                # now need to turn this results set into actual atoms
1276                tree = ET.fromstring(doc)
1277                for atom in tree:
1278                    logging.debug("- found reference in %s" %type)
1279                    links.append(ET.tostring(atom))
1280                   
1281            logging.info("Finished looking up indirect references")
1282        else:
1283            links = self.getLinksOfType(self.VTD.DEPLOYMENT_TERM)
1284
1285        # now retrieve the references and extract the required data
1286        logging.info("Retrieving info from %s references" %type)
1287        if type == VTD.DEPLOYMENT_TERM:
1288            logging.info("Extracting links data to deployment entitites")
1289            self.deployments = []
1290            for link in links:
1291                if lookupIndirectReferences:
1292                    deploymentAtom = link
1293                else:
1294                    localID = link.href.split("__ATOM__")[-1]
1295                    deploymentAtom = dr.get(self.ME.providerID, 'ATOM', localID, \
1296                                            targetCollection = eXistConnector.BASE_COLLECTION_PATH)
1297   
1298                deployment = Deployment.Deployment(Atom(xmlString=str(deploymentAtom)))
1299                self.deployments.append(deployment)
1300               
1301                self.addUniqueLinks(self.allActivities, deployment.activities)
1302                self.addUniqueLinks(self.allObs, deployment.obs)
1303                self.addUniqueLinks(self.allDpts, deployment.dpts)
1304        else:
1305            # for DE data, just store the title + link in a Link object
1306            self.dataEntities = []
1307            logging.info("Extracting links data to data entitites")
1308            for data in links:
1309                atom = Atom(xmlString=str(data))
1310                link = Link()
1311                link.title = atom.title
1312                link.href = atom.atomBrowseURL
1313                link.rel = atom.datasetID
1314               
1315                # NB, different deployments may be used by the same DE - so
1316                # avoid duplication
1317                self.addUniqueLinks(self.dataEntities, link)
1318           
1319        logging.info("Finished looking up %s info" %type)
1320
1321
1322    def addUniqueLinks(self, dataArray, links):
1323        '''
1324        Add links to specified array - if they are not already included
1325        @param dataArray: a list, potentially arlready containing links
1326        @param links: a Link or array of Links to add to the dataArray
1327        '''
1328        logging.debug("Adding new links")
1329        if not links:
1330            return
1331       
1332        if type(links) is not list:
1333            links = [links]
1334       
1335        for link in links:
1336            if type(link) is not Link:
1337                logging.warning("Link is not of 'Link' object type (type='%s') - skipping" %type(link))
1338                continue
1339            if link not in dataArray:
1340                logging.debug("- adding unique link")
1341                dataArray.append(link)
1342        logging.debug("Finished adding links")
1343
1344       
1345    def getFullPath(self):
1346        '''
1347        Return full path to atom in eXist, if it exists, or None, otherwise
1348        @return fullPath: string - collection + filename of atom in eXist
1349        '''
1350        # NB, name assigned when atom created in eXist - so if not defined, not
1351        # in eXist
1352        logging.debug("Getting full path to atom")
1353        if self.atomName:
1354            logging.debug("Return full path to atom in eXist")
1355            return self.getDefaultCollectionPath() + self.atomName
1356        logging.debug("Atom doesn't currently exist in eXist - return 'None'")
1357        return None
Note: See TracBrowser for help on using the repository browser.