source: exist/trunk/python/ndgUtils/models/Atom.py @ 4217

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4217
Revision 4217, 25.0 KB checked in by cbyrom, 11 years ago (diff)

Adjust atom to include atom type term ID as well as the vocab url - and
update the vocab data to include the terms for the various different
atom subtypes.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22
23class Person():
24    '''
25    Class representing atom author type data - with name, uri and role attributes
26    '''
27    def __init__(self, tagName = "author", namespace = None):
28        self.tagName = tagName
29        self.ns = namespace
30        self.name = ""
31        self.uri = ""
32        self.role = ""
33
34    def fromString(self, personString):
35        (self.name, self.uri, self.role) = getTripleData(personString)
36       
37    def fromETElement(self, personTag):
38        self.name = personTag.findtext('name') or ""
39        self.role = personTag.findtext('role') or ""
40        self.uri = personTag.findtext('uri') or ""
41        logging.debug("Added name: '%s', role: '%s', uri: '%s'" \
42                      %(self.name, self.role, self.uri))
43
44    def toXML(self):
45        prefix = ""
46        if self.ns:
47            prefix = self.ns + ':'
48
49        author = ET.Element(prefix + self.tagName)
50
51        if self.name:
52            name = ET.SubElement(author, prefix + "name")
53            name.text = self.name
54       
55        if self.uri:
56            uri = ET.SubElement(author, prefix + "uri")
57            uri.text = self.uri
58       
59        if self.role:
60            role = ET.SubElement(author, prefix + "role")
61            role.text = self.role
62
63        return author
64   
65
66class Link():
67    '''
68    Class representing an atom link - with href, title and rel attributes
69    '''
70    def __init__(self):
71        self.href = ""
72        self.title = ""
73        self.rel = ""
74
75    def fromString(self, linkString):
76        (self.href, self.title, self.ref) = getTripleData(linkString)
77       
78    def fromETElement(self, linkTag):
79        self.href = linkTag.attrib.get('href') or ""
80        self.rel = linkTag.attrib.get('rel') or ""
81        self.title = linkTag.attrib.get('title') or ""
82
83    def toXML(self):
84        link = ET.Element("link")
85        link.attrib["href"] = self.href
86        link.attrib["title"] = self.title
87        link.attrib["rel"] = self.rel
88        return link
89
90
91class Category():
92    '''
93    Class representing an atom category - with term, scheme and label attributes
94    '''
95    def __init__(self):
96        self.term = ""
97        self.scheme = ""
98        self.label = ""
99
100    def fromString(self, linkString):
101        (self.label, self.scheme, self.term) = getTripleData(linkString)
102       
103    def fromETElement(self, linkTag):
104        self.term = linkTag.attrib.get('term') or ""
105        self.label = linkTag.attrib.get('label') or ""
106        self.scheme = linkTag.attrib.get('scheme') or ""
107
108    def toXML(self):
109        link = ET.Element("category")
110        link.attrib["term"] = self.term
111        link.attrib["scheme"] = self.scheme
112        link.attrib["label"] = self.label
113        return link
114
115
116class Atom(object):
117
118    ATOM_TYPE = "ATOM_TYPE"
119    ATOM_SUBTYPE = "ATOM_SUBTYPE"
120
121    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
122                 xmlString = None):
123        '''
124        Constructor - initialise the atom variables
125        '''
126        logging.info("Initialising atom")
127        if atomType:
128            logging.info(" - of type '%s'" %atomType)
129        self.atomTypeID = atomType
130
131        # some data have further subtypes specified
132        self.subtype = None
133       
134        self.ndgObject = ndgObject
135
136        self.atomName = None
137        self.files = []
138        self.author = None
139        self.contributors = []
140        self.atomAuthors = []
141        self.parameters = []
142        self.spatialData = []
143        self.temporalData = []
144        self.relatedLinks = []
145        self.summary = []
146        self.csmlFile = None
147        self.cdmlFile = None
148        # general variable to use for setting the atom content - NB, if a csmlFile is specified
149        # (either directly or via a cdmlFile specification), this will be the content by default
150        # for this purpose
151        self.contentFile = None     
152        self.logos = []
153        self.title = None
154        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
155        self.atomID = None
156   
157        # boundary box info - to replace spatial/temporalData?
158        self.minX = None
159        self.minY = None
160        self.maxX = None
161        self.maxY = None
162        self.t1 = None
163        self.t2 = None
164       
165        # additional, non standard atom data can be included in the molesExtra element
166        if vocabTermData:
167            self.VTD = vocabTermData
168        else:
169            self.VTD = VTD()
170
171        if self.atomTypeID:
172            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
173
174        self.ME = ME.MolesEntity()
175       
176        # date when the atom was first ingested
177        self.publishedDate = None
178
179        # last update date
180        self.updateDate = None
181
182        # assume atom in working state by default - this is used to define what collection
183        # in eXist the atom is stored in
184        self.state = eXistConnector.WORKING_COLLECTION_PATH
185       
186        if xmlString:
187            self.fromString(xmlString)
188           
189        logging.info("Atom initialised")
190
191
192    def getValidSubTypes(self):
193        '''
194        Get list of subtypes that are valid wrt this atom type
195        '''
196        logging.debug("Lookup up subtypes for type, '%s'" %self.atomTypeID)
197        subTypes = self.VTD.SUBTYPE_TERMS.get(self.atomTypeID) or []
198        logging.debug(subTypes)
199        return subTypes
200       
201           
202
203    def __addAtomTypeDataXML(self, root):
204        '''
205        Add the atom type, and subtype data, if available, to atom categories
206        - and lookup and add the appropriate vocab term data
207        '''
208        if self.atomTypeID:
209            logging.info("Adding atom type info to XML output")
210            category = Category()
211            category.label = self.atomTypeID
212            # look up the appropriate vocab term data
213            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
214            category.term = self.ATOM_TYPE
215            root.append(category.toXML())
216
217        if self.subtype:
218            logging.info("Adding atom subtype info to XML output")
219            # NB subtypes not all defined, so leave this out for the moment
220            category.label = self.subtype
221            # look up the appropriate vocab term data
222            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtype)
223            category.term = self.ATOM_SUBTYPE
224            root.append(category.toXML())
225
226
227    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
228        '''
229        Add data to include in the moles entity element
230        '''
231        logging.debug('Adding moles entity information')
232        self.ME.abbreviation = abbreviation
233        self.ME.providerID = provider_id
234        self.ME.createdDate = getISO8601Date(object_creation_time)
235        logging.debug('Moles entity information added')
236
237
238    def _isNewParameter(self, param):
239        '''
240        Check if a parameter is already specified in the atom, return False if
241        so, otherwise return True
242        '''
243        for p in self.parameters:
244            if p.term == param.term and \
245                p.scheme == param.scheme and \
246                p.label == param.label:
247                return False
248        return True
249
250
251    def addRelatedLinks(self, linkVals):
252        '''
253        Add related links in string format - converting to Link objects
254        @param linkVals: string of format, 'uri | title | vocabServerURL'
255        '''
256        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
257
258
259    def addLogos(self, logoVals):
260        '''
261        Add related logos in string format - converting to Link objects
262        @param linkVals: string of format, 'uri | title | vocabServerURL'
263        '''
264        self.relatedLinks.append(self.objectify(logoVals, 'logo'))
265
266
267    def addParameters(self, params):
268        '''
269        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
270        @params param: parameter, as string array, to add to atom parameters collection
271        '''
272        # avoid strings being parsed character by character
273        if type(params) is str:
274            params = [params]
275           
276        for param in params:
277            # firstly tidy parameter
278            param = tidyUpParameters(param)
279            category = Category()
280            category.fromString(param)
281
282            # now check for uniqueness
283            if self._isNewParameter(category):
284                logging.debug("Adding new parameter: %s" %param)
285                self.parameters.append(category)
286   
287   
288    def _linksToXML(self, root):
289        '''
290        Add required links to the input element
291        @param root: element to add links to - NB, should be the root element of the atom
292        '''
293        selfLink = ET.SubElement(root, "link")
294        selfLink.attrib["href"] = self.atomBrowseURL
295        selfLink.attrib["rel"] = "self"
296        molesLink = ET.SubElement(root, "link")
297        molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
298        molesLink.attrib["href"] = molesDoc
299        molesLink.attrib["rel"] = "related"
300       
301        logging.debug(self.relatedLinks)
302        for relatedLink in self.relatedLinks:
303            root.append(relatedLink.toXML())
304       
305        for logo in self.logos:
306            root.append(logo.toXML())
307   
308    def toXML(self):
309        '''
310        Convert the atom into XML representation and return this
311        @return: xml version of atom
312        '''
313        logging.info("Creating formatted XML version of Atom")
314        root = ET.Element("entry")
315        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
316        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2alpha"
317        root.attrib["xmlns:georss"] = "http://www.georss.org/georss"
318        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
319        id = ET.SubElement(root, "id")
320        id.text = self.atomID
321        title = ET.SubElement(root, "title")
322        title.text = self.title
323        self._linksToXML(root)
324
325        # NB, the author tag is mandatory for atoms - so if an explicit
326        # author has not been set, just take the author to be the provider
327        if not self.author:
328            author = Person()
329            author.name = self.ME.providerID
330            author.uri = self.ME.providerID
331            self.author = author
332
333        root.append(self.author.toXML())
334           
335        # NB, only the first author in the list is the author; the rest are contrinbutors
336        for contributor in self.contributors:
337            root.append(contributor.toXML())
338
339        # add the moles entity section, if it is required
340        if self.ME:
341            # add any authors info
342            for author in self.atomAuthors:
343                if author not in self.ME.responsibleParties:
344                    self.ME.responsibleParties.append(author)
345            root.append(self.ME.toXML())
346
347        # add parameters data
348        for param in self.parameters:
349            root.append(param.toXML())
350
351        # add the type and subtype data
352        self.__addAtomTypeDataXML(root)
353                   
354        summary = ET.SubElement(root, "summary")
355        summary.text = self.Summary
356
357        # add link to content, if required - NB, can only have one content element in atom
358        # - and this is mandatory
359        content = ET.SubElement(root, "content")
360        if self.contentFile:
361            content.attrib["type"] = "application/xml"
362            content.attrib["src"] = self.contentFile
363        else:
364            content.text = "Metadata document"
365       
366        # if there's a published date already defined, assume we're doing an update now
367        # NB, update element is mandatory
368        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
369        if not self.publishedDate:
370            self.publishedDate = currentDate
371
372        updated = ET.SubElement(root, "updated")
373        if not self.updateDate:
374            self.updateDate = currentDate
375        updated.text = self.updateDate
376
377        published = ET.SubElement(root, "published")
378        published.text = self.publishedDate
379
380        # add temporal range data, if available
381        temporalRange = ET.SubElement(root, "moles:temporalRange")
382        if self.t1:
383            temporalRange.text = self.t1
384            if self.t2:
385                temporalRange.text += "/" + self.t2
386
387        # add spatial range data, if available
388        self._addSpatialData(root)
389
390        tree = ET.ElementTree(root)
391        logging.info("XML version of Atom created")
392        return tree
393
394
395    def __getSummary(self):
396        logging.debug("Getting summary data")
397        summaryString = ""
398        for summary_line in self.summary:
399            summaryString += summary_line + "\n"
400
401        return summaryString
402
403    def __setSummary(self, summary):
404        logging.debug("Adding summary data")
405        self.summary = []
406        for summary_line in summary.split('\n'):
407            self.summary.append(summary_line)
408           
409    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
410
411           
412    def fromString(self, xmlString):
413        '''
414        Initialise Atom object using an xmlString
415        @param xmlString: representation of atom as an XML string
416        '''
417        logging.info("Ingesting data from XML string")
418       
419        # firstly, remove any namespaces used - to avoid problems with elementtree
420        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
421        xmlString = xmlString.replace('moles:', '')
422        xmlString = xmlString.replace('georss:', '')
423        xmlString = xmlString.replace('gml:', '')
424        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
425
426        # now create elementtree with the XML string
427        logging.debug("Create elementtree instance with XML string")
428        tree = ET.fromstring(xmlString)
429       
430        title = tree.findtext('title')
431        if title:
432            logging.debug("Adding title data")
433            self.title = title
434
435        summary = tree.findtext('summary')
436        if summary:
437            self.Summary = summary
438
439        authorElement = tree.find('author')
440        logging.debug("Adding author data")
441        author = Person()
442        author.fromETElement(authorElement)
443        self.author = author
444
445        contributorElements = tree.findall('contributor')
446        for contributorElement in contributorElements:
447            logging.debug("Adding contributor data")
448            contributor = Person(tagName = 'contributor')
449            contributor.fromETElement(contributorElement)
450            self.contributors.append(contributor)
451
452        molesElement = tree.find('entity')
453        if molesElement:
454            self.ME.fromET(molesElement)
455            for author in self.ME.responsibleParties:
456                self.atomAuthors.append(author)
457               
458            # NB, must lookup the ID following the ME lookup
459            # - since the provider ID from this is required to construct
460            # the browse URL
461            id = tree.findtext('id')
462            if id:
463                id = id.split('__ATOM__')[1]
464                self.setDatasetID(id)
465
466        self._parseCategoryData(tree.findall('category'))
467
468        self._parseLinksData(tree.findall('link'))
469           
470        contentTag = tree.find('content')
471        if contentTag != None:
472            logging.debug("Found content tag - checking for CSML/CDML file data")
473            file = contentTag.attrib.get('src')
474            if file:
475                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
476                if file.upper().find('CSML') > -1:
477                    logging.debug("Adding CSML file data")
478                    self.csmlFile = file
479                elif file.upper().find('CDML') > -1:
480                    logging.debug("Adding CDML file data")
481                    self.cdmlFile = file
482                self.contentFile = file
483       
484        range = tree.findtext('temporalRange')
485        if range:
486            logging.debug("Adding temporal range data")
487            timeData = range.split('/')
488            self.t1 = timeData[0]
489            if len(timeData) > 1:
490                self.t2 = timeData[1]
491       
492        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
493        minBBox = tree.findall('.//lowerCorner')
494        if minBBox:
495            logging.debug("Adding min spatial range data")
496            minBBox = minBBox[0]
497            spatialData = minBBox.text.split()
498            self.minX = spatialData[0]
499            if len(spatialData) > 1:
500                self.minY = spatialData[1]
501       
502        maxBBox = tree.findall('.//upperCorner')
503        if maxBBox:
504            maxBBox = maxBBox[0]
505            logging.debug("Adding max spatial range data")
506            spatialData = maxBBox.text.split()
507            self.maxX = spatialData[0]
508            if len(spatialData) > 1:
509                self.maxY = spatialData[1]
510               
511        publishedDate = tree.findtext('published')
512        if publishedDate:
513            logging.debug("Adding published date")
514            self.publishedDate = publishedDate
515           
516        logging.info("Completed data ingest")
517   
518   
519    def _parseCategoryData(self, categories):
520        logging.debug("Adding category/parameters data")
521        for category in categories:
522            cat = Category()
523            cat.fromETElement(category)
524           
525            if cat.term == self.ATOM_TYPE:
526                logging.debug("Found atom type data")
527                self.atomTypeID = cat.label
528                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
529                continue
530            elif cat.term == self.ATOM_SUBTYPE:
531                logging.debug("Found atom subtype data")
532                self.subtype = cat.label
533                continue
534
535            self.parameters.append(cat)
536   
537
538    def setDatasetID(self, datasetID):
539        '''
540        Set the dataset ID for the atom - and generate an appropriate atom name using this
541        @param datasetID: ID to set for the atom
542        '''
543        self.datasetID = datasetID
544        self._generateAtomName(datasetID) 
545        self.atomID = self.createAtomID(datasetID)
546
547
548    def createAtomID(self, datasetID):
549        '''
550        Create a unique ID, conforming to atom standards, for atom
551        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
552        @param datasetID: ID of atom's dataset
553        @return: unique ID
554        '''
555        logging.info("Creating unique ID for atom")
556        if not self.atomBrowseURL:
557            self._generateAtomName(datasetID)
558        urlBit = self.atomBrowseURL.split('://')[1]
559        urlBit = urlBit.replace('#', '')
560        urlBits = urlBit.split('/')
561        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
562       
563        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
564        logging.info("- unique ID created for atom")
565        logging.debug(" - '%s'" %id)
566        return id
567       
568       
569    def _generateAtomName(self, datasetID):
570        '''
571        Generate a consistent name for the atom - with full eXist doc path
572        @param datasetID: ID of atom's dataset
573        '''
574        self.atomName = datasetID + ".atom"
575        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + \
576            self.ME.providerID + "__ATOM__" + datasetID
577
578
579    def _parseLinksData(self, links):
580        '''
581        Extract links and atom data from array of link elements in the XML representation of the atom
582        @param links: an array of <link> elements
583        '''
584        # firstly, get all data to start with, so we can properly process it afterwards
585        linkData = {}
586        logging.debug("Getting link data")
587        for linkTag in links:
588            link = Link()
589            link.fromETElement(linkTag)
590
591            if not linkData.has_key(link.rel):
592                linkData[link.rel] = []
593            if link.title == VTD.TERM_DATA[VTD.LOGO_TERM].title:
594                self.logos.append(link)
595            else:
596                linkData[link.rel].append(link)
597
598        # there should be one self referencing link - which will provide info on the atom itself
599        if not linkData.has_key('self'):
600            errorMessage = "Atom does not have self referencing link - " + \
601                "cannot ascertain datasetID without this - please fix"
602            logging.error(errorMessage)
603            raise ValueError(errorMessage)
604       
605        # this is the link describing the atom itself
606        self.atomBrowseURL = linkData['self'][0].href
607       
608        # now remove this value and the associated moles doc link
609        del linkData['self']
610        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
611        if linkData.has_key('related'):
612            relatedLinks = []
613            for link in linkData['related']:
614                if link.href != molesDoc:
615                    relatedLinks.append(link)
616           
617            linkData['related'] = relatedLinks
618               
619        # now add the remaining links to the atom
620        for key in linkData:
621            for link in linkData[key]:
622                logging.debug("Adding link data")
623                self.relatedLinks.append(link)
624       
625
626    def _addSpatialData(self, element):
627        '''
628        Add spatial coverage element to an input element
629        @param element: element to add coverage data to
630        '''
631        logging.info("Adding spatial data to Atom")
632        bbox = ET.SubElement(element, "georss:where")
633        if not self.minX:
634            logging.info("No spatial data specified")
635            return
636       
637        envelope = ET.SubElement(bbox, "gml:Envelope")
638        lc = ET.SubElement(envelope, "gml:lowerCorner")
639        lc.text = self.minX + " " + self.minY
640        uc = ET.SubElement(envelope, "gml:upperCorner")
641        uc.text = self.maxX + " " + self.maxY
642
643       
644    def setAttribute(self, attributeName, attributeValue):
645        '''
646        Set the value of an atom attribute - and do some basic tidying up of the string content
647        - to escape any XML unfriendly characters
648        @param attributeName: name of the attribute whose value to set
649        @param attributeValue: value to set the attribute to 
650        '''
651        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
652        origValue = attributeValue
653       
654        # escape any special characters if a value has been specified
655        # NB, need to cope with both single values and arrays
656        if attributeValue:
657            if type(attributeValue) is list:
658                newVals = []
659                for val in attributeValue:
660                    newVals.append(objectify(escapeSpecialCharacters(val)), attributeName)
661                attributeValue = newVals
662                   
663            else:
664                attributeValue = objectify(escapeSpecialCharacters(attributeValue), attributeName)
665
666        # handle the special case of authors; only one author is allowed per atom
667        # - the others should be treated as contributors
668        if attributeName == "authors":
669            setattr(self, "author", attributeValue[0])
670            if len(attributeValue) > 1:
671                setattr(self, "contributors", attributeValue[1:])
672        else:
673            setattr(self, attributeName, attributeValue)
674
675
676    def objectify(self, objectVals, attributeName):
677        '''
678        Some inputs are specified as strings but need to be converted into
679        objects - do this here
680        @param objectVals: a '|' delimited string of values
681        @param attributeName: name of attribute the values belong to
682        '''
683        obj = None
684        if type(objectVals) != str:
685            return objectVals
686       
687        if attributeName == "relatedLinks" or attributeName == "logo":
688            obj = Link()
689        elif attributeName == "atomAuthors" or attributeName == "authors":
690            obj = Person()
691
692        if obj:
693            obj.fromString(objectVals)
694            return obj
695       
696        return objectVals
697
698
699    def toPrettyXML(self):
700        '''
701        Returns nicely formatted XML as string
702        '''
703        atomXML = self.toXML()
704
705        # create the string
706        logging.debug("Converting the elementtree object into a string")
707        prettyXML = et2text(atomXML.getroot())
708
709        # add XML version tag
710        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
711        logging.info("Created formatted version of XML object")
712        return prettyXML
Note: See TracBrowser for help on using the repository browser.