source: exist/trunk/python/ndgUtils/models/Atom.py @ 4214

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4214
Revision 4214, 24.6 KB checked in by cbyrom, 11 years ago (diff)

Add cmp method to DeploymentLink? to allow for object equality tests +
add new method to MolesEntity? to ensure that only unique deployment
links are added to the helper arrays + add new constructor input to
Atom to allow the content to be defined at creation.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22
23class Person():
24    '''
25    Class representing atom author type data - with name, uri and role attributes
26    '''
27    def __init__(self, tagName = "author", namespace = None):
28        self.tagName = tagName
29        self.ns = namespace
30        self.name = ""
31        self.uri = ""
32        self.role = ""
33
34    def fromString(self, personString):
35        (self.name, self.uri, self.role) = getTripleData(personString)
36       
37    def fromETElement(self, personTag):
38        self.name = personTag.findtext('name') or ""
39        self.role = personTag.findtext('role') or ""
40        self.uri = personTag.findtext('uri') or ""
41        logging.debug("Added name: '%s', role: '%s', uri: '%s'" \
42                      %(self.name, self.role, self.uri))
43
44    def toXML(self):
45        prefix = ""
46        if self.ns:
47            prefix = self.ns + ':'
48
49        author = ET.Element(prefix + self.tagName)
50
51        if self.name:
52            name = ET.SubElement(author, prefix + "name")
53            name.text = self.name
54       
55        if self.uri:
56            uri = ET.SubElement(author, prefix + "uri")
57            uri.text = self.uri
58       
59        if self.role:
60            role = ET.SubElement(author, prefix + "role")
61            role.text = self.role
62
63        return author
64   
65
66class Link():
67    '''
68    Class representing an atom link - with href, title and rel attributes
69    '''
70    def __init__(self):
71        self.href = ""
72        self.title = ""
73        self.rel = ""
74
75    def fromString(self, linkString):
76        (self.href, self.title, self.ref) = getTripleData(linkString)
77       
78    def fromETElement(self, linkTag):
79        self.href = linkTag.attrib.get('href') or ""
80        self.rel = linkTag.attrib.get('rel') or ""
81        self.title = linkTag.attrib.get('title') or ""
82
83    def toXML(self):
84        link = ET.Element("link")
85        link.attrib["href"] = self.href
86        link.attrib["title"] = self.title
87        link.attrib["rel"] = self.rel
88        return link
89
90
91class Category():
92    '''
93    Class representing an atom category - with term, scheme and label attributes
94    '''
95    def __init__(self):
96        self.term = ""
97        self.scheme = ""
98        self.label = ""
99
100    def fromString(self, linkString):
101        (self.label, self.scheme, self.term) = getTripleData(linkString)
102       
103    def fromETElement(self, linkTag):
104        self.term = linkTag.attrib.get('term') or ""
105        self.label = linkTag.attrib.get('label') or ""
106        self.scheme = linkTag.attrib.get('scheme') or ""
107
108    def toXML(self):
109        link = ET.Element("category")
110        link.attrib["term"] = self.term
111        link.attrib["scheme"] = self.scheme
112        link.attrib["label"] = self.label
113        return link
114
115
116class Atom(object):
117
118    ATOM_TYPE = "ATOM_TYPE"
119    ATOM_SUBTYPE = "ATOM_SUBTYPE"
120
121    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
122                 xmlString = None):
123        '''
124        Constructor - initialise the atom variables
125        '''
126        logging.info("Initialising atom")
127        if atomType:
128            logging.info(" - of type '%s'" %atomType)
129        self.atomType = atomType
130
131        # some data have further subtypes specified
132        self.subtype = None
133       
134        self.ndgObject = ndgObject
135
136        self.atomName = None
137        self.files = []
138        self.author = None
139        self.contributors = []
140        self.atomAuthors = []
141        self.parameters = []
142        self.spatialData = []
143        self.temporalData = []
144        self.relatedLinks = []
145        self.summary = []
146        self.csmlFile = None
147        self.cdmlFile = None
148        # general variable to use for setting the atom content - NB, if a csmlFile is specified
149        # (either directly or via a cdmlFile specification), this will be the content by default
150        # for this purpose
151        self.contentFile = None     
152        self.logos = []
153        self.title = None
154        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
155        self.atomID = None
156   
157        # boundary box info - to replace spatial/temporalData?
158        self.minX = None
159        self.minY = None
160        self.maxX = None
161        self.maxY = None
162        self.t1 = None
163        self.t2 = None
164       
165        # additional, non standard atom data can be included in the molesExtra element
166        if vocabTermData:
167            self.VTD = vocabTermData
168        else:
169            self.VTD = VTD()
170         
171        self.ME = ME.MolesEntity()
172       
173        # date when the atom was first ingested
174        self.publishedDate = None
175
176        # last update date
177        self.updateDate = None
178
179        # assume atom in working state by default - this is used to define what collection
180        # in eXist the atom is stored in
181        self.state = eXistConnector.WORKING_COLLECTION_PATH
182       
183        if xmlString:
184            self.fromString(xmlString)
185           
186        logging.info("Atom initialised")
187
188
189    def __addAtomTypeDataXML(self, root):
190        '''
191        Add the atom type, and subtype data, if available, to atom categories
192        - and lookup and add the appropriate vocab term data
193        '''
194        if self.atomType:
195            logging.info("Adding atom type info to XML output")
196            category = Category()
197            category.label = self.VTD.TERM_DATA[self.atomType].title
198            # look up the appropriate vocab term data
199            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomType)
200            category.term = self.ATOM_TYPE
201            root.append(category.toXML())
202
203        if self.subtype:
204            logging.info("Adding atom subtype info to XML output")
205            # NB subtypes not all defined, so leave this out for the moment
206            category.label = self.subtype #self.VTD.TERM_DATA[self.subtype].title
207            # look up the appropriate vocab term data
208            category.scheme = 'NOT SET UP AT PRESENT'#self.VTD.getTermCurrentVocabURL(self.subtype)
209            category.term = self.ATOM_SUBTYPE
210            root.append(category.toXML())
211
212
213    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
214        '''
215        Add data to include in the moles entity element
216        '''
217        logging.debug('Adding moles entity information')
218        self.ME.abbreviation = abbreviation
219        self.ME.providerID = provider_id
220        self.ME.createdDate = getISO8601Date(object_creation_time)
221        logging.debug('Moles entity information added')
222
223
224    def _isNewParameter(self, param):
225        '''
226        Check if a parameter is already specified in the atom, return False if
227        so, otherwise return True
228        '''
229        for p in self.parameters:
230            if p.term == param.term and \
231                p.scheme == param.scheme and \
232                p.label == param.label:
233                return False
234        return True
235
236
237    def addRelatedLinks(self, linkVals):
238        '''
239        Add related links in string format - converting to Link objects
240        @param linkVals: string of format, 'uri | title | vocabServerURL'
241        '''
242        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
243
244
245    def addLogos(self, logoVals):
246        '''
247        Add related logos in string format - converting to Link objects
248        @param linkVals: string of format, 'uri | title | vocabServerURL'
249        '''
250        self.relatedLinks.append(self.objectify(logoVals, 'logo'))
251
252
253    def addParameters(self, params):
254        '''
255        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
256        @params param: parameter, as string array, to add to atom parameters collection
257        '''
258        # avoid strings being parsed character by character
259        if type(params) is str:
260            params = [params]
261           
262        for param in params:
263            # firstly tidy parameter
264            param = tidyUpParameters(param)
265            category = Category()
266            category.fromString(param)
267
268            # now check for uniqueness
269            if self._isNewParameter(category):
270                logging.debug("Adding new parameter: %s" %param)
271                self.parameters.append(category)
272   
273   
274    def _linksToXML(self, root):
275        '''
276        Add required links to the input element
277        @param root: element to add links to - NB, should be the root element of the atom
278        '''
279        selfLink = ET.SubElement(root, "link")
280        selfLink.attrib["href"] = self.atomBrowseURL
281        selfLink.attrib["rel"] = "self"
282        molesLink = ET.SubElement(root, "link")
283        molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
284        molesLink.attrib["href"] = molesDoc
285        molesLink.attrib["rel"] = "related"
286       
287        logging.debug(self.relatedLinks)
288        for relatedLink in self.relatedLinks:
289            root.append(relatedLink.toXML())
290       
291        for logo in self.logos:
292            root.append(logo.toXML())
293   
294    def toXML(self):
295        '''
296        Convert the atom into XML representation and return this
297        @return: xml version of atom
298        '''
299        logging.info("Creating formatted XML version of Atom")
300        root = ET.Element("entry")
301        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
302        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2alpha"
303        root.attrib["xmlns:georss"] = "http://www.georss.org/georss"
304        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
305        id = ET.SubElement(root, "id")
306        id.text = self.atomID
307        title = ET.SubElement(root, "title")
308        title.text = self.title
309        self._linksToXML(root)
310
311        # NB, the author tag is mandatory for atoms - so if an explicit
312        # author has not been set, just take the author to be the provider
313        if not self.author:
314            author = Person()
315            author.name = self.ME.providerID
316            author.uri = self.ME.providerID
317            self.author = author
318
319        root.append(self.author.toXML())
320           
321        # NB, only the first author in the list is the author; the rest are contrinbutors
322        for contributor in self.contributors:
323            root.append(contributor.toXML())
324
325        # add the moles entity section, if it is required
326        if self.ME:
327            # add any authors info
328            for author in self.atomAuthors:
329                if author not in self.ME.responsibleParties:
330                    self.ME.responsibleParties.append(author)
331            root.append(self.ME.toXML())
332
333        # add parameters data
334        for param in self.parameters:
335            root.append(param.toXML())
336
337        # add the type and subtype data
338        self.__addAtomTypeDataXML(root)
339                   
340        summary = ET.SubElement(root, "summary")
341        summary.text = self.Summary
342
343        # add link to content, if required - NB, can only have one content element in atom
344        # - and this is mandatory
345        content = ET.SubElement(root, "content")
346        if self.contentFile:
347            content.attrib["type"] = "application/xml"
348            content.attrib["src"] = self.contentFile
349        else:
350            content.text = "Metadata document"
351       
352        # if there's a published date already defined, assume we're doing an update now
353        # NB, update element is mandatory
354        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
355        if not self.publishedDate:
356            self.publishedDate = currentDate
357
358        updated = ET.SubElement(root, "updated")
359        if not self.updateDate:
360            self.updateDate = currentDate
361        updated.text = self.updateDate
362
363        published = ET.SubElement(root, "published")
364        published.text = self.publishedDate
365
366        # add temporal range data, if available
367        temporalRange = ET.SubElement(root, "moles:temporalRange")
368        if self.t1:
369            temporalRange.text = self.t1
370            if self.t2:
371                temporalRange.text += "/" + self.t2
372
373        # add spatial range data, if available
374        self._addSpatialData(root)
375
376        tree = ET.ElementTree(root)
377        logging.info("XML version of Atom created")
378        return tree
379
380
381    def __getSummary(self):
382        logging.debug("Getting summary data")
383        summaryString = ""
384        for summary_line in self.summary:
385            summaryString += summary_line + "\n"
386
387        return summaryString
388
389    def __setSummary(self, summary):
390        logging.debug("Adding summary data")
391        self.summary = []
392        for summary_line in summary.split('\n'):
393            self.summary.append(summary_line)
394           
395    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
396
397           
398    def fromString(self, xmlString):
399        '''
400        Initialise Atom object using an xmlString
401        @param xmlString: representation of atom as an XML string
402        '''
403        logging.info("Ingesting data from XML string")
404       
405        # firstly, remove any namespaces used - to avoid problems with elementtree
406        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
407        xmlString = xmlString.replace('moles:', '')
408        xmlString = xmlString.replace('georss:', '')
409        xmlString = xmlString.replace('gml:', '')
410        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
411
412        # now create elementtree with the XML string
413        logging.debug("Create elementtree instance with XML string")
414        tree = ET.fromstring(xmlString)
415       
416        title = tree.findtext('title')
417        if title:
418            logging.debug("Adding title data")
419            self.title = title
420
421        summary = tree.findtext('summary')
422        if summary:
423            self.Summary = summary
424
425        authorElement = tree.find('author')
426        logging.debug("Adding author data")
427        author = Person()
428        author.fromETElement(authorElement)
429        self.author = author
430
431        contributorElements = tree.findall('contributor')
432        for contributorElement in contributorElements:
433            logging.debug("Adding contributor data")
434            contributor = Person(tagName = 'contributor')
435            contributor.fromETElement(contributorElement)
436            self.contributors.append(contributor)
437
438        molesElement = tree.find('entity')
439        if molesElement:
440            self.ME.fromET(molesElement)
441            for author in self.ME.responsibleParties:
442                self.atomAuthors.append(author)
443               
444            # NB, must lookup the ID following the ME lookup
445            # - since the provider ID from this is required to construct
446            # the browse URL
447            id = tree.findtext('id')
448            if id:
449                id = id.split('__ATOM__')[1]
450                self.setDatasetID(id)
451
452        self._parseCategoryData(tree.findall('category'))
453
454        self._parseLinksData(tree.findall('link'))
455           
456        contentTag = tree.find('content')
457        if contentTag != None:
458            logging.debug("Found content tag - checking for CSML/CDML file data")
459            file = contentTag.attrib.get('src')
460            if file:
461                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
462                if file.upper().find('CSML') > -1:
463                    logging.debug("Adding CSML file data")
464                    self.csmlFile = file
465                elif file.upper().find('CDML') > -1:
466                    logging.debug("Adding CDML file data")
467                    self.cdmlFile = file
468                self.contentFile = file
469       
470        range = tree.findtext('temporalRange')
471        if range:
472            logging.debug("Adding temporal range data")
473            timeData = range.split('/')
474            self.t1 = timeData[0]
475            if len(timeData) > 1:
476                self.t2 = timeData[1]
477       
478        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
479        minBBox = tree.findall('.//lowerCorner')
480        if minBBox:
481            logging.debug("Adding min spatial range data")
482            minBBox = minBBox[0]
483            spatialData = minBBox.text.split()
484            self.minX = spatialData[0]
485            if len(spatialData) > 1:
486                self.minY = spatialData[1]
487       
488        maxBBox = tree.findall('.//upperCorner')
489        if maxBBox:
490            maxBBox = maxBBox[0]
491            logging.debug("Adding max spatial range data")
492            spatialData = maxBBox.text.split()
493            self.maxX = spatialData[0]
494            if len(spatialData) > 1:
495                self.maxY = spatialData[1]
496               
497        publishedDate = tree.findtext('published')
498        if publishedDate:
499            logging.debug("Adding published date")
500            self.publishedDate = publishedDate
501           
502        logging.info("Completed data ingest")
503   
504   
505    def _parseCategoryData(self, categories):
506        logging.debug("Adding category/parameters data")
507        for category in categories:
508            cat = Category()
509            cat.fromETElement(category)
510           
511            if cat.term == self.ATOM_TYPE:
512                logging.debug("Found atom type data")
513                self.atomType = cat.label
514                continue
515            elif cat.term == self.ATOM_SUBTYPE:
516                logging.debug("Found atom subtype data")
517                self.subtype = cat.label
518                continue
519
520            self.parameters.append(cat)
521   
522
523    def setDatasetID(self, datasetID):
524        '''
525        Set the dataset ID for the atom - and generate an appropriate atom name using this
526        @param datasetID: ID to set for the atom
527        '''
528        self.datasetID = datasetID
529        self._generateAtomName(datasetID) 
530        self.atomID = self.createAtomID(datasetID)
531
532
533    def createAtomID(self, datasetID):
534        '''
535        Create a unique ID, conforming to atom standards, for atom
536        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
537        @param datasetID: ID of atom's dataset
538        @return: unique ID
539        '''
540        logging.info("Creating unique ID for atom")
541        if not self.atomBrowseURL:
542            self._generateAtomName(datasetID)
543        urlBit = self.atomBrowseURL.split('://')[1]
544        urlBit = urlBit.replace('#', '')
545        urlBits = urlBit.split('/')
546        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
547       
548        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
549        logging.info("- unique ID created for atom")
550        logging.debug(" - '%s'" %id)
551        return id
552       
553       
554    def _generateAtomName(self, datasetID):
555        '''
556        Generate a consistent name for the atom - with full eXist doc path
557        @param datasetID: ID of atom's dataset
558        '''
559        self.atomName = datasetID + ".atom"
560        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + \
561            self.ME.providerID + "__ATOM__" + datasetID
562
563
564    def _parseLinksData(self, links):
565        '''
566        Extract links and atom data from array of link elements in the XML representation of the atom
567        @param links: an array of <link> elements
568        '''
569        # firstly, get all data to start with, so we can properly process it afterwards
570        linkData = {}
571        logging.debug("Getting link data")
572        for linkTag in links:
573            link = Link()
574            link.fromETElement(linkTag)
575
576            if not linkData.has_key(link.rel):
577                linkData[link.rel] = []
578            if link.title == VTD.TERM_DATA[VTD.LOGO_TERM].title:
579                self.logos.append(link)
580            else:
581                linkData[link.rel].append(link)
582
583        # there should be one self referencing link - which will provide info on the atom itself
584        if not linkData.has_key('self'):
585            errorMessage = "Atom does not have self referencing link - " + \
586                "cannot ascertain datasetID without this - please fix"
587            logging.error(errorMessage)
588            raise ValueError(errorMessage)
589       
590        # this is the link describing the atom itself
591        self.atomBrowseURL = linkData['self'][0].href
592       
593        # now remove this value and the associated moles doc link
594        del linkData['self']
595        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
596        if linkData.has_key('related'):
597            relatedLinks = []
598            for link in linkData['related']:
599                if link.href != molesDoc:
600                    relatedLinks.append(link)
601           
602            linkData['related'] = relatedLinks
603               
604        # now add the remaining links to the atom
605        for key in linkData:
606            for link in linkData[key]:
607                logging.debug("Adding link data")
608                self.relatedLinks.append(link)
609       
610
611    def _addSpatialData(self, element):
612        '''
613        Add spatial coverage element to an input element
614        @param element: element to add coverage data to
615        '''
616        logging.info("Adding spatial data to Atom")
617        bbox = ET.SubElement(element, "georss:where")
618        if not self.minX:
619            logging.info("No spatial data specified")
620            return
621       
622        envelope = ET.SubElement(bbox, "gml:Envelope")
623        lc = ET.SubElement(envelope, "gml:lowerCorner")
624        lc.text = self.minX + " " + self.minY
625        uc = ET.SubElement(envelope, "gml:upperCorner")
626        uc.text = self.maxX + " " + self.maxY
627
628       
629    def setAttribute(self, attributeName, attributeValue):
630        '''
631        Set the value of an atom attribute - and do some basic tidying up of the string content
632        - to escape any XML unfriendly characters
633        @param attributeName: name of the attribute whose value to set
634        @param attributeValue: value to set the attribute to 
635        '''
636        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
637        origValue = attributeValue
638       
639        # escape any special characters if a value has been specified
640        # NB, need to cope with both single values and arrays
641        if attributeValue:
642            if type(attributeValue) is list:
643                newVals = []
644                for val in attributeValue:
645                    newVals.append(objectify(escapeSpecialCharacters(val)), attributeName)
646                attributeValue = newVals
647                   
648            else:
649                attributeValue = objectify(escapeSpecialCharacters(attributeValue), attributeName)
650
651        # handle the special case of authors; only one author is allowed per atom
652        # - the others should be treated as contributors
653        if attributeName == "authors":
654            setattr(self, "author", attributeValue[0])
655            if len(attributeValue) > 1:
656                setattr(self, "contributors", attributeValue[1:])
657        else:
658            setattr(self, attributeName, attributeValue)
659
660
661    def objectify(self, objectVals, attributeName):
662        '''
663        Some inputs are specified as strings but need to be converted into
664        objects - do this here
665        @param objectVals: a '|' delimited string of values
666        @param attributeName: name of attribute the values belong to
667        '''
668        obj = None
669        if type(objectVals) != str:
670            return objectVals
671       
672        if attributeName == "relatedLinks" or attributeName == "logo":
673            obj = Link()
674        elif attributeName == "atomAuthors" or attributeName == "authors":
675            obj = Person()
676
677        if obj:
678            obj.fromString(objectVals)
679            return obj
680       
681        return objectVals
682
683
684    def toPrettyXML(self):
685        '''
686        Returns nicely formatted XML as string
687        '''
688        atomXML = self.toXML()
689
690        # create the string
691        logging.debug("Converting the elementtree object into a string")
692        prettyXML = et2text(atomXML.getroot())
693
694        # add XML version tag
695        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
696        logging.info("Created formatted version of XML object")
697        return prettyXML
Note: See TracBrowser for help on using the repository browser.