source: exist/trunk/python/ndgUtils/models/Atom.py @ 4219

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/Atom.py@4219
Revision 4219, 25.8 KB checked in by cbyrom, 11 years ago (diff)

Simplify vocab data by only returning termIDs for the subtypes + fix
a typo + extend Atom, allowing dicts of values to be set in the constructor +
add method to determine the eXist collection relating to the atom state.

Line 
1'''
2 Class representing data in  atom format - allowing moles data to be stored and accessed in a web feed compatible way
3 
4 @author: C Byrom, Tessella Jun 2008
5'''
6try: #python 2.5
7    from xml.etree import cElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree as ET
15import sys, logging, re, datetime
16from ndgUtils.eXistConnector import eXistConnector
17from ndgUtils.ETxmlView import et2text
18from utilities import getTripleData, escapeSpecialCharacters, \
19    tidyUpParameters, getISO8601Date
20from ndgUtils.vocabtermdata import VocabTermData as VTD
21from ndgUtils.models import MolesEntity as ME
22
23class Person():
24    '''
25    Class representing atom author type data - with name, uri and role attributes
26    '''
27    def __init__(self, tagName = "author", namespace = None):
28        self.tagName = tagName
29        self.ns = namespace
30        self.name = ""
31        self.uri = ""
32        self.role = ""
33
34    def fromString(self, personString):
35        (self.name, self.uri, self.role) = getTripleData(personString)
36       
37    def fromETElement(self, personTag):
38        self.name = personTag.findtext('name') or ""
39        self.role = personTag.findtext('role') or ""
40        self.uri = personTag.findtext('uri') or ""
41        logging.debug("Added name: '%s', role: '%s', uri: '%s'" \
42                      %(self.name, self.role, self.uri))
43
44    def toXML(self):
45        prefix = ""
46        if self.ns:
47            prefix = self.ns + ':'
48
49        author = ET.Element(prefix + self.tagName)
50
51        if self.name:
52            name = ET.SubElement(author, prefix + "name")
53            name.text = self.name
54       
55        if self.uri:
56            uri = ET.SubElement(author, prefix + "uri")
57            uri.text = self.uri
58       
59        if self.role:
60            role = ET.SubElement(author, prefix + "role")
61            role.text = self.role
62
63        return author
64   
65
66class Link():
67    '''
68    Class representing an atom link - with href, title and rel attributes
69    '''
70    def __init__(self):
71        self.href = ""
72        self.title = ""
73        self.rel = ""
74
75    def fromString(self, linkString):
76        (self.href, self.title, self.ref) = getTripleData(linkString)
77       
78    def fromETElement(self, linkTag):
79        self.href = linkTag.attrib.get('href') or ""
80        self.rel = linkTag.attrib.get('rel') or ""
81        self.title = linkTag.attrib.get('title') or ""
82
83    def toXML(self):
84        link = ET.Element("link")
85        link.attrib["href"] = self.href
86        link.attrib["title"] = self.title
87        link.attrib["rel"] = self.rel
88        return link
89
90
91class Category():
92    '''
93    Class representing an atom category - with term, scheme and label attributes
94    '''
95    def __init__(self):
96        self.term = ""
97        self.scheme = ""
98        self.label = ""
99
100    def fromString(self, linkString):
101        (self.label, self.scheme, self.term) = getTripleData(linkString)
102       
103    def fromETElement(self, linkTag):
104        self.term = linkTag.attrib.get('term') or ""
105        self.label = linkTag.attrib.get('label') or ""
106        self.scheme = linkTag.attrib.get('scheme') or ""
107
108    def toXML(self):
109        link = ET.Element("category")
110        link.attrib["term"] = self.term
111        link.attrib["scheme"] = self.scheme
112        link.attrib["label"] = self.label
113        return link
114
115
116class Atom(object):
117
118    ATOM_TYPE = "ATOM_TYPE"
119    ATOM_SUBTYPE = "ATOM_SUBTYPE"
120
121    def __init__(self, atomType = None, vocabTermData = None, ndgObject = None, \
122                 xmlString = None, state = eXistConnector.WORKING_COLLECTION_PATH, **inputs):
123        '''
124        Constructor - initialise the atom variables
125        '''
126        logging.info("Initialising atom")
127        if atomType:
128            logging.info(" - of type '%s'" %atomType)
129        self.atomTypeID = atomType
130
131        # some data have further subtypes specified
132        self.subtype = None
133       
134        self.ndgObject = ndgObject
135
136        self.atomName = None
137        self.files = []
138        self.author = None
139        self.contributors = []
140        self.atomAuthors = []
141        self.parameters = []
142        self.spatialData = []
143        self.temporalData = []
144        self.relatedLinks = []
145        self.summary = []
146        self.csmlFile = None
147        self.cdmlFile = None
148        # general variable to use for setting the atom content - NB, if a csmlFile is specified
149        # (either directly or via a cdmlFile specification), this will be the content by default
150        # for this purpose
151        self.contentFile = None     
152        self.logos = []
153        self.title = None
154        self.datasetID = None        # NB, the dataset id ends up in the atomName - <path><datasetID>.atom
155        self.atomID = None
156   
157        # boundary box info - to replace spatial/temporalData?
158        self.minX = None
159        self.minY = None
160        self.maxX = None
161        self.maxY = None
162        self.t1 = None
163        self.t2 = None
164
165        self.ME = ME.MolesEntity()
166       
167        # date when the atom was first ingested
168        self.publishedDate = None
169
170        # last update date
171        self.updateDate = None
172
173        # assume atom in working state by default - this is used to define what collection
174        # in eXist the atom is stored in
175        self.state = state
176       
177        # additional, non standard atom data can be included in the molesExtra element
178        if vocabTermData:
179            self.VTD = vocabTermData
180        else:
181            self.VTD = VTD()
182       
183        if xmlString:
184            self.fromString(xmlString)
185
186        # if inputs passed in as dict, add these now
187        self.__dict__.update(inputs)
188
189        if self.atomTypeID:
190            self.atomTypeName = self.VTD.TERM_DATA[self.atomTypeID].title
191
192        logging.info("Atom initialised")
193
194
195    def getEXistCollectionPath(self):
196        '''
197        Determine the correct collection to use for the atom in eXist
198        '''
199        collectionPath = eXistConnector.BASE_COLLECTION_PATH + self.state
200       
201        if self.atomTypeID == VTD.DE_TERM:
202            collectionPath += eXistConnector.DE_COLLECTION_PATH
203        elif self.atomTypeID == VTD.GRANULE_TERM:
204            collectionPath += eXistConnector.GRANULE_COLLECTION_PATH
205        else:
206            collectionPath += eXistConnector.DEPLOYMENT_COLLECTION_PATH
207       
208        if self.ME:
209            collectionPath += self.ME.providerID + "/"
210        return collectionPath
211
212
213
214    def getValidSubTypes(self):
215        '''
216        Get list of subtypes that are valid wrt this atom type
217        '''
218        logging.debug("Lookup up subtypes for type, '%s'" %self.atomTypeID)
219        subTypes = self.VTD.SUBTYPE_TERMS.get(self.atomTypeID) or []
220        types = []
221        for st in subTypes:
222            types.append(self.VTD.TERM_DATA[st])
223        logging.debug("Found subtypes: %s" %subTypes)
224        return types
225       
226           
227
228    def __addAtomTypeDataXML(self, root):
229        '''
230        Add the atom type, and subtype data, if available, to atom categories
231        - and lookup and add the appropriate vocab term data
232        '''
233        if self.atomTypeID:
234            logging.info("Adding atom type info to XML output")
235            category = Category()
236            category.label = self.atomTypeID
237            # look up the appropriate vocab term data
238            category.scheme = self.VTD.getTermCurrentVocabURL(self.atomTypeID)
239            category.term = self.ATOM_TYPE
240            root.append(category.toXML())
241
242        if self.subtype:
243            logging.info("Adding atom subtype info to XML output")
244            # NB subtypes not all defined, so leave this out for the moment
245            category.label = self.subtype
246            # look up the appropriate vocab term data
247            category.scheme = self.VTD.getTermCurrentVocabURL(self.subtype)
248            category.term = self.ATOM_SUBTYPE
249            root.append(category.toXML())
250
251
252    def addMolesEntityData(self, abbreviation, provider_id, object_creation_time):
253        '''
254        Add data to include in the moles entity element
255        '''
256        logging.debug('Adding moles entity information')
257        self.ME.abbreviation = abbreviation
258        self.ME.providerID = provider_id
259        self.ME.createdDate = getISO8601Date(object_creation_time)
260        logging.debug('Moles entity information added')
261
262
263    def _isNewParameter(self, param):
264        '''
265        Check if a parameter is already specified in the atom, return False if
266        so, otherwise return True
267        '''
268        for p in self.parameters:
269            if p.term == param.term and \
270                p.scheme == param.scheme and \
271                p.label == param.label:
272                return False
273        return True
274
275
276    def addRelatedLinks(self, linkVals):
277        '''
278        Add related links in string format - converting to Link objects
279        @param linkVals: string of format, 'uri | title | vocabServerURL'
280        '''
281        self.relatedLinks.append(self.objectify(linkVals, 'relatedLinks'))
282
283
284    def addLogos(self, logoVals):
285        '''
286        Add related logos in string format - converting to Link objects
287        @param linkVals: string of format, 'uri | title | vocabServerURL'
288        '''
289        self.relatedLinks.append(self.objectify(logoVals, 'logo'))
290
291
292    def addParameters(self, params):
293        '''
294        Add a parameter to list - ensuring it is unique and has been formatted and tidied appropriately
295        @params param: parameter, as string array, to add to atom parameters collection
296        '''
297        # avoid strings being parsed character by character
298        if type(params) is str:
299            params = [params]
300           
301        for param in params:
302            # firstly tidy parameter
303            param = tidyUpParameters(param)
304            category = Category()
305            category.fromString(param)
306
307            # now check for uniqueness
308            if self._isNewParameter(category):
309                logging.debug("Adding new parameter: %s" %param)
310                self.parameters.append(category)
311   
312   
313    def _linksToXML(self, root):
314        '''
315        Add required links to the input element
316        @param root: element to add links to - NB, should be the root element of the atom
317        '''
318        selfLink = ET.SubElement(root, "link")
319        selfLink.attrib["href"] = self.atomBrowseURL
320        selfLink.attrib["rel"] = "self"
321        molesLink = ET.SubElement(root, "link")
322        molesDoc = re.sub('ATOM','NDG-B1', self.atomBrowseURL)
323        molesLink.attrib["href"] = molesDoc
324        molesLink.attrib["rel"] = "related"
325       
326        logging.debug(self.relatedLinks)
327        for relatedLink in self.relatedLinks:
328            root.append(relatedLink.toXML())
329       
330        for logo in self.logos:
331            root.append(logo.toXML())
332   
333    def toXML(self):
334        '''
335        Convert the atom into XML representation and return this
336        @return: xml version of atom
337        '''
338        logging.info("Creating formatted XML version of Atom")
339        root = ET.Element("entry")
340        root.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
341        root.attrib["xmlns:moles"] = "http://ndg.nerc.ac.uk/schema/moles2alpha"
342        root.attrib["xmlns:georss"] = "http://www.georss.org/georss"
343        root.attrib["xmlns:gml"] = "http://www.opengis.net/gml"
344        id = ET.SubElement(root, "id")
345        id.text = self.atomID
346        title = ET.SubElement(root, "title")
347        title.text = self.title
348        self._linksToXML(root)
349
350        # NB, the author tag is mandatory for atoms - so if an explicit
351        # author has not been set, just take the author to be the provider
352        if not self.author:
353            author = Person()
354            author.name = self.ME.providerID
355            author.uri = self.ME.providerID
356            self.author = author
357
358        root.append(self.author.toXML())
359           
360        # NB, only the first author in the list is the author; the rest are contrinbutors
361        for contributor in self.contributors:
362            root.append(contributor.toXML())
363
364        # add the moles entity section, if it is required
365        if self.ME:
366            # add any authors info
367            for author in self.atomAuthors:
368                if author not in self.ME.responsibleParties:
369                    self.ME.responsibleParties.append(author)
370            root.append(self.ME.toXML())
371
372        # add parameters data
373        for param in self.parameters:
374            root.append(param.toXML())
375
376        # add the type and subtype data
377        self.__addAtomTypeDataXML(root)
378                   
379        summary = ET.SubElement(root, "summary")
380        summary.text = self.Summary
381
382        # add link to content, if required - NB, can only have one content element in atom
383        # - and this is mandatory
384        content = ET.SubElement(root, "content")
385        if self.contentFile:
386            content.attrib["type"] = "application/xml"
387            content.attrib["src"] = self.contentFile
388        else:
389            content.text = "Metadata document"
390       
391        # if there's a published date already defined, assume we're doing an update now
392        # NB, update element is mandatory
393        currentDate = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%SZ")
394        if not self.publishedDate:
395            self.publishedDate = currentDate
396
397        updated = ET.SubElement(root, "updated")
398        if not self.updateDate:
399            self.updateDate = currentDate
400        updated.text = self.updateDate
401
402        published = ET.SubElement(root, "published")
403        published.text = self.publishedDate
404
405        # add temporal range data, if available
406        temporalRange = ET.SubElement(root, "moles:temporalRange")
407        if self.t1:
408            temporalRange.text = self.t1
409            if self.t2:
410                temporalRange.text += "/" + self.t2
411
412        # add spatial range data, if available
413        self._addSpatialData(root)
414
415        tree = ET.ElementTree(root)
416        logging.info("XML version of Atom created")
417        return tree
418
419
420    def __getSummary(self):
421        logging.debug("Getting summary data")
422        summaryString = ""
423        for summary_line in self.summary:
424            summaryString += summary_line + "\n"
425
426        return summaryString
427
428    def __setSummary(self, summary):
429        logging.debug("Adding summary data")
430        self.summary = []
431        for summary_line in summary.split('\n'):
432            self.summary.append(summary_line)
433           
434    Summary = property(fset=__setSummary, fget=__getSummary, doc="Atom summary")
435
436           
437    def fromString(self, xmlString):
438        '''
439        Initialise Atom object using an xmlString
440        @param xmlString: representation of atom as an XML string
441        '''
442        logging.info("Ingesting data from XML string")
443       
444        # firstly, remove any namespaces used - to avoid problems with elementtree
445        logging.debug("Stripping moles namespace from string to allow easy handling with elementtree")
446        xmlString = xmlString.replace('moles:', '')
447        xmlString = xmlString.replace('georss:', '')
448        xmlString = xmlString.replace('gml:', '')
449        xmlString = xmlString.replace('xmlns="http://www.w3.org/2005/Atom"', '')
450
451        # now create elementtree with the XML string
452        logging.debug("Create elementtree instance with XML string")
453        tree = ET.fromstring(xmlString)
454       
455        title = tree.findtext('title')
456        if title:
457            logging.debug("Adding title data")
458            self.title = title
459
460        summary = tree.findtext('summary')
461        if summary:
462            self.Summary = summary
463
464        authorElement = tree.find('author')
465        logging.debug("Adding author data")
466        author = Person()
467        author.fromETElement(authorElement)
468        self.author = author
469
470        contributorElements = tree.findall('contributor')
471        for contributorElement in contributorElements:
472            logging.debug("Adding contributor data")
473            contributor = Person(tagName = 'contributor')
474            contributor.fromETElement(contributorElement)
475            self.contributors.append(contributor)
476
477        molesElement = tree.find('entity')
478        if molesElement:
479            self.ME.fromET(molesElement)
480            for author in self.ME.responsibleParties:
481                self.atomAuthors.append(author)
482               
483            # NB, must lookup the ID following the ME lookup
484            # - since the provider ID from this is required to construct
485            # the browse URL
486            id = tree.findtext('id')
487            if id:
488                id = id.split('__ATOM__')[1]
489                self.setDatasetID(id)
490
491        self._parseCategoryData(tree.findall('category'))
492
493        self._parseLinksData(tree.findall('link'))
494           
495        contentTag = tree.find('content')
496        if contentTag != None:
497            logging.debug("Found content tag - checking for CSML/CDML file data")
498            file = contentTag.attrib.get('src')
499            if file:
500                # NB, the path will reveal more reliably whether we're dealing with CSML and CDML files
501                if file.upper().find('CSML') > -1:
502                    logging.debug("Adding CSML file data")
503                    self.csmlFile = file
504                elif file.upper().find('CDML') > -1:
505                    logging.debug("Adding CDML file data")
506                    self.cdmlFile = file
507                self.contentFile = file
508       
509        range = tree.findtext('temporalRange')
510        if range:
511            logging.debug("Adding temporal range data")
512            timeData = range.split('/')
513            self.t1 = timeData[0]
514            if len(timeData) > 1:
515                self.t2 = timeData[1]
516       
517        # NB, this parser won't mind if we're dealing with Envelope or EnvelopeWithTimePeriod
518        minBBox = tree.findall('.//lowerCorner')
519        if minBBox:
520            logging.debug("Adding min spatial range data")
521            minBBox = minBBox[0]
522            spatialData = minBBox.text.split()
523            self.minX = spatialData[0]
524            if len(spatialData) > 1:
525                self.minY = spatialData[1]
526       
527        maxBBox = tree.findall('.//upperCorner')
528        if maxBBox:
529            maxBBox = maxBBox[0]
530            logging.debug("Adding max spatial range data")
531            spatialData = maxBBox.text.split()
532            self.maxX = spatialData[0]
533            if len(spatialData) > 1:
534                self.maxY = spatialData[1]
535               
536        publishedDate = tree.findtext('published')
537        if publishedDate:
538            logging.debug("Adding published date")
539            self.publishedDate = publishedDate
540           
541        logging.info("Completed data ingest")
542   
543   
544    def _parseCategoryData(self, categories):
545        logging.debug("Adding category/parameters data")
546        for category in categories:
547            cat = Category()
548            cat.fromETElement(category)
549           
550            if cat.term == self.ATOM_TYPE:
551                logging.debug("Found atom type data")
552                self.atomTypeID = cat.label
553                self.atomTypeName = self.VTD.TERM_DATA[cat.label].title
554                continue
555            elif cat.term == self.ATOM_SUBTYPE:
556                logging.debug("Found atom subtype data")
557                self.subtype = cat.label
558                continue
559
560            self.parameters.append(cat)
561   
562
563    def setDatasetID(self, datasetID):
564        '''
565        Set the dataset ID for the atom - and generate an appropriate atom name using this
566        @param datasetID: ID to set for the atom
567        '''
568        self.datasetID = datasetID
569        self._generateAtomName(datasetID) 
570        self.atomID = self.createAtomID(datasetID)
571
572
573    def createAtomID(self, datasetID):
574        '''
575        Create a unique ID, conforming to atom standards, for atom
576        NB, see http://diveintomark.org/archives/2004/05/28/howto-atom-id
577        @param datasetID: ID of atom's dataset
578        @return: unique ID
579        '''
580        logging.info("Creating unique ID for atom")
581        if not self.atomBrowseURL:
582            self._generateAtomName(datasetID)
583        urlBit = self.atomBrowseURL.split('://')[1]
584        urlBit = urlBit.replace('#', '')
585        urlBits = urlBit.split('/')
586        dateBit = datetime.datetime.today().strftime("%Y-%m-%d")
587       
588        id = "tag:" + urlBits[0] + "," + dateBit + ":/" + "/".join(urlBits[1:])
589        logging.info("- unique ID created for atom")
590        logging.debug(" - '%s'" %id)
591        return id
592       
593       
594    def _generateAtomName(self, datasetID):
595        '''
596        Generate a consistent name for the atom - with full eXist doc path
597        @param datasetID: ID of atom's dataset
598        '''
599        self.atomName = datasetID + ".atom"
600        self.atomBrowseURL = VTD.BROWSE_ROOT_URL + \
601            self.ME.providerID + "__ATOM__" + datasetID
602
603
604    def _parseLinksData(self, links):
605        '''
606        Extract links and atom data from array of link elements in the XML representation of the atom
607        @param links: an array of <link> elements
608        '''
609        # firstly, get all data to start with, so we can properly process it afterwards
610        linkData = {}
611        logging.debug("Getting link data")
612        for linkTag in links:
613            link = Link()
614            link.fromETElement(linkTag)
615
616            if not linkData.has_key(link.rel):
617                linkData[link.rel] = []
618            if link.title == VTD.TERM_DATA[VTD.LOGO_TERM].title:
619                self.logos.append(link)
620            else:
621                linkData[link.rel].append(link)
622
623        # there should be one self referencing link - which will provide info on the atom itself
624        if not linkData.has_key('self'):
625            errorMessage = "Atom does not have self referencing link - " + \
626                "cannot ascertain datasetID without this - please fix"
627            logging.error(errorMessage)
628            raise ValueError(errorMessage)
629       
630        # this is the link describing the atom itself
631        self.atomBrowseURL = linkData['self'][0].href
632       
633        # now remove this value and the associated moles doc link
634        del linkData['self']
635        molesDoc = self.atomBrowseURL.replace('ATOM', 'NDG-B1')
636        if linkData.has_key('related'):
637            relatedLinks = []
638            for link in linkData['related']:
639                if link.href != molesDoc:
640                    relatedLinks.append(link)
641           
642            linkData['related'] = relatedLinks
643               
644        # now add the remaining links to the atom
645        for key in linkData:
646            for link in linkData[key]:
647                logging.debug("Adding link data")
648                self.relatedLinks.append(link)
649       
650
651    def _addSpatialData(self, element):
652        '''
653        Add spatial coverage element to an input element
654        @param element: element to add coverage data to
655        '''
656        logging.info("Adding spatial data to Atom")
657        bbox = ET.SubElement(element, "georss:where")
658        if not self.minX:
659            logging.info("No spatial data specified")
660            return
661       
662        envelope = ET.SubElement(bbox, "gml:Envelope")
663        lc = ET.SubElement(envelope, "gml:lowerCorner")
664        lc.text = self.minX + " " + self.minY
665        uc = ET.SubElement(envelope, "gml:upperCorner")
666        uc.text = self.maxX + " " + self.maxY
667
668       
669    def setAttribute(self, attributeName, attributeValue):
670        '''
671        Set the value of an atom attribute - and do some basic tidying up of the string content
672        - to escape any XML unfriendly characters
673        @param attributeName: name of the attribute whose value to set
674        @param attributeValue: value to set the attribute to 
675        '''
676        logging.debug("Setting attribute, %s, to %s" %(attributeName, attributeValue))
677        origValue = attributeValue
678       
679        # escape any special characters if a value has been specified
680        # NB, need to cope with both single values and arrays
681        if attributeValue:
682            if type(attributeValue) is list:
683                newVals = []
684                for val in attributeValue:
685                    newVals.append(objectify(escapeSpecialCharacters(val)), attributeName)
686                attributeValue = newVals
687                   
688            else:
689                attributeValue = objectify(escapeSpecialCharacters(attributeValue), attributeName)
690
691        # handle the special case of authors; only one author is allowed per atom
692        # - the others should be treated as contributors
693        if attributeName == "authors":
694            setattr(self, "author", attributeValue[0])
695            if len(attributeValue) > 1:
696                setattr(self, "contributors", attributeValue[1:])
697        else:
698            setattr(self, attributeName, attributeValue)
699
700
701    def objectify(self, objectVals, attributeName):
702        '''
703        Some inputs are specified as strings but need to be converted into
704        objects - do this here
705        @param objectVals: a '|' delimited string of values
706        @param attributeName: name of attribute the values belong to
707        '''
708        obj = None
709        if type(objectVals) != str:
710            return objectVals
711       
712        if attributeName == "relatedLinks" or attributeName == "logo":
713            obj = Link()
714        elif attributeName == "atomAuthors" or attributeName == "authors":
715            obj = Person()
716
717        if obj:
718            obj.fromString(objectVals)
719            return obj
720       
721        return objectVals
722
723
724    def toPrettyXML(self):
725        '''
726        Returns nicely formatted XML as string
727        '''
728        atomXML = self.toXML()
729
730        # create the string
731        logging.debug("Converting the elementtree object into a string")
732        prettyXML = et2text(atomXML.getroot())
733
734        # add XML version tag
735        prettyXML = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + prettyXML
736        logging.info("Created formatted version of XML object")
737        return prettyXML
Note: See TracBrowser for help on using the repository browser.