source: ndgCommon/trunk/ndg/common/src/lib/atomutilities.py @ 4981

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/atomutilities.py@4991
Revision 4981, 5.8 KB checked in by cbyrom, 11 years ago (diff)

Fix a couple of small formatting problems in atomutilities + create
test suite to properly test this + add a fix to the dif.xq and improve
the efficiency of the atom2DIF xquery.

Line 
1'''
2Various helper methods for use with atom data  - mainly for
3adding missing info when doing an atom -> dif transform
4
5@author: C Byrom, Tessella Jan 09
6'''
7import logging
8from xml.etree import ElementTree as ET
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.utilities import findElementIndex
11import ndg.common.src.clients.xmldb.eXist.dbconstants as dc
12
13NEED_TO_RESOLVE_STRING = 'NEED TO RESOLVE'
14
15def addStandardKeywords(doc):
16    '''
17    Add keywords common to all data sets
18    @param doc: string representation of DIF doc
19    @return doc in string form with added data
20    '''
21    logging.info("Adding common keywords")
22    difDoc = ET.fromstring(doc)
23   
24    elementName = '{' + ndgObject.DIF_NS + '}Parameters'
25    index = findElementIndex(difDoc, elementName, isLast = True)
26
27    if index < 0:
28        errorMessage = "Could not find '%s' element in doc to insert standard keyword data after - skipping doc" %elementName
29        logging.error(errorMessage)
30        raise ValueError(errorMessage)
31
32    for keyword in ['NERC_DDC']: #'climatologyMeteorologyAtmosphere'
33        keywordElement = ET.Element('{' + ndgObject.DIF_NS + '}Keyword')
34        keywordElement.text = keyword
35        difDoc.insert(index + 1, keywordElement)
36   
37    logging.info("Keywords added")
38    return ET.tostring(difDoc)
39
40
41def expandParametersData(doc):
42    '''
43    The parameters data is taken from the atom category elements - and these need
44    to be properly looked up on the vocab server to give their proper values
45    @param doc: string representation of DIF doc
46    @return doc in string form with added data
47    '''
48    logging.info("Expanding Parameters data")
49    difDoc = ET.fromstring(doc)
50
51    parametersName = '{' + ndgObject.DIF_NS + '}Parameters'
52    for element in difDoc:
53        if element.tag == parametersName:
54            logging.debug("Parameters element found")
55           
56            # extract the topic data - this is where the vocab data is stored
57            # NB, the data is concated as URI---TERM
58            topicElement = element.find('{' + ndgObject.DIF_NS + '}Topic')
59            if not topicElement.text:
60                continue;
61            data = topicElement.text.split('---')
62            if len(data) == 2:
63                uri = data[0]
64                term = data[1]
65                try:
66                    VTI = vocabUtil.getTermItemfromURIAndTerm(uri, term)
67                    topicElement.text = VTI.title
68                except:
69                    # NB, this will currently fail since the vocab data is incomplete
70                    topicElement.text = uri + " (%s)" %NEED_TO_RESOLVE_STRING
71               
72    logging.info("Parameters expanded")
73    return ET.tostring(difDoc)
74
75
76def addOrgData(doc, targetCollection, dataCentre):
77    '''
78    Add organisation data to a DIF doc
79    @param doc: string representation of DIF doc
80    @param targetCollection: Collection that the data is from
81    @param dataCentre: elementree element representing the DIF:Data_Center
82    element - with the required organisation data to add
83    @return doc in string form with added data
84    '''
85    logging.info("Adding organisation data")
86    # add organisation data
87    # - NB, need to remove end DIF tag for this
88    difDoc = ET.fromstring(doc)
89   
90    elementName = '{' + ndgObject.DIF_NS + '}Data_Set_Language'
91    index = findElementIndex(difDoc, elementName)
92
93    if index < 0:
94        errorMessage = "Could not find '%s' element in doc to insert organisation data after - skipping doc" %elementName
95        logging.error(errorMessage)
96        raise ValueError(errorMessage)
97   
98    difDoc.insert(index + 1, dataCentre)
99   
100    # add additional tag for Published (not published) data
101    if targetCollection.find('Published') > -1:
102        logging.info("Adding published version info")
103        # TODO: when proper versioning is implemented in the Atoms, this
104        # tag should extract the relevant info
105        citationElement = difDoc.find('{' + ndgObject.DIF_NS + '}Data_Set_Citation')
106        versionElement = ET.Element('{' + ndgObject.DIF_NS + '}Version')
107        versionElement.text = '1.0'
108        citationElement.append(versionElement)
109        logging.info("Finished adding version info")
110
111    logging.info("Finished adding organisation info")
112    return ET.tostring(difDoc)
113
114
115def getDataCentreDIFElement(repositoryID, atomClient):
116    '''
117    Organisation data needs to be added to DIF records created from atoms
118    - this data is stored in the eXist DB - and is retrieved using this method
119    @param repositoryID: ID of the organisation where the data is stored.  Currently
120    valid IDs are 'badc.nerc.ac.uk' and 'neodc.nerc.ac.uk'
121    @param atomClient: xml db client implementing the InterfaceXMLDBClient interface
122    @return: Elementtree Element containing the specified Data_Centre element data
123    @raise: ValueError if unrecognised repository ID is specified
124    '''
125    logging.info("Getting organisation data for repository, '%s'" %repositoryID)
126    if repositoryID != 'badc.nerc.ac.uk' and repositoryID != 'neodc.nerc.ac.uk':
127        raise ValueError("Unrecognised repository ID, '%s'" %repositoryID)
128   
129    # NB, using the query() method (via the useChunked keyword) here results
130    # in the namespaces being expanded - which ET doesn't like
131    orgData = atomClient.buildAndRunQuery('dif', dc.RESOURCES_COLLECTION_PATH, 
132                                          repositoryID, 'organisation',
133                                          useChunked = False)
134   
135    if not orgData:
136        raise ValueError("Could not find organisation data for provider, '%s'" %repositoryID)
137
138    logging.info("Data retrieved - turning this into Elementtree Element data")
139   
140    # strip out the content we want
141    tree = ET.fromstring(orgData[0])
142    dataCentre = tree.find('{' + ndgObject.DIF_NS + '}Data_Center')
143    logging.info("- returning Element data for data centre")
144    return dataCentre
Note: See TracBrowser for help on using the repository browser.