source: mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py @ 8144

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py@8144
Revision 8144, 14.6 KB checked in by mnagni, 8 years ago (diff)

Implementation of tickets #22386, #22395

Line 
1'''
2Created on 15 Nov 2011
3
4@author: mnagni
5'''
6from ea_model.moles3_4.utilities.mo_responsiblepartyinfo import MO_ResponsiblePartyInfo
7from ea_model.iso_19115_2006_metadata_corrigendum.reference_system_information.md_identifier import MD_Identifier
8from httplib import HTTPConnection
9from xml.etree.ElementTree import XML, ElementTree, fromstring, dump, tostring
10import time, datetime
11from ea_model.upcomingiso.ci_organisation import CI_Organisation
12from ea_model.upcomingiso.ci_party import CI_Party
13from ea_model.upcomingiso.ci_individual import CI_Individual
14from libs.migration.exception.exceptions import NoDataLineage
15from ea_model.iso_19115_2006_metadata_corrigendum.citation_and_responsible_party_information.ci_citation import CI_Citation
16from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_element import DQ_Element
17from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_conformanceresult import DQ_ConformanceResult
18from hashlib import md5
19from xml.sax.saxutils import unescape, escape
20import html5lib
21from html5lib import sanitizer, treebuilders
22
23base = '/exist/rest/atoms'
24
25DS_pUBLISHED = 'published'
26DS_WORKING = 'working'
27DS_PUBLISHED = 'Published'
28docStatus = (DS_pUBLISHED, DS_WORKING, DS_PUBLISHED)
29
30DT_DEPLOYMENTS = 'deployments'
31DT_DATA_ENTITIES = 'data_entities'
32DT_DEPLOYMENT_DATA = 'deployment_data'
33DT_DATA_GRANULES = 'data_granules'
34docTypes = (DT_DEPLOYMENTS, DT_DATA_ENTITIES, DT_DEPLOYMENT_DATA, DT_DATA_GRANULES)
35
36DO_BADC = 'badc.nerc.ac.uk'
37DO_NEODC = 'neodc.nerc.ac.uk'
38DO_UKSSDC = 'ukssdc.ac.uk'
39
40CEDA = 'Centre for Environmental Data Archive'
41docOwners = (DO_BADC, DO_NEODC, DO_UKSSDC)
42
43atomNS = "{http://www.w3.org/2005/Atom}"
44existNS = "{http://exist.sourceforge.net/NS/exist}"
45molesNS = "{http://ndg.nerc.ac.uk/schema/moles2beta}"
46htmlNS = "http://www.w3.org/1999/xhtml"
47time_format = '%Y-%m-%dT%H:%M:%SZ'
48ihost = 'bora.badc.rl.ac.uk'
49iport = '8080'
50
51#MD_Identifier codes
52MD_CODE_MOLES2_CITATION = 'MOLES2_CITATION'
53
54htmlParser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("etree"), namespaceHTMLElements=False)
55
56def calculateHash(text):
57    """
58        Returns an md5 hexadecimal representation of the given text
59        @param text: the string to encode
60        @return: the hexadecimal md5 value of the given text
61    """
62    encoder = md5()
63    encoder.update(text)
64    return encoder.hexdigest()
65
66def buildExistDocPath(docStatus, docType, docOwner, docName):
67    '''
68        @param docStatus: one value from commons.docStatus
69        @param docType: one value from commons.docCollections
70        @param docOwner: one value from commons.docOwners
71        @param docName: one value from commons.docOwners       
72    '''       
73    return '%s/%s' % (buildExistOwnerPath(docStatus, docType, docOwner), docName)
74
75def buildExistOwnerPath(docStatus, docType, docOwner):
76    '''
77        @param docStatus: one value from commons.docStatus
78        @param docType: one value from commons.docCollections
79        @param docOwner: one value from commons.docOwners
80    '''       
81    return '%s/%s' % (buildExistTypePath(docStatus, docType), docOwner)
82
83def buildExistTypePath(docStatus, docType):
84    '''
85        @param docStatus: one value from commons.docStatus
86        @param docType: one value from commons.docCollections
87    '''       
88    return '%s/%s' % (buildExistStatusPath(docStatus), docType)
89
90def buildExistStatusPath(docStatus):
91    '''
92        @param docStatus: one value from commons.docStatus
93    '''       
94    return '/exist/rest/atoms/%s' % (docStatus)
95
96def getAtomDocumentByMO(migrationObject):
97    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
98    return getAtomDocumentAsElementtree(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
99
100def getAtomDocumentHashByMO(migrationObject):
101    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
102    text = _getAtomDocumentAsText(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
103    return calculateHash(text)
104
105def getAtomDocumentHash(docStatus, docType, docOwner, docName):
106    source = buildExistDocPath(docStatus, docType, docOwner, docName)
107    text = _getDocument(source)
108    return calculateHash(text)
109
110def hasAtomDocumentSameHash(migrationObject):
111    return getAtomDocumentHashByMO(migrationObject) == migrationObject.doc_hash
112
113def getAtomDocumentByType(migrationObject, docType):
114    return getAtomDocumentAsElementtree(migrationObject.doc_status, docType, migrationObject.doc_owner, migrationObject.doc_name)
115
116def _getAtomDocumentAsText(docStatus, docType, docOwner, docName):
117    source = buildExistDocPath(docStatus, docType, docOwner, docName)
118    return _getDocument(source)
119
120def getAtomDocumentAsElementtree(docStatus, docType, docOwner, docName):
121    source = buildExistDocPath(docStatus, docType, docOwner, docName)
122    return _getXMLDocument(source)
123
124def _getXMLDocument(source):
125    return XML(_getDocument(source))
126
127def stringToTimestamp(timestring):
128    return datetime.datetime.fromtimestamp(time.mktime(time.strptime(timestring, time_format)))
129
130def _getDocument(source):
131    conn = HTTPConnection(host = ihost, port = iport)
132    conn.connect()
133    req = conn.request('GET', source)
134    res = conn.getresponse()
135    xmlDoc = res.read()
136    '''
137    print (xmlDoc)
138    '''
139    conn.close()
140    return xmlDoc
141
142def _returnNotNoneText(element):
143    if element is None:
144        return None
145    return element.text
146
147def findMolesCreationDate(resourceXML):
148    creationDate = resourceXML.find('%sentity/%smolesISO/%screated' % (molesNS, molesNS, molesNS))
149    return _returnNotNoneText(creationDate)
150
151def findMolesLineage(dataEntityMigration):
152    resourceXML = getAtomDocumentByMO(dataEntityMigration)
153    lineage = resourceXML.find('%sentity/%smolesISO/%slineage' % (molesNS, molesNS, molesNS))
154    if lineage is None:
155        raise NoDataLineage(dataEntityMigration)
156    return lineage.text
157
158def extractQuality(dataEntityMigration):
159    resourceXML = getAtomDocumentByMO(dataEntityMigration)
160    quality = resourceXML.find('%sentity/%smolesISO/%squality' % (molesNS, molesNS, molesNS))
161    return _returnNotNoneText(quality)
162
163def extractContent(dataEntityMigration):
164    """
165        Returns a dictionary containing the div composing the
166        <content> element in a dataentity document.
167    """
168    resourceXML = getAtomDocumentByMO(dataEntityMigration)
169    content = resourceXML.find('%scontent' % (atomNS))
170    text = _returnNotNoneText(content)
171    doc = htmlParser.parse(unescape(text))
172    contentDict = {}
173    for el in doc.findall('body//div'):   
174        prop = el.get('property')
175        if prop:
176            if prop.startswith('cedacat'):
177                contentDict[prop.split(':')[1]] = escape(tostring(el))
178    return contentDict
179
180def _extractAuthors(authorsCSV):
181    if authorsCSV is None:
182        return []
183    authors = authorsCSV.split(',')
184    for index in range(len(authors)):
185        authors[index] = authors[index].strip()
186        if len(authors[index]) == 0:
187            authors.remove(authors[index])
188    return authors
189
190def findAuthorsInResource(resourceMigration):
191    '''
192        Returns a dictionary with the following keys:
193        'authors': a list of string representing the authors
194        'contributors': a list of string representing the contributors
195    '''
196    ret = {}   
197    resourceXML = getAtomDocumentByMO(resourceMigration)
198    ret['authors'] = _extractAuthors(findAuthorInResource(resourceXML))
199    ret['contributors'] = _extractAuthors(findContributorInResource(resourceXML))
200    return ret
201
202def findAuthorInResource(resourceXML): 
203    author = resourceXML.find('%sauthor/%sname' % (atomNS, atomNS))
204    return _returnNotNoneText(author)
205
206def findContributorInResource(resourceXML): 
207    contributors = resourceXML.find('%scontributor/%sname' % (atomNS, atomNS))
208    return _returnNotNoneText(contributors)
209
210def findID(dataEntXML):
211    ent_id = dataEntXML.find('%sid' % (atomNS))
212    return _returnNotNoneText(ent_id)
213
214def extractLinks(dataEntXML, markers):
215    linksDict = {}
216    links = dataEntXML.findall('%slink' % (atomNS))
217    for link in links:
218        for marker in markers:
219            _updateLinksDict(linksDict, link, marker)       
220    return linksDict
221
222def _extractDataEntityLinks(dataEntXML):
223    markers = ['Deployment', 'DOWNLOAD', 'DOCUMENTATION', 'ACCESS', 'LOGO']
224    return extractLinks(dataEntXML, markers)
225
226def _updateLinksDict(linksDict, link, linkMarker):
227    rel = link.get('rel')
228    if rel and rel.endswith('/' + linkMarker):
229        if not linksDict.has_key(linkMarker):
230            linksDict[linkMarker] = []
231        linksDict[linkMarker].append(link.get('href'))
232
233def findDeploymentsInDE(dataEntityMigration):
234    dataEntXML = getAtomDocumentByMO(dataEntityMigration)
235    linksDict = _extractDataEntityLinks(dataEntXML)
236    links = _extractLinksByMarker(linksDict, 'Deployment')
237    return [depName + '.atom' for depName in links]
238
239def findSubTypeInDPT(resourceMigration):
240    resourceXML = getAtomDocumentByMO(resourceMigration)
241    categories = resourceXML.findall('%scategory' % (atomNS))
242    for category in categories:
243        if category.get("term") == "ATOM_SUBTYPE":
244            return category.get("label")   
245       
246def extractAuthor(deploymentMigration):
247    resourceXML = getAtomDocumentByMO(deploymentMigration)
248    title = resourceXML.find('%stitle' % (atomNS))
249    return _returnNotNoneText(title)
250
251def extractSummary(deploymentMigration, dataEntityMigration):
252    resourceXML = getAtomDocumentByMO(deploymentMigration)
253    summary = resourceXML.find('%ssummary' % (atomNS))
254    ret = _returnNotNoneText(summary)
255    if ret:
256        return ret
257    resourceXML = getAtomDocumentByMO(dataEntityMigration)
258    summary = resourceXML.find('%ssummary' % (atomNS))
259    return _returnNotNoneText(summary)
260
261def findLinksInDeployment(deploymentMigration):
262    """
263        Returns a dictionary of links owned by the given dataEntity document
264        @param deploymentMigration: a DeploymentMigration instance
265        @return: a dictionary of links. The possible keys are ['ACTIVITY', 'DPT', 'OBS']
266    """
267    resourceXML = getAtomDocumentByMO(deploymentMigration)
268    links = {}
269    markers = ['ACTIVITY', 'DPT', 'OBS'] 
270    linksDict = extractLinks(resourceXML, markers)
271    for marker in markers:   
272        links[marker] = _extractLinksByMarker(linksDict, marker)
273    return links
274
275def _extractLinksByMarker(linksDict, marker):
276    dpt = []
277    if linksDict.has_key(marker):
278        for link in linksDict[marker]:
279            try:
280                linkLongName = link.split('/')[-1]
281                linkName = linkLongName.rsplit('__ATOM__')[1]
282                dpt.append(linkName)
283            except Exception as ex:
284                print ex
285    return dpt
286
287
288def getResourceRefs(deploymentRefs):
289    '''
290        Returns a list of Elements representing the inner resource reference items
291        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
292    ''' 
293    XMLDepl = _getXMLDocument(deploymentRefs)
294    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
295
296def getOwnerRefs(docStatus, docType, docOwner):
297    '''
298        Returns a list of Elements representing the inner resource reference items
299        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
300    '''     
301    XMLDepl = _getXMLDocument(buildExistOwnerPath(docStatus, docType, docOwner))
302    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
303
304def getTypeRefs(docStatus, docType):
305    '''
306        Returns a list of Elements representing the inner resource reference items
307        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
308    '''     
309    XMLDepl = _getXMLDocument(buildExistTypePath(docStatus, docType))
310    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
311
312
313
314def getCollectionRefs(publishedRefs):
315    '''
316        Returns a list of Elements representing the inner deployment reference items
317        @param basePublished: the name of the eXist collection name below the 'published' one
318    ''' 
319    XMLPubl = _getXMLDocument(publishedRefs)
320    return XMLPubl.findall('%scollection/%scollection' % (existNS, existNS))
321
322def getResource(source, resourceName):
323    resourceSource = '%s/%s' % (source, resourceName)
324    resourceDoc = _getDocument(resourceSource)
325    return XML(resourceDoc)
326
327def createMO_ResponsiblePartyInfo(role, name):
328    '''
329        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
330        @param name: the CI_Party name
331    '''
332    return createMO_ResponsiblePartyInfoAsCI_Individual(role, name)
333    #return _createMO_ResponsiblePartyInfo(role, name)
334
335def createMO_ResponsiblePartyInfoAsCI_Organization(role, names):
336    '''
337        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
338        @param names: the name assigned to each CI_Party
339    '''
340    return _createMO_ResponsiblePartyInfo(role, names, CI_Organisation)
341
342def createMO_ResponsiblePartyInfoAsCI_Individual(role, names):
343    '''
344        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
345        @param names: the name assigned to each CI_Party
346    '''
347    return _createMO_ResponsiblePartyInfo(role, names, CI_Individual)
348
349def _createMO_ResponsiblePartyInfo(role, names, partyType = CI_Party):
350    """
351        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
352        @param names: the name assigned to each CI_Party
353        @param partyType: the CI_Party implementation to use
354    """
355    mo_responsableInfo = MO_ResponsiblePartyInfo()
356    mo_responsableInfo.role = role
357    test = mo_responsableInfo.party
358    parties = []
359    for name in names:
360        ci_org = partyType()
361        ci_org.name = name
362        parties.append(ci_org)
363    mo_responsableInfo.party = parties
364    return mo_responsableInfo
365
366def createCI_Citation(title = ""):
367    ci_citation = CI_Citation()
368    ci_citation.title = title
369    return ci_citation
370
371def createMD_Identifier(title = "", code = ""):
372    md_identifier = MD_Identifier()
373    md_identifier.code = code
374    md_identifier.authority = createCI_Citation(title)
375    return md_identifier
376
377def createDQ_ConformanceResult(explaination = ""):
378    dq_conformanceResult = DQ_ConformanceResult()
379    dq_conformanceResult.explanation = explaination
380    return dq_conformanceResult
381
382def createDQ_Element(explaination = ""):
383    dq_element = DQ_Element()
384    dq_element.result = []
385    dq_result = createDQ_ConformanceResult(explaination)
386    dq_element.result.append(dq_result)
387    return dq_element
388
389
390
391
392
393
394
Note: See TracBrowser for help on using the repository browser.