source: mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py @ 8147

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py@8147
Revision 8147, 14.3 KB checked in by mnagni, 8 years ago (diff)

Implementation of tickets #22390

Line 
1'''
2Created on 15 Nov 2011
3
4@author: mnagni
5'''
6from ea_model.moles3_4.utilities.mo_responsiblepartyinfo import MO_ResponsiblePartyInfo
7from ea_model.iso_19115_2006_metadata_corrigendum.reference_system_information.md_identifier import MD_Identifier
8from httplib import HTTPConnection
9from xml.etree.ElementTree import XML, ElementTree, fromstring, dump, tostring
10import time, datetime
11from ea_model.upcomingiso.ci_organisation import CI_Organisation
12from ea_model.upcomingiso.ci_party import CI_Party
13from ea_model.upcomingiso.ci_individual import CI_Individual
14from libs.migration.exception.exceptions import NoDataLineage,\
15    NoAssociatedAuthor
16from ea_model.iso_19115_2006_metadata_corrigendum.citation_and_responsible_party_information.ci_citation import CI_Citation
17from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_element import DQ_Element
18from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_conformanceresult import DQ_ConformanceResult
19from hashlib import md5
20from xml.sax.saxutils import unescape, escape
21import html5lib
22from html5lib import sanitizer, treebuilders
23from libs.migration.authors import authors
24
25base = '/exist/rest/atoms'
26
27DS_pUBLISHED = 'published'
28DS_WORKING = 'working'
29DS_PUBLISHED = 'Published'
30docStatus = (DS_pUBLISHED, DS_WORKING, DS_PUBLISHED)
31
32DT_DEPLOYMENTS = 'deployments'
33DT_DATA_ENTITIES = 'data_entities'
34DT_DEPLOYMENT_DATA = 'deployment_data'
35DT_DATA_GRANULES = 'data_granules'
36docTypes = (DT_DEPLOYMENTS, DT_DATA_ENTITIES, DT_DEPLOYMENT_DATA, DT_DATA_GRANULES)
37
38DO_BADC = 'badc.nerc.ac.uk'
39DO_NEODC = 'neodc.nerc.ac.uk'
40DO_UKSSDC = 'ukssdc.ac.uk'
41
42CEDA = 'Centre for Environmental Data Archive'
43docOwners = (DO_BADC, DO_NEODC, DO_UKSSDC)
44
45atomNS = "{http://www.w3.org/2005/Atom}"
46existNS = "{http://exist.sourceforge.net/NS/exist}"
47molesNS = "{http://ndg.nerc.ac.uk/schema/moles2beta}"
48htmlNS = "http://www.w3.org/1999/xhtml"
49time_format = '%Y-%m-%dT%H:%M:%SZ'
50ihost = 'bora.badc.rl.ac.uk'
51iport = '8080'
52
53#MD_Identifier codes
54MD_CODE_MOLES2_CITATION = 'MOLES2_CITATION'
55
56htmlParser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("etree"), namespaceHTMLElements=False)
57
58def calculateHash(text):
59    """
60        Returns an md5 hexadecimal representation of the given text
61        @param text: the string to encode
62        @return: the hexadecimal md5 value of the given text
63    """
64    encoder = md5()
65    encoder.update(text)
66    return encoder.hexdigest()
67
68def buildExistDocPath(docStatus, docType, docOwner, docName):
69    '''
70        @param docStatus: one value from commons.docStatus
71        @param docType: one value from commons.docCollections
72        @param docOwner: one value from commons.docOwners
73        @param docName: one value from commons.docOwners       
74    '''       
75    return '%s/%s' % (buildExistOwnerPath(docStatus, docType, docOwner), docName)
76
77def buildExistOwnerPath(docStatus, docType, docOwner):
78    '''
79        @param docStatus: one value from commons.docStatus
80        @param docType: one value from commons.docCollections
81        @param docOwner: one value from commons.docOwners
82    '''       
83    return '%s/%s' % (buildExistTypePath(docStatus, docType), docOwner)
84
85def buildExistTypePath(docStatus, docType):
86    '''
87        @param docStatus: one value from commons.docStatus
88        @param docType: one value from commons.docCollections
89    '''       
90    return '%s/%s' % (buildExistStatusPath(docStatus), docType)
91
92def buildExistStatusPath(docStatus):
93    '''
94        @param docStatus: one value from commons.docStatus
95    '''       
96    return '/exist/rest/atoms/%s' % (docStatus)
97
98def getAtomDocumentByMO(migrationObject):
99    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
100    return getAtomDocumentAsElementtree(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
101
102def getAtomDocumentHashByMO(migrationObject):
103    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
104    text = _getAtomDocumentAsText(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
105    return calculateHash(text)
106
107def getAtomDocumentHash(docStatus, docType, docOwner, docName):
108    source = buildExistDocPath(docStatus, docType, docOwner, docName)
109    text = _getDocument(source)
110    return calculateHash(text)
111
112def hasAtomDocumentSameHash(migrationObject):
113    return getAtomDocumentHashByMO(migrationObject) == migrationObject.doc_hash
114
115def getAtomDocumentByType(migrationObject, docType):
116    return getAtomDocumentAsElementtree(migrationObject.doc_status, docType, migrationObject.doc_owner, migrationObject.doc_name)
117
118def _getAtomDocumentAsText(docStatus, docType, docOwner, docName):
119    source = buildExistDocPath(docStatus, docType, docOwner, docName)
120    return _getDocument(source)
121
122def getAtomDocumentAsElementtree(docStatus, docType, docOwner, docName):
123    source = buildExistDocPath(docStatus, docType, docOwner, docName)
124    return _getXMLDocument(source)
125
126def _getXMLDocument(source):
127    return XML(_getDocument(source))
128
129def stringToTimestamp(timestring):
130    return datetime.datetime.fromtimestamp(time.mktime(time.strptime(timestring, time_format)))
131
132def _getDocument(source):
133    conn = HTTPConnection(host = ihost, port = iport)
134    conn.connect()
135    req = conn.request('GET', source)
136    res = conn.getresponse()
137    xmlDoc = res.read()
138    '''
139    print (xmlDoc)
140    '''
141    conn.close()
142    return xmlDoc
143
144def _returnNotNoneText(element):
145    if element is None:
146        return None
147    return element.text
148
149def findMolesCreationDate(resourceXML):
150    creationDate = resourceXML.find('%sentity/%smolesISO/%screated' % (molesNS, molesNS, molesNS))
151    return _returnNotNoneText(creationDate)
152
153def findMolesLineage(dataEntityMigration):
154    resourceXML = getAtomDocumentByMO(dataEntityMigration)
155    lineage = resourceXML.find('%sentity/%smolesISO/%slineage' % (molesNS, molesNS, molesNS))
156    if lineage is None:
157        raise NoDataLineage(dataEntityMigration)
158    return lineage.text
159
160def extractQuality(dataEntityMigration):
161    resourceXML = getAtomDocumentByMO(dataEntityMigration)
162    quality = resourceXML.find('%sentity/%smolesISO/%squality' % (molesNS, molesNS, molesNS))
163    return _returnNotNoneText(quality)
164
165def extractContent(dataEntityMigration):
166    """
167        Returns a dictionary containing the div composing the
168        <content> element in a dataentity document.
169    """
170    resourceXML = getAtomDocumentByMO(dataEntityMigration)
171    content = resourceXML.find('%scontent' % (atomNS))
172    text = _returnNotNoneText(content)
173    doc = htmlParser.parse(unescape(text))
174    contentDict = {}
175    for el in doc.findall('body//div'):   
176        prop = el.get('property')
177        if prop:
178            if prop.startswith('cedacat'):
179                contentDict[prop.split(':')[1]] = escape(tostring(el))
180    return contentDict
181
182def _extractAuthors(authorsCSV):
183    if authorsCSV is None:
184        return []
185    authors = authorsCSV.split(',')
186    for index in range(len(authors)):
187        authors[index] = authors[index].strip()
188        if len(authors[index]) == 0:
189            authors.remove(authors[index])
190    return authors
191
192def findAuthorsInResource(resourceMigration):
193    '''
194        Returns a dictionary with the following keys:
195        'authors': a list of string representing the authors
196        'contributors': a list of string representing the contributors
197    '''
198    ret = {}   
199    resourceXML = getAtomDocumentByMO(resourceMigration)
200    ret['authors'] = findAuthorInResource(resourceXML)
201    ret['contributors'] = findContributorInResource(resourceXML)
202    return ret
203
204def findAuthorInResource(resourceXML): 
205    author = resourceXML.find('%sauthor/%sname' % (atomNS, atomNS))
206    return _returnNotNoneText(author)
207
208def findContributorInResource(resourceXML): 
209    contributors = resourceXML.find('%scontributor/%sname' % (atomNS, atomNS))
210    return _returnNotNoneText(contributors)
211
212def findID(dataEntXML):
213    ent_id = dataEntXML.find('%sid' % (atomNS))
214    return _returnNotNoneText(ent_id)
215
216def extractLinks(dataEntXML, markers):
217    linksDict = {}
218    links = dataEntXML.findall('%slink' % (atomNS))
219    for link in links:
220        for marker in markers:
221            _updateLinksDict(linksDict, link, marker)       
222    return linksDict
223
224def _extractDataEntityLinks(dataEntXML):
225    markers = ['Deployment', 'DOWNLOAD', 'DOCUMENTATION', 'ACCESS', 'LOGO']
226    return extractLinks(dataEntXML, markers)
227
228def _updateLinksDict(linksDict, link, linkMarker):
229    rel = link.get('rel')
230    if rel and rel.endswith('/' + linkMarker):
231        if not linksDict.has_key(linkMarker):
232            linksDict[linkMarker] = []
233        linksDict[linkMarker].append(link.get('href'))
234
235def findDeploymentsInDE(dataEntityMigration):
236    dataEntXML = getAtomDocumentByMO(dataEntityMigration)
237    linksDict = _extractDataEntityLinks(dataEntXML)
238    links = _extractLinksByMarker(linksDict, 'Deployment')
239    return [depName + '.atom' for depName in links]
240
241def findSubTypeInDPT(resourceMigration):
242    resourceXML = getAtomDocumentByMO(resourceMigration)
243    categories = resourceXML.findall('%scategory' % (atomNS))
244    for category in categories:
245        if category.get("term") == "ATOM_SUBTYPE":
246            return category.get("label")   
247       
248def extractTitle(deploymentMigration):
249    resourceXML = getAtomDocumentByMO(deploymentMigration)
250    title = resourceXML.find('%stitle' % (atomNS))
251    return _returnNotNoneText(title)
252
253def extractSummary(deploymentMigration, dataEntityMigration):
254    resourceXML = getAtomDocumentByMO(deploymentMigration)
255    summary = resourceXML.find('%ssummary' % (atomNS))
256    ret = _returnNotNoneText(summary)
257    if ret:
258        return ret
259    resourceXML = getAtomDocumentByMO(dataEntityMigration)
260    summary = resourceXML.find('%ssummary' % (atomNS))
261    return _returnNotNoneText(summary)
262
263def findLinksInDeployment(deploymentMigration):
264    """
265        Returns a dictionary of links owned by the given dataEntity document
266        @param deploymentMigration: a DeploymentMigration instance
267        @return: a dictionary of links. The possible keys are ['ACTIVITY', 'DPT', 'OBS']
268    """
269    resourceXML = getAtomDocumentByMO(deploymentMigration)
270    links = {}
271    markers = ['ACTIVITY', 'DPT', 'OBS'] 
272    linksDict = extractLinks(resourceXML, markers)
273    for marker in markers:   
274        links[marker] = _extractLinksByMarker(linksDict, marker)
275    return links
276
277def _extractLinksByMarker(linksDict, marker):
278    dpt = []
279    if linksDict.has_key(marker):
280        for link in linksDict[marker]:
281            try:
282                linkLongName = link.split('/')[-1]
283                linkName = linkLongName.rsplit('__ATOM__')[1]
284                dpt.append(linkName)
285            except Exception as ex:
286                print ex
287    return dpt
288
289
290def getResourceRefs(deploymentRefs):
291    '''
292        Returns a list of Elements representing the inner resource reference items
293        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
294    ''' 
295    XMLDepl = _getXMLDocument(deploymentRefs)
296    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
297
298def getOwnerRefs(docStatus, docType, docOwner):
299    '''
300        Returns a list of Elements representing the inner resource reference items
301        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
302    '''     
303    XMLDepl = _getXMLDocument(buildExistOwnerPath(docStatus, docType, docOwner))
304    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
305
306def getTypeRefs(docStatus, docType):
307    '''
308        Returns a list of Elements representing the inner resource reference items
309        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
310    '''     
311    XMLDepl = _getXMLDocument(buildExistTypePath(docStatus, docType))
312    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
313
314
315
316def getCollectionRefs(publishedRefs):
317    '''
318        Returns a list of Elements representing the inner deployment reference items
319        @param basePublished: the name of the eXist collection name below the 'published' one
320    ''' 
321    XMLPubl = _getXMLDocument(publishedRefs)
322    return XMLPubl.findall('%scollection/%scollection' % (existNS, existNS))
323
324def getResource(source, resourceName):
325    resourceSource = '%s/%s' % (source, resourceName)
326    resourceDoc = _getDocument(resourceSource)
327    return XML(resourceDoc)
328
329def createMO_ResponsiblePartyInfoAsCI_Organization(role, names):
330    '''
331        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
332        @param names: the name assigned to each CI_Party
333    '''
334    return createMO_ResponsiblePartyInfo(role, names, CI_Organisation)
335
336def createMO_ResponsiblePartyInfoAsCI_Individual(role, names):
337    '''
338        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
339        @param names: the name assigned to each CI_Party
340    '''
341    return createMO_ResponsiblePartyInfo(role, names, CI_Individual)
342
343def createMO_ResponsiblePartyInfo(role, names, partyType = CI_Party):
344    """
345        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
346        @param names: the name assigned to each CI_Party
347        @param partyType: the CI_Party implementation to use
348    """
349    mo_responsableInfo = MO_ResponsiblePartyInfo()
350    mo_responsableInfo.role = role
351    test = mo_responsableInfo.party
352    parties = []
353    for name in names:
354        ci_org = partyType()
355        ci_org.name = name
356        parties.append(ci_org)
357    mo_responsableInfo.party = parties
358    return mo_responsableInfo
359
360def createCI_Citation(title = ""):
361    ci_citation = CI_Citation()
362    ci_citation.title = title
363    return ci_citation
364
365def createMD_Identifier(title = "", code = ""):
366    md_identifier = MD_Identifier()
367    md_identifier.code = code
368    md_identifier.authority = createCI_Citation(title)
369    return md_identifier
370
371def createDQ_ConformanceResult(explaination = ""):
372    dq_conformanceResult = DQ_ConformanceResult()
373    dq_conformanceResult.explanation = explaination
374    return dq_conformanceResult
375
376def createDQ_Element(explaination = ""):
377    dq_element = DQ_Element()
378    dq_element.result = []
379    dq_result = createDQ_ConformanceResult(explaination)
380    dq_element.result.append(dq_result)
381    return dq_element
382
383
384
385
386
387
388
Note: See TracBrowser for help on using the repository browser.