source: mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py @ 8182

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/mauRepo/MolesManager/trunk/src/libs/migration/processor/commons.py@8182
Revision 8182, 20.1 KB checked in by mnagni, 9 years ago (diff)

Complete - # 22397: CEDA Observation geographicExtent
 http://team.ceda.ac.uk/trac/ceda/ticket/22397

Line 
1'''
2Created on 15 Nov 2011
3
4@author: mnagni
5'''
6from ea_model.moles3_4.utilities.mo_responsiblepartyinfo import MO_ResponsiblePartyInfo
7from ea_model.iso_19115_2006_metadata_corrigendum.reference_system_information.md_identifier import MD_Identifier
8from httplib import HTTPConnection
9from xml.etree.ElementTree import XML, tostring
10import time, datetime
11from ea_model.upcomingiso.ci_organisation import CI_Organisation
12from ea_model.upcomingiso.ci_party import CI_Party
13from ea_model.upcomingiso.ci_individual import CI_Individual
14from libs.migration.exception.exceptions import NoDataLineage
15from ea_model.iso_19115_2006_metadata_corrigendum.citation_and_responsible_party_information.ci_citation import CI_Citation
16from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_element import DQ_Element
17from ea_model.iso_19115_2006_metadata_corrigendum.data_quality_information.dq_conformanceresult import DQ_ConformanceResult
18from hashlib import md5
19from xml.sax.saxutils import unescape, escape
20import html5lib
21from html5lib import treebuilders
22from ea_model.ceda_metadatamodel.ceda_result.ceda_result import CEDA_Result
23from ea_model.moles3_4.result.mo_onlineresource import MO_OnlineResource
24from ea_model.iso_19115_2006_metadata_corrigendum.citation_and_responsible_party_information.url import URL
25from ea_model.iso_19115_2006_metadata_corrigendum.extent_information.ex_geographicboundingbox import EX_GeographicBoundingBox
26from ea_model.ceda_metadatamodel.ceda_computation.ceda_processing import CEDA_Processing
27from ea_model.ceda_metadatamodel.ceda_acquisition.ceda_instrument import CEDA_Instrument
28from ea_model.ceda_metadatamodel.ceda_observationprocess.ceda_compositeprocess import CEDA_CompositeProcess
29from ea_model.ceda_metadatamodel.ceda_acquisition.ceda_acquisition import CEDA_Acquisition
30
31base = '/exist/rest/atoms'
32
33DS_pUBLISHED = 'published'
34DS_WORKING = 'working'
35DS_PUBLISHED = 'Published'
36docStatus = (DS_pUBLISHED, DS_WORKING, DS_PUBLISHED)
37
38DT_DEPLOYMENTS = 'deployments'
39DT_DATA_ENTITIES = 'data_entities'
40DT_DEPLOYMENT_DATA = 'deployment_data'
41DT_DATA_GRANULES = 'data_granules'
42docTypes = (DT_DEPLOYMENTS, DT_DATA_ENTITIES, DT_DEPLOYMENT_DATA, DT_DATA_GRANULES)
43
44DO_BADC = 'badc.nerc.ac.uk'
45DO_NEODC = 'neodc.nerc.ac.uk'
46DO_UKSSDC = 'ukssdc.ac.uk'
47
48CEDA = 'Centre for Environmental Data Archive'
49docOwners = (DO_BADC, DO_NEODC, DO_UKSSDC)
50
51atomNS = "{http://www.w3.org/2005/Atom}"
52existNS = "{http://exist.sourceforge.net/NS/exist}"
53molesNS = "{http://ndg.nerc.ac.uk/schema/moles2beta}"
54htmlNS = "{http://www.w3.org/1999/xhtml}"
55georssNS="{http://www.georss.org/georss/10}"
56gmlNS="{http://www.opengis.net/gml}"
57time_format = '%Y-%m-%dT%H:%M:%SZ'
58ihost = 'bora.badc.rl.ac.uk'
59iport = '8080'
60
61#MD_Identifier codes
62MD_CODE_MOLES2_CITATION = 'ceda_moles2_citation'
63
64htmlParser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("etree"), namespaceHTMLElements=False)
65
66def calculateHash(text):
67    """
68        Returns an md5 hexadecimal representation of the given text
69        @param text: the string to encode
70        @return: the hexadecimal md5 value of the given text
71    """
72    encoder = md5()
73    encoder.update(text)
74    return encoder.hexdigest()
75
76def buildExistDocPath(docStatus, docType, docOwner, docName):
77    '''
78        @param docStatus: one value from commons.docStatus
79        @param docType: one value from commons.docCollections
80        @param docOwner: one value from commons.docOwners
81        @param docName: one value from commons.docOwners       
82    '''       
83    return '%s/%s' % (buildExistOwnerPath(docStatus, docType, docOwner), docName)
84
85def buildExistOwnerPath(docStatus, docType, docOwner):
86    '''
87        @param docStatus: one value from commons.docStatus
88        @param docType: one value from commons.docCollections
89        @param docOwner: one value from commons.docOwners
90    '''       
91    return '%s/%s' % (buildExistTypePath(docStatus, docType), docOwner)
92
93def buildExistTypePath(docStatus, docType):
94    '''
95        @param docStatus: one value from commons.docStatus
96        @param docType: one value from commons.docCollections
97    '''       
98    return '%s/%s' % (buildExistStatusPath(docStatus), docType)
99
100def buildExistStatusPath(docStatus):
101    '''
102        @param docStatus: one value from commons.docStatus
103    '''       
104    return '/exist/rest/atoms/%s' % (docStatus)
105
106def getAtomDocumentByMO(migrationObject):
107    if migrationObject is None:
108        raise Exception("migrationObject is None")
109    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
110    return getAtomDocumentAsElementtree(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
111
112def getAtomDocumentHashByMO(migrationObject):
113    if migrationObject is None:
114        raise Exception("migrationObject is None")   
115    mo_typeDict = {'DeploymentsMigration': DT_DEPLOYMENTS, 'DataEntityMigration': DT_DATA_ENTITIES, 'DeploymentDataMigration': DT_DEPLOYMENT_DATA}     
116    text = _getAtomDocumentAsText(migrationObject.doc_status, mo_typeDict[type(migrationObject).__name__], migrationObject.doc_owner, migrationObject.doc_name)
117    return calculateHash(text)
118
119def getAtomDocumentHash(docStatus, docType, docOwner, docName):
120    source = buildExistDocPath(docStatus, docType, docOwner, docName)
121    text = _getDocument(source)
122    return calculateHash(text)
123
124def hasAtomDocumentSameHash(migrationObject):
125    return getAtomDocumentHashByMO(migrationObject) == migrationObject.doc_hash
126
127def getAtomDocumentByType(migrationObject, docType):
128    if migrationObject is None:
129        raise Exception("migrationObject is None")   
130    return getAtomDocumentAsElementtree(migrationObject.doc_status, docType, migrationObject.doc_owner, migrationObject.doc_name)
131
132def _getAtomDocumentAsText(docStatus, docType, docOwner, docName):
133    source = buildExistDocPath(docStatus, docType, docOwner, docName)
134    return _getDocument(source)
135
136def getAtomDocumentAsElementtree(docStatus, docType, docOwner, docName):
137    source = buildExistDocPath(docStatus, docType, docOwner, docName)
138    return _getXMLDocument(source)
139
140def _getXMLDocument(source):
141    return XML(_getDocument(source))
142
143def stringToTimestamp(timestring):
144    return datetime.datetime.fromtimestamp(time.mktime(time.strptime(timestring, time_format)))
145
146def _getDocument(source):
147    conn = HTTPConnection(host = ihost, port = iport)
148    conn.connect()
149    req = conn.request('GET', source)
150    res = conn.getresponse()
151    xmlDoc = res.read()
152    '''
153    print (xmlDoc)
154    '''
155    conn.close()
156    return xmlDoc
157
158def _returnNotNoneText(element):
159    if element is None:
160        return None
161    return element.text
162
163def findMolesCreationDate(resourceXML):
164    creationDate = resourceXML.find('%sentity/%smolesISO/%screated' % (molesNS, molesNS, molesNS))
165    return _returnNotNoneText(creationDate)
166
167def findMolesLineage(dataEntityMigration):
168    resourceXML = getAtomDocumentByMO(dataEntityMigration)
169    lineage = resourceXML.find('%sentity/%smolesISO/%slineage' % (molesNS, molesNS, molesNS))
170    if lineage is None:
171        raise NoDataLineage(dataEntityMigration)
172    return lineage.text
173
174def extractQuality(dataEntityMigration):
175    resourceXML = getAtomDocumentByMO(dataEntityMigration)
176    quality = resourceXML.find('%sentity/%smolesISO/%squality' % (molesNS, molesNS, molesNS))
177    return _returnNotNoneText(quality)
178
179def extractContent(dataEntityMigration):
180    """
181        Returns a dictionary containing the div composing the
182        <content> element in a dataentity document.
183    """
184    resourceXML = getAtomDocumentByMO(dataEntityMigration)
185    content = resourceXML.find('%scontent' % (atomNS))
186    text = _returnNotNoneText(content)
187    doc = htmlParser.parse(unescape(text))
188    contentDict = {}
189    for el in doc.findall('body//div'):   
190        prop = el.get('property')
191        if prop:
192            if prop.startswith('cedacat'):
193                contentDict[prop.split(':')[1]] = escape(tostring(el))
194    return contentDict
195
196def _extractAuthors(authorsCSV):
197    if authorsCSV is None:
198        return []
199    authors = authorsCSV.split(',')
200    for index in range(len(authors)):
201        authors[index] = authors[index].strip()
202        if len(authors[index]) == 0:
203            authors.remove(authors[index])
204    return authors
205
206def findAuthorsInResource(resourceMigration):
207    '''
208        Returns a dictionary with the following keys:
209        'authors': a list of string representing the authors
210        'contributors': a list of string representing the contributors
211    '''
212    ret = {}   
213    resourceXML = getAtomDocumentByMO(resourceMigration)
214    ret['authors'] = findAuthorInResource(resourceXML)
215    ret['contributors'] = findContributorInResource(resourceXML)
216    return ret
217
218def findAuthorInResource(resourceXML): 
219    author = resourceXML.find('%sauthor/%sname' % (atomNS, atomNS))
220    return _returnNotNoneText(author)
221
222def findContributorInResource(resourceXML): 
223    contributors = resourceXML.find('%scontributor/%sname' % (atomNS, atomNS))
224    return _returnNotNoneText(contributors)
225
226def findID(dataEntXML):
227    ent_id = dataEntXML.find('%sid' % (atomNS))
228    return _returnNotNoneText(ent_id)
229
230def _extractLinks(dataEntXML, markers):
231    linksDict = {}
232    links = dataEntXML.findall('%slink' % (atomNS))
233    for link in links:
234        for marker in markers:
235            _updateLinksDict(linksDict, link, marker)       
236    return linksDict
237
238def _extractMigrationObjectLinks(dataEntXML):
239    markers = ['Deployment', 'DOWNLOAD', 'DOCUMENTATION', 'ACCESS', 'LOGO']
240    return _extractLinks(dataEntXML, markers)
241
242def _updateLinksDict(linksDict, link, linkMarker):
243    if not linksDict.has_key(linkMarker):
244        linksDict[linkMarker] = []
245    rel = link.get('rel')
246    if rel and rel.endswith('/' + linkMarker):
247        linksDict[linkMarker].append({'href': link.get('href'), 'title': link.get('title')})
248
249def _findLinksInMigrationDocument(dataEntityMigration):
250    dataEntXML = getAtomDocumentByMO(dataEntityMigration)
251    linksDict = _extractMigrationObjectLinks(dataEntXML)
252    return linksDict
253
254def findDownloadLinksInMigrationDocument(migrationObject):
255    """
256        Return a list of dictionaries describing a <link rel="...DOWNLOAD..."> tag type
257        Each dictionary has two keys: 'href' and 'title'
258        @param migrationObject: the migration instance to retrieve and parse
259        @return: a list of dictionaries
260    """
261    linksDict = _findLinksInMigrationDocument(migrationObject)
262    return linksDict['DOWNLOAD']
263
264def findDOIInMigrationDocument(migrationObject):
265    """
266        Return a dictionary describing a <link rel="...DOCUMENTATION..."> tag type
267        The dictionary has two keys: 'href' and 'title'
268        @param migrationObject: the migration instance to retrieve and parse
269        @return: a dictionary relative to the DOI, None otherwise
270    """
271    linksDict = _findLinksInMigrationDocument(migrationObject)
272    for link in linksDict['DOCUMENTATION']:
273        if link['href'].startswith('doi:'):
274            return link
275    return None
276
277def findDeploymentsInDE(dataEntityMigration):
278    linksDict = _findLinksInMigrationDocument(dataEntityMigration)
279    links = _extractLinksByMarker(linksDict, 'Deployment')
280    return [depName + '.atom' for depName in links]
281
282def findSubTypeInDPT(resourceMigration):
283    resourceXML = getAtomDocumentByMO(resourceMigration)
284    categories = resourceXML.findall('%scategory' % (atomNS))
285    for category in categories:
286        if category.get("term") == "ATOM_SUBTYPE":
287            return category.get("label")   
288       
289def extractTitle(deploymentMigration):
290    resourceXML = getAtomDocumentByMO(deploymentMigration)
291    title = resourceXML.find('%stitle' % (atomNS))
292    return _returnNotNoneText(title)
293
294def extractSummary(deploymentMigration, dataEntityMigration):
295    resourceXML = getAtomDocumentByMO(deploymentMigration)
296    summary = resourceXML.find('%ssummary' % (atomNS))
297    ret = _returnNotNoneText(summary)
298    if ret:
299        return ret
300    resourceXML = getAtomDocumentByMO(dataEntityMigration)
301    summary = resourceXML.find('%ssummary' % (atomNS))
302    return _returnNotNoneText(summary)
303
304def extractGeographicExtentInMigrationDocument(migrationObject):
305    """
306        Extracts if existing the georss:where/gel:Enveloper/upper-lowerCorner elements.
307        @param migrationObject: a migration object to retrieve to parse for data
308        @return: None if no data are found, otherwise a dictionary with keys: 'east', 'north', 'west', 'south' where
309        the values are float
310    """
311    resourceXML = getAtomDocumentByMO(migrationObject)
312    upperCorner = resourceXML.find('%swhere/%sEnvelope/%supperCorner' % (georssNS, gmlNS, gmlNS))
313    lowerCorner = resourceXML.find('%swhere/%sEnvelope/%slowerCorner' % (georssNS, gmlNS, gmlNS))
314    ret = None
315    if upperCorner != None and lowerCorner != None:
316        upperCornerData = upperCorner.text.split()
317        lowerCornerData = lowerCorner.text.split()
318        ret = {'east': float(upperCornerData[0]), 'north': float(upperCornerData[1]), 'west': float(lowerCornerData[0]), 'south': float(lowerCornerData[1])}
319    return ret
320
321def findLinksInDeployment(deploymentMigration):
322    """
323        Returns a dictionary of links owned by the given dataEntity document
324        @param deploymentMigration: a DeploymentMigration instance
325        @return: a dictionary of links. The possible keys are ['ACTIVITY', 'DPT', 'OBS']
326    """
327    resourceXML = getAtomDocumentByMO(deploymentMigration)
328    links = {}
329    markers = ['ACTIVITY', 'DPT', 'OBS'] 
330    linksDict = _extractLinks(resourceXML, markers)
331    for marker in markers:   
332        links[marker] = _extractLinksByMarker(linksDict, marker)
333    return links
334
335def _extractLinksByMarker(linksDict, marker):
336    dpt = []
337    if linksDict.has_key(marker):
338        for link in linksDict[marker]:
339            try:
340                linkLongName = link['href'].split('/')[-1]
341                linkName = linkLongName.rsplit('__ATOM__')[1]
342                dpt.append(linkName)
343            except Exception as ex:
344                print ex
345    return dpt
346
347
348def getResourceRefs(deploymentRefs):
349    '''
350        Returns a list of Elements representing the inner resource reference items
351        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
352    ''' 
353    XMLDepl = _getXMLDocument(deploymentRefs)
354    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
355
356def getOwnerRefs(docStatus, docType, docOwner):
357    '''
358        Returns a list of Elements representing the inner resource reference items
359        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
360    '''     
361    XMLDepl = _getXMLDocument(buildExistOwnerPath(docStatus, docType, docOwner))
362    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
363
364def getTypeRefs(docStatus, docType):
365    '''
366        Returns a list of Elements representing the inner resource reference items
367        @param resourceRefs: the name of the eXist collection name below the 'deployments' one
368    '''     
369    XMLDepl = _getXMLDocument(buildExistTypePath(docStatus, docType))
370    return XMLDepl.findall('%scollection/%sresource' % (existNS, existNS))
371
372
373
374def getCollectionRefs(publishedRefs):
375    '''
376        Returns a list of Elements representing the inner deployment reference items
377        @param basePublished: the name of the eXist collection name below the 'published' one
378    ''' 
379    XMLPubl = _getXMLDocument(publishedRefs)
380    return XMLPubl.findall('%scollection/%scollection' % (existNS, existNS))
381
382def getResource(source, resourceName):
383    resourceSource = '%s/%s' % (source, resourceName)
384    resourceDoc = _getDocument(resourceSource)
385    return XML(resourceDoc)
386
387def createMO_ResponsiblePartyInfoAsCI_Organization(role, names):
388    '''
389        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
390        @param names: the name assigned to each CI_Party
391    '''
392    return createMO_ResponsiblePartyInfo(role, names, CI_Organisation)
393
394def createMO_ResponsiblePartyInfoAsCI_Individual(role, names):
395    '''
396        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
397        @param names: the name assigned to each CI_Party
398    '''
399    return createMO_ResponsiblePartyInfo(role, names, CI_Individual)
400
401def createMO_ResponsiblePartyInfo(role, names, partyType = CI_Party):
402    """
403        @param role: a CI_RoleCode/MO_RoleValue assigned to this ResponsibleParty
404        @param names: the name assigned to each CI_Party
405        @param partyType: the CI_Party implementation to use
406    """
407    mo_responsableInfo = MO_ResponsiblePartyInfo()
408    mo_responsableInfo.role = role
409    test = mo_responsableInfo.party
410    parties = []
411    for name in names:
412        ci_org = partyType()
413        ci_org.name = name
414        parties.append(ci_org)
415    mo_responsableInfo.party = parties
416    return mo_responsableInfo
417
418def createCI_Citation(title = ""):
419    ci_citation = CI_Citation()
420    ci_citation.title = title
421    return ci_citation
422
423def createMD_Identifier(title = "", code = ""):
424    """
425        Creates a new MD_Identifier
426        @param title: the MD_Identifier.authority.title field
427        @param code: the MD_Identifier.code field
428    """
429    md_identifier = MD_Identifier()
430    md_identifier.code = code
431    md_identifier.authority = createCI_Citation(title)
432    return md_identifier
433
434def createMO_OnlineResource(linkage, name = None, function = None, description = None, applicationProfile = None):
435    """
436        Creates a new CEDA_Result
437        @param linkage: the MO_OnlineResource.linkage.url.??? field
438        @param name: the MO_OnlineResource.linkage.name field
439        @param function: the MO_OnlineResource.function field
440        @param description: the MO_OnlineResource.description field
441        @param applicationProfile: the MO_OnlineResource.applicationProfile field
442    """
443    on_line_resource = MO_OnlineResource()
444    url = URL()
445    #url.???? = linkage   
446    on_line_resource.linkage = url
447    if name: 
448        on_line_resource.name = name
449    if function:
450        on_line_resource.function = function
451    if description:
452        on_line_resource.description = description
453    if applicationProfile:
454        on_line_resource.applicationProfile = applicationProfile
455    return on_line_resource       
456
457def createCEDA_Result(linkage, name = None, function = None, description = None, applicationProfile = None):
458    """
459        Creates a new CEDA_Result
460        @param linkage: the CEDA_Result.source.linkage.url.??? field
461        @param name: the CEDA_Result.source.linkage.name field
462        @param function: the CEDA_Result.source.function field
463    """
464    ceda_result = CEDA_Result()
465    on_line_resource = createMO_OnlineResource(linkage, name, function, description, applicationProfile)       
466    ceda_result.source.append(on_line_resource)
467    return ceda_result
468
469
470
471def createDQ_ConformanceResult(explaination = ""):
472    dq_conformanceResult = DQ_ConformanceResult()
473    dq_conformanceResult.explanation = explaination
474    return dq_conformanceResult
475
476def createDQ_Element(explaination = ""):
477    dq_element = DQ_Element()
478    dq_element.result = []
479    dq_result = createDQ_ConformanceResult(explaination)
480    dq_element.result.append(dq_result)
481    return dq_element
482
483def createEX_GeographicBoundingBox(east, north, west, south):
484    """
485        Creates an EX_GeographicBoundingBox instance
486        @param east: the eastBoundLongitude attribute as float
487        @param north: the northBoundLongitude attribute as float
488        @param west: the westBoundLongitude attribute as float
489        @param south: the southBoundLongitude attribute as float               
490    """
491    ex_geographic_bb = EX_GeographicBoundingBox()
492    ex_geographic_bb.eastBoundLongitude = east
493    ex_geographic_bb.northBoundLatitude = north
494    ex_geographic_bb.westBoundLongitude = west
495    ex_geographic_bb.southBoundLatitude = south
496    return ex_geographic_bb
497   
498def createCEDA_Processing():
499    ceda_processing = CEDA_Processing()
500    return ceda_processing
501
502
503def createCEDA_Instrument():
504    ceda_instrument = CEDA_Instrument()
505    return ceda_instrument
506
507def createCEDA_CompositeProcess():
508    ceda_cp = CEDA_CompositeProcess()
509    return ceda_cp
510
511def createCEDA_Acquisition():
512    ceda_acquisition = CEDA_Acquisition()
513    return ceda_acquisition
514   
Note: See TracBrowser for help on using the repository browser.