source: TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/DIF.py @ 5040

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/DIF.py@5040
Revision 5040, 7.4 KB checked in by sdonegan, 11 years ago (diff)

Debug new ingest classes - previous commit had problems with mdip records.

Line 
1# python class to support methods on a DIF ... to conform with
2# renderEntity etc ...
3#
4from Utilities import *
5from geoUtilities import *
6from People import *
7from AccessControl import AccessControl
8from ndgUtils.ETxmlView import loadET, nsdumb
9from renderEntity import renderEntity
10try: #python 2.5
11    from xml.etree import ElementTree as ET
12except ImportError:
13    try:
14        # if you've installed it yourself it comes this way
15        import ElementTree as ET
16    except ImportError:
17        # if you've egged it this is the way it comes
18        from elementtree import ElementTree as ET
19
20class DIFService:
21    ''' A DIF only knows about a related URL '''
22    def __init__(self,c,u,d):
23        ''' Take a related url tuple (content_type,url,description) and store it, using
24        an ndgModifier if necessary '''
25        self.contentType,self.url,self.description=c,u,d
26    def __str__(self):
27        return '<a href="%s" title="%s">%s</a>'%self.contentType,self.url,self.description
28   
29class DIF:
30    ''' Supports the NASA GCMD DIF format for python operations,
31    note ... not a complete implementation, currently minimum to
32    show a reasonable piece of content '''
33    def __init__(self,xml,et=0,debug=0,ndgObject=None):
34       
35        '''Initialise a python dif instance based on an xml document (expected
36        to be an input string if et=0, otherwise an ElementTree instance) '''
37        if et:
38            try:
39                self.tree=xml
40                self.xml=ET.tostring(xml)
41            except:
42                raise TypeError,'DIF input is not a valid ElementTree instance'
43        else:
44            try:
45                self.tree=loadET(xml)
46                self.xml=xml
47            except: # for some reason we can't parse the document, and it's worth knowing why
48                if isinstance(xml,str):
49                    l=min(len(xml),300)-1
50                    if l!=len(xml):xml=xml[0:l]+'\n...'
51                    raise ValueError,'DIF input cannot be parsed into an ElementTree instance:\n%s'%xml
52                else:
53                    raise TypeError,'DIF input of type [%s] needs to be a string!'%type(xml)
54       
55        self.debug=debug
56       
57        # if this is an ndgObject that'll have been sorted externally ...
58        self.ndgObject=ndgObject
59       
60        # now try and interpret it
61       
62        helper=nsdumb(self.tree)
63        self.metadataType='DIF'
64        if helper.strip(self.tree.tag)!=self.metadataType: 
65            self.tree=helper.find(self.tree,self.metadataType)
66            if self.tree is None: 
67                raise ValueError, 'DIF input does not include a DIF element:\n%s'%self.xml
68       
69        self.entryID=helper.getText(self.tree,'Entry_ID')
70        self.abstract=helper.getText(self.tree,'Summary')
71        self.name=helper.getText(self.tree,'Entry_Title')
72        self.abbreviation=self.name[0:min(5,len(self.name))]
73       
74        #add some extra parameters here to extract further columns to aid in ranking & ordering       
75        self.metadataCreationDate=helper.getText(self.tree,'DIF_Creation_Date')
76        self.datacentreName=helper.getText(self.tree,'Data_Center/Data_Center_Name/Short_Name')
77       
78        if len(helper.getText(self.tree,'Data_Set_Citation/Dataset_Title')) < 1:
79            self.datasetTitle=self.name
80        else:
81            self.datasetTitle=helper.getText(self.tree,'Data_Set_Citation/Dataset_Title')
82       
83        #logging.info('EXTRA INFORMATION for ORDERING= dataset title:  ' + self.datasetTitle + '  dataset creation date: ' + self.metadataCreationDate + '  datacentre name: ' + self.datacentreName)
84       
85        #Note that entity.constraints.html is about access control on the metadata,
86        #and so we don't populate this here ...
87        self.constraints=AccessControl(None)
88       
89        #need entity.parameters, entity.bbox, entity.timeCoverage, entity.curator, entity.creators
90
91        self.parameters=[]
92        for parameter in helper.findall(self.tree,'Parameters'):
93            name=''
94            for level in ['Category','Topic','Term','Variable','Detailed_Variable']:
95                name+=helper.getText(parameter,level)+'/'
96            self.parameters.append(name.rstrip('/'))
97           
98
99        #load up information about spatial bounding box
100        self.bbox=Bounding(self.tree,entity='DIF',getter=helper.getText)
101       
102        #load up information about temporal extent
103        tc=(
104            helper.getText(self.tree,'Temporal_Coverage/Start_Date'),
105            helper.getText(self.tree,'Temporal_Coverage/Stop_Date'),
106            helper.getText(self.tree,'Data_Set_Progress') )
107        self.timeCoverage=TimeCoverage(tc)
108       
109        #load up those silly paleo keywords
110        self.paleoKeywords=[]
111        paleoElements=helper.findall(self.tree,'Paleo_Temporal_Coverage')
112        for e in paleoElements:
113            self.paleoKeywords.append(helper.getText(e,'Chronostratigraphic_Unit'))
114           
115        #Data curator information
116        self.centre=DIFcontact(helper.find(self.tree,'Data_Center'),ctype='centre',helper=helper)
117        self.curator=DIFcontact(self.tree)
118
119        #Data Creators
120        self.creators=[]
121        # use author here because a full dif entry for creator wont necessarily exist in citation ...
122        self.authors=DIFAuthors(self.tree,helper)
123        self.date=dateParse(helper.getText(self.tree,'Data_Set_Citation/Dataset_Release_Date'),'YYYY')
124        if self.date=='': self.date='XXXX'
125        self.title=helper.getText(self.tree,'Data_Set_Citation/Dataset_Title')
126        self.briefCitation=None
127        if (self.authors!='' and self.date!='' and self.title!=''):
128            self.briefCitation='%s (%s): %s'%(self.authors,self.date,self.title)
129
130        #services
131        self.services=[]
132
133        for item in helper.findall(self.tree,'Related_URL'):
134            self.services.append(
135                DIFService(
136                 helper.getText(item,'URL_Content_type'),
137                 helper.getText(item,'URL'),
138                 helper.getText(item,'Description') ))
139       
140        if self.ndgObject is None:
141            self.binding=None
142        else:
143            if self.ndgObject.discoveryURL is not None:
144                self.binding=DIFService('DISCOVERY',self.ndgObject.discoveryURL,'Discovery record')
145            else: self.binding=None
146
147        if self.debug:
148           f=open('difs.log','a')
149           f.write('%s##\n%s\n##################################\n'%(self.entryID,self.xml))
150           f.close()
151           
152    def toHTML(self,config):
153
154        if self.tree is not None:
155            renderer=renderEntity(config)
156            return renderer.render(self)
157        else:
158            return '<p>No Valid DIF</p>'
159
160
161import unittest
162
163class TestCase(unittest.TestCase):
164    """
165    """
166
167    inputFile = 'examples/neodc.eg1.dif'
168    configFile='examples/example.config'
169   
170    def setUp(self):
171        ''' Load example config and DIF files for testing '''
172        f=file(self.inputFile,'r')
173        xml=f.read()
174        self.dif=DIF(xml)
175        self.config=myConfig(self.configFile)
176
177    def testEntries(self):
178        ''' Testing the DIF object can be loaded and some key entries extracted '''
179        print 'Entry ID [%s]'%self.dif.entryID
180        print 'Author [%s]'%self.dif.authors
181       
182    def testrenderDIF(self):
183        ''' Testing the conversion to html '''
184        print self.dif.timeCoverage
185        html=self.dif.toHTML(self.config)
186        g=file('difOutput.html','w')
187        g.write(html)
188
189if __name__=="__main__":
190    unittest.main()
191
192       
193       
Note: See TracBrowser for help on using the repository browser.