source: TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py @ 2364

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py
Revision 2364, 9.3 KB checked in by lawrence, 13 years ago (diff)

Sundry modifications to get browse working again and to use a
config file to identify the appropriate repositories (for
a given ndg uri, what repository holds the browse info? this
is by way of a temporary expedient).

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4
5#normal imports
6import sys, time
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16   
17from ETxmlView import loadET,et2text
18
19from xml.dom import expatbuilder
20
21class ExpatReaderClass:
22      fromString = staticmethod(expatbuilder.parseString)
23      fromStream = staticmethod(expatbuilder.parse)
24 
25
26class ndgSearch:
27    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
28   
29    def __init__(self,logger=None,tracefile=None):
30        '''get an instance of the service'''
31        #how do we get a different backend provider?
32        loc=DiscoveryServiceLocator()
33        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
34        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
35        self.serverSessionID=None
36        self.logger=logger
37        self.__reset()
38       
39    def __reset(self):
40        ''' Initialise and/or making sure we have no hangovers from a previous call '''
41        self.documents=None
42        self.hits=None
43        self.error=None
44        self.response=None
45        self.status=None
46   
47    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,):
48        ''' Get a list of documents for a specific term using a free text search'''
49       
50        self.__reset()
51        #if target=='Authors':
52        #    request=doAuthorSearchRequest()
53        #elif target=='Params':
54        #    request=doParameterSearchRequest()
55        #else:
56        #    request=doFullTextSearchRequest()
57        request=doSearchRequest()
58        request.Term=term
59        request.Start=start
60        request.HowMany=howmany
61        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
62        if bbox is not None:
63            box=request.new_BoundingBox()
64            box.LimitNorth,box.LimitSouth,box.LimitWest,box.LimitEast=map(float,bbox)
65            request.BoundingBox=box
66       
67        if scope is not None:
68            request.Scope=scope
69       
70        if dateRange is not None:
71            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
72            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
73           
74            dRange=request.new_DateRange()
75            dRange.DateRangeStart=start
76            dRange.DateRangeEnd=end
77            request.DateRange=dRange
78
79        if self.logger: itime=time.time()
80        response=self.server.doSearch(request)
81           
82        if self.logger:
83            etime=time.time()-itime
84            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
85        if response._status:
86            self.serverSessionID=response._resultId
87            self.hits=response.Hits
88            self.documents=response.Documents.Document
89        else:
90            self.hits=response.Hits
91        self.status=[response._statusMessage,]
92        self.response=response
93        return self.documents
94
95    def getDoc(self,document,format='original'):
96        ''' Return a single document from the backend database '''
97       
98        #create a request object
99        request=doPresentRequest()
100        #get an instance of the Documents holder class
101        DocList=request.new_documents()
102        request.Documents=DocList
103        DocList.Document=[document,]
104        request.Format=format
105        #try:
106        searchResult=self.server.doPresent(request)
107        #except Exception,e:
108        #   raise Exception,e
109        if searchResult._status:
110            result=searchResult.Documents.Document[0]
111        else:
112            raise ValueError('Error retrieving [%s] was [%s]'%(document,searchResult._statusMessage))
113            #result=searchResult._statusMessage
114        return result
115                   
116    def getAllDocs(self,format='original'):           
117        ''' Parse the list of documents, and retrieve them directly '''
118       
119        if self.documents is None: return []
120        if self.logger: itime=time.time()
121         
122        #create a request object
123        request=doPresentRequest()
124        #get an instance of the Documents holder class
125        DocList=request.new_documents()
126        request.Documents=DocList
127        DocList.Document=self.documents
128        request.Format=format
129        result=self.server.doPresent(request)
130        if result._status:
131            docs=result.Documents.Document
132        else:
133            raise ValueError('Error retrieving [%s] was [%s]'%
134                        (self.documents,result._statusMessage))
135        if self.logger:
136            etime=time.time()-itime
137            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents),etime))
138
139        return docs
140           
141    def __xmlerror(self,docmessage):
142        print docmessage
143        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
144           
145    def getDocElement(self,document):
146        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
147        into an ElementTree instance '''
148        #we stick it straight into element tree because we need to use et to get the actual document
149        #we want, not the envelope xml elements
150       
151        doc=self.getDoc(document)
152        path=document.rstrip('.xml')
153        try:
154            r=loadET(doc)
155            #return r.find(path)
156            return r
157        except:
158            return self.__xmlerror((path,doc))
159           
160    def getAllDocsAsElements(self):
161        ''' Get all the documents and load them into a list of ET instances '''
162        result=[]
163        docs=self.getAllDocs()
164        for doc in docs: 
165            try:
166                r=loadET(doc)
167                result.append(r)#result.append(r.find(path))
168            except:
169                result.append(self.__xmlerror(doc))
170        return result
171           
172           
173    def getLabelledDocs(self,format='original'):
174        ''' Returns all the documents in sequence in a labelled list of strings'''
175        if self.hits==0: return []
176        #filenames=self.documents.Document
177        #so we know that the following call is the problem ...
178        responses=self.getAllDocs(format)
179        filenames=self.documents
180        i=len(filenames)
181        j=len(responses)
182        if i!=j:
183            print filenames
184            raise ValueError,'Internal inconsistency in search return [hits:%s!=responses:%s]'%(i,j)
185        indices=range(i)
186        results=[]
187        for i in indices:
188            results.append((filenames[i].strip('.xml'),responses[i]))
189        return results
190       
191    def get(self,repository,schema,localID,format='original',targetCollection=None):
192        ''' Obtain a document via it's NDG id split up '''
193        #nb argument targetCollection is here to provide same API as exist xmlrpc interface
194        uri='%s__%s__%s'%(repository,schema,localID)
195        fileName=uri+'.xml'
196        return self.getDoc(fileName,format)
197       
198       
199import unittest
200
201class TestCase(unittest.TestCase):
202   
203    def testSearch(self):
204        ''' Test fundamental search capability '''
205        term='temperature'
206        s=ndgSearch()
207        howmany=10
208        docs=s.search(term,start=1,howmany=howmany)
209        print 'Asked for ',howmany,' documents (there were %s hits):'%s.hits
210        print s.documents
211       
212    def testgetLabelledDocs(self):
213        ''' Test returning of labelled documents '''
214        term='CD97'
215        s=ndgSearch()
216        r=s.search(term)
217        print s.status
218        print s.documents
219        output=s.getLabelledDocs()
220       
221    def testNoReturn(self):
222        ''' Tests a search return with (hopefully nothing to be found)'''
223        term='xpabnl'
224        s=ndgSearch()
225        r=s.search(term)
226        print 'Hopefully this is zero: if not, expect the NoReturn test to fail:',s.hits
227        output=s.getLabelledDocs()
228        self.assertEqual(len(output),0)
229       
230    def testGetDoc(self):
231        ''' Test obtaining a specific document which had better exist '''
232        doc='noc.soton.ac.uk__DIF__NOCSDAT100.xml'
233        #doc='ndg.noc.soton.ac.uk__DIF__NOCSDAT274.xml'
234        s=ndgSearch()
235        r=s.getDoc(doc)
236       
237    def testSequence(self):
238        ''' Tests that repeated searches work and can support zero responses in the middle '''
239        # this was a bug we found and needed a test case for
240        s=ndgSearch()
241        term='CD97' # hopefully just get a couple of hits for now.
242        r=s.search(term)
243        print s.hits
244        #if we uncomment this it used to break, and not if uncommented, now it seems ok...
245        output=s.getLabelledDocs() 
246        #this failed as well, and we isolated the dependency in the previous call to this one.
247        #res=s.getAllDocs()
248        term='xpabnl'
249        r=s.search(term)
250        print s.hits
251        print 'Hopefully this is also zero: if not, expect the Sequence test to fail:',s.hits
252       
253    def testGet(self):
254        ''' Tests getting via uri components '''
255        (r,s,l)='neodc.nerc.ac.uk','DIF','NEODC_ARSF_ATM_DAED'
256        ss=ndgSearch()
257        x=ss.get(r,s,l,format='DC')
258
259if __name__=="__main__":
260    unittest.main()
Note: See TracBrowser for help on using the repository browser.