source: TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py @ 2315

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py@2315
Revision 2315, 8.6 KB checked in by lawrence, 14 years ago (diff)

Improved parameter handling, and better test cases.

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4
5#normal imports
6import sys, time
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16   
17from ETxmlView import loadET,et2text
18
19from xml.dom import expatbuilder
20
21class ExpatReaderClass:
22      fromString = staticmethod(expatbuilder.parseString)
23      fromStream = staticmethod(expatbuilder.parse)
24 
25
26class ndgSearch:
27    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
28   
29    def __init__(self,logger=None,tracefile=None):
30        '''get an instance of the service'''
31        #how do we get a different backend provider?
32        loc=DiscoveryServiceLocator()
33        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
34        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
35        self.serverSessionID=None
36        self.logger=logger
37        self.__reset()
38       
39    def __reset(self):
40        ''' Initialise and/or making sure we have no hangovers from a previous call '''
41        self.documents=None
42        self.hits=None
43        self.error=None
44        self.response=None
45        self.status=None
46   
47    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,):
48        ''' Get a list of documents for a specific term using a free text search'''
49       
50        self.__reset()
51        #if target=='Authors':
52        #    request=doAuthorSearchRequest()
53        #elif target=='Params':
54        #    request=doParameterSearchRequest()
55        #else:
56        #    request=doFullTextSearchRequest()
57        request=doSearchRequest()
58        request.Term=term
59        request.Start=start
60        request.HowMany=howmany
61        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
62        if bbox is not None:
63            box=request.new_BoundingBox()
64            box.LimitNorth,box.LimitSouth,box.LimitWest,box.LimitEast=map(float,bbox)
65            request.BoundingBox=box
66       
67        if scope is not None:
68            request.Scope=scope
69       
70        if dateRange is not None:
71            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
72            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
73           
74            dRange=request.new_DateRange()
75            dRange.DateRangeStart=start
76            dRange.DateRangeEnd=end
77            request.DateRange=dRange
78
79        if self.logger: itime=time.time()
80        response=self.server.doSearch(request)
81           
82        if self.logger:
83            etime=time.time()-itime
84            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
85        if response._status:
86            self.serverSessionID=response._resultId
87            self.hits=response.Hits
88            self.documents=response.Documents.Document
89        else:
90            self.hits=response.Hits
91        self.status=[response._statusMessage,]
92        self.response=response
93        return self.documents
94
95    def getDoc(self,document):
96        ''' Return a single document from the backend database '''
97       
98        #create a request object
99        request=doPresentRequest()
100        #get an instance of the Documents holder class
101        DocList=request.new_documents()
102        request.Documents=DocList
103        DocList.Document=[document,]
104        request.Format='original'
105        #try:
106        searchResult=self.server.doPresent(request)
107        #except Exception,e:
108        #   raise Exception,e
109        if searchResult._status:
110            result=searchResult.Documents.Document[0]
111        else:
112            raise ValueError('Error retrieving [%s] was [%s]'%(document,searchResult._statusMessage))
113            #result=searchResult._statusMessage
114        return result
115                   
116    def getAllDocs(self,format='original'):           
117        ''' Parse the list of documents, and retrieve them directly '''
118       
119        if self.documents is None: return []
120        if self.logger: itime=time.time()
121         
122        #create a request object
123        request=doPresentRequest()
124        #get an instance of the Documents holder class
125        DocList=request.new_documents()
126        request.Documents=DocList
127        DocList.Document=self.documents
128        request.Format=format
129       
130        result=self.server.doPresent(request)
131        if result._status:
132            docs=result.Documents.Document
133        else:
134            docs=[]
135       
136        if self.logger:
137            etime=time.time()-itime
138            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents),etime))
139
140        return docs
141           
142           
143    def __xmlerror(self,docmessage):
144        print docmessage
145        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
146           
147    def getDocElement(self,document):
148        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
149        into an ElementTree instance '''
150        #we stick it straight into element tree because we need to use et to get the actual document
151        #we want, not the envelope xml elements
152       
153        doc=self.getDoc(document)
154        path=document.rstrip('.xml')
155        try:
156            r=loadET(doc)
157            #return r.find(path)
158            return r
159        except:
160            return self.__xmlerror((path,doc))
161           
162    def getAllDocsAsElements(self):
163        ''' Get all the documents and load them into a list of ET instances '''
164        result=[]
165        docs=self.getAllDocs()
166        for doc in docs: 
167            try:
168                r=loadET(doc)
169                result.append(r)#result.append(r.find(path))
170            except:
171                result.append(self.__xmlerror(doc))
172        return result
173           
174           
175    def getLabelledDocs(self,format='original'):
176        ''' Returns all the documents in sequence in a labelled list of strings'''
177        if self.hits==0: return []
178        #filenames=self.documents.Document
179        #so we know that the following call is the problem ...
180        responses=self.getAllDocs(format)
181        filenames=self.documents
182        i=len(filenames)
183        j=len(responses)
184        if i!=j:
185            print filenames
186            raise ValueError,'Internal inconsistency in search return [hits:%s!=responses:%s]'%(i,j)
187        indices=range(i)
188        results=[]
189        for i in indices:
190            results.append((filenames[i].strip('.xml'),responses[i]))
191        return results
192       
193       
194import unittest
195
196class TestCase(unittest.TestCase):
197   
198    def testSearch(self):
199        ''' Test fundamental search capability '''
200        term='temperature'
201        s=ndgSearch()
202        howmany=10
203        docs=s.search(term,start=1,howmany=howmany)
204        print 'Asked for ',howmany,' documents (there were %s hits):'%s.hits
205        print s.documents
206       
207    def testgetLabelledDocs(self):
208        ''' Test returning of labelled documents '''
209        term='CD97'
210        s=ndgSearch()
211        r=s.search(term)
212        print s.status
213        print s.documents
214        output=s.getLabelledDocs()
215       
216    def testNoReturn(self):
217        ''' Tests a search return with (hopefully nothing to be found)'''
218        term='xpabnl'
219        s=ndgSearch()
220        r=s.search(term)
221        print 'Hopefully this is zero: if not, expect the NoReturn test to fail:',s.hits
222        output=s.getLabelledDocs()
223        self.assertEqual(len(output),0)
224       
225    def testGetDoc(self):
226        ''' Test obtaining a specific document which had better exist '''
227        doc='noc.soton.ac.uk__DIF__NOCSDAT100.xml'
228        #doc='ndg.noc.soton.ac.uk__DIF__NOCSDAT274.xml'
229        s=ndgSearch()
230        r=s.getDoc(doc)
231       
232    def testSequence(self):
233        ''' Tests that repeated searches work and can support zero responses in the middle '''
234        # this was a bug we found and needed a test case for
235        s=ndgSearch()
236        term='CD97' # hopefully just get a couple of hits for now.
237        r=s.search(term)
238        print s.hits
239        #if we uncomment this it used to break, and not if uncommented, now it seems ok...
240        output=s.getLabelledDocs() 
241        #this failed as well, and we isolated the dependency in the previous call to this one.
242        #res=s.getAllDocs()
243        term='xpabnl'
244        r=s.search(term)
245        print s.hits
246        print 'Hopefully this is also zero: if not, expect the Sequence test to fail:',s.hits
247
248       
249
250if __name__=="__main__":
251    unittest.main()
Note: See TracBrowser for help on using the repository browser.