source: TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py @ 2097

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py@2097
Revision 2097, 8.4 KB checked in by lawrence, 12 years ago (diff)

Sundry modifications associated with the deployment on glue, and better
options for viewing and downloading the underlying xml records (not yet
complete)

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4
5#normal imports
6import sys, time
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16   
17from ETxmlView import loadET,et2text
18
19from xml.dom import expatbuilder
20
21class ExpatReaderClass:
22      fromString = staticmethod(expatbuilder.parseString)
23      fromStream = staticmethod(expatbuilder.parse)
24 
25
26class ndgSearch:
27    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
28   
29    def __init__(self,logger=None,tracefile=None):
30        '''get an instance of the service'''
31        #how do we get a different backend provider?
32        loc=DiscoveryServiceLocator()
33        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
34        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
35        self.serverSessionID=None
36        self.logger=logger
37        self.__reset()
38       
39    def __reset(self):
40        ''' Initialise and/or making sure we have no hangovers from a previous call '''
41        self.documents=None
42        self.hits=None
43        self.error=None
44        self.response=None
45        self.status=None
46   
47    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,):
48        ''' Get a list of documents for a specific term using a free text search'''
49       
50        self.__reset()
51        #if target=='Authors':
52        #    request=doAuthorSearchRequest()
53        #elif target=='Params':
54        #    request=doParameterSearchRequest()
55        #else:
56        #    request=doFullTextSearchRequest()
57        request=doSearchRequest()
58        request.Term=term
59        request.Start=start
60        request.HowMany=howmany
61        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
62        if bbox is not None:
63            box=request.new_BoundingBox()
64            box.LimitNorth,box.LimitSouth,box.LimitWest,box.LimitEast=map(float,bbox)
65            request.BoundingBox=box
66       
67        if scope is not None:
68            request.Scope=scope
69       
70        if dateRange is not None:
71            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
72            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
73           
74            dRange=request.new_DateRange()
75            dRange.DateRangeStart=start
76            dRange.DateRangeEnd=end
77            request.DateRange=dRange
78
79        if self.logger: itime=time.time()
80        response=self.server.doSearch(request)
81           
82        if self.logger:
83            etime=time.time()-itime
84            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
85        if response._status:
86            self.serverSessionID=response._resultId
87            self.hits=response.Hits
88            self.documents=response._documents
89        else:
90            self.hits=response.Hits
91        self.status=[response._statusMessage,]
92        self.response=response
93        return self.documents
94
95    def getDoc(self,document):
96        ''' Return a single document from the backend database '''
97       
98        #create a request object
99        request=doPresentRequest()
100        request._documents=[document,]
101        request._format='original'
102        try:
103            searchResult=self.server.doPresent(request)
104        except Exception,e:
105            raise Exception,e
106        if searchResult._status:
107            result=searchResult._documents[0]
108        else:
109            result=searchResult._statusMessage
110        return result
111                   
112    def getAllDocs(self,format='original'):           
113        ''' Parse the list of documents, and retrieve them directly '''
114        docs=[]
115        self.status=[]
116       
117        if self.documents is None: return []
118       
119        #create a request object
120        request=doPresentRequest()
121        if self.logger: itime=time.time()
122       
123        request.Documents=self.documents
124        request.Format=format
125        result=self.server.doPresent(request)   
126        if result._status:
127            docs=result._documents._document
128       
129        if self.logger:
130            etime=time.time()-itime
131            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents._document),etime))
132        #self.docs=docs
133        #return self.docs
134        #removed self.docs reference Jan 31st, can't see why we had it ...
135        #nb returning a complete copy of the list didn't fix the problems with repeated calls
136        return docs
137           
138           
139    def __xmlerror(self,docmessage):
140        print docmessage
141        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
142           
143    def getDocElement(self,document):
144        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
145        into an ElementTree instance '''
146        #we stick it straight into element tree because we need to use et to get the actual document
147        #we want, not the envelope xml elements
148       
149        doc=self.getDoc(document)
150        path=document.rstrip('.xml')
151        try:
152            r=loadET(doc)
153            #return r.find(path)
154            return r
155        except:
156            return self.__xmlerror((path,doc))
157           
158    def getAllDocsAsElements(self):
159        ''' Get all the documents and load them into a list of ET instances '''
160        result=[]
161        docs=self.getAllDocs()
162        for doc in docs: 
163            try:
164                r=loadET(doc)
165                result.append(r)#result.append(r.find(path))
166            except:
167                result.append(self.__xmlerror(doc))
168        return result
169           
170           
171    def getLabelledDocs(self,format='original'):
172        ''' Returns all the documents in sequence in a labelled list of strings'''
173        if self.hits==0: return []
174        #filenames=self.documents.Document
175        #so we know that the following call is the problem ...
176        responses=self.getAllDocs(format)
177        filenames=self.documents.Document
178        i=len(filenames)
179        j=len(responses)
180        if i!=j:
181            raise ValueError,'Internal inconsistency in search return [%s!=%s]'%(i,j)
182        indices=range(i)
183        results=[]
184        for i in indices:
185            results.append((filenames[i].strip('.xml'),responses[i]))
186        return results
187       
188       
189import unittest
190
191class TestCase(unittest.TestCase):
192   
193    def testSearch(self):
194        ''' Test fundamental search capability '''
195        term='temperature'
196        s=ndgSearch(tracefile)
197        howmany=10
198        docs=s.search(term,start=1,howmany=howmany)
199        print 'Asked for ',howmany,' documents (there were %s hits):'%s.hits
200        print s.documents.Document
201       
202    def testgetLabelledDocs(self):
203        ''' Test returning of labelled documents '''
204        term='neodc'
205        s=ndgSearch()
206        r=s.search(term)
207        print s.status
208        output=s.getLabelledDocs()
209       
210    def testNoReturn(self):
211        ''' Tests a search return with (hopefully nothing to be found)'''
212        term='xpabnl'
213        s=ndgSearch()
214        r=s.search(term)
215        print 'Hopefully this is zero: if not, expect the NoReturn test to fail:',s.hits
216        output=s.getLabelledDocs()
217        self.assertEqual(len(output),0)
218       
219    def testSequence(self):
220        ''' Tests that repeated searches work and can support zero responses in the middle '''
221        # this was a bug we found and needed a test case for
222        s=ndgSearch(tracefile=sys.stdout)
223        term='neodc'
224        r=s.search(term)
225        print s.hits
226        #if we uncomment this it breaks, if not it doesn't ...
227        #output=s.getLabelledDocs()
228        #this fails as well, and we've isolated the dependency in the previous call to
229        #be on this one:
230        res=s.getAllDocs()
231        #del(output) doesn't work ...
232        term='xpabnl'
233        r=s.search(term)
234        print s.hits
235        print 'Hopefully this is also zero: if not, expect the Sequence test to fail:',s.hits
236        #output=s.getLabelledDocs()
237        #self.assertEqual(len(output),0)
238       
239
240if __name__=="__main__":
241    unittest.main()
Note: See TracBrowser for help on using the repository browser.