source: TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py @ 2260

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/ndgSearch.py@2260
Revision 2260, 8.4 KB checked in by lawrence, 14 years ago (diff)

Slight changes to support modified xqueries, and modified xquery
itself ... first cut at MDIP (based on DC, doesn't actually do
anything proper for now).

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4
5#normal imports
6import sys, time
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16   
17from ETxmlView import loadET,et2text
18
19from xml.dom import expatbuilder
20
21class ExpatReaderClass:
22      fromString = staticmethod(expatbuilder.parseString)
23      fromStream = staticmethod(expatbuilder.parse)
24 
25
26class ndgSearch:
27    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
28   
29    def __init__(self,logger=None,tracefile=None):
30        '''get an instance of the service'''
31        #how do we get a different backend provider?
32        loc=DiscoveryServiceLocator()
33        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
34        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
35        self.serverSessionID=None
36        self.logger=logger
37        self.__reset()
38       
39    def __reset(self):
40        ''' Initialise and/or making sure we have no hangovers from a previous call '''
41        self.documents=None
42        self.hits=None
43        self.error=None
44        self.response=None
45        self.status=None
46   
47    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,):
48        ''' Get a list of documents for a specific term using a free text search'''
49       
50        self.__reset()
51        #if target=='Authors':
52        #    request=doAuthorSearchRequest()
53        #elif target=='Params':
54        #    request=doParameterSearchRequest()
55        #else:
56        #    request=doFullTextSearchRequest()
57        request=doSearchRequest()
58        request.Term=term
59        request.Start=start
60        request.HowMany=howmany
61        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
62        if bbox is not None:
63            box=request.new_BoundingBox()
64            box.LimitNorth,box.LimitSouth,box.LimitWest,box.LimitEast=map(float,bbox)
65            request.BoundingBox=box
66       
67        if scope is not None:
68            request.Scope=scope
69       
70        if dateRange is not None:
71            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
72            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
73           
74            dRange=request.new_DateRange()
75            dRange.DateRangeStart=start
76            dRange.DateRangeEnd=end
77            request.DateRange=dRange
78
79        if self.logger: itime=time.time()
80        response=self.server.doSearch(request)
81           
82        if self.logger:
83            etime=time.time()-itime
84            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
85        if response._status:
86            self.serverSessionID=response._resultId
87            self.hits=response.Hits
88            self.documents=response.Documents.Document
89        else:
90            self.hits=response.Hits
91        self.status=[response._statusMessage,]
92        self.response=response
93        return self.documents
94
95    def getDoc(self,document):
96        ''' Return a single document from the backend database '''
97       
98        #create a request object
99        request=doPresentRequest()
100        #get an instance of the Documents holder class
101        DocList=request.new_documents()
102        request.Documents=DocList
103        DocList.Document=[document,]
104        request.Format='original'
105        #try:
106        searchResult=self.server.doPresent(request)
107        #except Exception,e:
108        #   raise Exception,e
109        if searchResult._status:
110            result=searchResult.Documents.Document[0]
111        else:
112            result=searchResult._statusMessage
113        return result
114                   
115    def getAllDocs(self,format='original'):           
116        ''' Parse the list of documents, and retrieve them directly '''
117       
118        if self.documents is None: return []
119        if self.logger: itime=time.time()
120         
121        #create a request object
122        request=doPresentRequest()
123        #get an instance of the Documents holder class
124        DocList=request.new_documents()
125        request.Documents=DocList
126        DocList.Document=self.documents
127        request.Format=format
128       
129        result=self.server.doPresent(request)
130        if result._status:
131            docs=result.Documents.Document
132        else:
133            docs=[]
134       
135        if self.logger:
136            etime=time.time()-itime
137            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents),etime))
138
139        return docs
140           
141           
142    def __xmlerror(self,docmessage):
143        print docmessage
144        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
145           
146    def getDocElement(self,document):
147        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
148        into an ElementTree instance '''
149        #we stick it straight into element tree because we need to use et to get the actual document
150        #we want, not the envelope xml elements
151       
152        doc=self.getDoc(document)
153        path=document.rstrip('.xml')
154        try:
155            r=loadET(doc)
156            #return r.find(path)
157            return r
158        except:
159            return self.__xmlerror((path,doc))
160           
161    def getAllDocsAsElements(self):
162        ''' Get all the documents and load them into a list of ET instances '''
163        result=[]
164        docs=self.getAllDocs()
165        for doc in docs: 
166            try:
167                r=loadET(doc)
168                result.append(r)#result.append(r.find(path))
169            except:
170                result.append(self.__xmlerror(doc))
171        return result
172           
173           
174    def getLabelledDocs(self,format='original'):
175        ''' Returns all the documents in sequence in a labelled list of strings'''
176        if self.hits==0: return []
177        #filenames=self.documents.Document
178        #so we know that the following call is the problem ...
179        responses=self.getAllDocs(format)
180        filenames=self.documents
181        i=len(filenames)
182        j=len(responses)
183        if i!=j:
184            raise ValueError,'Internal inconsistency in search return [hits:%s!=responses:%s]'%(i,j)
185        indices=range(i)
186        results=[]
187        for i in indices:
188            results.append((filenames[i].strip('.xml'),responses[i]))
189        return results
190       
191       
192import unittest
193
194class TestCase(unittest.TestCase):
195   
196    def testSearch(self):
197        ''' Test fundamental search capability '''
198        term='temperature'
199        s=ndgSearch()
200        howmany=10
201        docs=s.search(term,start=1,howmany=howmany)
202        print 'Asked for ',howmany,' documents (there were %s hits):'%s.hits
203        print s.documents
204       
205    def testgetLabelledDocs(self):
206        ''' Test returning of labelled documents '''
207        term='CD97'
208        s=ndgSearch()
209        r=s.search(term)
210        print s.status
211        print s.documents
212        output=s.getLabelledDocs()
213       
214    def testNoReturn(self):
215        ''' Tests a search return with (hopefully nothing to be found)'''
216        term='xpabnl'
217        s=ndgSearch()
218        r=s.search(term)
219        print 'Hopefully this is zero: if not, expect the NoReturn test to fail:',s.hits
220        output=s.getLabelledDocs()
221        self.assertEqual(len(output),0)
222       
223    def testGetDoc(self):
224        ''' Test obtaining a specific document which had better exist '''
225        doc='noc.soton.ac.uk__DIF__NOCSDAT100.xml'
226        s=ndgSearch()
227        r=s.getDoc(doc)
228       
229    def testSequence(self):
230        ''' Tests that repeated searches work and can support zero responses in the middle '''
231        # this was a bug we found and needed a test case for
232        s=ndgSearch()
233        term='CD97' # hopefully just get a couple of hits for now.
234        r=s.search(term)
235        print s.hits
236        #if we uncomment this it used to break, and not if uncommented, now it seems ok...
237        output=s.getLabelledDocs() 
238        #this failed as well, and we isolated the dependency in the previous call to this one.
239        #res=s.getAllDocs()
240        term='xpabnl'
241        r=s.search(term)
242        print s.hits
243        print 'Hopefully this is also zero: if not, expect the Sequence test to fail:',s.hits
244
245       
246
247if __name__=="__main__":
248    unittest.main()
Note: See TracBrowser for help on using the repository browser.