source: TI05-delivery/ows_framework/trunk/ows_server/ows_server/models/ndgSearch.py @ 2820

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI05-delivery/ows_framework/trunk/ows_server/ows_server/models/ndgSearch.py@2820
Revision 2820, 9.6 KB checked in by lawrence, 12 years ago (diff)

Fixing the strip problem (the string method strip doesn't
do what you intuitively think it will do!)

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4import os.path
5
6#normal imports
7import sys, time
8try: #python 2.5
9    from xml.etree import ElementTree as ET
10except ImportError:
11    try:
12        # if you've installed it yourself it comes this way
13        import ElementTree as ET
14    except ImportError:
15        # if you've egged it this is the way it comes
16        from elementtree import ElementTree as ET
17   
18from ETxmlView import loadET,et2text
19
20from xml.dom import expatbuilder
21
22class ExpatReaderClass:
23      fromString = staticmethod(expatbuilder.parseString)
24      fromStream = staticmethod(expatbuilder.parse)
25 
26
27class ndgSearch:
28    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
29   
30    def __init__(self,logger=None,tracefile=None):
31        '''get an instance of the service'''
32        #how do we get a different backend provider?
33        loc=DiscoveryServiceLocator()
34        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
35        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
36        self.serverSessionID=None
37        self.logger=logger
38        self.__reset()
39       
40    def __reset(self):
41        ''' Initialise and/or making sure we have no hangovers from a previous call '''
42        self.documents=None
43        self.hits=None
44        self.error=None
45        self.response=None
46        self.status=None
47   
48    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,):
49        ''' Get a list of documents for a specific term using a free text search'''
50       
51        self.__reset()
52        #if target=='Authors':
53        #    request=doAuthorSearchRequest()
54        #elif target=='Params':
55        #    request=doParameterSearchRequest()
56        #else:
57        #    request=doFullTextSearchRequest()
58        request=doSearchRequest()
59        request.Term=term
60        request.Start=start
61        request.HowMany=howmany
62        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
63        if bbox is not None:
64            box=request.new_BoundingBox()
65            box.LimitNorth,box.LimitSouth,box.LimitWest,box.LimitEast=map(float,bbox)
66            request.BoundingBox=box
67       
68        if scope is not None:
69            request.Scope=scope
70       
71        if dateRange is not None:
72            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
73            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
74           
75            dRange=request.new_DateRange()
76            dRange.DateRangeStart=start
77            dRange.DateRangeEnd=end
78            request.DateRange=dRange
79
80        if self.logger: itime=time.time()
81        response=self.server.doSearch(request)
82           
83        if self.logger:
84            etime=time.time()-itime
85            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
86        if response._status:
87            self.serverSessionID=response._resultId
88            self.hits=response.Hits
89            self.documents=response.Documents.Document
90        else:
91            self.hits=response.Hits
92        self.status=[response._statusMessage,]
93        self.response=response
94        return self.documents
95
96    def getDoc(self,document,format='original'):
97        ''' Return a single document from the backend database '''
98       
99        #create a request object
100        request=doPresentRequest()
101        #get an instance of the Documents holder class
102        DocList=request.new_documents()
103        request.Documents=DocList
104        DocList.Document=[document,]
105        request.Format=format
106        #try:
107        searchResult=self.server.doPresent(request)
108        #except Exception,e:
109        #   raise Exception,e
110        if searchResult._status:
111            result=searchResult.Documents.Document[0]
112        else:
113            raise ValueError('Error retrieving [%s] was [%s]'%(document,searchResult._statusMessage))
114            #result=searchResult._statusMessage
115        return result
116                   
117    def getAllDocs(self,format='original'):           
118        ''' Parse the list of documents, and retrieve them directly '''
119       
120        if self.documents is None: return []
121        if self.logger: itime=time.time()
122         
123        #create a request object
124        request=doPresentRequest()
125        #get an instance of the Documents holder class
126        DocList=request.new_documents()
127        request.Documents=DocList
128        DocList.Document=self.documents
129        request.Format=format
130        result=self.server.doPresent(request)
131        if result._status:
132            docs=result.Documents.Document
133        else:
134            raise ValueError('Error retrieving [%s] was [%s]'%
135                        (self.documents,result._statusMessage))
136        if self.logger:
137            etime=time.time()-itime
138            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents),etime))
139
140        return docs
141           
142    def __xmlerror(self,docmessage):
143        print docmessage
144        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
145           
146    def getDocElement(self,document):
147        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
148        into an ElementTree instance '''
149        #we stick it straight into element tree because we need to use et to get the actual document
150        #we want, not the envelope xml elements
151       
152        doc=self.getDoc(document)
153        path=document.rstrip('.xml')
154        try:
155            r=loadET(doc)
156            #return r.find(path)
157            return r
158        except:
159            return self.__xmlerror((path,doc))
160           
161    def getAllDocsAsElements(self):
162        ''' Get all the documents and load them into a list of ET instances '''
163        result=[]
164        docs=self.getAllDocs()
165        for doc in docs: 
166            try:
167                r=loadET(doc)
168                result.append(r)#result.append(r.find(path))
169            except:
170                result.append(self.__xmlerror(doc))
171        return result
172           
173           
174    def getLabelledDocs(self,format='original'):
175        ''' Returns all the documents in sequence in a labelled list of strings'''
176        if self.hits==0: return []
177        #filenames=self.documents.Document
178        #so we know that the following call is the problem ...
179        responses=self.getAllDocs(format)
180        filenames=self.documents
181        i=len(filenames)
182        j=len(responses)
183        if i!=j:
184            print filenames
185            raise ValueError,'Internal inconsistency in search return [hits:%s!=responses:%s]'%(i,j)
186        indices=range(i)
187        results=[]
188        for i in indices:
189            ####results.append((filenames[i].strip('.xml'),responses[i]))
190            ####Wow, 'higem.xml'.strip('.xml') = hige ... and that's what split is supposed to do!
191            ff=os.path.splitext(filenames[i])
192            results.append((ff[0],responses[i]))
193        return results
194       
195    def get(self,repository,schema,localID,format='original',targetCollection=None):
196        ''' Obtain a document via it's NDG id split up '''
197        #nb argument targetCollection is here to provide same API as exist xmlrpc interface
198        uri='%s__%s__%s'%(repository,schema,localID)
199        fileName=uri+'.xml'
200        return self.getDoc(fileName,format)
201       
202       
203import unittest
204
205class TestCase(unittest.TestCase):
206   
207    def testSearch(self):
208        ''' Test fundamental search capability '''
209        term='temperature'
210        s=ndgSearch()
211        howmany=10
212        docs=s.search(term,start=1,howmany=howmany)
213        print 'Asked for ',howmany,' documents (there were %s hits):'%s.hits
214        print s.documents
215       
216    def testgetLabelledDocs(self):
217        ''' Test returning of labelled documents '''
218        term='CD97'
219        s=ndgSearch()
220        r=s.search(term)
221        print s.status
222        print s.documents
223        output=s.getLabelledDocs()
224       
225    def testNoReturn(self):
226        ''' Tests a search return with (hopefully nothing to be found)'''
227        term='xpabnl'
228        s=ndgSearch()
229        r=s.search(term)
230        print 'Hopefully this is zero: if not, expect the NoReturn test to fail:',s.hits
231        output=s.getLabelledDocs()
232        self.assertEqual(len(output),0)
233       
234    def testGetDoc(self):
235        ''' Test obtaining a specific document which had better exist '''
236        doc='noc.soton.ac.uk__DIF__NOCSDAT100.xml'
237        #doc='ndg.noc.soton.ac.uk__DIF__NOCSDAT274.xml'
238        s=ndgSearch()
239        r=s.getDoc(doc)
240       
241    def testSequence(self):
242        ''' Tests that repeated searches work and can support zero responses in the middle '''
243        # this was a bug we found and needed a test case for
244        s=ndgSearch()
245        term='CD97' # hopefully just get a couple of hits for now.
246        r=s.search(term)
247        print s.hits
248        #if we uncomment this it used to break, and not if uncommented, now it seems ok...
249        output=s.getLabelledDocs() 
250        #this failed as well, and we isolated the dependency in the previous call to this one.
251        #res=s.getAllDocs()
252        term='xpabnl'
253        r=s.search(term)
254        print s.hits
255        print 'Hopefully this is also zero: if not, expect the Sequence test to fail:',s.hits
256       
257    def testGet(self):
258        ''' Tests getting via uri components '''
259        (r,s,l)='neodc.nerc.ac.uk','DIF','NEODC_ARSF_ATM_DAED'
260        ss=ndgSearch()
261        x=ss.get(r,s,l,format='DC')
262       
263    def testFailedGet(self):
264        doc='abc'
265        s=ndgSearch()
266        r=s.getDoc(doc)
267        assert 'Error' in r
268
269if __name__=="__main__":
270    unittest.main()
Note: See TracBrowser for help on using the repository browser.