source: exist/trunk/python/ndgUtils/ndgSearch.py @ 4196

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/ndgSearch.py@5371
Revision 4196, 9.4 KB checked in by cbyrom, 12 years ago (diff)

Extend xmlHandler to allow specification of different namespaces when
using the _distrbutens method to decorate xpath expressions with namespaces
+ add logging + add namespace info to ndgObject - for NSs used in the
various doc types + fix bug in ndgSearch to check response Hits properly.

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4import os.path, logging
5
6#normal imports
7import sys, time
8try: #python 2.5
9    from xml.etree import ElementTree as ET
10except ImportError:
11    try:
12        # if you've installed it yourself it comes this way
13        import ElementTree as ET
14    except ImportError:
15        # if you've egged it this is the way it comes
16        from elementtree import ElementTree as ET
17   
18from ETxmlView import loadET,et2text
19
20from xml.dom import expatbuilder
21
22class NDGSearchError(Exception):
23    """Exception handling for NDG Search class."""
24    def __init__(self, msg):
25        logging.error(msg)
26        Exception.__init__(self, msg)
27
28class ExpatReaderClass:
29      fromString = staticmethod(expatbuilder.parseString)
30      fromStream = staticmethod(expatbuilder.parse)
31#
32# Locator (this mimics the code which lives in the wsdl2py generated DiscoveryService_services,
33# but explicitly allowsa different port address) ...
34#
35
36default_HostAndPort="http://ndg.badc.rl.ac.uk:8080/axis2/services/DiscoveryService"
37
38class ndgSearch:
39    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
40   
41    def __init__(self, tracefile=None,HostAndPort=default_HostAndPort):
42        '''Get an instance of the NDG Discovery Service.
43            Optional Keywords are:
44                tracefile - for collecting debug output
45                HostAndPort - the location of the SOAP web service (default is ndg Glue)
46        '''
47        #Get a different backend provider via the url argument
48        loc=DiscoveryServiceLocator()
49        self.server=loc.getDiscoveryServicePortType(url=HostAndPort,
50                                    readerclass=ExpatReaderClass,tracefile=tracefile)
51        self.serverSessionID=None
52        logging.info('Discovery web service connection to %s'%HostAndPort)
53        self.__reset()
54       
55    def __reset(self):
56        ''' Initialise and/or making sure we have no hangovers from a previous call '''
57        self.documents=None
58        self.hits=None
59        self.error=None
60        self.response=None
61        self.status=None
62   
63    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,geoSearchType='overlaps'):
64        ''' Get a list of documents for a specific term using a free text search'''
65        logging.info("Running search - with term, '%s'" %term)
66        self.__reset()
67        #if target=='Authors':
68        #    request=doAuthorSearchRequest()
69        #elif target=='Params':
70        #    request=doParameterSearchRequest()
71        #else:
72        #    request=doFullTextSearchRequest()
73        request=doSearchRequest()
74        request.Term=term
75        request.Start=start
76        request.HowMany=howmany
77        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
78        if bbox is not None:
79            box=request.new_BoundingBox()
80            box.LimitNorth,box.LimitWest,box.LimitEast,box.LimitSouth=map(float,bbox)
81            request.BoundingBox=box
82            request.SpatialOperator=geoSearchType
83       
84        if scope is not None:
85            request.Scope=scope
86       
87        if dateRange is not None:
88            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
89            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
90           
91            dRange=request.new_DateRange()
92            dRange.DateRangeStart=start
93            dRange.DateRangeEnd=end
94            request.DateRange=dRange
95
96        itime=time.time()
97        response=self.server.doSearch(request)
98        etime=time.time()-itime
99        logging.debug('Search Request [%s] took [%ss]'%(term,etime))
100
101        # NB, set default and only change this if response lists Hits - this
102        # can return 'None' - so isn't reliable var to set to without checking
103        self.hits = 0
104        if response.Hits:
105            self.hits = response.Hits
106           
107        if response._status:
108            self.serverSessionID=response._resultId
109            self.documents=response.Documents.Document
110       
111        logging.info("Search returned %s results: %s" %(self.hits, self.documents))
112        self.status=[response._statusMessage,]
113        self.response=response
114        logging.info("Search complete")
115        return self.documents
116
117
118    def __setUpRequestObject(self, documents, format):
119        '''
120        Set up a request object and set it up with the appropriate
121        Documents holder class and format
122        @param documents: documents to hold in request
123        @type documents: list
124        @param format: document format to use
125        @type format: str
126        @return: request object with Documents holder and format set up
127        '''
128        logging.debug("Setting up request object")
129        request=doPresentRequest()
130        request.Documents = request.new_documents()
131        request.Documents.Document = documents
132        request.Format = format
133        logging.debug("Request object set up")
134        return request
135
136
137    def __runDocumentPresent(self, request):
138        '''
139        Run a document retrieval, provided the request to use
140        @param request: request object to run the doPresent call with
141        @type request: request
142        @return: result - array of documents returned from call
143        @raise ValueError if document not retrieved properly
144        '''
145        logging.debug("Running 'doPresent()'")
146        itime=time.time()
147        searchResult=self.server.doPresent(request)
148        if searchResult._status:
149            result=searchResult.Documents.Document
150        else:
151            raise ValueError('Error during document retrieval: %s' \
152                             %searchResult._statusMessage)
153        etime=time.time()-itime
154        logging.debug('Document Load took [%ss]' %etime)
155        logging.debug("'doPresent() completed")
156        return result
157         
158
159    def getDoc(self,document,format='original'):
160        '''
161        Return a single document from the backend database
162        '''
163        logging.info("Retrieving document, '%s' in %s format" %(document, format))
164
165        request = self.__setUpRequestObject([document], format)
166
167        docs = self.__runDocumentPresent(request)
168        logging.info("Document retrieved successfully")
169        return docs[0]
170
171                   
172    def getAllDocs(self,format='original'):           
173        '''
174        Parse the list of documents, and retrieve them directly
175        '''
176        logging.info("Retrieving all documents specified")
177        logging.debug("- %s" %self.documents)
178        if self.documents is None:
179            logging.info("No documents specified to retrieve - returning")
180            return []
181       
182        request = self.__setUpRequestObject(self.documents, format)
183        docs = self.__runDocumentPresent(request)
184        logging.info("Documents retrieved successfully")
185        return docs
186           
187    def __xmlerror(self,docmessage):
188        print docmessage
189        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
190           
191    def getDocElement(self,document):
192        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
193        into an ElementTree instance '''
194        #we stick it straight into element tree because we need to use et to get the actual document
195        #we want, not the envelope xml elements
196       
197        doc=self.getDoc(document)
198        path=document.rstrip('.xml')
199        try:
200            r=loadET(doc)
201            #return r.find(path)
202            return r
203        except:
204            return self.__xmlerror((path,doc))
205           
206    def getAllDocsAsElements(self):
207        '''
208        Get all the documents and load them into a list of ET instances
209        '''
210        result=[]
211        docs=self.getAllDocs()
212        for doc in docs: 
213            try:
214                r=loadET(doc)
215                result.append(r)#result.append(r.find(path))
216            except:
217                result.append(self.__xmlerror(doc))
218        return result
219           
220           
221    def getLabelledDocs(self,format='original'):
222        '''
223        Returns all the documents in sequence in a labelled list of strings
224        '''
225        logging.info("Retrieving all labelled documents")
226        if self.hits==0: return []
227        #filenames=self.documents.Document
228        #so we know that the following call is the problem ...
229        responses=self.getAllDocs(format)
230       
231        i=len(self.documents)
232        j=len(responses)
233        if i!=j:
234            logging.debug("Requested docs: %s\nActual results: %s" \
235                          %(self.documents, responses))
236            raise NDGSearchError('Internal inconsistency in search \
237                return [hits:%s!=responses:%s]'%(i,j))
238        indices=range(i)
239        results=[]
240        for i in indices:
241            ff=os.path.splitext(self.documents[i])
242            results.append((ff[0],responses[i]))
243        logging.info("Documents retrieved")
244        return results
245       
246    def get(self,repository,schema,localID,format='DIF',**kw):
247        ''' Obtain a document via it's NDG id split up '''
248        #nb argument targetCollection is here to provide same API as exist xmlrpc interface
249        uri='%s__%s__%s'%(repository,schema,localID)
250        fileName=uri+'.xml'
251        return self.getDoc(fileName,format)
252       
253       
254
Note: See TracBrowser for help on using the repository browser.