source: exist/trunk/python/ndgUtils/ndgSearch.py @ 3127

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/ndgSearch.py@3127
Revision 3127, 7.4 KB checked in by lawrence, 12 years ago (diff)

This verison of ndgUtils does build to an egg. Next we need to
get dif2moles round trip testing working ... and solve
the related URL problem, and then we can work on the changes
to ndgObject which arise ... before trying to replace
the ows_server code.

Line 
1# these imports are autogenerated by wsdl2py ...
2from DiscoveryService_services_types import *
3from DiscoveryService_services import *
4import os.path
5
6#normal imports
7import sys, time
8try: #python 2.5
9    from xml.etree import ElementTree as ET
10except ImportError:
11    try:
12        # if you've installed it yourself it comes this way
13        import ElementTree as ET
14    except ImportError:
15        # if you've egged it this is the way it comes
16        from elementtree import ElementTree as ET
17   
18from ETxmlView import loadET,et2text
19
20from xml.dom import expatbuilder
21
22class ExpatReaderClass:
23      fromString = staticmethod(expatbuilder.parseString)
24      fromStream = staticmethod(expatbuilder.parse)
25 
26
27class ndgSearch:
28    ''' Provides a client to the NDG discovery services methods exposed by the Discovery.wsdl '''
29   
30    def __init__(self,logger=None,tracefile=None):
31        '''get an instance of the service'''
32        #how do we get a different backend provider?
33        loc=DiscoveryServiceLocator()
34        #self.server=loc.getDiscovery(readerclass=ExpatReaderClass,
35        self.server=loc.getDiscoveryServicePortType(readerclass=ExpatReaderClass,tracefile=tracefile)
36        self.serverSessionID=None
37        self.logger=logger
38        self.__reset()
39       
40    def __reset(self):
41        ''' Initialise and/or making sure we have no hangovers from a previous call '''
42        self.documents=None
43        self.hits=None
44        self.error=None
45        self.response=None
46        self.status=None
47   
48    def search(self,term,start=1,howmany=20,target='All',scope=None,bbox=None,dateRange=None,geoSearchType='overlaps'):
49        ''' Get a list of documents for a specific term using a free text search'''
50       
51        self.__reset()
52        #if target=='Authors':
53        #    request=doAuthorSearchRequest()
54        #elif target=='Params':
55        #    request=doParameterSearchRequest()
56        #else:
57        #    request=doFullTextSearchRequest()
58        request=doSearchRequest()
59        request.Term=term
60        request.Start=start
61        request.HowMany=howmany
62        request.TermType={'Authors':'author','Params':'parameter','All':'fullText'}[target]
63        if bbox is not None:
64            box=request.new_BoundingBox()
65            box.LimitNorth,box.LimitWest,box.LimitEast,box.LimitSouth=map(float,bbox)
66            request.BoundingBox=box
67            request.SpatialOperator=geoSearchType
68       
69        if scope is not None:
70            request.Scope=scope
71       
72        if dateRange is not None:
73            start=int(dateRange[0][2]),int(dateRange[0][1]),int(dateRange[0][0]),0,0,0,0
74            end=int(dateRange[1][2]),int(dateRange[1][1]),int(dateRange[1][0]),0,0,0,0
75           
76            dRange=request.new_DateRange()
77            dRange.DateRangeStart=start
78            dRange.DateRangeEnd=end
79            request.DateRange=dRange
80
81        if self.logger: itime=time.time()
82        response=self.server.doSearch(request)
83           
84        if self.logger:
85            etime=time.time()-itime
86            self.logger.info('Search Request [%s] took [%ss]'%(term,etime))
87        if response._status:
88            self.serverSessionID=response._resultId
89            self.hits=response.Hits
90            self.documents=response.Documents.Document
91        else:
92            self.hits=response.Hits
93        self.status=[response._statusMessage,]
94        self.response=response
95        return self.documents
96
97    def getDoc(self,document,format='original'):
98        ''' Return a single document from the backend database '''
99       
100        #create a request object
101        request=doPresentRequest()
102        #get an instance of the Documents holder class
103        DocList=request.new_documents()
104        request.Documents=DocList
105        DocList.Document=[document,]
106        request.Format=format
107        #try:
108        searchResult=self.server.doPresent(request)
109        #except Exception,e:
110        #   raise Exception,e
111        if searchResult._status:
112            result=searchResult.Documents.Document[0]
113        else:
114            raise ValueError('Error retrieving [%s] was [%s]'%(document,searchResult._statusMessage))
115            #result=searchResult._statusMessage
116        return result
117                   
118    def getAllDocs(self,format='original'):           
119        ''' Parse the list of documents, and retrieve them directly '''
120       
121        if self.documents is None: return []
122        if self.logger: itime=time.time()
123         
124        #create a request object
125        request=doPresentRequest()
126        #get an instance of the Documents holder class
127        DocList=request.new_documents()
128        request.Documents=DocList
129        DocList.Document=self.documents
130        request.Format=format
131        result=self.server.doPresent(request)
132        if result._status:
133            docs=result.Documents.Document
134        else:
135            raise ValueError('Error retrieving [%s] was [%s]'%
136                        (self.documents,result._statusMessage))
137        if self.logger:
138            etime=time.time()-itime
139            self.logger.info('Document Load [n=%s] took [%ss]'%(len(self.documents),etime))
140
141        return docs
142           
143    def __xmlerror(self,docmessage):
144        print docmessage
145        return ET.fromstring('<Error><Document>%s</Document><Message>%s</Message></Error>'%docmessage)
146           
147    def getDocElement(self,document):
148        ''' Takes a document path (maybe from a previous call to ndgSearch) and extracts that document
149        into an ElementTree instance '''
150        #we stick it straight into element tree because we need to use et to get the actual document
151        #we want, not the envelope xml elements
152       
153        doc=self.getDoc(document)
154        path=document.rstrip('.xml')
155        try:
156            r=loadET(doc)
157            #return r.find(path)
158            return r
159        except:
160            return self.__xmlerror((path,doc))
161           
162    def getAllDocsAsElements(self):
163        ''' Get all the documents and load them into a list of ET instances '''
164        result=[]
165        docs=self.getAllDocs()
166        for doc in docs: 
167            try:
168                r=loadET(doc)
169                result.append(r)#result.append(r.find(path))
170            except:
171                result.append(self.__xmlerror(doc))
172        return result
173           
174           
175    def getLabelledDocs(self,format='original'):
176        ''' Returns all the documents in sequence in a labelled list of strings'''
177        if self.hits==0: return []
178        #filenames=self.documents.Document
179        #so we know that the following call is the problem ...
180        responses=self.getAllDocs(format)
181        filenames=self.documents
182        i=len(filenames)
183        j=len(responses)
184        if i!=j:
185            print filenames
186            raise ValueError,'Internal inconsistency in search return [hits:%s!=responses:%s]'%(i,j)
187        indices=range(i)
188        results=[]
189        for i in indices:
190            ####results.append((filenames[i].strip('.xml'),responses[i]))
191            ####Wow, 'higem.xml'.strip('.xml') = hige ... and that's what split is supposed to do!
192            ff=os.path.splitext(filenames[i])
193            results.append((ff[0],responses[i]))
194        return results
195       
196    def get(self,repository,schema,localID,format='DIF',**kw):
197        ''' Obtain a document via it's NDG id split up '''
198        #nb argument targetCollection is here to provide same API as exist xmlrpc interface
199        uri='%s__%s__%s'%(repository,schema,localID)
200        fileName=uri+'.xml'
201        return self.getDoc(fileName,format)
202       
203       
204
Note: See TracBrowser for help on using the repository browser.