source: TI05-delivery/ows_framework/trunk/ows_server/ows_server/models/DocumentRetrieve.py @ 3070

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI05-delivery/ows_framework/trunk/ows_server/ows_server/models/DocumentRetrieve.py@3070
Revision 3070, 8.4 KB checked in by lawrence, 12 years ago (diff)

Support for synonyms in semantic search

Line 
1from eXistInterface import ndg_eXist
2from ndg_xqueries import *
3from xml.dom import minidom
4try:
5    from xml.etree import ElementTree as ET
6except:
7    import ElementTree as ET
8import urllib2
9#from stripped_xqueries import strip_de_xquery
10
11# The MOLES document retrieval is a python port of :
12# TI07-MOLES/trunk/JavaCode/returnmolesxmldb/ndg/services/returnmoles/Main.java
13# Note that ndgRetreive essentially provides test cases for this code.
14
15debug=0
16
17def queryReplace(xquery,repository,localID,targetCollection):
18    ''' Used to modify the vanilla xqueries to a query which gets the actual requested documents '''
19    xquery=xquery.replace('RepositoryID',repository,1)
20    xquery=xquery.replace('LocalID',localID,1)
21    xquery=xquery.replace('TargetCollection', targetCollection)
22    return xquery
23   
24class DocumentRetrieve (ndg_eXist):
25    ''' This class provides a document retrieval service via the NDG exist interfaces '''
26    def __init__(self,repository,pwfile='passwords.txt'):
27
28        ndg_eXist.__init__(self,db=repository,passwordFile=pwfile)
29        self.repository=repository
30
31    def get(self,repository,schema,localID,format='NDG-B0',targetCollection='/db/discovery/moles'):
32
33        # We are making the assumption for now that everything is stored as
34        # MOLES documents and that we can retrieve from MOLES anything we like
35        # in another format.
36        if schema in ['NDG-A0','NumSim']:
37            xquery={'NDG-A0':csmlQuery,'NumSim':numsimQuery}[schema]
38            if schema=='NumSim': xquery=xquery.replace('RepositoryID',repository)
39            xquery=xquery.replace('TargetCollection',targetCollection)
40            xquery=xquery.replace('LocalID',localID)
41            if debug:print xquery
42            id,summary=self.executeQuery(xquery)
43            if summary['hits']==1:
44                r=self.retrieve(id,0,{})
45                self.sessionRelease(id)
46            else:
47                raise ValueError('Unable to obtain %s document [%s] (hits=%s)'%(schema,localID,summary['hits']))
48            return r
49       
50        elif (schema[0:5] == 'NDG-B' or schema[0:3]=='DIF' or schema[0:4] == 'MDIP'):
51           
52            # find out what type of object actually exists of this sort
53            # 0 - None, 1 - Activity, 2 - DPT, 3 - ObsStn, 4 - DE
54            # if the schema is a DIF, we expect to find a DE from the DIF ingestion to MOLES ...
55           
56            # the following xquery can be used to generate a listing of the database
57            # contents ...
58            # xquery=listingQuery
59
60            # default code has targetCollection='/db/ndg_B_metadata, this gets us the right one ...
61            xquery=ObjectTypeQuery
62            xquery=queryReplace(xquery,repository,localID,targetCollection)
63         
64            id,summary=self.executeQuery(xquery)
65            hits=summary['hits']
66           
67            if hits!=1: raise ValueError,'%s documents returned for uri %s:%s:%s (in %s)'%(hits,repository,'NDG-B0',localID,targetCollection)
68           
69            # check output formats
70            allowed = ['DIF','DC','ISO19139','NDG-B0','NDG-B1','MDIP']
71            if format not in allowed: raise TypeError,'Invalid document output format [%s]'%format
72           
73            # get output type
74            r=self.retrieve(id,0,{})
75           
76            # typical response looks like this:
77            #        <objectType xmlns="http://ndg.nerc.ac.uk/moles">1</objectType>
78            xml=minidom.parseString(r)
79            objectType=int(xml.getElementsByTagNameNS(
80                            "http://ndg.nerc.ac.uk/moles", "objectType").item(0).firstChild.data)
81            if debug: print 'objectType=%s'%objectType
82           
83            if format.find('NDG')==-1 and objectType!=4:
84                msg={1:'Activity',2:'Data Production Tool',3:'Observation Station'}[objectType]
85                raise TypeError,'Document URI type [%s] not valid for output format [%s]'%(msg,format)
86           
87            #select the correct xquery
88            xquery={'DIF':DIFQuery,
89                    'DC':DublinCoreDEQuery,
90                    'ISO19139':ISO19139Query,
91                    'NDG-B0':MOLESQuery,
92                    'MDIP':MDIPQuery,
93                    'NDG-B1':{
94                        4:StubBDEQuery,#strip_de_xquery,#
95                        3:StubBObsStnQuery,
96                        2:StubBDPTQuery,
97                        1:StubBActQuery}[objectType]
98                     }[format]
99                     
100            xquery=queryReplace(xquery,repository,localID,targetCollection)
101           
102            try:
103                id,summary=self.executeQuery(xquery)
104            except Exception,e:
105                print xquery
106                raise Exception,e
107            hits=summary['hits']
108            # should only be the one document in the result set
109            if hits!=1: 
110                if debug:
111                    f=open('xquery.%s.fails.xq'%format,'w')
112                    f.write(xquery)
113                    f.close()
114                raise ValueError,'Actual Document Query returned [%s] hits - internal error!'%hits
115             
116            # now let's get it and return it
117            r=self.retrieve(id,0,{})
118
119        else:
120            raise TypeError('Unknown Schema "%s" in URI'%schema)
121        self.sessionRelease(id)
122        return r
123           
124class genericHTTP(object):
125    ''' Provides a generic HTTP request class '''
126    def __init__(self,proxyServer=None):
127        if proxyServer is None:
128            proxyHandler=urllib2.ProxyHandler({})
129        else:
130            proxy=proxyServer
131            if proxy[0:4]!='http':proxy='http://'+proxy
132            proxyHandler=urllib2.ProxyHandler({'http':proxy})
133        self.opener=urllib2.build_opener(proxyHandler)
134       
135    def get(self,url):
136        request=urllib2.Request(url)
137        response='Error obtaining remote file: '
138        try:
139            f = self.opener.open(request)
140            response=''
141        except urllib2.URLError,e:
142            if hasattr(e,'reason'):
143                response+='No access to server [%s]'%e.reason
144            elif hasattr(e,'code'):
145                response+='Response code [%s]'%e.code
146        except socket.error:
147            response+='Network Socket problem'
148        except Exception,e:
149            response+='[%s]'%str(e)
150        if response=='':
151            return f.read()
152        else:
153            raise IOError(response) 
154           
155class ndgHTTP(genericHTTP):
156    ''' Provides a get method to obtain an xml document from a remote NDG repository '''
157    def __init__(self,remoteHost,proxyServer=None):
158        self.remoteHost=remoteHost
159        genericHTTP.__init__(self,proxyServer)
160       
161    def uriget(self,uri):
162        n=ndgObject(uri)
163        return self.get(n.repository,n.schema,n.localID)
164
165    def get(self,repository,schema,localID,format='NDG-B0',targetCollection=None):
166        #last two arguments for compatibiltiy with API, not for use in restful get ...
167        #TODO what about security? Probably means we need to get the headers of our responses sorted ...
168        url='%s/view/%s__%s__%s?format=raw&outputSchema=%s'%(self.remoteHost,repository,schema,localID,format)
169        #http://docs.python.org/lib/urllib2-examples.html
170        return genericHTTP.get(self,url)
171       
172    def setSecurity(self,location,usercode,password):
173        ''' Use a usercode password to set security credentials at a specific location '''
174        pass
175   
176class ndgVocabPOX(genericHTTP):
177    ''' Provides a POX interface to the vocab server '''
178    def __init__(self,path="http://vocab.ndg.nerc.ac.uk/axis2/services/vocab/",proxyServer=None):
179        genericHTTP.__init__(self,proxyServer)
180        self.path="http://vocab.ndg.nerc.ac.uk/axis2/services/vocab/"
181        self.ns="http://vocab.ndg.nerc.ac.uk/"
182    def getRelated(self,subject):
183        ''' Get a related record '''
184        url='%sgetRelatedRecordByCriteria?subjectText=%s&predicate=255&inferences=True&objectList=%slist/P211/current'%(self.path,subject,self.ns)
185        self.url=url
186        doc=genericHTTP.get(self,url)
187        x=ET.fromstring(doc)
188        b=x.findall('*/{urn:vocab/types}broadMatch')
189        n=x.findall('*/{urn:vocab/types}narrowMatch')
190        s=x.findall('*/{urn:vocab/types}exactMatch')
191        self.broader=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in b]
192        self.narrower=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in n]
193        self.synonyms=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in s]
194        return [self.broader,self.narrower,self.synonyms]
Note: See TracBrowser for help on using the repository browser.