source: ndgCommon/trunk/ndg/common/src/dal/DocumentRetrieve.py @ 4834

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/dal/DocumentRetrieve.py@4834
Revision 4834, 7.2 KB checked in by cbyrom, 11 years ago (diff)

Add support for retrieving DIF documents produced in provider eXist DB

  • i.e. not via discovery service - to allow retrieval of DIF docs

produced when atom docs are published; this will aid harvesting of
info from feeds.

Line 
1'''
2 Extend eXistInterface class - to add document retrieval functionality
3 
4 @author: B Lawrence?
5'''
6from ndg.common.src.clients.xmldb.eXist.eXistInterface import ndg_eXist
7from ndg.common.src.lib.ndgXqueries import ndgXqueries
8from ndg.common.src.models.ndgObject import ndgObject as no
9from xml.etree import ElementTree as ET
10import urllib2, logging, socket
11
12
13def httpify(url):
14    '''
15    Ensure a url has an http prefix
16    '''
17    if url[0:4] != 'http':
18        url = 'http://' + url
19    return url
20   
21       
22class DocumentRetrieve(ndg_eXist):
23    '''
24    This class provides a document retrieval service via the NDG exist interfaces
25    '''
26
27    ATOM_TYPE = 'ATOM-TYPE'
28    ATOM_BACKUP_TYPE = 'ATOM-BACKUP'
29    ATOM = 'ATOM'
30    def __init__(self, repository, pwfile='passwords.txt'):
31        logging.info("Using repository, '%s'" %repository)
32
33        ndg_eXist.__init__(self, db=repository, passwordFile=pwfile)
34        logging.info("DB connection initialised")
35        self.repository=repository
36        self.xq=ndgXqueries()
37        self.knownQueries={'DIF':'moles2dif','DC':'moles2DC','ISO19139':'moles2iso19139', \
38                           'NDG-B0':'moles','NDG-B1':'molesObjectType','MDIP':'moles2mdip', \
39                           'NDG-A0':'csml','NumSim':'numsim', self.ATOM:'atom', \
40                           self.ATOM_BACKUP_TYPE:'atom', self.ATOM_TYPE:'atomTypeList',
41                           no.BROWSE_DIF_DOC_TYPE:'dif'}
42
43    def _retrieveDoc(self, schema, xqtype, targetCollection, repository, localID):
44        '''
45        Retrieve doc using specified XQuery type
46        @return: docName, docContents
47        '''
48        logging.debug("Retrieving doc - type, '%s', coll, '%s', rep:'%s', localID:'%s'" \
49                      %(xqtype,targetCollection,repository,localID))
50        xquery=self.xq.actual(xqtype,targetCollection,repository,localID)
51
52        id,summary=self.executeQuery(xquery)
53        # NB, backups will inevitably return lots of docs - only retrieve the top one
54        # for the moment - since this is not really needed atm
55        if summary['hits'] != 1 and schema != self.ATOM_BACKUP_TYPE:
56            raise ValueError('Unable to obtain single %s document [%s] (hits=%s)'\
57                             %(schema,localID,summary['hits']))
58
59        docName = summary['documents'][0][0]
60        r=self.retrieve(id,0,{})
61        self.sessionRelease(id)
62        return docName, r
63
64
65    def get(self,repository,schema,localID,targetCollection='/db/discovery/moles', \
66            includeDocNameData=False):
67        '''
68        @keyword includeDocNameData: if True, a dictionary is returned, instead of the dataset, with the key
69        being the name of the document and the entry being the dataset
70        '''
71        logging.debug("Get called with rep:'%s', schema:'%s', localID:'%s', collection:'%s'" \
72                      %(repository,schema,localID,targetCollection))
73        docName = ""
74        if schema not in self.knownQueries:
75            raise TypeError('Unknown Schema "%s" in URI'%schema)
76
77        xqtype=self.knownQueries[schema]
78
79        if schema == 'NDG-B1':
80            # this is a general moles object - so need to further establish the type of moles doc
81            # it is to get the correct XQUery to use
82            name, xml = self._retrieveDoc(schema, xqtype, targetCollection, \
83                                          repository, localID)
84            xml=ET.fromstring(xml)
85            otype=int(xml.text or 0)
86           
87            xqtype={4:'stubB_dataEntity',
88                    3:'stubB_observationStation',
89                    2:'stubB_DPT',
90                    1:'stubB_activity'}[otype]
91
92        docName, r = self._retrieveDoc(schema, xqtype, targetCollection, 
93                                       repository, localID)
94        if includeDocNameData:
95            return {docName: r}
96        return r
97   
98   
99    def error(self,string,t,r,s,l):
100        raise ValueError,string+' for %s:%s:%s in %s'%(r,s,l,t)
101
102           
103class genericHTTP(object):
104    ''' Provides a generic HTTP request class '''
105    def __init__(self,proxyServer=None):
106        if proxyServer is None:
107            proxyHandler=urllib2.ProxyHandler({})
108        else:
109            proxy=httpify(proxyServer)
110            proxyHandler=urllib2.ProxyHandler({'http':proxy})
111        self.opener=urllib2.build_opener(proxyHandler)
112       
113    def get(self,url):
114        url = httpify(url)
115        request=urllib2.Request(url)
116        logging.info("Getting data from url: %s" %url)
117        response='Cannot obtain remote file: '
118        try:
119            f = self.opener.open(request)
120            response=''
121        except urllib2.URLError,e:
122            if hasattr(e,'reason'):
123                response+='No access to server [%s]'%e.reason
124            elif hasattr(e,'code'):
125                response+='Response code [%s]'%e.code
126        except socket.error:
127            response+='Network Socket problem'
128        except Exception,e:
129            response+='[%s]'%str(e)
130           
131        if response=='':
132            return f.read()
133        else:
134            raise IOError(response) 
135
136           
137class ndgHTTP(genericHTTP):
138    ''' Provides a get method to obtain an xml document from a remote NDG repository '''
139    def __init__(self,remoteHost,proxyServer=None):
140        self.remoteHost=remoteHost
141        genericHTTP.__init__(self,proxyServer)
142       
143    def uriget(self,uri):
144        # NB, having this import at the module level can cause problems
145        # with resolving imports when using this module - e.g. from the
146        # ndgObject level
147        from ndg.common.src.models.ndgObject import ndgObject
148        n = ndgObject(uri)
149        return self.get(n.repository,n.schema,n.localID)
150
151    def get(self,repository,schema,localID,**kw):
152        ''' Return a remote ndg document '''
153        #TODO what about security? Probably means we need to get the headers of our responses sorted ...
154        url='%s/retrieve/%s__%s__%s'%(self.remoteHost,repository,schema,localID)
155        return genericHTTP.get(self,url)
156       
157    def setSecurity(self,location,usercode,password):
158        ''' Use a usercode password to set security credentials at a specific location '''
159        pass
160
161   
162class ndgVocabPOX(genericHTTP):
163    ''' Provides a POX interface to the vocab server '''
164    def __init__(self,path="http://vocab.ndg.nerc.ac.uk/axis2/services/vocab/",proxyServer=None):
165        genericHTTP.__init__(self,proxyServer)
166        self.path="http://vocab.ndg.nerc.ac.uk/axis2/services/vocab/"
167        self.ns="http://vocab.ndg.nerc.ac.uk/"
168    def getRelated(self,subject):
169        ''' Get a related record '''
170        url='%sgetRelatedRecordByCriteria?subjectText=%s&predicate=255&inferences=True&objectList=%slist/P211/current'%(self.path,subject,self.ns)
171        self.url=url
172        doc=genericHTTP.get(self,url)
173        x=ET.fromstring(doc)
174        b=x.findall('*/{urn:vocab/types}broadMatch')
175        n=x.findall('*/{urn:vocab/types}narrowMatch')
176        s=x.findall('*/{urn:vocab/types}exactMatch')
177        self.broader=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in b]
178        self.narrower=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in n]
179        self.synonyms=[(i.find('{urn:vocab/types}entryTerm').text or '') for i in s]
180        return [self.broader,self.narrower,self.synonyms]
181
Note: See TracBrowser for help on using the repository browser.