source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/RecordCollection.py @ 3800

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/RecordCollection.py@3800
Revision 3800, 3.6 KB checked in by cbyrom, 11 years ago (diff)

Upgraded version of ingest codebranch - including major refactoring of the ingest
scripts to make more OO - allowing re-use and simplification of code + removal of reliance
on eXist DB to store data; this will now all be stored and looked up from the Postgres DB

Line 
1#!/usr/bin/env python
2'''
3Class to wrapper a collection of metadata records - to allow re-use of the various shared
4 xqueries, etc
5C Byrom Apr 08
6'''
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16
17from ETxmlView import loadET, nsdumb
18import molesReadWrite as MRW
19from ndgUtils.ndgXqueries import ndgXqueries
20import os, sys
21
22class RecordCollection:
23    '''
24    Class to wrapper the various docs contained in a particular metadata collection
25    @param targetCollection: The collection to use, as stored in eXist
26    @param ndgDataProvider: If True, data has come from a NDG dataprovider, otherwise False 
27    '''
28    def __init__(self, targetCollection, ndgDataProvider, host, datacentre_format):
29        self._targetCollection = targetCollection
30        self._ndgDataProvider = ndgDataProvider
31        # firstly load contents of file
32        xml = file(filename).read()
33        # we use loadET to protect ourselves from scummy characters and unicode problems
34        self._originalFormat = loadET(xml)
35        #debugging stuff
36        print "vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
37        print xml
38#        print self._originalFormat.keys()
39#        for i in self._originalFormat: print i.tag
40#        print dir(self._originalFormat)
41        print "vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
42        #we use nsdumb in case the namespace causes difficulties ...
43        helper=nsdumb(self._originalFormat)
44        #print helper
45#        self.id=helper.getText(self._originalFormat,'DatasetIdentifier')
46
47       
48        # initialise the various record fields
49        self.db_id = None    # the DB ID of the record, for easy reference when it is created
50        self._molesFormat = None
51        self._dcFormat = None
52        self.getSpatioTemporalData()
53        self.doRecordTransforms()
54       
55
56    def processCollection(self):
57        targetCollection = "/db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace
58
59        if datacentre_format != 'DIF':
60            print 'ERROR: mini-moles creation does not handle MDIP yet! So this WILL FAIL (probably)'
61       
62        ndgDir=ndgDirectory(self._targetCollection,self._host,docType=self._datacentre_format)
63       
64        #create the mini-moles for each Discovery record in the collection
65        for member in ndgDir.members:
66            #print member
67            filename= member['fileName']
68            disc_id = member['EntryID']
69            print "INFO: internal id = %s" %disc_id
70            print "INFO: discovery filename = %s" %filename
71            # now create the xquery
72            # sort out the output ID stuff ...
73            if NDG_dataProvider:
74                discObj=ndgObject(disc_id)
75                xquery=xq.actual('dif2moles',targetCollection,discObj.repository,discObj.localID)
76            else:
77                xquery=xq.actual('dif2moles',targetCollection,datacentre_namespace,disc_id)
78            # and then sort out the input ID stuff
79            xquery=xquery.replace('Input_Entry_ID',disc_id)
80            xquery=xquery.replace('repository_localid', datacentre_namespace )
81            #print xq.help('dif2moles')
82            molesid,s=xmldb.executeQuery(xquery)
83            moles_from_dif=xmldb.retrieve(molesid,0)
84            #print moles_from_dif
85            # now write out xml to file
86            outdir= './DIF2MOLES'
87            f=open(outdir+"/"+filename,'w')
88            f.write(moles_from_dif)
89            f.close()
90       
Note: See TracBrowser for help on using the repository browser.