source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/SpaceTimeIngestFromMOLES.py @ 3821

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/SpaceTimeIngestFromMOLES.py@3821
Revision 3821, 9.6 KB checked in by cbyrom, 11 years ago (diff)

Fix a few problems - including referencing the xquery libraries; these
have now been added to the ndgUtils egg and are extracted locally and
referenced directly. Also add functionality to deal with the moles -> other
transforms + add new utility methods and tidy up code and add more logging.

Line 
1#!/usr/bin/env python
2try: #python 2.5
3    from xml.etree import ElementTree as etree
4except ImportError:
5    try:
6        # if you've installed it yourself it comes this way
7        import elementtree.ElementTree as etree
8#        import ElementTree as etree
9    except ImportError:
10        # if you've egged it this is the way it comes
11        from ndgUtils.elementtree import ElementTree as etree
12
13try: #python 2.5
14    from xml.etree import cElementTree
15except ImportError:
16    try:
17        # if you've installed it yourself it comes this way
18        import cElementTree
19    except ImportError:
20        # if you've egged it this is the way it comes
21        from ndgUtils.elementtree import cElementTree
22
23import molesReadWrite as MRW
24import sys
25import db_funcs
26import os
27
28#connect to db (in separate db functions module)
29connection = db_funcs.db_connect()
30
31def listify(item):
32        ''' listify checks if an item is a list, if it isn't it puts it inside a list and returns it. Always returns a list object.'''
33        if type(item) is list:
34            return item
35        else:
36            return [item]
37
38def id_exists(fileName):
39        sql = "select id from spatiotemp where id = '"+fileName+"';"
40        cursor = connection.cursor()
41        try:
42            cursor.execute(sql)
43        except:
44            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
45        if len(cursor.fetchall()) <1:
46            return False
47        else:
48            return True
49
50
51def do_insert(fileName,west,south,east,north,startdate,enddate):
52        sql = "INSERT INTO spatiotemp (id, coordinates, startdate, enddate) VALUES ( '"+fileName+ "', sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', '"+startdate+"', '"+enddate+"');"
53        if str(startdate)=='nostartdate':
54            sql=sql.replace(", startdate,"," ")
55            sql=sql.replace(",   'no '"," ")
56            sql=sql.replace("'"+startdate+"',"," ")
57        if str(enddate)=='noenddate':
58            sql=sql.replace(", enddate"," ")
59            sql=sql.replace("enddate"," ")
60            #print sql
61            sql=sql.replace(",   'no '"," ")
62            sql=sql.replace(", '"+enddate+"'"," ")
63            #print sql
64        if str(west)=='null' or str(south)=='null' or str(east)=='null' or str(north)=='null':
65            sql=sql.replace("coordinates,"," ")
66            sql=sql.replace("sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))',"," ")
67        print sql
68        cursor = connection.cursor()
69        try:
70            cursor.execute(sql)
71        except:
72            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
73        connection.commit()
74
75def do_update(fileName,west,south,east,north,startdate,enddate):
76        sql = "UPDATE spatiotemp SET coordinates = sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', startdate='"+startdate+"', enddate= '"+enddate+"', update_time= now() WHERE id='"+fileName+"';"
77        if str(startdate)=='nostartdate':
78            sql=sql.replace("startdate='"+startdate+"',"," ")
79        if str(enddate)=='noenddate':
80            sql=sql.replace("enddate= '"+enddate+"',"," ")
81        if str(west)=='null' or str(south)=='null' or str(east)=='null' or str(north)=='null':
82            sql=sql.replace("coordinates = sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))',", " ")
83        print sql
84        cursor = connection.cursor()
85        try:
86            cursor.execute(sql)
87        except:
88            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
89        connection.commit()
90
91def main(fileName):
92    if fileName == "":
93        sys.exit("USAGE: argument 1 = filename of MOLES record")
94    else:
95        print "INFO: getting spatiotemporal data from moles records, %s" %fileName
96
97    #this is a fix to the  ElementTree namespace problem that namespaces are usually
98    # represented as ns0, ns1, ns2 etc.
99    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
100    numfilesproc = 0
101
102    if fileName.find('.xml') != -1:
103        no_bbox = False
104        no_dates = False
105        east = 'null'
106        west = 'null'
107        north = 'null'
108        south = 'null'
109        startdate='nostartdate'
110        enddate='noenddate'
111        dgMeta=MRW.dgMetadata()
112        try:
113            dgMeta.fromXML(cElementTree.ElementTree(file=fileName).getroot())
114        except:
115            print "WARNING: Cannot parse the XML moles document %s. Will not process" %fileName
116            return
117        try:
118            bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
119        except:
120            print "INFO: XML moles document %s does not contain a bounding box." %fileName
121            no_bbox=True
122
123        try:
124            dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
125            print "startdate = %s" %dates.DateRangeStart
126            print "enddate = %s" %dates.DateRangeEnd
127        except:
128            print "INFO: XML moles document %s does not contain temporal info." %fileName
129            no_dates=True
130        #if type(dates) =='NoneType':
131        #    no_dates=True
132
133        if no_bbox and no_dates:
134            print "INFO: XML moles document %s does not contain any spatiotemporal info." %fileName
135            return
136
137        if no_dates:
138            pass
139        else:
140            startdate=dates.DateRangeStart
141            enddate= dates.DateRangeEnd
142            if startdate==None or startdate=='None':
143                startdate="nostartdate"
144            if enddate==None or enddate=='None':
145                enddate="noenddate"
146
147        if no_bbox:
148            pass
149        else:
150            # for bbox in bbox_list:
151            #parse the coordinates somewhat - only use the first bounding box.
152            #print bbox_list
153            bbox=bbox_list[0]
154            #print bbox
155            #west
156            try:
157                west = bbox.LimitWest.strip()
158            except:
159                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %fileName
160                return
161            if west.endswith('E'):
162                west=bbox.LimitWest.split('E')[0]
163            elif west.endswith('W'):
164                if west.startswith('-'):
165                    west = bbox.LimitWest.split('W')[0]
166                else:
167                    west = "-" +bbox.LimitWest.split('W')[0]
168            try:
169                float(west)
170            except:
171                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %fileName
172                return
173            #print "West = %s" %west
174            #east
175            try:
176                east = bbox.LimitEast.strip()
177            except:
178                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %fileName
179                return
180            if east.endswith('E'):
181                east=bbox.LimitEast.split('E')[0]
182            elif east.endswith('W'):
183                if east.startswith('-'):
184                    east = bbox.LimitEast.split('W')[0]
185                else:
186                    east = "-" +bbox.LimitEast.split('W')[0]
187            try:
188                float(east)
189            except:
190                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %fileName
191                return
192            #print "East = %s" %east
193            #north
194            try:
195                north = bbox.LimitNorth.strip()
196            except:
197                print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %fileName
198                return
199            if north.endswith('N'):
200                north=bbox.LimitNorth.split('N')[0]
201            elif north.endswith('S'):
202                if north.startswith('-'):
203                    north = bbox.LimitNorth.split('S')[0]
204                else:
205                    north = "-" +bbox.LimitNorth.split('S')[0]
206            try:
207                float(north)
208            except:
209                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %fileName
210                return
211            #print "North = %s" %north
212            #south
213            try:
214                south = bbox.LimitSouth.strip()
215            except:
216                print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %fileName
217                return
218            if south.endswith('N'):
219                south=bbox.LimitSouth.split('N')[0]
220            elif south.endswith('S'):
221                if south.startswith('-'):
222                    south = bbox.LimitSouth.split('S')[0]
223                else:
224                    south = "-" +bbox.LimitSouth.split('S')[0]
225            try:
226                float(south)
227            except:
228                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %fileName
229                return
230            #print "North = %s" %south
231
232        print "west= %s,south %s, east %s, north %s, startdate %s, enddate %s" %(west,south,east,north,startdate,enddate)
233        if id_exists( fileName ):
234            print "INFO: doc %s exists, updating\n" %fileName
235            do_update( fileName, west, south, east, north, startdate, enddate )
236        else:
237            print "INFO: doc %s does not exist, inserting new record\n" %fileName
238
239            do_insert( fileName, west, south, east, north, startdate, enddate )
240        numfilesproc += 1
241    else:
242        print "WARNING: File %s appears not to be XML. Will not be processed." %fileName
243
244    print 'INFO: SpaceTimeIngestFromMOLES.py ran to end. files processed= %s' %(numfilesproc)
245
246if __name__=='__main__':
247    indir=sys.argv[1]
248    main(indir)
Note: See TracBrowser for help on using the repository browser.