source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/SpaceTimeIngestPostgisFromMOLES.py @ 3797

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/SpaceTimeIngestPostgisFromMOLES.py@3797
Revision 3797, 14.6 KB checked in by cbyrom, 11 years ago (diff)

Upgraded version of ingest codebranch - including major refactoring of the ingest
scripts to make more OO - allowing re-use and simplification of code + removal of reliance
on eXist DB to store data; this will now all be stored and looked up from the Postgres DB

Line 
1#!/usr/bin/env python
2try: #python 2.5
3    from xml.etree import ElementTree as etree
4except ImportError:
5    try:
6        # if you've installed it yourself it comes this way
7        import elementtree.ElementTree as etree
8#        import ElementTree as etree
9    except ImportError:
10        # if you've egged it this is the way it comes
11        from ndgUtils.elementtree import ElementTree as etree
12
13try: #python 2.5
14    from xml.etree import cElementTree
15except ImportError:
16    try:
17        # if you've installed it yourself it comes this way
18        import cElementTree
19    except ImportError:
20        # if you've egged it this is the way it comes
21        from ndgUtils.elementtree import cElementTree
22import molesReadWrite as MRW
23import sys
24import db_funcs
25import os
26
27#connect to db (in separate db functions module)
28connection = db_funcs.db_connect()
29#set up table names
30item_table = 'test_itemtimelocation'
31location_table = 'test_location'
32time_table = 'test_time'
33time_id_seq = time_table+'_id_seq'
34location_id_seq = location_table+'_id_seq'
35
36def listify(item):
37        ''' listify checks if an item is a list, if it isn't it puts it inside a list and returns it. Always returns a list object.'''
38        if type(item) is list:
39            return item
40        else:
41            return [item]
42
43def commavaluesfromsublist(list):
44        '''creates an comma-separated string of values from a list of sublists, taking the first element of each sublist'''
45        count=0
46        arraystr=""
47        for item in list:
48                if item != None:
49                        arraystr += "%s" % item[0]
50                        if count < len(list)-1:
51                                arraystr += "," 
52                        count = count+1
53               
54        return arraystr
55
56def id_exists(Mid):
57        sql = "select itemid from "+item_table+" where itemid = '"+Mid+"';"
58        cursor = connection.cursor()
59        try:
60            cursor.execute(sql)
61        except:
62            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
63        if len(cursor.fetchall()) <1:
64            return False
65        else:
66            return True
67
68def clear_items(Mid):
69        # Clear items having this id
70        # First find location ids and timeids associated with this itemid. Set ids to null in joining table, then delete the entries in the location and time tables
71        find_timeid_sql = "SELECT timeid from "+item_table+" WHERE itemid='"+Mid+"';"
72        find_locationid_sql = "SELECT locationid from "+item_table+" WHERE itemid='"+Mid+"';"
73        cursor = connection.cursor()
74        clear_times = False
75        clear_locations = False
76        try:
77                cursor.execute(find_timeid_sql)
78                if( cursor.rowcount > 0 ):
79                        clear_times = True
80                        timeidlist = commavaluesfromsublist(cursor.fetchall())
81                        clear_times_sql = "DELETE FROM "+time_table+" WHERE id IN ("+timeidlist+");"
82               
83                cursor.execute(find_locationid_sql)
84                if( cursor.rowcount > 0 ):
85                        clear_locations = True
86                        locationidlist = commavaluesfromsublist(cursor.fetchall())
87                        clear_locations_sql = "DELETE FROM "+location_table+" WHERE id IN ("+locationidlist+");"
88
89                clear_items_sql = "DELETE FROM "+item_table+" WHERE itemid='"+Mid+"';"
90                print clear_items_sql
91                cursor.execute(clear_items_sql)
92
93                if clear_locations:
94                        print clear_locations_sql
95                        cursor.execute(clear_locations_sql)
96                if clear_times:
97                        print clear_times_sql
98                        cursor.execute(clear_times_sql)
99        except:
100            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
101        connection.commit()
102       
103
104def link_spatial_and_temporal(Mid,west,south,east,north,startdate,enddate):
105        '''Creates an entry in the item/time/location table linking an item, time and location.'''
106        locationid = insert_spatial_coverage(Mid,west,south,east,north)
107        timeid = insert_temporal_coverage(Mid,startdate,enddate)
108        if (locationid > -1):
109                locationidstr = "%s" % locationid
110        else:
111                locationidstr = "NULL"
112
113        if (timeid > -1):
114                timeidstr = "%s" % timeid
115        else:
116                timeidstr = "NULL"
117
118        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid, timeid) values ('"+Mid+"', "+locationidstr+", "+timeidstr+" );"
119
120        print "ItemTimeLocation:\t"+itemtimelocation_sql
121        cursor = connection.cursor()
122        try:
123            cursor.execute(itemtimelocation_sql)
124        except:
125            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
126        connection.commit()
127
128def link_spatial(Mid,west,south,east,north):
129        '''Creates an entry in the item/time/location table linking an item and location only'''
130        locationid = insert_spatial_coverage(Mid,west,south,east,north)
131        if (locationid > -1):
132                locationidstr = "%s" % locationid
133        else:
134                locationidstr = "NULL"
135
136        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid) values ('"+Mid+"', "+locationidstr+" );"
137
138        print "ItemTimeLocation:\t"+itemtimelocation_sql
139        cursor = connection.cursor()
140        try:
141            cursor.execute(itemtimelocation_sql)
142        except:
143            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
144        connection.commit()
145
146def link_temporal(Mid,startdate,enddate):
147        '''Creates an entry in the item/time/location table linking an item and time only'''
148        timeid = insert_temporal_coverage(Mid,startdate,enddate)
149        if (timeid > -1):
150                timeidstr = "%s" % timeid
151        else:
152                timeidstr = "NULL"
153
154        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, timeid) values ('"+Mid+"', "+timeidstr+" );"
155
156        print "ItemTimeLocation:\t"+itemtimelocation_sql
157        cursor = connection.cursor()
158        try:
159            cursor.execute(itemtimelocation_sql)
160        except:
161            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
162        connection.commit()
163
164
165def insert_spatial_coverage(Mid,west,south,east,north):
166        '''Attempts to insert spatial coverage into spatial table. If successful, returns newly created location id, -1 otherwise'''
167        returnid = -1
168        if str(west)!='null' and str(south)!='null' and str(east)!='null' and str(north)!='null':
169                insert_sql="INSERT INTO "+location_table+" (geom) VALUES ( setsrid('BOX3D("+west+" "+south+","+east+" "+north+")'::box3d,4326) );"
170                print insert_sql
171        cursor = connection.cursor()
172        try:
173                cursor.execute(insert_sql)
174                if (cursor.rowcount > 0):
175                        getid_sql="select currval('"+location_id_seq+"');"
176                        cursor.execute(getid_sql)
177                        id = commavaluesfromsublist(cursor.fetchall())
178                        return int(id)
179                else:
180                        return -1
181        except:
182                print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
183        connection.commit()
184
185def insert_temporal_coverage(Mid,startdate,enddate):
186        '''Attempts to insert temporal coverage (only) into time table. If successful, returns timeid to be used in item table, -1 otherwise'''
187        returnid = -1
188        if (startdate=='nostartdate' and enddate=='noenddate'):
189                # Skip if both are missing, although it's OK if only 1 is given
190                return -1
191        else:
192                if (startdate=='nostartdate'): startdate='NULL'
193                if (enddate=='noenddate'): enddate='NULL'
194                insert_sql =  "INSERT INTO "+time_table+" (start_time, end_time) VALUES('"+startdate+"', '"+enddate+"');"
195                print insert_sql
196        cursor = connection.cursor()
197        try:
198                cursor.execute(insert_sql)
199                if (cursor.rowcount > 0):
200                        getid_sql="select currval('"+time_id_seq+"');"
201                        cursor.execute(getid_sql)
202                        id = commavaluesfromsublist(cursor.fetchall())
203                        return int(id)
204                else:
205                        return -1
206        except:
207                print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
208        connection.commit()
209
210
211def main(indir):
212    '''Iterates through MOLES XML files in specified directory and processes them to extract spatio-temporal coverage info for inserting into PostGIS database. Currently picks out *first* BoundingBox found within dgDataEntity/dgDataSummary/dgDataCoverage/dgSpatialCoverage and temporal coverage from dgDataEntityr/dgDataSummary/dgDataCoverage/dgTemporalCoverage/DateRange.'''
213    if indir == "":
214        sys.exit("USAGE: argument 1 = full path of directory where MOLES records reside")
215    else:
216        print "INFO: moles records are in %s" %indir
217
218#this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
219    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
220    numfilesproc = 0
221
222    filenames = os.listdir(indir)
223    for filename in filenames:
224        Mid = filename
225        print Mid
226        if filename.find('.xml') != -1:
227            full_filename = indir + "/" + filename
228            #print full_filename
229            no_bbox = False
230            no_dates = False
231            east = 'null'
232            west = 'null'
233            north = 'null'
234            south = 'null'
235            startdate='nostartdate'
236            enddate='noenddate'
237            dgMeta=MRW.dgMetadata()
238            try:
239                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
240            except:
241                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
242                continue
243            try:
244                bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
245            except:
246                print "INFO: XML moles document %s does not contain a bounding box." %full_filename
247                no_bbox=True
248
249            try:
250                dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
251                print "startdate = %s" %dates.DateRangeStart
252                print "enddate = %s" %dates.DateRangeEnd
253            except:
254                print "INFO: XML moles document %s does not contain temporal info." %full_filename
255                no_dates=True
256            #if type(dates) =='NoneType':
257            #    no_dates=True
258
259            if no_bbox and no_dates:
260                print "INFO: XML moles document %s does not contain any spatiotemporal info." %full_filename
261                continue
262
263            if no_dates:
264                pass
265            else:
266                startdate=dates.DateRangeStart
267                enddate= dates.DateRangeEnd
268                if startdate==None or startdate=='None':
269                    startdate="nostartdate"
270                if enddate==None or enddate=='None':
271                    enddate="noenddate"
272
273            if no_bbox:
274                pass
275            else:
276                # for bbox in bbox_list:
277                #parse the coordinates somewhat - only use the first bounding box.
278                #print bbox_list
279                bbox=bbox_list[0]
280                #print bbox
281                #west
282                try:
283                    west = bbox.LimitWest.strip()
284                except:
285                    print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %full_filename
286                    continue
287                if west.endswith('E'):
288                    west=bbox.LimitWest.split('E')[0]
289                elif west.endswith('W'):
290                    if west.startswith('-'):
291                        west = bbox.LimitWest.split('W')[0]
292                    else:
293                        west = "-" +bbox.LimitWest.split('W')[0]
294                try:
295                    west = '%s' % float(west)
296                except:
297                    print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %full_filename
298                    continue
299                #print "West = %s" %west
300                #east
301                try:
302                    east = bbox.LimitEast.strip()
303                except:
304                    print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %full_filename
305                    continue
306                if east.endswith('E'):
307                    east=bbox.LimitEast.split('E')[0]
308                elif east.endswith('W'):
309                    if east.startswith('-'):
310                        east = bbox.LimitEast.split('W')[0]
311                    else:
312                        east = "-" +bbox.LimitEast.split('W')[0]
313                try:
314                    east = '%s' % float(east)
315                except:
316                    print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %full_filename
317                    continue
318                #print "East = %s" %east
319                #north
320                try:
321                    north = bbox.LimitNorth.strip()
322                except:
323                    print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %full_filename
324                    continue
325                if north.endswith('N'):
326                    north=bbox.LimitNorth.split('N')[0]
327                elif north.endswith('S'):
328                    if north.startswith('-'):
329                        north = bbox.LimitNorth.split('S')[0]
330                    else:
331                        north = "-" +bbox.LimitNorth.split('S')[0]
332                try:
333                    north = '%s' % float(north)
334                except:
335                    print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename
336                    continue
337                #print "North = %s" %north
338                #south
339                try:
340                    south = bbox.LimitSouth.strip()
341                except:
342                    print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %full_filename
343                    continue
344                if south.endswith('N'):
345                    south=bbox.LimitSouth.split('N')[0]
346                elif south.endswith('S'):
347                    if south.startswith('-'):
348                        south = bbox.LimitSouth.split('S')[0]
349                    else:
350                        south = "-" +bbox.LimitSouth.split('S')[0]
351                try:
352                    south = '%s' % float(south)
353                except:
354                    print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename
355                    continue
356                #print "North = %s" %south
357
358            print "west= %s,south %s, east %s, north %s, startdate %s, enddate %s" %(west,south,east,north,startdate,enddate)
359            print "INFO: doc %s\n" %Mid
360            clear_items(Mid)
361            link_spatial_and_temporal( Mid, west, south, east, north, startdate, enddate )
362            numfilesproc += 1
363        else:
364            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
365
366    print 'INFO: SpaceTimeIngestPostgisFromMOLES.py ran to end. files processed= %s' %(numfilesproc)
367
368if __name__=='__main__':
369    indir=sys.argv[1]
370    main(indir)
Note: See TracBrowser for help on using the repository browser.