source: TI01-discovery/trunk/ingestAutomation/OAIBatch/SpaceTimeIngestPostgisFromMOLES.py @ 3177

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/SpaceTimeIngestPostgisFromMOLES.py@3785
Revision 3177, 14.0 KB checked in by selatham, 12 years ago (diff)

adding new coverages extraction to postgis tables.

Line 
1#!/usr/bin/env python
2import cElementTree
3import elementtree.ElementTree as etree
4import molesReadWrite as MRW
5import sys
6import db_funcs
7import os
8
9#connect to db (in separate db functions module)
10connection = db_funcs.db_connect()
11#set up table names
12item_table = 'test_itemtimelocation'
13location_table = 'test_location'
14time_table = 'test_time'
15time_id_seq = time_table+'_id_seq'
16location_id_seq = location_table+'_id_seq'
17
18def listify(item):
19        ''' listify checks if an item is a list, if it isn't it puts it inside a list and returns it. Always returns a list object.'''
20        if type(item) is list:
21            return item
22        else:
23            return [item]
24
25def commavaluesfromsublist(list):
26        '''creates an comma-separated string of values from a list of sublists, taking the first element of each sublist'''
27        count=0
28        arraystr=""
29        for item in list:
30                if item != None:
31                        arraystr += "%s" % item[0]
32                        if count < len(list)-1:
33                                arraystr += "," 
34                        count = count+1
35               
36        return arraystr
37
38def id_exists(Mid):
39        sql = "select itemid from "+item_table+" where itemid = '"+Mid+"';"
40        cursor = connection.cursor()
41        try:
42            cursor.execute(sql)
43        except:
44            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
45        if len(cursor.fetchall()) <1:
46            return False
47        else:
48            return True
49
50def clear_items(Mid):
51        # Clear items having this id
52        # First find location ids and timeids associated with this itemid. Set ids to null in joining table, then delete the entries in the location and time tables
53        find_timeid_sql = "SELECT timeid from "+item_table+" WHERE itemid='"+Mid+"';"
54        find_locationid_sql = "SELECT locationid from "+item_table+" WHERE itemid='"+Mid+"';"
55        cursor = connection.cursor()
56        clear_times = False
57        clear_locations = False
58        try:
59                cursor.execute(find_timeid_sql)
60                if( cursor.rowcount > 0 ):
61                        clear_times = True
62                        timeidlist = commavaluesfromsublist(cursor.fetchall())
63                        clear_times_sql = "DELETE FROM "+time_table+" WHERE id IN ("+timeidlist+");"
64               
65                cursor.execute(find_locationid_sql)
66                if( cursor.rowcount > 0 ):
67                        clear_locations = True
68                        locationidlist = commavaluesfromsublist(cursor.fetchall())
69                        clear_locations_sql = "DELETE FROM "+location_table+" WHERE id IN ("+locationidlist+");"
70
71                clear_items_sql = "DELETE FROM "+item_table+" WHERE itemid='"+Mid+"';"
72                print clear_items_sql
73                cursor.execute(clear_items_sql)
74
75                if clear_locations:
76                        print clear_locations_sql
77                        cursor.execute(clear_locations_sql)
78                if clear_times:
79                        print clear_times_sql
80                        cursor.execute(clear_times_sql)
81        except:
82            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
83        connection.commit()
84       
85
86def link_spatial_and_temporal(Mid,west,south,east,north,startdate,enddate):
87        '''Creates an entry in the item/time/location table linking an item, time and location.'''
88        locationid = insert_spatial_coverage(Mid,west,south,east,north)
89        timeid = insert_temporal_coverage(Mid,startdate,enddate)
90        if (locationid > -1):
91                locationidstr = "%s" % locationid
92        else:
93                locationidstr = "NULL"
94
95        if (timeid > -1):
96                timeidstr = "%s" % timeid
97        else:
98                timeidstr = "NULL"
99
100        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid, timeid) values ('"+Mid+"', "+locationidstr+", "+timeidstr+" );"
101
102        print "ItemTimeLocation:\t"+itemtimelocation_sql
103        cursor = connection.cursor()
104        try:
105            cursor.execute(itemtimelocation_sql)
106        except:
107            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
108        connection.commit()
109
110def link_spatial(Mid,west,south,east,north):
111        '''Creates an entry in the item/time/location table linking an item and location only'''
112        locationid = insert_spatial_coverage(Mid,west,south,east,north)
113        if (locationid > -1):
114                locationidstr = "%s" % locationid
115        else:
116                locationidstr = "NULL"
117
118        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, locationid) values ('"+Mid+"', "+locationidstr+" );"
119
120        print "ItemTimeLocation:\t"+itemtimelocation_sql
121        cursor = connection.cursor()
122        try:
123            cursor.execute(itemtimelocation_sql)
124        except:
125            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
126        connection.commit()
127
128def link_temporal(Mid,startdate,enddate):
129        '''Creates an entry in the item/time/location table linking an item and time only'''
130        timeid = insert_temporal_coverage(Mid,startdate,enddate)
131        if (timeid > -1):
132                timeidstr = "%s" % timeid
133        else:
134                timeidstr = "NULL"
135
136        itemtimelocation_sql = "INSERT INTO "+item_table+" (itemid, timeid) values ('"+Mid+"', "+timeidstr+" );"
137
138        print "ItemTimeLocation:\t"+itemtimelocation_sql
139        cursor = connection.cursor()
140        try:
141            cursor.execute(itemtimelocation_sql)
142        except:
143            print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
144        connection.commit()
145
146
147def insert_spatial_coverage(Mid,west,south,east,north):
148        '''Attempts to insert spatial coverage into spatial table. If successful, returns newly created location id, -1 otherwise'''
149        returnid = -1
150        if str(west)!='null' and str(south)!='null' and str(east)!='null' and str(north)!='null':
151                insert_sql="INSERT INTO "+location_table+" (geom) VALUES ( setsrid('BOX3D("+west+" "+south+","+east+" "+north+")'::box3d,4326) );"
152                print insert_sql
153        cursor = connection.cursor()
154        try:
155                cursor.execute(insert_sql)
156                if (cursor.rowcount > 0):
157                        getid_sql="select currval('"+location_id_seq+"');"
158                        cursor.execute(getid_sql)
159                        id = commavaluesfromsublist(cursor.fetchall())
160                        return int(id)
161                else:
162                        return -1
163        except:
164                print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
165        connection.commit()
166
167def insert_temporal_coverage(Mid,startdate,enddate):
168        '''Attempts to insert temporal coverage (only) into time table. If successful, returns timeid to be used in item table, -1 otherwise'''
169        returnid = -1
170        if (startdate=='nostartdate' and enddate=='noenddate'):
171                # Skip if both are missing, although it's OK if only 1 is given
172                return -1
173        else:
174                if (startdate=='nostartdate'): startdate='NULL'
175                if (enddate=='noenddate'): enddate='NULL'
176                insert_sql =  "INSERT INTO "+time_table+" (start_time, end_time) VALUES('"+startdate+"', '"+enddate+"');"
177                print insert_sql
178        cursor = connection.cursor()
179        try:
180                cursor.execute(insert_sql)
181                if (cursor.rowcount > 0):
182                        getid_sql="select currval('"+time_id_seq+"');"
183                        cursor.execute(getid_sql)
184                        id = commavaluesfromsublist(cursor.fetchall())
185                        return int(id)
186                else:
187                        return -1
188        except:
189                print "Error: database error %s %s" %(sys.exc_type, sys.exc_value)
190        connection.commit()
191
192
193def main(indir):
194    '''Iterates through MOLES XML files in specified directory and processes them to extract spatio-temporal coverage info for inserting into PostGIS database. Currently picks out *first* BoundingBox found within dgDataEntity/dgDataSummary/dgDataCoverage/dgSpatialCoverage and temporal coverage from dgDataEntityr/dgDataSummary/dgDataCoverage/dgTemporalCoverage/DateRange.'''
195    if indir == "":
196        sys.exit("USAGE: argument 1 = full path of directory where MOLES records reside")
197    else:
198        print "INFO: moles records are in %s" %indir
199
200#this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
201    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
202    numfilesproc = 0
203
204    filenames = os.listdir(indir)
205    for filename in filenames:
206        Mid = filename
207        print Mid
208        if filename.find('.xml') != -1:
209            full_filename = indir + "/" + filename
210            #print full_filename
211            no_bbox = False
212            no_dates = False
213            east = 'null'
214            west = 'null'
215            north = 'null'
216            south = 'null'
217            startdate='nostartdate'
218            enddate='noenddate'
219            dgMeta=MRW.dgMetadata()
220            try:
221                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
222            except:
223                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
224                continue
225            try:
226                bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
227            except:
228                print "INFO: XML moles document %s does not contain a bounding box." %full_filename
229                no_bbox=True
230
231            try:
232                dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
233                print "startdate = %s" %dates.DateRangeStart
234                print "enddate = %s" %dates.DateRangeEnd
235            except:
236                print "INFO: XML moles document %s does not contain temporal info." %full_filename
237                no_dates=True
238            #if type(dates) =='NoneType':
239            #    no_dates=True
240
241            if no_bbox and no_dates:
242                print "INFO: XML moles document %s does not contain any spatiotemporal info." %full_filename
243                continue
244
245            if no_dates:
246                pass
247            else:
248                startdate=dates.DateRangeStart
249                enddate= dates.DateRangeEnd
250                if startdate==None or startdate=='None':
251                    startdate="nostartdate"
252                if enddate==None or enddate=='None':
253                    enddate="noenddate"
254
255            if no_bbox:
256                pass
257            else:
258                # for bbox in bbox_list:
259                #parse the coordinates somewhat - only use the first bounding box.
260                #print bbox_list
261                bbox=bbox_list[0]
262                #print bbox
263                #west
264                try:
265                    west = bbox.LimitWest.strip()
266                except:
267                    print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %full_filename
268                    continue
269                if west.endswith('E'):
270                    west=bbox.LimitWest.split('E')[0]
271                elif west.endswith('W'):
272                    if west.startswith('-'):
273                        west = bbox.LimitWest.split('W')[0]
274                    else:
275                        west = "-" +bbox.LimitWest.split('W')[0]
276                try:
277                    west = '%s' % float(west)
278                except:
279                    print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %full_filename
280                    continue
281                #print "West = %s" %west
282                #east
283                try:
284                    east = bbox.LimitEast.strip()
285                except:
286                    print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %full_filename
287                    continue
288                if east.endswith('E'):
289                    east=bbox.LimitEast.split('E')[0]
290                elif east.endswith('W'):
291                    if east.startswith('-'):
292                        east = bbox.LimitEast.split('W')[0]
293                    else:
294                        east = "-" +bbox.LimitEast.split('W')[0]
295                try:
296                    east = '%s' % float(east)
297                except:
298                    print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %full_filename
299                    continue
300                #print "East = %s" %east
301                #north
302                try:
303                    north = bbox.LimitNorth.strip()
304                except:
305                    print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %full_filename
306                    continue
307                if north.endswith('N'):
308                    north=bbox.LimitNorth.split('N')[0]
309                elif north.endswith('S'):
310                    if north.startswith('-'):
311                        north = bbox.LimitNorth.split('S')[0]
312                    else:
313                        north = "-" +bbox.LimitNorth.split('S')[0]
314                try:
315                    north = '%s' % float(north)
316                except:
317                    print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename
318                    continue
319                #print "North = %s" %north
320                #south
321                try:
322                    south = bbox.LimitSouth.strip()
323                except:
324                    print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %full_filename
325                    continue
326                if south.endswith('N'):
327                    south=bbox.LimitSouth.split('N')[0]
328                elif south.endswith('S'):
329                    if south.startswith('-'):
330                        south = bbox.LimitSouth.split('S')[0]
331                    else:
332                        south = "-" +bbox.LimitSouth.split('S')[0]
333                try:
334                    south = '%s' % float(south)
335                except:
336                    print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename
337                    continue
338                #print "North = %s" %south
339
340            print "west= %s,south %s, east %s, north %s, startdate %s, enddate %s" %(west,south,east,north,startdate,enddate)
341            print "INFO: doc %s\n" %Mid
342            clear_items(Mid)
343            link_spatial_and_temporal( Mid, west, south, east, north, startdate, enddate )
344            numfilesproc += 1
345        else:
346            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
347
348    print 'INFO: SpaceTimeIngestPostgisFromMOLES.py ran to end. files processed= %s' %(numfilesproc)
349
350if __name__=='__main__':
351    indir=sys.argv[1]
352    main(indir)
Note: See TracBrowser for help on using the repository browser.