Changeset 3799 for TI01-discovery
- Timestamp:
- 17/04/08 17:19:49 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrage
- Files:
-
- 6 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/SchemaNameSpace.py
r2508 r3799 1 # python class to change/correct namespaces to the latest ones used by NDG discovery2 #3 1 import sys 4 2 5 3 class SchemaNameSpace: 6 ''' ''' 4 ''' 5 Class to change/correct namespaces to the latest ones used by NDG discovery 6 NB, only currently handles correction of DIF files 7 ''' 7 8 def __init__(self,infile,outfile,format): 9 ''' 10 Constructor - with the logic to do the namespace change 11 @param infile: file to correct namespaces in 12 @param outfile: file to create with the corrected namespaces 13 @param format: Format of file being processed. DIF is the only format which currently is processed. 14 ''' 8 15 self.ff=open(infile,'r') 9 16 self.ww=open(outfile,'w') 10 17 self.format= format 11 #print dir(self.ff)12 #print dir(self.ww)13 #print "format: %s" %self.format14 18 self.lines=self.ff.readlines() 15 19 for self.line in self.lines: -
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/SpaceTimeIngestFromMOLES.py
r3205 r3799 1 1 #!/usr/bin/env python 2 2 try: #python 2.5 3 from xml.etree import cElementTree4 3 from xml.etree import ElementTree as etree 5 4 except ImportError: 6 5 try: 7 6 # if you've installed it yourself it comes this way 7 import elementtree.ElementTree as etree 8 # import ElementTree as etree 9 except ImportError: 10 # if you've egged it this is the way it comes 11 from ndgUtils.elementtree import ElementTree as etree 12 13 try: #python 2.5 14 from xml.etree import cElementTree 15 except ImportError: 16 try: 17 # if you've installed it yourself it comes this way 8 18 import cElementTree 9 import ElementTree as etree10 19 except ImportError: 11 20 # if you've egged it this is the way it comes 12 21 from ndgUtils.elementtree import cElementTree 13 from ndgUtils.elementtree import ElementTree as etree14 22 15 23 import molesReadWrite as MRW … … 17 25 import db_funcs 18 26 import os 27 import PostgresDBUtils 19 28 20 29 #connect to db (in separate db functions module) … … 28 37 return [item] 29 38 30 def id_exists( Mid):31 sql = "select id from spatiotemp where id = '"+ Mid+"';"39 def id_exists(fileName): 40 sql = "select id from spatiotemp where id = '"+fileName+"';" 32 41 cursor = connection.cursor() 33 42 try: … … 41 50 42 51 43 def do_insert( Mid,west,south,east,north,startdate,enddate):44 sql = "INSERT INTO spatiotemp (id, coordinates, startdate, enddate) VALUES ( '"+ Mid+ "', sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', '"+startdate+"', '"+enddate+"');"52 def do_insert(fileName,west,south,east,north,startdate,enddate): 53 sql = "INSERT INTO spatiotemp (id, coordinates, startdate, enddate) VALUES ( '"+fileName+ "', sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', '"+startdate+"', '"+enddate+"');" 45 54 if str(startdate)=='nostartdate': 46 55 sql=sql.replace(", startdate,"," ") … … 65 74 connection.commit() 66 75 67 def do_update( Mid,west,south,east,north,startdate,enddate):68 sql = "UPDATE spatiotemp SET coordinates = sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', startdate='"+startdate+"', enddate= '"+enddate+"', update_time= now() WHERE id='"+ Mid+"';"76 def do_update(fileName,west,south,east,north,startdate,enddate): 77 sql = "UPDATE spatiotemp SET coordinates = sbox'(("+west+"d , "+south+"d), ("+east+"d , "+north+"d))', startdate='"+startdate+"', enddate= '"+enddate+"', update_time= now() WHERE id='"+fileName+"';" 69 78 if str(startdate)=='nostartdate': 70 79 sql=sql.replace("startdate='"+startdate+"',"," ") … … 81 90 connection.commit() 82 91 83 def main( indir):84 if indir== "":85 sys.exit("USAGE: argument 1 = f ull path of directory where MOLES records reside")92 def main(fileName): 93 if fileName == "": 94 sys.exit("USAGE: argument 1 = filename of MOLES record") 86 95 else: 87 print "INFO: moles records are in %s" %indir 88 89 #this is a fix to the ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc. 96 print "INFO: getting spatiotemporal data from moles records, %s" %fileName 97 98 #this is a fix to the ElementTree namespace problem that namespaces are usually 99 # represented as ns0, ns1, ns2 etc. 90 100 etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) 91 101 numfilesproc = 0 92 102 93 filenames = os.listdir(indir) 94 for filename in filenames: 95 Mid = filename 96 print Mid 97 if filename.find('.xml') != -1: 98 full_filename = indir + "/" + filename 99 #print full_filename 100 no_bbox = False 101 no_dates = False 102 east = 'null' 103 west = 'null' 104 north = 'null' 105 south = 'null' 106 startdate='nostartdate' 107 enddate='noenddate' 108 dgMeta=MRW.dgMetadata() 109 try: 110 dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot()) 111 except: 112 print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename 113 continue 114 try: 115 bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 116 except: 117 print "INFO: XML moles document %s does not contain a bounding box." %full_filename 118 no_bbox=True 119 120 try: 121 dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange 122 print "startdate = %s" %dates.DateRangeStart 123 print "enddate = %s" %dates.DateRangeEnd 124 except: 125 print "INFO: XML moles document %s does not contain temporal info." %full_filename 126 no_dates=True 127 #if type(dates) =='NoneType': 128 # no_dates=True 129 130 if no_bbox and no_dates: 131 print "INFO: XML moles document %s does not contain any spatiotemporal info." %full_filename 132 continue 133 134 if no_dates: 135 pass 136 else: 137 startdate=dates.DateRangeStart 138 enddate= dates.DateRangeEnd 139 if startdate==None or startdate=='None': 140 startdate="nostartdate" 141 if enddate==None or enddate=='None': 142 enddate="noenddate" 143 144 if no_bbox: 145 pass 146 else: 147 # for bbox in bbox_list: 148 #parse the coordinates somewhat - only use the first bounding box. 149 #print bbox_list 150 bbox=bbox_list[0] 151 #print bbox 152 #west 153 try: 154 west = bbox.LimitWest.strip() 155 except: 156 print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %full_filename 157 continue 158 if west.endswith('E'): 159 west=bbox.LimitWest.split('E')[0] 160 elif west.endswith('W'): 161 if west.startswith('-'): 162 west = bbox.LimitWest.split('W')[0] 163 else: 164 west = "-" +bbox.LimitWest.split('W')[0] 165 try: 166 float(west) 167 except: 168 print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %full_filename 169 continue 170 #print "West = %s" %west 171 #east 172 try: 173 east = bbox.LimitEast.strip() 174 except: 175 print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %full_filename 176 continue 177 if east.endswith('E'): 178 east=bbox.LimitEast.split('E')[0] 179 elif east.endswith('W'): 180 if east.startswith('-'): 181 east = bbox.LimitEast.split('W')[0] 182 else: 183 east = "-" +bbox.LimitEast.split('W')[0] 184 try: 185 float(east) 186 except: 187 print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %full_filename 188 continue 189 #print "East = %s" %east 190 #north 191 try: 192 north = bbox.LimitNorth.strip() 193 except: 194 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename 195 continue 196 if north.endswith('N'): 197 north=bbox.LimitNorth.split('N')[0] 198 elif north.endswith('S'): 199 if north.startswith('-'): 200 north = bbox.LimitNorth.split('S')[0] 201 else: 202 north = "-" +bbox.LimitNorth.split('S')[0] 203 try: 204 float(north) 205 except: 206 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename 207 continue 208 #print "North = %s" %north 209 #south 210 try: 211 south = bbox.LimitSouth.strip() 212 except: 213 print "ERROR: Will not process File %s. Contains incorrect South bounding box limit." %full_filename 214 continue 215 if south.endswith('N'): 216 south=bbox.LimitSouth.split('N')[0] 217 elif south.endswith('S'): 218 if south.startswith('-'): 219 south = bbox.LimitSouth.split('S')[0] 220 else: 221 south = "-" +bbox.LimitSouth.split('S')[0] 222 try: 223 float(south) 224 except: 225 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %full_filename 226 continue 227 #print "North = %s" %south 228 229 print "west= %s,south %s, east %s, north %s, startdate %s, enddate %s" %(west,south,east,north,startdate,enddate) 230 if id_exists( Mid ): 231 print "INFO: doc %s exists, updating\n" %Mid 232 do_update( Mid, west, south, east, north, startdate, enddate ) 233 else: 234 print "INFO: doc %s does not exist, inserting new record\n" %Mid 235 236 do_insert( Mid, west, south, east, north, startdate, enddate ) 237 numfilesproc += 1 238 else: 239 print "WARNING: File %s appears not to be XML. Will not be processed." %filename 103 if fileName.find('.xml') != -1: 104 no_bbox = False 105 no_dates = False 106 east = 'null' 107 west = 'null' 108 north = 'null' 109 south = 'null' 110 startdate='nostartdate' 111 enddate='noenddate' 112 dgMeta=MRW.dgMetadata() 113 try: 114 dgMeta.fromXML(cElementTree.ElementTree(file=fileName).getroot()) 115 except: 116 print "WARNING: Cannot parse the XML moles document %s. Will not process" %fileName 117 return 118 try: 119 bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) 120 except: 121 print "INFO: XML moles document %s does not contain a bounding box." %fileName 122 no_bbox=True 123 124 try: 125 dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange 126 print "startdate = %s" %dates.DateRangeStart 127 print "enddate = %s" %dates.DateRangeEnd 128 except: 129 print "INFO: XML moles document %s does not contain temporal info." %fileName 130 no_dates=True 131 #if type(dates) =='NoneType': 132 # no_dates=True 133 134 if no_bbox and no_dates: 135 print "INFO: XML moles document %s does not contain any spatiotemporal info." %fileName 136 return 137 138 if no_dates: 139 pass 140 else: 141 startdate=dates.DateRangeStart 142 enddate= dates.DateRangeEnd 143 if startdate==None or startdate=='None': 144 startdate="nostartdate" 145 if enddate==None or enddate=='None': 146 enddate="noenddate" 147 148 if no_bbox: 149 pass 150 else: 151 # for bbox in bbox_list: 152 #parse the coordinates somewhat - only use the first bounding box. 153 #print bbox_list 154 bbox=bbox_list[0] 155 #print bbox 156 #west 157 try: 158 west = bbox.LimitWest.strip() 159 except: 160 print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %fileName 161 return 162 if west.endswith('E'): 163 west=bbox.LimitWest.split('E')[0] 164 elif west.endswith('W'): 165 if west.startswith('-'): 166 west = bbox.LimitWest.split('W')[0] 167 else: 168 west = "-" +bbox.LimitWest.split('W')[0] 169 try: 170 float(west) 171 except: 172 print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %fileName 173 return 174 #print "West = %s" %west 175 #east 176 try: 177 east = bbox.LimitEast.strip() 178 except: 179 print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %fileName 180 return 181 if east.endswith('E'): 182 east=bbox.LimitEast.split('E')[0] 183 elif east.endswith('W'): 184 if east.startswith('-'): 185 east = bbox.LimitEast.split('W')[0] 186 else: 187 east = "-" +bbox.LimitEast.split('W')[0] 188 try: 189 float(east) 190 except: 191 print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %fileName 192 return 193 #print "East = %s" %east 194 #north 195 try: 196 north = bbox.LimitNorth.strip() 197 except: 198 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %fileName 199 return 200 if north.endswith('N'): 201 north=bbox.LimitNorth.split('N')[0] 202 elif north.endswith('S'): 203 if north.startswith('-'): 204 north = bbox.LimitNorth.split('S')[0] 205 else: 206 north = "-" +bbox.LimitNorth.split('S')[0] 207 try: 208 float(north) 209 except: 210 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %fileName 211 return 212 #print "North = %s" %north 213 #south 214 try: 215 south = bbox.LimitSouth.strip() 216 except: 217 print "ERROR: Will not process File %s. Contains incorrect South bounding box limit." %fileName 218 return 219 if south.endswith('N'): 220 south=bbox.LimitSouth.split('N')[0] 221 elif south.endswith('S'): 222 if south.startswith('-'): 223 south = bbox.LimitSouth.split('S')[0] 224 else: 225 south = "-" +bbox.LimitSouth.split('S')[0] 226 try: 227 float(south) 228 except: 229 print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %fileName 230 return 231 #print "North = %s" %south 232 233 print "west= %s,south %s, east %s, north %s, startdate %s, enddate %s" %(west,south,east,north,startdate,enddate) 234 if id_exists( fileName ): 235 print "INFO: doc %s exists, updating\n" %fileName 236 do_update( fileName, west, south, east, north, startdate, enddate ) 237 else: 238 print "INFO: doc %s does not exist, inserting new record\n" %fileName 239 240 do_insert( fileName, west, south, east, north, startdate, enddate ) 241 numfilesproc += 1 242 else: 243 print "WARNING: File %s appears not to be XML. Will not be processed." %fileName 240 244 241 245 print 'INFO: SpaceTimeIngestFromMOLES.py ran to end. files processed= %s' %(numfilesproc) -
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/SpaceTimeIngestPostgisFromMOLES.py
r3177 r3799 1 1 #!/usr/bin/env python 2 import cElementTree 3 import elementtree.ElementTree as etree 2 try: #python 2.5 3 from xml.etree import ElementTree as etree 4 except ImportError: 5 try: 6 # if you've installed it yourself it comes this way 7 import elementtree.ElementTree as etree 8 # import ElementTree as etree 9 except ImportError: 10 # if you've egged it this is the way it comes 11 from ndgUtils.elementtree import ElementTree as etree 12 13 try: #python 2.5 14 from xml.etree import cElementTree 15 except ImportError: 16 try: 17 # if you've installed it yourself it comes this way 18 import cElementTree 19 except ImportError: 20 # if you've egged it this is the way it comes 21 from ndgUtils.elementtree import cElementTree 4 22 import molesReadWrite as MRW 5 23 import sys -
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/db_funcs.py
r1794 r3799 3 3 # functions for use with NDG discovery postgres db 4 4 5 # db setup6 5 import pgdb 7 6 8 7 def db_connect(): 9 8 # Open a Postgres database connection 10 DATABASE = 'xxxx' 11 HOST = 'xxx.xxx.uk' 12 USER = 'xxx' 13 PW = 'xxxxxxx' 14 SCHEMA = 'xxx' 9 DATABASE = 'test' 10 HOST = 'localhost' 11 USER = 'postgres' 12 PW = 'pass01word' 13 SCHEMA = 'public' 14 # DATABASE = 'xxxx' 15 # HOST = 'xxx.xxx.uk' 16 # USER = 'xxx' 17 # PW = 'xxxxxxx' 18 # SCHEMA = 'xxx' 15 19 connection_string = HOST + ':' + DATABASE + ':' + USER + ':' + PW 16 20 connection = pgdb.connect(connection_string) … … 18 22 19 23 24 def runSQLCommand(connection, sqlCmd): 25 ''' 26 Run a SQL command against a specified DB connection 27 @param connection: a postgres DB connection 28 @param sqlCmd: a SQL command to execute with the postgres connection 29 ''' 30 print "Running SQL command: %s" %sqlCmd 31 cursor = connection.cursor() 32 try: 33 cursor.execute(sqlCmd) 34 except: 35 print "Error: database error %s %s" %(sys.exc_type, sys.exc_value) 36 connection.commit() 37 return cursor.fetchall() 20 38 -
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/keywordAdder.py
r2088 r3799 1 1 #!/usr/bin/env python 2 2 # keywordAdder - adds Structered Keywords to moles documents 3 import cElementTree 4 import elementtree.ElementTree as etree 3 try: #python 2.5 4 from xml.etree import ElementTree as etree 5 except ImportError: 6 try: 7 # if you've installed it yourself it comes this way 8 import elementtree.ElementTree as etree 9 # import ElementTree as etree 10 except ImportError: 11 # if you've egged it this is the way it comes 12 from ndgUtils.elementtree import ElementTree as etree 13 14 try: #python 2.5 15 from xml.etree import cElementTree 16 except ImportError: 17 try: 18 # if you've installed it yourself it comes this way 19 import cElementTree 20 except ImportError: 21 # if you've egged it this is the way it comes 22 from ndgUtils.elementtree import cElementTree 5 23 import molesReadWrite as MRW 6 24 import sys -
TI01-discovery/branches/ingestAutomation-upgrage/OAIBatch/oai_ingest.py
r3777 r3799 217 217 ident = ident.replace(":","-") 218 218 ident = ident.replace("/","-") 219 new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 219 # new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 220 new_filename = indir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+ ident +".xml" 221 220 222 print "INFO: original file = %s, newfile = %s" %(original_filename, new_filename) 221 commandline = "cp "+original_filename+ " " +new_filename 223 224 try: 225 os.rename(original_filename, new_filename) 226 except: 227 sys.exit("ERROR: Failed to rename file %s to %s" %(original_filename, new_filename)) 228 # commandline = "cp "+original_filename+ " " +new_filename 222 229 #print "Executing : " + commandline 223 status = os.system(commandline)224 if status !=0:225 sys.exit("ERROR: Failed at re-naming file stage")230 # status = os.system(commandline) 231 # if status !=0: 232 # sys.exit("ERROR: Failed at re-naming file stage") 226 233 numfilesproc += 1 227 234 else:
Note: See TracChangeset
for help on using the changeset viewer.