from dreqPy import dreq import hashlib import collections, string, os, utils_wb import htmlTemplates as tmpl import xml, re, uuid import xml.dom, xml.dom.minidom import sets import xlsxwriter mipTableMap = {'cfMon':'CFmon' } knowna = [ 'LS3MIP [LWday]', 'LS3MIP [LEday]', 'CFMIP [cf1hrClimMon]', 'HighResMIP [3hr_cloud]', 'CFMIP [cf3hr_sim_new]', 'C4MIP [L_3hr]', 'DAMIP [DAMIP_day]', 'DAMIP [DAMIP_3hr_p2]', 'DynVar [DYVR_daily_c]', 'PMIP [PMIP-6hr]', 'HighResMIP [1hrLev]', 'PMIP [PMIP-Amon-02]', 'PMIP [PMIP-day-02]', 'PMIP [PMIP-aero-02]'] knownl = [ 'PMIP [PMIP-Lmon-02]'] knowno = [ 'DAMIP [DAMIP_Omon_p2]', 'FAFMIP [fafOyr]', 'PMIP [PMIP-Omon-02]'] from utils_wb import uniCleanFunc from utils_wb import workbook if os.path.isfile( 'refDefaultP.txt' ): refpix = {} for l in open( 'refDefaultP.txt' ).readlines(): bits = string.split( string.strip(l), '\t' ) assert len(bits) == 4, 'Bad record found in %s' % 'refDefaultP.txt' refpix[bits[0]] = (bits[1],bits[2], int(bits[3]) ) else: refpix = None empty=re.compile('^$') src1 = '../workbook/trial2_20150831.xml' dq = dreq.loadDreq(dreqXML=src1, manifest=None) inx = dq.inx ##inx.makeVarRefs() ix_rql_uid = {} ix_rqvg_uid = {} ix_ovar_uid = {} ix_gpi_uid = {} list_gp_ovar = collections.defaultdict( list ) xr_var_ovar = collections.defaultdict( list ) xr_var_gpi = collections.defaultdict( list ) rql_by_name = collections.defaultdict( list ) def makeVarRefs(uid, var, iref_by_uid): varRefs = {} for thisuid in var.uid.keys(): if iref_by_uid.has_key(thisuid): ee1 = collections.defaultdict( list ) for k,i in iref_by_uid[thisuid]: thisi = uid[i] sect = thisi._h.label if sect == 'groupItem': ee1[sect].append( '%s.%s' % (thisi.mip, thisi.group) ) elif sect == 'ovar': ee1[sect].append( thisi.mipTable ) elif sect == 'revisedTabItem': ee1[sect].append( '%s.%s' % (thisi.mip, thisi.table) ) varRefs[thisuid] = ee1 return varRefs varRefs = makeVarRefs( inx.uid, inx.var, inx.iref_by_uid) class addUnits(object): def __init__(self,dq,wbi='units/units.xlsx'): eqs = [('N m-1', 'kg s-2', 'J m-2'), ('Pa', 'N m-2', 'kg m-1 s-2'), ('cm-1','km-1','m-1') ] eqss = ['N m-1','Pa','m-1','m','s','1','m s-2','m2','m3' ] wb = workbook( wbi ) sh = wb.book.sheet_by_name('units') self.repl = {} self.uu = {} for j in range(1,sh.nrows): r = sh.row(j) if len(r) > 5 and r[5].value != u'': self.repl[ r[0].value ] = r[5].value else: self.uu[r[0].value] = [x.value for x in r] for k in self.repl: if self.repl[k] not in self.uu: print 'Bad replacement found: %s --> %s' % (k,self.repl[k]) for i in dq.coll['var'].items: if i.units in self.repl: u = self.repl[i.units] else: u = i.units u = string.strip(str(u)) if str(u) == '1.0': u = '1' if u not in self.uu: if u == "1.0": if float(u) not in self.uu: print 'UNITS NOT FOUND: %s (%s)' % (u,i.label) else: print 'UNITS NOT FOUND: %s (%s)' % (u,i.label) def uid(self,u0): if u0 in self.repl: u = self.repl[u0] else: u = u0 if u in self.uu: return self.uu[u][7] elif u == '1.0' and float(u) in self.uu: return self.uu[float(u)][7] else: return None class updates(object): delToks = sets.Set( ['inc','omit'] ) def __init__(self,fndup,fnmult,idir='rev1'): assert os.path.isdir( idir ), 'Directory %s not found' % idir self.fdup = '%s/%s' % (idir,fndup) self.fmult = '%s/%s' % (idir,fnmult) for p in [self.fdup,self.fmult]: assert os.path.isfile( p ), 'File %s not found' % p self.repl = {} self.upd = {} self.twins = [] self.ddel = {} self.cmvrepl = {} def writeCmvUpd( self, inx, fnrp='CMVreplace.csv'): oo = open( fnrp, 'w' ) for k in self.cmvrepl.keys(): if inx.iref_by_uid.has_key(k): kn = self.cmvrepl[k] for tag,ki in inx.iref_by_uid[k]: vu = [ inx.uid.has_key(kk) for kk in [k,kn,ki] ] if all( vu ): oo.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (k,kn,tag,ki, inx.uid[k].label, inx.uid[kn].label, inx.uid[ki].label) ) else: print 'ERROR.088.0003: Bad index in replace info: %s .. %s .. %s' % ( str([k,kn,ki]), str(vu), tag ) else: print 'ERROR.088.0004: Bad index in replace info: %s' % k oo.close() def writeVarUpd(self, inx, fnrp='uuidreplace.csv', fnrm='uuidremove.csv', fnup='uuidupdate.csv'): oo = open( fnrp, 'w' ) oo2 = open( fnrm, 'w' ) for k in self.repl.keys(): if inx.iref_by_uid.has_key(k): kn = self.repl[k] for tag,ki in inx.iref_by_uid[k]: vu = [ inx.uid.has_key(kk) for kk in [k,kn,ki] ] if all( vu ): oo.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (k,kn,tag,ki, inx.uid[k].label, inx.uid[kn].label, inx.uid[ki].label) ) else: print 'ERROR.088.0001: Bad index in replace info: %s .. %s .. %s' % ( str([k,kn,ki]), str(vu), tag ) else: oo2.write( k + '\n' ) oo.close() oo2.close() oo = open( fnup, 'w' ) for k in self.upd.keys(): ee = self.upd[k] oo.write( string.join( [k,ee['provNote'],string.join(ee['tags']),ee['label'], ee['title'] ], '\t') + '\n' ) oo.close() def scandup(self): ii = open( self.fdup ).readlines() nn = (len(ii)-1)/2 for i in range(nn): l1 = string.split( ii[i*2+1], '\t' ) l2 = string.split( ii[i*2+2], '\t' ) xx = l1[8:10] yy = l2[8:10] if xx[1] == '' and yy[1] == xx[0]: ths = 0 assert not self.repl.has_key( yy[0] ), 'duplicate replacement request for %s' % yy[0] self.repl[ yy[0] ] = yy[1] elif yy[1] == '' and xx[1] == yy[0]: ths = 1 assert not self.repl.has_key( xx[0] ), 'duplicate replacement request for %s' % xx[0] self.repl[ xx[0] ] = xx[1] elif l1[10] == 'twin' and l2[10] == 'twin': ths = 2 self.twins.append( l1[8] ) self.twins.append( l2[8] ) elif l1[10] in self.delToks and l2[10] in self.delToks: ths = 3 self.ddel[ l1[8] ] = (l1[10],l1[11]) self.ddel[ l2[8] ] = (l2[10],l2[11]) elif xx[1] == '' and yy[1] == "": print 'WARN.087.00001: uncorrected duplication ..... %s ' % str( l1[:5] ) else: ths = -1 print 'ERROR.xxx.0001: Match failed' print l1 print l2 assert False def scancmvdup(self): wb = workbook( 'csv2/CMORvar.xls' ) sht = wb.book.sheet_by_name( 'Sheet1' ) for i in range(sht.nrows): rr = sht.row(i) if len(rr) == 21 and str( rr[20].value ) != '': kn = rr[20].value ko = rr[18].value vn = rr[0].value self.cmvrepl[ ko ] = kn print '%s: replace %s with %s' % (vn,ko,kn) def scanmult(self): ii = open( self.fmult ).readlines() nn = (len(ii)-1)/3 for i in range(nn): l1 = string.split( ii[i*3+1], '\t' ) l2 = string.split( ii[i*3+2], '\t' ) l3 = string.split( ii[i*3+3], '\t' ) yy = [l1[9],l2[9],l3[9]] xx = [l1[8],l2[8],l3[8]] zz = (l1,l2,l3) for j in range(3): if yy[j] != '': assert yy[j] in xx, 'Invalid replacement option, %s' % yy[j] assert not self.repl.has_key( xx[j] ), 'duplicate replacement request for %s' % xx[j] self.repl[ xx[j] ] = yy[j] elif zz[j][10] == 'twin': self.twins.append( zz[j][8] ) elif zz[j][11] == 'update': tags = map( string.strip, string.split( zz[j][13], ',' ) ) self.upd[ xx[j] ] = { 'provNote':zz[j][12], 'tags':tags, 'label':zz[j][0], 'title':zz[j][1] } ### ### varDup and varMult created in first parse ----- then editted to select options ### 2nd pass through then generates the replace and remove options -- taking into account cross references ### the results of the 2nd pass go back to ../workbook to generate a new set of inputs. ### up = updates('varDup.csv', 'varMult.csv', idir='rev2') up.scandup() up.scancmvdup() up.scanmult() urep = False urep = True if urep: up.writeVarUpd( inx, fnrp='uuidreplace.csv', fnrm='uuidremove.csv', fnup='uuidupdate.csv') up.writeCmvUpd( inx, fnrp='CMVreplace.csv') else: oo2 = open( 'uuidremove2.csv', 'w' ) for i in dq.coll['var'].items: if not inx.iref_by_uid.has_key(i.uid): oo2.write( string.join( [i.uid,i.label,i.title,i.prov,i.description], '\t') + '\n' ) oo2.close() ### check back references. nbr = 0 lbr = [] for k in inx.iref_by_uid.keys(): if not inx.uid.has_key(k): nbr += 1 lbr.append(k) print 'Missing references: ', nbr ### can now apply mappings, create updated records and write to new xml? for i in dq.coll['requestLink'].items: rql_by_name[i.label].append( i.uid ) ix_rql_uid[i.uid] = i for i in dq.coll['requestVarGroup'].items: ix_rqvg_uid[i.uid] = i if dq.coll.has_key( 'revisedTabItem' ): thisk = 'revisedTabItem' else: thisk = 'requestVar' oo = open( 'uuidinsert.csv', 'w' ) for i in dq.coll[thisk].items: if i.uid == '__new__': if inx.var.label.has_key( i.label ): if len( inx.var.label[i.label] ) == 1: v = inx.uid[ inx.var.label[i.label][0] ] oo.write( string.join( ['unique',i.label,v.label,v.uid,v.prov,i.mip], '\t' ) + '\n' ) else: oo.write( string.join( ['ambiguous',i.label,i.mip,str(len(inx.var.label[i.label] ) ) ], '\t' ) + '\n' ) oo.close() oo = open( 'varMult.csv', 'w' ) oo2 = open( 'varDup.csv', 'w' ) oo3 = open( 'varStar.csv', 'w' ) hs = ['label','title','sn','units','description','prov','procnote','procComment','uid'] oo.write( string.join(hs, '\t' ) + '\n' ) oo2.write( string.join(hs, '\t' ) + '\n' ) oo3.write( string.join(hs, '\t' ) + '\n' ) ks = inx.var.label.keys() ks.sort() emptySet = sets.Set( ['','unset'] ) def entryEq(a,b): return a == b or (a in emptySet and b in emptySet) deferredRecs = [] for k in ks: if len(inx.var.label[k]) == 2: v1 = inx.var.uid[inx.var.label[k][0]] v2 = inx.var.uid[inx.var.label[k][1]] cc = map( lambda x: entryEq( v1.__dict__[x], v2.__dict__[x]), ['title','sn','units','description'] ) if all(cc): ### where duplicates are identical , collect and output at end of file. pv1 = string.find( v1.__dict__['prov'], 'OMIP.' ) != -1 pv2 = string.find( v2.__dict__['prov'], 'OMIP.' ) != -1 if pv2: vp = v2 vo = v1 else: if not pv1: print 'WARN.088.00002: no preference: %s, %s, %s' % (v1.__dict__['label'],v1.__dict__['prov'],v2.__dict__['prov']) vp = v1 vo = v2 deferredRecs.append( string.join(map( lambda x: vo.__dict__[x], hs) + [vp.uid,'identical'], '\t' ) + '\n' ) deferredRecs.append( string.join(map( lambda x: vp.__dict__[x], hs) + ['',''], '\t' ) + '\n' ) else: try: oo2.write( string.join(map( lambda x: str(v1.__dict__[x]), hs) + ['',''], '\t' ) + '\n' ) oo2.write( string.join(map( lambda x: str(v2.__dict__[x]), hs) + ['',''], '\t' ) + '\n' ) except: print 'SEVERE.oo2.00001',v1.__dict__ print 'SEVERE.oo2.00002',v2.__dict__ elif len(inx.var.label[k]) > 1: for i in inx.var.label[k]: oo.write( string.join(map( lambda x: inx.var.uid[i].__dict__[x], hs), '\t' ) + '\n' ) if k[-2:] == '--': for i in (inx.var.label[k] + inx.var.label[k[:-2]]): oo3.write( string.join(map( lambda x: inx.var.uid[i].__dict__[x], hs), '\t' ) + '\n' ) ## output auto-filled records for identical duplicates at end of varDup file. for r in deferredRecs: oo2.write( r ) oo.close() oo2.close() oo3.close() vns = inx.var.label.keys() vns.sort() for v in vns: if len( inx.var.label[v] ) > 1: print 'INFO.001.0001:',v, string.join( map( lambda x: inx.var.uid[x].sn, inx.var.label[v] ), ';' ) nok = 0 nerr = 0 if dq.coll.has_key( 'ovar' ): thisk = 'ovar' else: thisk = 'CMORvar' for i in dq.coll[thisk].items: vid = i.vid ix_ovar_uid[i.uid] = i xr_var_ovar[vid].append( i.uid ) if not inx.var.uid.has_key(vid): print 'missing key:',i.label, i.prov, vid nerr += 1 else: nok += 1 class rqHtml(object): def __init__(self,odir='./htmlSc/'): self.odir = odir if not os.path.isdir(odir): os.mkdir(odir) def mkRqlHtml(self,name): ## [u'comment', u'uid', u'tab', u'title', u'label', u'grid', 'defaults', u'objective', u'mip', 'globalDefault', u'gridreq'] if len( rql_by_name[name] ) == 1: self.mkRqlHtml01(rql_by_name[name][0], name ) else: self.mkRqlHtmlGp(name) def mkRqlHtmlGp(self,name): ee = {} ee['title'] = 'CMIP Request Link %s (with multiple definitions)' % name self.pageName = 'rql__%s.html' % name al =[] for i in range( len( rql_by_name[name] ) ): this = ix_rql_uid[rql_by_name[name][i]] al.append( tmpl.item % {'item':'[%s]: %s' % (name,i,i,this.title) } ) ee['items'] = string.join(al, '\n' ) ee['introduction'] = '' ee['htmlBody'] = tmpl.indexWrapper % ee ee['htmlHead'] = '''%(title)s''' % ee self.pageHtml = tmpl.pageWrapper % ee self.write() for i in range( len( rql_by_name[name] ) ): self.mkRqlHtml01(rql_by_name[name][i],i) def mkRqlHtml01(self,id, tag): this = ix_rql_uid[id] ee = {} if this.label == tag: ee['title'] = 'CMIP Request Link %s' % tag self.pageName = 'rql__%s.html' % tag else: ee['title'] = 'CMIP Request Link %s[%s]' % (this.label,tag) self.pageName = 'rql__%s__%s.html' % (this.label,tag) atts = this.__dict__.keys() atts.sort() al =[] for a in atts: if a not in ['defaults','globalDefault']: al.append( tmpl.item % {'item':'%s: %s' % (a,this.__dict__.get(a,'-- Not Set --')) } ) ee['items'] = string.join(al, '\n' ) ee['introduction'] = '' ee['htmlBody'] = tmpl.indexWrapper % ee ee['htmlHead'] = '''%(title)s''' % ee self.pageHtml = tmpl.pageWrapper % ee self.write() def mkVarHtml(self,name): if len( inx.var.label[name] ) == 1: self.mkVarHtml01(inx.var.label[name][0], name ) else: self.mkVarHtmlGp(name) def mkVarHtmlGp(self,name): ee = {} ee['title'] = 'CMIP Variable %s (with multiple definitions)' % name self.pageName = 'var__%s.html' % name al =[] for i in range( len( inx.var.label[name] ) ): this = inx.var.uid[inx.var.label[name][i]] al.append( tmpl.item % {'item':'[%s]: %s' % (name,i,i,this.title) } ) ee['items'] = string.join(al, '\n' ) ee['introduction'] = '' ee['htmlBody'] = tmpl.indexWrapper % ee ee['htmlHead'] = '''%(title)s''' % ee self.pageHtml = tmpl.pageWrapper % ee self.write() ##print 'Multi var: %s' % name for i in range( len( inx.var.label[name] ) ): self.mkVarHtml01(inx.var.label[name][i],i) def mkVarHtml01(self,id, tag): this = inx.var.uid[id] ee = {} if this.label == tag: ee['title'] = 'CMIP Variable %s' % tag self.pageName = 'var__%s.html' % tag else: ee['title'] = 'CMIP Variable %s[%s]' % (this.label,tag) self.pageName = 'var__%s__%s.html' % (this.label,tag) atts = this.__dict__.keys() atts.sort() al =[] for a in atts: if a not in ['defaults','globalDefault']: al.append( tmpl.item % {'item':'%s: %s' % (a,this.__dict__.get(a,'-- Not Set --')) } ) if inx.iref_by_uid.has_key(this.uid): assert varRefs.has_key(this.uid), 'Problem with collected references' ee1 = varRefs[this.uid] ks = ee1.keys() ks.sort() for k in ks: al.append( tmpl.item % {'item':'%s: %s' % (k,string.join(ee1[k])) } ) ee['items'] = string.join(al, '\n' ) ee['introduction'] = '' ee['htmlBody'] = tmpl.indexWrapper % ee ee['htmlHead'] = '''%(title)s''' % ee self.pageHtml = tmpl.pageWrapper % ee self.write() def varHtml(self): for k in inx.var.label.keys(): self.mkVarHtml(k) def rqlHtml(self): for k in rql_by_name.keys(): self.mkRqlHtml(k) def write(self): oo = open( '%s/%s' % (self.odir,self.pageName), 'w' ) oo.write( self.pageHtml ) oo.close() vh = rqHtml() vh.varHtml() vh.rqlHtml() if nerr == 0: print 'CHECK 001: %s records checked, no missing references' % nok shps = {'': 64, 'XYZKT': 13, '4-element vector': 2, 'XYT': 476, '2D vector field ': 2, 'KZT': 4, '2D vector field': 2, 'XYZ': 27, 'XYZT': 204, '2D': 83, 'scalar': 14, 'XY': 88, '?': 21, '2D ': 1, 'XYKT': 3, 'YZT': 16, 'ZST1': 15, 'XKT': 2, 'BasinYT': 1} vshpchkMap = {'':'', u'all model levels above 400hPa':'alevStrat', u'all':'Xlev', 3.0:'plev3', '4.0':'plev4', \ 36.0:'plev36', u'soil levels':'sdepth', \ 1.0:'sfc?', \ 16.0:'plev16', 7.0:'plev7', 40.0:'plev40', u'all*':'Xlev', 14.0:'plev14', u'Model levels or 27Plevs':'alev|plev27', \ u'17 (or 23 )':'plev17|plev23', u'17 (or 23)':'plev17|plev23', \ 27.0:'plev27', 17.0:'plev17', u'17 (or23)':'plev17|plev23', 8.0:'plev8', u'all model levels':'alev', 5.0:'plev5'} ks = vshpchkMap.keys() for k in ks: if type(k) == type(0.): vshpchkMap[str(k)] = vshpchkMap[k] tsmap = { 'mean':[u'daily mean', u'time mean', u'time: day', u'Cumulative annual fraction', u'Time mean', u'weighted time mean', u'time: mean', u'mean', u'Mean'], '__unknown__':['','dummyAt'], 'point':[ u'Instantaneous (end of year)', u'point', u'Synoptic', u'instantaneous', u'time: point', u'synoptic'] } tsmap2 = {} for k in tsmap.keys(): for i in tsmap[k]: tsmap2[i] = k if dq.coll.has_key( 'groupItem' ): ee = collections.defaultdict( int ) for i in dq.coll['groupItem'].items: tst = tsmap2[ i.tstyle ] dd = '' if 'X' in i.shape: dd += 'latitude ' if 'Y' in i.shape: dd += 'longitude ' if 'Z' in i.shape: if i.levels == '': print 'ERROR.001.0001: no levels specified', i.label, i.title else: zdim = vshpchkMap[i.levels] dd += zdim ## print '%s::%s::%s|%s' % (i.shape, i.levels, i.tstyle, dd) for i in dq.coll['groupItem'].items: list_gp_ovar[i.gpid].append( i.uid ) nok = 0 nerr = 0 for i in dq.coll['groupItem'].items: vid = i.vid ix_gpi_uid[i.uid] = i xr_var_gpi[vid].append( i.uid ) if not inx.var.uid.has_key(vid): nerr += 1 else: nok += 1 ##print 'groupItem to var crossref: nok = %s, nerr = %s' % (nok, nerr) class tcmp(object): def __init__(self): pass def cmp(self,x,y): return cmp(x.title,y.title) def atRepr(l,x,optional=False): if x != None: if optional: v = l.__dict__.get(x, '' ) else: v = l.__dict__[x] else: v = l if v == '__unset__': return '' elif type(v) in [ type([]), type(())]: return string.join([str(i) for i in v]) else: return v class xlsx(object): def __init__(self,fn): self.wb = xlsxwriter.Workbook(fn) def newSheet(self,name): self.worksheet = self.wb.add_worksheet(name=name) return self.worksheet def close(self): self.wb.close() def dumpxlsx( fn, key, atl): wb = xlsx( fn ) sht = wb.newSheet( key ) j = 0 for i in range(len(atl)): sht.write( j,i, atl[i] ) ll = dq.coll[key].items[:] ll.sort( tcmp().cmp ) for l in ll: uid = atRepr(l,'uid') j+=1 i=0 for x in atl: sht.write( j,i, atRepr(l,x) ) i+=1 if key == 'var' and refpix != None: if uid in refpix: p = refpix[uid][2] else: p = 201 sht.write( j,i, p ) wb.close() def dumpcsv( fn, key, atl, optionalSet='' ): oo = open(fn, 'w' ) ll = dq.coll[key].items[:] ll.sort( tcmp().cmp ) oo.write( string.join( atl, '\t' ) + '\n' ) for l in ll: try: oo.write( string.join( map( lambda x: str(atRepr(l,x,x in optionalSet)), atl), '\t' ) + '\n' ) except: print 'SEVERE.090.0001: print %s' % str(atl) print l print key raise oo.close() def atlSort( ll ): oo = [] l1 = ['label','title'] l2 = ['uid','defaults','globalDefault'] for i in l1: if i in ll: oo.append(i) ll.sort() for i in ll: if i not in l1 + l2: oo.append(i) if 'uid' in ll: oo.append( 'uid' ) return oo for k in dq.coll.keys(): if len( dq.coll[k].items ) > 0: expl = dq.coll[k].items[0] atl = atlSort( expl.__dict__.keys() ) atl1 = [a for a in atl if a != 'parent' and a[0] != '_'] ###print k, atl1 optionalSet = set( [a for a in atl1 if not expl.__class__.__dict__[a].required] ) dumpcsv( 'csv2/%s.csv' % k, k, atl1, optionalSet=optionalSet ) if k == 'var': dumpxlsx( 'csv2/var.xlsx', k, atl1 ) class annotate(object): def __init__(self,src,dreq): assert os.path.isfile( src), '%s not found' % src self.doc = xml.dom.minidom.parse( src ) self.dreq = dreq self.removedUids = {} def iniVar(self,dq): a = addUnits(dq) this = self.doc.getElementsByTagName('var')[0] dil = this.getElementsByTagName('item') self.vid = {} for item in dil: uid = item.getAttribute( 'uid' ) title = item.getAttribute( 'title' ) label = item.getAttribute( 'label' ) self.vid[uid] = (label,title) units = item.getAttribute( 'units' ) if units in a.repl: print 'INFO.units.0001: replacing %s --> %s' % (units,a.repl[ units ]) units = a.repl[ units ] u2 = a.uid( units ) if u2 == None: units = string.strip( str(units) ) u2 = a.uid( units ) if u2 != None: item.setAttribute( 'unid', u2 ) else: print 'Units not recognised: %s (%s) -- %s' % (units,label, type(units)) def tableMap(self,dq): that = self.doc.getElementsByTagName('CMORvar')[0] dil = that.getElementsByTagName('item') ee1 = {} for item in dil: mt = item.getAttribute( 'mipTable' ) if mt in mipTableMap: mt1 = mipTableMap[mt] item.setAttribute( 'mipTable', mt1 ) item.setAttribute( 'mtid', 'MIPtable::%s' % mt1 ) def strTtl(self,dq): this = self.doc.getElementsByTagName('cellMethods')[0] dil = this.getElementsByTagName('item') eecm = {} for item in dil: eecm[item.getAttribute( 'uid' )] = item.getAttribute( 'label' ) that = self.doc.getElementsByTagName('spatialShape')[0] dil = that.getElementsByTagName('item') ee1 = {} for item in dil: uid = item.getAttribute( 'uid' ) lab = item.getAttribute( 'label' ) ttl = item.getAttribute( 'title' ) ee1[uid] = (lab,ttl) this = self.doc.getElementsByTagName('structure')[0] dil = this.getElementsByTagName('item') filterStr = True estr = [] strForce = ['str-a076','str-x269','str-x100'] for item in dil: uid = item.getAttribute( 'uid' ) lab = item.getAttribute( 'label' ) if len( dq.inx.iref_by_uid[uid] ) == 0 and filterStr: if lab not in strForce: print 'UNUSED STRUCTURE: %s, %s' % (lab,uid) this.removeChild( item ) self.removedUids[uid] = 'structure: %s' % lab else: tmid = item.getAttribute( 'tmid' ) spid = item.getAttribute( 'spid' ) cmid = item.getAttribute( 'cmid' ) cml = eecm.get( cmid, '' ) if cml != '': cml = ' [%s]' % cml o = item.getAttribute( 'odims' ) c = item.getAttribute( 'coords' ) if spid not in ee1: print 'SEVERE:spid.0001: spid not found: %s' % spid sl = '__unknowm__' st = '__unknowm__' else: sl,st = ee1[spid] if tmid not in dq.inx.uid: print 'BAD time record uid: ',tmid print lab, uid raise title = '%s, %s [%s]' % (dq.inx.uid[tmid].title, st, sl) if string.strip( c ) != '' or string.strip( o ) != '': title += ' {%s:%s}' % (o,c) title += cml if title != item.getAttribute( 'title' ): print 'STRUCT: %s:: %s' % (title, item.getAttribute( 'title' ) ) if item.getAttribute( 'title' )[:4] == 'str-': item.setAttribute( 'title', title ) estr.append( item.getAttribute( 'label' ) ) if len(estr) > 0: oo = open( 'scandreq_estr.txt', 'w' ) for i in estr: oo.write( i + '\n' ) oo.close() def rvgCheck(self,dq): """Remove request variable groups which have no requestLink""" this = self.doc.getElementsByTagName('requestVarGroup')[0] dil = this.getElementsByTagName('item') nn = 0 for item in dil: uid = item.getAttribute( 'uid' ) if ('requestLink' not in dq.inx.iref_by_sect[uid].a) and ('tableSection' not in dq.inx.iref_by_sect[uid].a): if item.getAttribute( 'label' ) in ['aermonthly']: print 'WARN.0010: overriding variable group pruning: ',item.getAttribute( 'label' ) else: ##self.removedUids[ uid ] = 'requestVarGroup: %s' % item.getAttribute( 'label' ) ##this.removeChild(item) print 'INFO.rvg.0001: New rvg?? ',uid, item.getAttribute( 'label' ) nn+=1 ##print 'WARN.Unused variable groups removed: %s' % nn this = self.doc.getElementsByTagName('requestVar')[0] dil = this.getElementsByTagName('item') nn = 0 s1 = {i.uid for i in dq.coll['requestVarGroup'].items if i.uid not in self.removedUids} self.usedCmv = set() for item in dil: uid = item.getAttribute( 'uid' ) vid = item.getAttribute( 'vid' ) vgid = item.getAttribute( 'vgid' ) if vgid not in s1: this.removeChild(item) self.removedUids[ uid ] = 'requestVar: %s' % item.getAttribute( 'label' ) nn+=1 else: self.usedCmv.add( vid ) print 'Unused request variables removed: %s' % nn def rqvCheck(self,dq): wb = workbook( 'ingest/rqv_cmv_remap.xls' ) rqvredirect = {('aermonthly','siconc'):'SImon' } sh = wb.book.sheet_by_name('maps') self.repl = {} self.uu = {} for j in range(sh.nrows): r = [str(x.value) for x in sh.row(j)] rqvredirect[ ( r[0], r[1] ) ] = r[2] this = self.doc.getElementsByTagName('requestVar')[0] dil = this.getElementsByTagName('item') badrqv = set() self.removedCmv = set() for item in dil: uid = item.getAttribute( 'uid' ) vid = item.getAttribute( 'vid' ) if vid not in dq.inx.uid: badrqv.add( uid ) elif dq.inx.uid[vid]._h.label == 'remarks': badrqv.add( uid ) else: cmv = (dq.inx.uid[vid].mipTable,dq.inx.uid[vid].label) if cmv in rqvredirect: targ = (rqvredirect[cmv],dq.inx.uid[vid].label) if targ not in self.cmvLookUp: print 'SEVERE.rgvrdi.00001: attempt to redirect to non-existant var: %s --> %s' % (str(cmv),str(targ)) else: nid = self.cmvLookUp[targ] item.setAttribute( 'vid', nid ) self.removedCmv.add( vid ) print 'INFO.rgvrdi.00002: redirect rql to new CMOR var: %s --> %s' % (str(cmv),str(targ)) def cmvCheck2(self,dq): this = self.doc.getElementsByTagName('CMORvar')[0] dil = this.getElementsByTagName('item') self.usedVar = set() self.keepVar = {'ugrido',} for item in dil: uid = item.getAttribute( 'uid' ) if ((uid not in self.usedCmv) or (uid in self.removedCmv)) and (item.getAttribute( 'label' ) not in self.keepVar): print 'INFO.cmv.04004: removing unused CMORvar: %s, %s, %s, %s: ' % (uid,item.getAttribute( 'label' ),item.getAttribute( 'table' ),item.getAttribute( 'mipTable' )) this.removeChild(item) self.removedUids[ uid ] = 'CMORvar: %s' % item.getAttribute( 'label' ) else: self.usedVar.add( item.getAttribute( 'vid' ) ) this = self.doc.getElementsByTagName('var')[0] dil = this.getElementsByTagName('item') self.varLookUp = {} for item in dil: uid = item.getAttribute( 'uid' ) self.varLookUp[ item.getAttribute( 'label' ) ] = uid if uid not in self.usedVar and item.getAttribute( 'label' ) not in self.keepVar: print 'INFO.var.04004: removing unused var: %s, %s, %s, %s: ' % (uid,item.getAttribute( 'label' ),item.getAttribute( 'table' ),item.getAttribute( 'units' )) this.removeChild(item) self.removedUids[ uid ] = 'var: %s' % item.getAttribute( 'label' ) this = self.doc.getElementsByTagName('CMORvar')[0] wb = workbook( 'ingest/extraCmv.xls' ) sh = wb.book.sheet_by_name('extra') extra = [] self.uu = {} for j in range(1,sh.nrows): extra.append( [str(x.value) for x in sh.row(j)] + ['',''] ) extra_old = [('E6hrZ','ps','str-a076','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','6hr', 'CMIP extra','scanDreq.py','1','float',''), ('AERmon','ps','str-013','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','mon', 'CMIP extra','scanDreq.py','1','float',''), ('E3hrPt','ps','str-d11','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','3hr', 'CMIP extra','scanDreq.py','1','float',''),] eh = ['label','modeling_realm','title','description','frequency','provNote','prov','defaultPriority','type','positive'] hids = set() idfp = eh.index('defaultPriority') + 3 for e in extra: assert e[11] in ['float'], 'bad type in cmv extra ... see ingest/extraCmv.xls: %s' % e[11] if (e[0],e[1]) not in self.cmvLookUp: e[idfp] = str( int( float( e[idfp] ) ) ) stid = self.strLookUp[e[2]] vid = self.varLookUp[e[1]] new = self.doc.createElement( 'item' ) new.setAttribute( 'mipTable', e[0] ) new.setAttribute( 'mtid', 'MIPtable::%s' % e[0] ) new.setAttribute( 'stid', stid ) new.setAttribute( 'vid', vid ) new.setAttribute( 'rowIndex', '0' ) for a in ['deflate','deflate_level','shuffle']: new.setAttribute( a, '' ) hid = hashlib.new( 'sha1', 'extra-fields::' + ':'.join(e[:8]) ).hexdigest() assert hid not in hids new.setAttribute( 'uid', hid ) print 'INFO.extra.00011: ',e for k in range(len(eh) ): new.setAttribute( eh[k], e[3+k] ) this.appendChild( new ) else: print 'SKIPPING extraCMV: %s.%s' % (e[0],e[1]) def cmvCheck(self,dq): this = self.doc.getElementsByTagName('CMORvar')[0] dil = this.getElementsByTagName('item') kk = 0 kka = 0 nrm0 = 0 nrm1 = 0 self.cmvLookUp = {} for item in dil: title = item.getAttribute( 'title' ) if title[:6] == '__from': kka += 1 vid = item.getAttribute( 'vid' ) if vid in self.vid: title2 = self.vid[vid][1] item.setAttribute( 'title', title2 ) kk += 1 realm = item.getAttribute( 'modeling_realm' ) if realm in ['','?']: stid = item.getAttribute( 'stid' ) st = dq.inx.uid[stid] odims = st.odims this = None if odims == 'iceband': this = 'seaIce' elif odims not in ['','?']: this = 'atmos' else: sp = dq.inx.uid[st.spid] if sp.label in ['TR-na','XY-O', 'YB-O', 'YB-R']: this = 'ocean' elif sp.label != 'XY-na': this = 'atmos' else: prov = item.getAttribute( 'prov' ) if prov in knowna: this = 'atmos' elif prov in knowno: this = 'ocean' elif prov in knownl: this = 'land' else: lab = item.getAttribute( 'label' ) print 'ERROR.cmv.00006: no realm: %s, %s, %s, %s ..' % (lab,odims, sp.label, prov) if this == None: nrm1 += 1 else: nrm0 += 1 item.setAttribute( 'modeling_realm', this ) self.cmvLookUp[(item.getAttribute( 'mipTable' ),item.getAttribute( 'label' ) )] = item.getAttribute( 'uid' ) print ('CMOR Var realms set: %s' % nrm0 ) if nrm1 > 0: print ('SEVERE.cmv.00005: realm unset: %s' % nrm1) print ('CMOR Var titles reset: %s [%s]' % (kk,kka)) def mipProv(self,dq): s1 = re.compile( '\[([a-zA-Z0-9]*)\]' ) cc = collections.defaultdict(list) dd = collections.defaultdict(int) mips = set() for i in dq.coll['mip'].items: mips.add( i.uid ) for i in dq.coll['var'].items: cc[i.prov].append( i.label ) ee = {} for i in sorted( cc.keys() ): if i[:9] == 'CMIP6 end': m = s1.findall( i ) assert len( m ) == 1, 'FAILED TO PARSE: %s' % i this = m[0] else: i5 = i.find( 'CMIP5' ) != -1 io = i.find( 'OMIP' ) != -1 icx = i.find( 'CORDEX' ) != -1 ip = i.find( 'PMIP' ) != -1 icc = i.find( 'CCMI' ) != -1 isp = i.find( 'SPECS' ) != -1 icf = i.find( 'CFMIP' ) != -1 iac = i.find( 'AerChemMIP' ) != -1 if i5 and io: print 'WARNING .. unclear provenance: ',i,cc[i] this = 'CMIP5/OMIP' elif i5: this = 'CMIP5' elif io: this = 'OMIP' elif icx: this = 'CORDEX' elif ip: this = 'PMIP' elif icc: this = 'CCMI' elif isp: this = 'SPECS' elif icf: this = 'CFMIP' elif iac: this = 'AerChemMIP' else: print 'WARNING .. unclear provenance [2]: ',i,cc[i] this = 'unknown' ee[i] = this dd[this] += len( cc[i] ) self.dd = dd self.ee = ee this = self.doc.getElementsByTagName('var')[0] dil = this.getElementsByTagName('item') print 'FIXING var provmip attribute, %s items' % len(dil) kk = 0 for item in dil: kk += 1 p = item.getAttribute( 'prov' ) p0 = item.getAttribute( 'provmip' ) if p0 not in mips: if p in mips: item.setAttribute( 'provmip', p ) else: assert ee.has_key(p), 'Unmatched key: %s' % p assert ee[p] in mips, 'Unknown provenance: %s, %s' % (p,ee[p]) item.setAttribute( 'provmip', ee[p] ) def fixCellMethods(self,dq): this = self.doc.getElementsByTagName('structure')[0] dil = this.getElementsByTagName('item') cmrep = collections.defaultdict( set ) cmc = collections.defaultdict( int ) self.strLookUp = {} nrep = 0 for item in dil: self.strLookUp[ item.getAttribute( 'label' ) ] = item.getAttribute( 'uid' ) cm = item.getAttribute( 'cell_methods' ) if string.find( cm, "area: where" ) != -1: cm1 = string.replace( cm, "area: where", "area: mean where" ) item.setAttribute( 'cell_methods', cm1 ) cmrep[cm].add(cm1) cmc[cm1] += 1 nrep += 1 elif string.find( cm, "time:mean" ) != -1: cm1 = string.replace( cm, "time:mean", "time: mean" ) item.setAttribute( 'cell_methods', cm1 ) cmrep[cm].add(cm1) cmc[cm1] += 1 nrep += 1 elif string.find( cm, "weighted b " ) != -1: cm1 = string.replace( cm, "weighted b ", "weighted by " ) item.setAttribute( 'cell_methods', cm1 ) cmrep[cm].add(cm1) cmc[cm1] += 1 nrep += 1 print ('FIXED CELL METHODS .. %s' % nrep ) for k in cmc: print ('%s: %s' % (k,cmc[k]) ) ## ## defective code .. can not easily do structure mapping here .. need to think about that ... ## def sectionCopy(self,dq): this = self.doc.getElementsByTagName('CMORvar')[0] thisRqv = self.doc.getElementsByTagName('requestVar')[0] xx = [i for i in dq.coll['requestVarGroup'].items if i.label == 'OMIP-Omon'] assert len(xx) == 1, 'OMIP-Omon request variable group not found' omipOmonUid = xx[0].uid dil = this.getElementsByTagName('item') for item in dil: mipt = item.getAttribute( 'mipTable' ) prov = item.getAttribute( 'prov' ) provn = item.getAttribute( 'provNote' ) if mipt == 'Oyr' and prov[:12] == "CMIP6 [OMIP]" and provn == 'bgc': rowix = int( item.getAttribute( 'rowIndex' ) ) if rowix < 65: var = item.getAttribute( 'label' ) new = item.cloneNode(True) new.setAttribute( 'defaultPriority', '2' ) new.setAttribute( 'mipTable', 'Omon' ) new.setAttribute( 'prov', 'Copy from Oyr' ) new.setAttribute( 'provNote', 'sdq.001' ) vid = str( uuid.uuid1() ) new.setAttribute( 'uid', vid ) this.appendChild( new ) ## ## create request var ## new2 = self.doc.createElement( 'item' ) uid = str( uuid.uuid1() ) new2.setAttribute( 'uid', uid ) new2.setAttribute( 'priority', '2' ) new2.setAttribute( 'vid', vid ) new2.setAttribute( 'vgid', omipOmonUid ) new2.setAttribute( 'mip', 'OMIP' ) new2.setAttribute( 'table', 'OMIP-Omon' ) if omipOmonUid not in dq.inx.uid: print 'ERROR.005.0001: vgid %s not found' % omipOmonUid thisRqv.appendChild(new2) def missingRefs(self,mrefs,dq,clear=True): this = self.doc.getElementsByTagName('remarks')[0] if clear: dil = this.getElementsByTagName('item') for d in dil: this.removeChild(d) for k in mrefs.keys(): if len( mrefs[k] ) == 1: tid = mrefs[k][0][2] tattr = mrefs[k][0][1] tn = None else: tid = None ee = collections.defaultdict(int) tn = str( len( mrefs[k] ) ) for t in mrefs[k]: s = self.dreq.inx.uid[t[2]]._h.label ee['%s.%s' % (s,t[1])] += 1 if len( ee.keys() ) == 1: tattr = ee.keys()[0] else: tattr = '__multiple__' if tid == None or (tid not in self.removedUids): item = self.doc.createElement( 'item' ) assert type(k) == type( '' ), 'Attempt to set uid with bad type: %s' % str(k) item.setAttribute( 'uid', k ) item.setAttribute( 'tattr', tattr ) if tn != None: item.setAttribute( 'techNote', tn ) if tid != None: item.setAttribute( 'tid', tid ) if tid not in dq.inx.uid: print 'ERROR.005.0002: tid %s not found' % tid item.setAttribute( 'class', 'missingLink' ) item.setAttribute( 'description', 'Missing links detected and marked for fixing' ) item.setAttribute( 'prov', 'scanDreq.py:annotate' ) this.appendChild( item ) parent = self.doc.getElementsByTagName('annex')[0] for this in parent.childNodes: if this.nodeType == this.ELEMENT_NODE: dil = this.getElementsByTagName('item') print 'INFO.nodescan.00001: ',this.localName,len(dil) for item in dil: for k in item.attributes.keys(): v = item.getAttribute( k ) if type( v ) not in [type( '' ),type( u'' )]: print 'SEVERE.0001: tuple in attribute value',this.localName,k,v txt = self.doc.toprettyxml(indent='\t', newl='\n', encoding=None) oo = open( 'out/annotated_20150731_i1.xml', 'w' ) lines = string.split( txt, '\n' ) for line in lines: l = utils_wb.uniCleanFunc( string.strip(line) ) if empty.match(l): continue else: oo.write(l + '\n') oo.close() def anno(): oo = open( 'var1.csv', 'w' ) ks = ['label','title','sn','units','description','prov','procnote','procComment','uid'] ks2 = [ 'ovar','groupItem','revisedTabItem'] oo.write( string.join(ks + ks2, '\t' ) + '\n' ) for i in dq.coll['var'].items: if i.label[-2:] != '--': ee1 = varRefs.get( i.uid, {} ) r2 = map( lambda x: string.join( atRepr( ee1.get(x, [] ), None ) ), ks2 ) oo.write( string.join(map( lambda x: atRepr(i,x), ks) + r2, '\t' ) + '\n' ) oo.close() print 'ANNOTATING: ', dq.c.docl[0][0] an = annotate( dq.c.docl[0][0], dq ) ############################################################################### ### mode 3: using exports/a1/ ... ### should add structures to exports/a1/ ... i.e. structures + spatial and temporal shapes + CMOR dimensions. ### ###an.sectionCopy(dq) an.iniVar( dq ) an.fixCellMethods(dq) an.mipProv(dq) an.cmvCheck(dq) an.rvgCheck(dq) an.strTtl(dq) an.rqvCheck(dq) an.cmvCheck2(dq) for k in an.removedUids: print 'WARN.REMOVED: %s: %s' % (k,an.removedUids[k]) an.missingRefs( dq.inx.missingIds, dq ) if __name__ == '__main__': anno()