from dreqPy import dreq
import hashlib
import collections, string, os, utils_wb
import htmlTemplates as tmpl
import xml, re, uuid
import xml.dom, xml.dom.minidom
import sets
import xlsxwriter
mipTableMap = {'cfMon':'CFmon' }
knowna = [ 'LS3MIP [LWday]', 'LS3MIP [LEday]', 'CFMIP [cf1hrClimMon]', 'HighResMIP [3hr_cloud]',
'CFMIP [cf3hr_sim_new]', 'C4MIP [L_3hr]', 'DAMIP [DAMIP_day]', 'DAMIP [DAMIP_3hr_p2]',
'DynVar [DYVR_daily_c]', 'PMIP [PMIP-6hr]', 'HighResMIP [1hrLev]', 'PMIP [PMIP-Amon-02]',
'PMIP [PMIP-day-02]', 'PMIP [PMIP-aero-02]']
knownl = [ 'PMIP [PMIP-Lmon-02]']
knowno = [ 'DAMIP [DAMIP_Omon_p2]', 'FAFMIP [fafOyr]', 'PMIP [PMIP-Omon-02]']
from utils_wb import uniCleanFunc
from utils_wb import workbook
if os.path.isfile( 'refDefaultP.txt' ):
refpix = {}
for l in open( 'refDefaultP.txt' ).readlines():
bits = string.split( string.strip(l), '\t' )
assert len(bits) == 4, 'Bad record found in %s' % 'refDefaultP.txt'
refpix[bits[0]] = (bits[1],bits[2], int(bits[3]) )
else:
refpix = None
empty=re.compile('^$')
src1 = '../workbook/trial2_20150831.xml'
dq = dreq.loadDreq(dreqXML=src1, manifest=None)
inx = dq.inx
##inx.makeVarRefs()
ix_rql_uid = {}
ix_rqvg_uid = {}
ix_ovar_uid = {}
ix_gpi_uid = {}
list_gp_ovar = collections.defaultdict( list )
xr_var_ovar = collections.defaultdict( list )
xr_var_gpi = collections.defaultdict( list )
rql_by_name = collections.defaultdict( list )
def makeVarRefs(uid, var, iref_by_uid):
varRefs = {}
for thisuid in var.uid.keys():
if iref_by_uid.has_key(thisuid):
ee1 = collections.defaultdict( list )
for k,i in iref_by_uid[thisuid]:
thisi = uid[i]
sect = thisi._h.label
if sect == 'groupItem':
ee1[sect].append( '%s.%s' % (thisi.mip, thisi.group) )
elif sect == 'ovar':
ee1[sect].append( thisi.mipTable )
elif sect == 'revisedTabItem':
ee1[sect].append( '%s.%s' % (thisi.mip, thisi.table) )
varRefs[thisuid] = ee1
return varRefs
varRefs = makeVarRefs( inx.uid, inx.var, inx.iref_by_uid)
class addUnits(object):
def __init__(self,dq,wbi='units/units.xlsx'):
eqs = [('N m-1', 'kg s-2', 'J m-2'), ('Pa', 'N m-2', 'kg m-1 s-2'), ('cm-1','km-1','m-1') ]
eqss = ['N m-1','Pa','m-1','m','s','1','m s-2','m2','m3' ]
wb = workbook( wbi )
sh = wb.book.sheet_by_name('units')
self.repl = {}
self.uu = {}
for j in range(1,sh.nrows):
r = sh.row(j)
if len(r) > 5 and r[5].value != u'':
self.repl[ r[0].value ] = r[5].value
else:
self.uu[r[0].value] = [x.value for x in r]
for k in self.repl:
if self.repl[k] not in self.uu:
print 'Bad replacement found: %s --> %s' % (k,self.repl[k])
for i in dq.coll['var'].items:
if i.units in self.repl:
u = self.repl[i.units]
else:
u = i.units
u = string.strip(str(u))
if str(u) == '1.0':
u = '1'
if u not in self.uu:
if u == "1.0":
if float(u) not in self.uu:
print 'UNITS NOT FOUND: %s (%s)' % (u,i.label)
else:
print 'UNITS NOT FOUND: %s (%s)' % (u,i.label)
def uid(self,u0):
if u0 in self.repl:
u = self.repl[u0]
else:
u = u0
if u in self.uu:
return self.uu[u][7]
elif u == '1.0' and float(u) in self.uu:
return self.uu[float(u)][7]
else:
return None
class updates(object):
delToks = sets.Set( ['inc','omit'] )
def __init__(self,fndup,fnmult,idir='rev1'):
assert os.path.isdir( idir ), 'Directory %s not found' % idir
self.fdup = '%s/%s' % (idir,fndup)
self.fmult = '%s/%s' % (idir,fnmult)
for p in [self.fdup,self.fmult]:
assert os.path.isfile( p ), 'File %s not found' % p
self.repl = {}
self.upd = {}
self.twins = []
self.ddel = {}
self.cmvrepl = {}
def writeCmvUpd( self, inx, fnrp='CMVreplace.csv'):
oo = open( fnrp, 'w' )
for k in self.cmvrepl.keys():
if inx.iref_by_uid.has_key(k):
kn = self.cmvrepl[k]
for tag,ki in inx.iref_by_uid[k]:
vu = [ inx.uid.has_key(kk) for kk in [k,kn,ki] ]
if all( vu ):
oo.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (k,kn,tag,ki, inx.uid[k].label, inx.uid[kn].label, inx.uid[ki].label) )
else:
print 'ERROR.088.0003: Bad index in replace info: %s .. %s .. %s' % ( str([k,kn,ki]), str(vu), tag )
else:
print 'ERROR.088.0004: Bad index in replace info: %s' % k
oo.close()
def writeVarUpd(self, inx, fnrp='uuidreplace.csv', fnrm='uuidremove.csv', fnup='uuidupdate.csv'):
oo = open( fnrp, 'w' )
oo2 = open( fnrm, 'w' )
for k in self.repl.keys():
if inx.iref_by_uid.has_key(k):
kn = self.repl[k]
for tag,ki in inx.iref_by_uid[k]:
vu = [ inx.uid.has_key(kk) for kk in [k,kn,ki] ]
if all( vu ):
oo.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (k,kn,tag,ki, inx.uid[k].label, inx.uid[kn].label, inx.uid[ki].label) )
else:
print 'ERROR.088.0001: Bad index in replace info: %s .. %s .. %s' % ( str([k,kn,ki]), str(vu), tag )
else:
oo2.write( k + '\n' )
oo.close()
oo2.close()
oo = open( fnup, 'w' )
for k in self.upd.keys():
ee = self.upd[k]
oo.write( string.join( [k,ee['provNote'],string.join(ee['tags']),ee['label'], ee['title'] ], '\t') + '\n' )
oo.close()
def scandup(self):
ii = open( self.fdup ).readlines()
nn = (len(ii)-1)/2
for i in range(nn):
l1 = string.split( ii[i*2+1], '\t' )
l2 = string.split( ii[i*2+2], '\t' )
xx = l1[8:10]
yy = l2[8:10]
if xx[1] == '' and yy[1] == xx[0]:
ths = 0
assert not self.repl.has_key( yy[0] ), 'duplicate replacement request for %s' % yy[0]
self.repl[ yy[0] ] = yy[1]
elif yy[1] == '' and xx[1] == yy[0]:
ths = 1
assert not self.repl.has_key( xx[0] ), 'duplicate replacement request for %s' % xx[0]
self.repl[ xx[0] ] = xx[1]
elif l1[10] == 'twin' and l2[10] == 'twin':
ths = 2
self.twins.append( l1[8] )
self.twins.append( l2[8] )
elif l1[10] in self.delToks and l2[10] in self.delToks:
ths = 3
self.ddel[ l1[8] ] = (l1[10],l1[11])
self.ddel[ l2[8] ] = (l2[10],l2[11])
elif xx[1] == '' and yy[1] == "":
print 'WARN.087.00001: uncorrected duplication ..... %s ' % str( l1[:5] )
else:
ths = -1
print 'ERROR.xxx.0001: Match failed'
print l1
print l2
assert False
def scancmvdup(self):
wb = workbook( 'csv2/CMORvar.xls' )
sht = wb.book.sheet_by_name( 'Sheet1' )
for i in range(sht.nrows):
rr = sht.row(i)
if len(rr) == 21 and str( rr[20].value ) != '':
kn = rr[20].value
ko = rr[18].value
vn = rr[0].value
self.cmvrepl[ ko ] = kn
print '%s: replace %s with %s' % (vn,ko,kn)
def scanmult(self):
ii = open( self.fmult ).readlines()
nn = (len(ii)-1)/3
for i in range(nn):
l1 = string.split( ii[i*3+1], '\t' )
l2 = string.split( ii[i*3+2], '\t' )
l3 = string.split( ii[i*3+3], '\t' )
yy = [l1[9],l2[9],l3[9]]
xx = [l1[8],l2[8],l3[8]]
zz = (l1,l2,l3)
for j in range(3):
if yy[j] != '':
assert yy[j] in xx, 'Invalid replacement option, %s' % yy[j]
assert not self.repl.has_key( xx[j] ), 'duplicate replacement request for %s' % xx[j]
self.repl[ xx[j] ] = yy[j]
elif zz[j][10] == 'twin':
self.twins.append( zz[j][8] )
elif zz[j][11] == 'update':
tags = map( string.strip, string.split( zz[j][13], ',' ) )
self.upd[ xx[j] ] = { 'provNote':zz[j][12], 'tags':tags, 'label':zz[j][0], 'title':zz[j][1] }
###
### varDup and varMult created in first parse ----- then editted to select options
### 2nd pass through then generates the replace and remove options -- taking into account cross references
### the results of the 2nd pass go back to ../workbook to generate a new set of inputs.
###
up = updates('varDup.csv', 'varMult.csv', idir='rev2')
up.scandup()
up.scancmvdup()
up.scanmult()
urep = False
urep = True
if urep:
up.writeVarUpd( inx, fnrp='uuidreplace.csv', fnrm='uuidremove.csv', fnup='uuidupdate.csv')
up.writeCmvUpd( inx, fnrp='CMVreplace.csv')
else:
oo2 = open( 'uuidremove2.csv', 'w' )
for i in dq.coll['var'].items:
if not inx.iref_by_uid.has_key(i.uid):
oo2.write( string.join( [i.uid,i.label,i.title,i.prov,i.description], '\t') + '\n' )
oo2.close()
### check back references.
nbr = 0
lbr = []
for k in inx.iref_by_uid.keys():
if not inx.uid.has_key(k):
nbr += 1
lbr.append(k)
print 'Missing references: ', nbr
### can now apply mappings, create updated records and write to new xml?
for i in dq.coll['requestLink'].items:
rql_by_name[i.label].append( i.uid )
ix_rql_uid[i.uid] = i
for i in dq.coll['requestVarGroup'].items:
ix_rqvg_uid[i.uid] = i
if dq.coll.has_key( 'revisedTabItem' ):
thisk = 'revisedTabItem'
else:
thisk = 'requestVar'
oo = open( 'uuidinsert.csv', 'w' )
for i in dq.coll[thisk].items:
if i.uid == '__new__':
if inx.var.label.has_key( i.label ):
if len( inx.var.label[i.label] ) == 1:
v = inx.uid[ inx.var.label[i.label][0] ]
oo.write( string.join( ['unique',i.label,v.label,v.uid,v.prov,i.mip], '\t' ) + '\n' )
else:
oo.write( string.join( ['ambiguous',i.label,i.mip,str(len(inx.var.label[i.label] ) ) ], '\t' ) + '\n' )
oo.close()
oo = open( 'varMult.csv', 'w' )
oo2 = open( 'varDup.csv', 'w' )
oo3 = open( 'varStar.csv', 'w' )
hs = ['label','title','sn','units','description','prov','procnote','procComment','uid']
oo.write( string.join(hs, '\t' ) + '\n' )
oo2.write( string.join(hs, '\t' ) + '\n' )
oo3.write( string.join(hs, '\t' ) + '\n' )
ks = inx.var.label.keys()
ks.sort()
emptySet = sets.Set( ['','unset'] )
def entryEq(a,b):
return a == b or (a in emptySet and b in emptySet)
deferredRecs = []
for k in ks:
if len(inx.var.label[k]) == 2:
v1 = inx.var.uid[inx.var.label[k][0]]
v2 = inx.var.uid[inx.var.label[k][1]]
cc = map( lambda x: entryEq( v1.__dict__[x], v2.__dict__[x]), ['title','sn','units','description'] )
if all(cc):
### where duplicates are identical , collect and output at end of file.
pv1 = string.find( v1.__dict__['prov'], 'OMIP.' ) != -1
pv2 = string.find( v2.__dict__['prov'], 'OMIP.' ) != -1
if pv2:
vp = v2
vo = v1
else:
if not pv1:
print 'WARN.088.00002: no preference: %s, %s, %s' % (v1.__dict__['label'],v1.__dict__['prov'],v2.__dict__['prov'])
vp = v1
vo = v2
deferredRecs.append( string.join(map( lambda x: vo.__dict__[x], hs) + [vp.uid,'identical'], '\t' ) + '\n' )
deferredRecs.append( string.join(map( lambda x: vp.__dict__[x], hs) + ['',''], '\t' ) + '\n' )
else:
try:
oo2.write( string.join(map( lambda x: str(v1.__dict__[x]), hs) + ['',''], '\t' ) + '\n' )
oo2.write( string.join(map( lambda x: str(v2.__dict__[x]), hs) + ['',''], '\t' ) + '\n' )
except:
print 'SEVERE.oo2.00001',v1.__dict__
print 'SEVERE.oo2.00002',v2.__dict__
elif len(inx.var.label[k]) > 1:
for i in inx.var.label[k]:
oo.write( string.join(map( lambda x: inx.var.uid[i].__dict__[x], hs), '\t' ) + '\n' )
if k[-2:] == '--':
for i in (inx.var.label[k] + inx.var.label[k[:-2]]):
oo3.write( string.join(map( lambda x: inx.var.uid[i].__dict__[x], hs), '\t' ) + '\n' )
## output auto-filled records for identical duplicates at end of varDup file.
for r in deferredRecs:
oo2.write( r )
oo.close()
oo2.close()
oo3.close()
vns = inx.var.label.keys()
vns.sort()
for v in vns:
if len( inx.var.label[v] ) > 1:
print 'INFO.001.0001:',v, string.join( map( lambda x: inx.var.uid[x].sn, inx.var.label[v] ), ';' )
nok = 0
nerr = 0
if dq.coll.has_key( 'ovar' ):
thisk = 'ovar'
else:
thisk = 'CMORvar'
for i in dq.coll[thisk].items:
vid = i.vid
ix_ovar_uid[i.uid] = i
xr_var_ovar[vid].append( i.uid )
if not inx.var.uid.has_key(vid):
print 'missing key:',i.label, i.prov, vid
nerr += 1
else:
nok += 1
class rqHtml(object):
def __init__(self,odir='./htmlSc/'):
self.odir = odir
if not os.path.isdir(odir):
os.mkdir(odir)
def mkRqlHtml(self,name):
## [u'comment', u'uid', u'tab', u'title', u'label', u'grid', 'defaults', u'objective', u'mip', 'globalDefault', u'gridreq']
if len( rql_by_name[name] ) == 1:
self.mkRqlHtml01(rql_by_name[name][0], name )
else:
self.mkRqlHtmlGp(name)
def mkRqlHtmlGp(self,name):
ee = {}
ee['title'] = 'CMIP Request Link %s (with multiple definitions)' % name
self.pageName = 'rql__%s.html' % name
al =[]
for i in range( len( rql_by_name[name] ) ):
this = ix_rql_uid[rql_by_name[name][i]]
al.append( tmpl.item % {'item':'[%s]: %s' % (name,i,i,this.title) } )
ee['items'] = string.join(al, '\n' )
ee['introduction'] = ''
ee['htmlBody'] = tmpl.indexWrapper % ee
ee['htmlHead'] = '''
%(title)s''' % ee
self.pageHtml = tmpl.pageWrapper % ee
self.write()
for i in range( len( rql_by_name[name] ) ):
self.mkRqlHtml01(rql_by_name[name][i],i)
def mkRqlHtml01(self,id, tag):
this = ix_rql_uid[id]
ee = {}
if this.label == tag:
ee['title'] = 'CMIP Request Link %s' % tag
self.pageName = 'rql__%s.html' % tag
else:
ee['title'] = 'CMIP Request Link %s[%s]' % (this.label,tag)
self.pageName = 'rql__%s__%s.html' % (this.label,tag)
atts = this.__dict__.keys()
atts.sort()
al =[]
for a in atts:
if a not in ['defaults','globalDefault']:
al.append( tmpl.item % {'item':'%s: %s' % (a,this.__dict__.get(a,'-- Not Set --')) } )
ee['items'] = string.join(al, '\n' )
ee['introduction'] = ''
ee['htmlBody'] = tmpl.indexWrapper % ee
ee['htmlHead'] = '''%(title)s''' % ee
self.pageHtml = tmpl.pageWrapper % ee
self.write()
def mkVarHtml(self,name):
if len( inx.var.label[name] ) == 1:
self.mkVarHtml01(inx.var.label[name][0], name )
else:
self.mkVarHtmlGp(name)
def mkVarHtmlGp(self,name):
ee = {}
ee['title'] = 'CMIP Variable %s (with multiple definitions)' % name
self.pageName = 'var__%s.html' % name
al =[]
for i in range( len( inx.var.label[name] ) ):
this = inx.var.uid[inx.var.label[name][i]]
al.append( tmpl.item % {'item':'[%s]: %s' % (name,i,i,this.title) } )
ee['items'] = string.join(al, '\n' )
ee['introduction'] = ''
ee['htmlBody'] = tmpl.indexWrapper % ee
ee['htmlHead'] = '''%(title)s''' % ee
self.pageHtml = tmpl.pageWrapper % ee
self.write()
##print 'Multi var: %s' % name
for i in range( len( inx.var.label[name] ) ):
self.mkVarHtml01(inx.var.label[name][i],i)
def mkVarHtml01(self,id, tag):
this = inx.var.uid[id]
ee = {}
if this.label == tag:
ee['title'] = 'CMIP Variable %s' % tag
self.pageName = 'var__%s.html' % tag
else:
ee['title'] = 'CMIP Variable %s[%s]' % (this.label,tag)
self.pageName = 'var__%s__%s.html' % (this.label,tag)
atts = this.__dict__.keys()
atts.sort()
al =[]
for a in atts:
if a not in ['defaults','globalDefault']:
al.append( tmpl.item % {'item':'%s: %s' % (a,this.__dict__.get(a,'-- Not Set --')) } )
if inx.iref_by_uid.has_key(this.uid):
assert varRefs.has_key(this.uid), 'Problem with collected references'
ee1 = varRefs[this.uid]
ks = ee1.keys()
ks.sort()
for k in ks:
al.append( tmpl.item % {'item':'%s: %s' % (k,string.join(ee1[k])) } )
ee['items'] = string.join(al, '\n' )
ee['introduction'] = ''
ee['htmlBody'] = tmpl.indexWrapper % ee
ee['htmlHead'] = '''%(title)s''' % ee
self.pageHtml = tmpl.pageWrapper % ee
self.write()
def varHtml(self):
for k in inx.var.label.keys():
self.mkVarHtml(k)
def rqlHtml(self):
for k in rql_by_name.keys():
self.mkRqlHtml(k)
def write(self):
oo = open( '%s/%s' % (self.odir,self.pageName), 'w' )
oo.write( self.pageHtml )
oo.close()
vh = rqHtml()
vh.varHtml()
vh.rqlHtml()
if nerr == 0:
print 'CHECK 001: %s records checked, no missing references' % nok
shps = {'': 64, 'XYZKT': 13, '4-element vector': 2, 'XYT': 476, '2D vector field ': 2, 'KZT': 4, '2D vector field': 2, 'XYZ': 27, 'XYZT': 204, '2D': 83, 'scalar': 14, 'XY': 88, '?': 21, '2D ': 1, 'XYKT': 3, 'YZT': 16, 'ZST1': 15, 'XKT': 2, 'BasinYT': 1}
vshpchkMap = {'':'', u'all model levels above 400hPa':'alevStrat', u'all':'Xlev', 3.0:'plev3', '4.0':'plev4', \
36.0:'plev36', u'soil levels':'sdepth', \
1.0:'sfc?', \
16.0:'plev16', 7.0:'plev7', 40.0:'plev40', u'all*':'Xlev', 14.0:'plev14', u'Model levels or 27Plevs':'alev|plev27', \
u'17 (or 23 )':'plev17|plev23', u'17 (or 23)':'plev17|plev23', \
27.0:'plev27', 17.0:'plev17', u'17 (or23)':'plev17|plev23', 8.0:'plev8', u'all model levels':'alev', 5.0:'plev5'}
ks = vshpchkMap.keys()
for k in ks:
if type(k) == type(0.):
vshpchkMap[str(k)] = vshpchkMap[k]
tsmap = { 'mean':[u'daily mean', u'time mean', u'time: day',
u'Cumulative annual fraction', u'Time mean', u'weighted time mean', u'time: mean', u'mean', u'Mean'],
'__unknown__':['','dummyAt'],
'point':[ u'Instantaneous (end of year)', u'point', u'Synoptic', u'instantaneous', u'time: point', u'synoptic'] }
tsmap2 = {}
for k in tsmap.keys():
for i in tsmap[k]:
tsmap2[i] = k
if dq.coll.has_key( 'groupItem' ):
ee = collections.defaultdict( int )
for i in dq.coll['groupItem'].items:
tst = tsmap2[ i.tstyle ]
dd = ''
if 'X' in i.shape:
dd += 'latitude '
if 'Y' in i.shape:
dd += 'longitude '
if 'Z' in i.shape:
if i.levels == '':
print 'ERROR.001.0001: no levels specified', i.label, i.title
else:
zdim = vshpchkMap[i.levels]
dd += zdim
## print '%s::%s::%s|%s' % (i.shape, i.levels, i.tstyle, dd)
for i in dq.coll['groupItem'].items:
list_gp_ovar[i.gpid].append( i.uid )
nok = 0
nerr = 0
for i in dq.coll['groupItem'].items:
vid = i.vid
ix_gpi_uid[i.uid] = i
xr_var_gpi[vid].append( i.uid )
if not inx.var.uid.has_key(vid):
nerr += 1
else:
nok += 1
##print 'groupItem to var crossref: nok = %s, nerr = %s' % (nok, nerr)
class tcmp(object):
def __init__(self):
pass
def cmp(self,x,y):
return cmp(x.title,y.title)
def atRepr(l,x,optional=False):
if x != None:
if optional:
v = l.__dict__.get(x, '' )
else:
v = l.__dict__[x]
else:
v = l
if v == '__unset__':
return ''
elif type(v) in [ type([]), type(())]:
return string.join([str(i) for i in v])
else:
return v
class xlsx(object):
def __init__(self,fn):
self.wb = xlsxwriter.Workbook(fn)
def newSheet(self,name):
self.worksheet = self.wb.add_worksheet(name=name)
return self.worksheet
def close(self):
self.wb.close()
def dumpxlsx( fn, key, atl):
wb = xlsx( fn )
sht = wb.newSheet( key )
j = 0
for i in range(len(atl)):
sht.write( j,i, atl[i] )
ll = dq.coll[key].items[:]
ll.sort( tcmp().cmp )
for l in ll:
uid = atRepr(l,'uid')
j+=1
i=0
for x in atl:
sht.write( j,i, atRepr(l,x) )
i+=1
if key == 'var' and refpix != None:
if uid in refpix:
p = refpix[uid][2]
else:
p = 201
sht.write( j,i, p )
wb.close()
def dumpcsv( fn, key, atl, optionalSet='' ):
oo = open(fn, 'w' )
ll = dq.coll[key].items[:]
ll.sort( tcmp().cmp )
oo.write( string.join( atl, '\t' ) + '\n' )
for l in ll:
try:
oo.write( string.join( map( lambda x: str(atRepr(l,x,x in optionalSet)), atl), '\t' ) + '\n' )
except:
print 'SEVERE.090.0001: print %s' % str(atl)
print l
print key
raise
oo.close()
def atlSort( ll ):
oo = []
l1 = ['label','title']
l2 = ['uid','defaults','globalDefault']
for i in l1:
if i in ll:
oo.append(i)
ll.sort()
for i in ll:
if i not in l1 + l2:
oo.append(i)
if 'uid' in ll:
oo.append( 'uid' )
return oo
for k in dq.coll.keys():
if len( dq.coll[k].items ) > 0:
expl = dq.coll[k].items[0]
atl = atlSort( expl.__dict__.keys() )
atl1 = [a for a in atl if a != 'parent' and a[0] != '_']
###print k, atl1
optionalSet = set( [a for a in atl1 if not expl.__class__.__dict__[a].required] )
dumpcsv( 'csv2/%s.csv' % k, k, atl1, optionalSet=optionalSet )
if k == 'var':
dumpxlsx( 'csv2/var.xlsx', k, atl1 )
class annotate(object):
def __init__(self,src,dreq):
assert os.path.isfile( src), '%s not found' % src
self.doc = xml.dom.minidom.parse( src )
self.dreq = dreq
self.removedUids = {}
def iniVar(self,dq):
a = addUnits(dq)
this = self.doc.getElementsByTagName('var')[0]
dil = this.getElementsByTagName('item')
self.vid = {}
for item in dil:
uid = item.getAttribute( 'uid' )
title = item.getAttribute( 'title' )
label = item.getAttribute( 'label' )
self.vid[uid] = (label,title)
units = item.getAttribute( 'units' )
if units in a.repl:
print 'INFO.units.0001: replacing %s --> %s' % (units,a.repl[ units ])
units = a.repl[ units ]
u2 = a.uid( units )
if u2 == None:
units = string.strip( str(units) )
u2 = a.uid( units )
if u2 != None:
item.setAttribute( 'unid', u2 )
else:
print 'Units not recognised: %s (%s) -- %s' % (units,label, type(units))
def tableMap(self,dq):
that = self.doc.getElementsByTagName('CMORvar')[0]
dil = that.getElementsByTagName('item')
ee1 = {}
for item in dil:
mt = item.getAttribute( 'mipTable' )
if mt in mipTableMap:
mt1 = mipTableMap[mt]
item.setAttribute( 'mipTable', mt1 )
item.setAttribute( 'mtid', 'MIPtable::%s' % mt1 )
def strTtl(self,dq):
this = self.doc.getElementsByTagName('cellMethods')[0]
dil = this.getElementsByTagName('item')
eecm = {}
for item in dil:
eecm[item.getAttribute( 'uid' )] = item.getAttribute( 'label' )
that = self.doc.getElementsByTagName('spatialShape')[0]
dil = that.getElementsByTagName('item')
ee1 = {}
for item in dil:
uid = item.getAttribute( 'uid' )
lab = item.getAttribute( 'label' )
ttl = item.getAttribute( 'title' )
ee1[uid] = (lab,ttl)
this = self.doc.getElementsByTagName('structure')[0]
dil = this.getElementsByTagName('item')
filterStr = True
estr = []
strForce = ['str-a076','str-x269','str-x100']
for item in dil:
uid = item.getAttribute( 'uid' )
lab = item.getAttribute( 'label' )
if len( dq.inx.iref_by_uid[uid] ) == 0 and filterStr:
if lab not in strForce:
print 'UNUSED STRUCTURE: %s, %s' % (lab,uid)
this.removeChild( item )
self.removedUids[uid] = 'structure: %s' % lab
else:
tmid = item.getAttribute( 'tmid' )
spid = item.getAttribute( 'spid' )
cmid = item.getAttribute( 'cmid' )
cml = eecm.get( cmid, '' )
if cml != '':
cml = ' [%s]' % cml
o = item.getAttribute( 'odims' )
c = item.getAttribute( 'coords' )
if spid not in ee1:
print 'SEVERE:spid.0001: spid not found: %s' % spid
sl = '__unknowm__'
st = '__unknowm__'
else:
sl,st = ee1[spid]
if tmid not in dq.inx.uid:
print 'BAD time record uid: ',tmid
print lab, uid
raise
title = '%s, %s [%s]' % (dq.inx.uid[tmid].title, st, sl)
if string.strip( c ) != '' or string.strip( o ) != '':
title += ' {%s:%s}' % (o,c)
title += cml
if title != item.getAttribute( 'title' ):
print 'STRUCT: %s:: %s' % (title, item.getAttribute( 'title' ) )
if item.getAttribute( 'title' )[:4] == 'str-':
item.setAttribute( 'title', title )
estr.append( item.getAttribute( 'label' ) )
if len(estr) > 0:
oo = open( 'scandreq_estr.txt', 'w' )
for i in estr:
oo.write( i + '\n' )
oo.close()
def rvgCheck(self,dq):
"""Remove request variable groups which have no requestLink"""
this = self.doc.getElementsByTagName('requestVarGroup')[0]
dil = this.getElementsByTagName('item')
nn = 0
for item in dil:
uid = item.getAttribute( 'uid' )
if ('requestLink' not in dq.inx.iref_by_sect[uid].a) and ('tableSection' not in dq.inx.iref_by_sect[uid].a):
if item.getAttribute( 'label' ) in ['aermonthly']:
print 'WARN.0010: overriding variable group pruning: ',item.getAttribute( 'label' )
else:
##self.removedUids[ uid ] = 'requestVarGroup: %s' % item.getAttribute( 'label' )
##this.removeChild(item)
print 'INFO.rvg.0001: New rvg?? ',uid, item.getAttribute( 'label' )
nn+=1
##print 'WARN.Unused variable groups removed: %s' % nn
this = self.doc.getElementsByTagName('requestVar')[0]
dil = this.getElementsByTagName('item')
nn = 0
s1 = {i.uid for i in dq.coll['requestVarGroup'].items if i.uid not in self.removedUids}
self.usedCmv = set()
for item in dil:
uid = item.getAttribute( 'uid' )
vid = item.getAttribute( 'vid' )
vgid = item.getAttribute( 'vgid' )
if vgid not in s1:
this.removeChild(item)
self.removedUids[ uid ] = 'requestVar: %s' % item.getAttribute( 'label' )
nn+=1
else:
self.usedCmv.add( vid )
print 'Unused request variables removed: %s' % nn
def rqvCheck(self,dq):
wb = workbook( 'ingest/rqv_cmv_remap.xls' )
rqvredirect = {('aermonthly','siconc'):'SImon' }
sh = wb.book.sheet_by_name('maps')
self.repl = {}
self.uu = {}
for j in range(sh.nrows):
r = [str(x.value) for x in sh.row(j)]
rqvredirect[ ( r[0], r[1] ) ] = r[2]
this = self.doc.getElementsByTagName('requestVar')[0]
dil = this.getElementsByTagName('item')
badrqv = set()
self.removedCmv = set()
for item in dil:
uid = item.getAttribute( 'uid' )
vid = item.getAttribute( 'vid' )
if vid not in dq.inx.uid:
badrqv.add( uid )
elif dq.inx.uid[vid]._h.label == 'remarks':
badrqv.add( uid )
else:
cmv = (dq.inx.uid[vid].mipTable,dq.inx.uid[vid].label)
if cmv in rqvredirect:
targ = (rqvredirect[cmv],dq.inx.uid[vid].label)
if targ not in self.cmvLookUp:
print 'SEVERE.rgvrdi.00001: attempt to redirect to non-existant var: %s --> %s' % (str(cmv),str(targ))
else:
nid = self.cmvLookUp[targ]
item.setAttribute( 'vid', nid )
self.removedCmv.add( vid )
print 'INFO.rgvrdi.00002: redirect rql to new CMOR var: %s --> %s' % (str(cmv),str(targ))
def cmvCheck2(self,dq):
this = self.doc.getElementsByTagName('CMORvar')[0]
dil = this.getElementsByTagName('item')
self.usedVar = set()
self.keepVar = {'ugrido',}
for item in dil:
uid = item.getAttribute( 'uid' )
if ((uid not in self.usedCmv) or (uid in self.removedCmv)) and (item.getAttribute( 'label' ) not in self.keepVar):
print 'INFO.cmv.04004: removing unused CMORvar: %s, %s, %s, %s: ' % (uid,item.getAttribute( 'label' ),item.getAttribute( 'table' ),item.getAttribute( 'mipTable' ))
this.removeChild(item)
self.removedUids[ uid ] = 'CMORvar: %s' % item.getAttribute( 'label' )
else:
self.usedVar.add( item.getAttribute( 'vid' ) )
this = self.doc.getElementsByTagName('var')[0]
dil = this.getElementsByTagName('item')
self.varLookUp = {}
for item in dil:
uid = item.getAttribute( 'uid' )
self.varLookUp[ item.getAttribute( 'label' ) ] = uid
if uid not in self.usedVar and item.getAttribute( 'label' ) not in self.keepVar:
print 'INFO.var.04004: removing unused var: %s, %s, %s, %s: ' % (uid,item.getAttribute( 'label' ),item.getAttribute( 'table' ),item.getAttribute( 'units' ))
this.removeChild(item)
self.removedUids[ uid ] = 'var: %s' % item.getAttribute( 'label' )
this = self.doc.getElementsByTagName('CMORvar')[0]
wb = workbook( 'ingest/extraCmv.xls' )
sh = wb.book.sheet_by_name('extra')
extra = []
self.uu = {}
for j in range(1,sh.nrows):
extra.append( [str(x.value) for x in sh.row(j)] + ['',''] )
extra_old = [('E6hrZ','ps','str-a076','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','6hr', 'CMIP extra','scanDreq.py','1','float',''),
('AERmon','ps','str-013','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','mon', 'CMIP extra','scanDreq.py','1','float',''),
('E3hrPt','ps','str-d11','ps','atmos','Surface Pressure','Surface Pressure .. needed for vertical coordinates','3hr', 'CMIP extra','scanDreq.py','1','float',''),]
eh = ['label','modeling_realm','title','description','frequency','provNote','prov','defaultPriority','type','positive']
hids = set()
idfp = eh.index('defaultPriority') + 3
for e in extra:
assert e[11] in ['float'], 'bad type in cmv extra ... see ingest/extraCmv.xls: %s' % e[11]
if (e[0],e[1]) not in self.cmvLookUp:
e[idfp] = str( int( float( e[idfp] ) ) )
stid = self.strLookUp[e[2]]
vid = self.varLookUp[e[1]]
new = self.doc.createElement( 'item' )
new.setAttribute( 'mipTable', e[0] )
new.setAttribute( 'mtid', 'MIPtable::%s' % e[0] )
new.setAttribute( 'stid', stid )
new.setAttribute( 'vid', vid )
new.setAttribute( 'rowIndex', '0' )
for a in ['deflate','deflate_level','shuffle']:
new.setAttribute( a, '' )
hid = hashlib.new( 'sha1', 'extra-fields::' + ':'.join(e[:8]) ).hexdigest()
assert hid not in hids
new.setAttribute( 'uid', hid )
print 'INFO.extra.00011: ',e
for k in range(len(eh) ):
new.setAttribute( eh[k], e[3+k] )
this.appendChild( new )
else:
print 'SKIPPING extraCMV: %s.%s' % (e[0],e[1])
def cmvCheck(self,dq):
this = self.doc.getElementsByTagName('CMORvar')[0]
dil = this.getElementsByTagName('item')
kk = 0
kka = 0
nrm0 = 0
nrm1 = 0
self.cmvLookUp = {}
for item in dil:
title = item.getAttribute( 'title' )
if title[:6] == '__from':
kka += 1
vid = item.getAttribute( 'vid' )
if vid in self.vid:
title2 = self.vid[vid][1]
item.setAttribute( 'title', title2 )
kk += 1
realm = item.getAttribute( 'modeling_realm' )
if realm in ['','?']:
stid = item.getAttribute( 'stid' )
st = dq.inx.uid[stid]
odims = st.odims
this = None
if odims == 'iceband':
this = 'seaIce'
elif odims not in ['','?']:
this = 'atmos'
else:
sp = dq.inx.uid[st.spid]
if sp.label in ['TR-na','XY-O', 'YB-O', 'YB-R']:
this = 'ocean'
elif sp.label != 'XY-na':
this = 'atmos'
else:
prov = item.getAttribute( 'prov' )
if prov in knowna:
this = 'atmos'
elif prov in knowno:
this = 'ocean'
elif prov in knownl:
this = 'land'
else:
lab = item.getAttribute( 'label' )
print 'ERROR.cmv.00006: no realm: %s, %s, %s, %s ..' % (lab,odims, sp.label, prov)
if this == None:
nrm1 += 1
else:
nrm0 += 1
item.setAttribute( 'modeling_realm', this )
self.cmvLookUp[(item.getAttribute( 'mipTable' ),item.getAttribute( 'label' ) )] = item.getAttribute( 'uid' )
print ('CMOR Var realms set: %s' % nrm0 )
if nrm1 > 0:
print ('SEVERE.cmv.00005: realm unset: %s' % nrm1)
print ('CMOR Var titles reset: %s [%s]' % (kk,kka))
def mipProv(self,dq):
s1 = re.compile( '\[([a-zA-Z0-9]*)\]' )
cc = collections.defaultdict(list)
dd = collections.defaultdict(int)
mips = set()
for i in dq.coll['mip'].items:
mips.add( i.uid )
for i in dq.coll['var'].items:
cc[i.prov].append( i.label )
ee = {}
for i in sorted( cc.keys() ):
if i[:9] == 'CMIP6 end':
m = s1.findall( i )
assert len( m ) == 1, 'FAILED TO PARSE: %s' % i
this = m[0]
else:
i5 = i.find( 'CMIP5' ) != -1
io = i.find( 'OMIP' ) != -1
icx = i.find( 'CORDEX' ) != -1
ip = i.find( 'PMIP' ) != -1
icc = i.find( 'CCMI' ) != -1
isp = i.find( 'SPECS' ) != -1
icf = i.find( 'CFMIP' ) != -1
iac = i.find( 'AerChemMIP' ) != -1
if i5 and io:
print 'WARNING .. unclear provenance: ',i,cc[i]
this = 'CMIP5/OMIP'
elif i5:
this = 'CMIP5'
elif io:
this = 'OMIP'
elif icx:
this = 'CORDEX'
elif ip:
this = 'PMIP'
elif icc:
this = 'CCMI'
elif isp:
this = 'SPECS'
elif icf:
this = 'CFMIP'
elif iac:
this = 'AerChemMIP'
else:
print 'WARNING .. unclear provenance [2]: ',i,cc[i]
this = 'unknown'
ee[i] = this
dd[this] += len( cc[i] )
self.dd = dd
self.ee = ee
this = self.doc.getElementsByTagName('var')[0]
dil = this.getElementsByTagName('item')
print 'FIXING var provmip attribute, %s items' % len(dil)
kk = 0
for item in dil:
kk += 1
p = item.getAttribute( 'prov' )
p0 = item.getAttribute( 'provmip' )
if p0 not in mips:
if p in mips:
item.setAttribute( 'provmip', p )
else:
assert ee.has_key(p), 'Unmatched key: %s' % p
assert ee[p] in mips, 'Unknown provenance: %s, %s' % (p,ee[p])
item.setAttribute( 'provmip', ee[p] )
def fixCellMethods(self,dq):
this = self.doc.getElementsByTagName('structure')[0]
dil = this.getElementsByTagName('item')
cmrep = collections.defaultdict( set )
cmc = collections.defaultdict( int )
self.strLookUp = {}
nrep = 0
for item in dil:
self.strLookUp[ item.getAttribute( 'label' ) ] = item.getAttribute( 'uid' )
cm = item.getAttribute( 'cell_methods' )
if string.find( cm, "area: where" ) != -1:
cm1 = string.replace( cm, "area: where", "area: mean where" )
item.setAttribute( 'cell_methods', cm1 )
cmrep[cm].add(cm1)
cmc[cm1] += 1
nrep += 1
elif string.find( cm, "time:mean" ) != -1:
cm1 = string.replace( cm, "time:mean", "time: mean" )
item.setAttribute( 'cell_methods', cm1 )
cmrep[cm].add(cm1)
cmc[cm1] += 1
nrep += 1
elif string.find( cm, "weighted b " ) != -1:
cm1 = string.replace( cm, "weighted b ", "weighted by " )
item.setAttribute( 'cell_methods', cm1 )
cmrep[cm].add(cm1)
cmc[cm1] += 1
nrep += 1
print ('FIXED CELL METHODS .. %s' % nrep )
for k in cmc:
print ('%s: %s' % (k,cmc[k]) )
##
## defective code .. can not easily do structure mapping here .. need to think about that ...
##
def sectionCopy(self,dq):
this = self.doc.getElementsByTagName('CMORvar')[0]
thisRqv = self.doc.getElementsByTagName('requestVar')[0]
xx = [i for i in dq.coll['requestVarGroup'].items if i.label == 'OMIP-Omon']
assert len(xx) == 1, 'OMIP-Omon request variable group not found'
omipOmonUid = xx[0].uid
dil = this.getElementsByTagName('item')
for item in dil:
mipt = item.getAttribute( 'mipTable' )
prov = item.getAttribute( 'prov' )
provn = item.getAttribute( 'provNote' )
if mipt == 'Oyr' and prov[:12] == "CMIP6 [OMIP]" and provn == 'bgc':
rowix = int( item.getAttribute( 'rowIndex' ) )
if rowix < 65:
var = item.getAttribute( 'label' )
new = item.cloneNode(True)
new.setAttribute( 'defaultPriority', '2' )
new.setAttribute( 'mipTable', 'Omon' )
new.setAttribute( 'prov', 'Copy from Oyr' )
new.setAttribute( 'provNote', 'sdq.001' )
vid = str( uuid.uuid1() )
new.setAttribute( 'uid', vid )
this.appendChild( new )
##
## create request var
##
new2 = self.doc.createElement( 'item' )
uid = str( uuid.uuid1() )
new2.setAttribute( 'uid', uid )
new2.setAttribute( 'priority', '2' )
new2.setAttribute( 'vid', vid )
new2.setAttribute( 'vgid', omipOmonUid )
new2.setAttribute( 'mip', 'OMIP' )
new2.setAttribute( 'table', 'OMIP-Omon' )
if omipOmonUid not in dq.inx.uid:
print 'ERROR.005.0001: vgid %s not found' % omipOmonUid
thisRqv.appendChild(new2)
def missingRefs(self,mrefs,dq,clear=True):
this = self.doc.getElementsByTagName('remarks')[0]
if clear:
dil = this.getElementsByTagName('item')
for d in dil:
this.removeChild(d)
for k in mrefs.keys():
if len( mrefs[k] ) == 1:
tid = mrefs[k][0][2]
tattr = mrefs[k][0][1]
tn = None
else:
tid = None
ee = collections.defaultdict(int)
tn = str( len( mrefs[k] ) )
for t in mrefs[k]:
s = self.dreq.inx.uid[t[2]]._h.label
ee['%s.%s' % (s,t[1])] += 1
if len( ee.keys() ) == 1:
tattr = ee.keys()[0]
else:
tattr = '__multiple__'
if tid == None or (tid not in self.removedUids):
item = self.doc.createElement( 'item' )
assert type(k) == type( '' ), 'Attempt to set uid with bad type: %s' % str(k)
item.setAttribute( 'uid', k )
item.setAttribute( 'tattr', tattr )
if tn != None:
item.setAttribute( 'techNote', tn )
if tid != None:
item.setAttribute( 'tid', tid )
if tid not in dq.inx.uid:
print 'ERROR.005.0002: tid %s not found' % tid
item.setAttribute( 'class', 'missingLink' )
item.setAttribute( 'description', 'Missing links detected and marked for fixing' )
item.setAttribute( 'prov', 'scanDreq.py:annotate' )
this.appendChild( item )
parent = self.doc.getElementsByTagName('annex')[0]
for this in parent.childNodes:
if this.nodeType == this.ELEMENT_NODE:
dil = this.getElementsByTagName('item')
print 'INFO.nodescan.00001: ',this.localName,len(dil)
for item in dil:
for k in item.attributes.keys():
v = item.getAttribute( k )
if type( v ) not in [type( '' ),type( u'' )]:
print 'SEVERE.0001: tuple in attribute value',this.localName,k,v
txt = self.doc.toprettyxml(indent='\t', newl='\n', encoding=None)
oo = open( 'out/annotated_20150731_i1.xml', 'w' )
lines = string.split( txt, '\n' )
for line in lines:
l = utils_wb.uniCleanFunc( string.strip(line) )
if empty.match(l):
continue
else:
oo.write(l + '\n')
oo.close()
def anno():
oo = open( 'var1.csv', 'w' )
ks = ['label','title','sn','units','description','prov','procnote','procComment','uid']
ks2 = [ 'ovar','groupItem','revisedTabItem']
oo.write( string.join(ks + ks2, '\t' ) + '\n' )
for i in dq.coll['var'].items:
if i.label[-2:] != '--':
ee1 = varRefs.get( i.uid, {} )
r2 = map( lambda x: string.join( atRepr( ee1.get(x, [] ), None ) ), ks2 )
oo.write( string.join(map( lambda x: atRepr(i,x), ks) + r2, '\t' ) + '\n' )
oo.close()
print 'ANNOTATING: ', dq.c.docl[0][0]
an = annotate( dq.c.docl[0][0], dq )
###############################################################################
### mode 3: using exports/a1/ ...
### should add structures to exports/a1/ ... i.e. structures + spatial and temporal shapes + CMOR dimensions.
###
###an.sectionCopy(dq)
an.iniVar( dq )
an.fixCellMethods(dq)
an.mipProv(dq)
an.cmvCheck(dq)
an.rvgCheck(dq)
an.strTtl(dq)
an.rqvCheck(dq)
an.cmvCheck2(dq)
for k in an.removedUids:
print 'WARN.REMOVED: %s: %s' % (k,an.removedUids[k])
an.missingRefs( dq.inx.missingIds, dq )
if __name__ == '__main__':
anno()