from dreqPy import dreq from dreqPy import misc_utils from ingest import mipTab from ing02 import loadcf import cmchecl import collections, sys, os, json def mapCoreAt(x): if type(x) != type( 'x' ): if type(x) in [type(z) for z in [[],()]] and all( [type(z) == type('x') for z in x] ): return ';'.join(x) else: ### print( 'no string: %s' % x ) return '--no string--' else: return x realms = [ "aerosol", "atmos", "atmosChem", "land", "landIce", "ocean", "ocnBgchem", "seaIce" ] realm = { "aerosol":"Aerosol", "atmos":"Atmosphere", "atmosChem":"Atmospheric Chemistry", "land":"Land Surface", "landIce":"Land Ice", "ocean":"Ocean", "ocnBgchem":"Ocean Biogeochemistry", "seaIce":"Sea Ice" } python2 = True if sys.version_info[0] == 3: python2 = False if len(sys.argv) > 1: lname = sys.argv[1] else: lname = 'audit' logFarm = misc_utils.dreqLog(dir='logs') log = logFarm.getLog(lname) class c1(object): def __init__(self): self.a = collections.defaultdict( list ) class checkUnits(object): def __init__(self, cc, extra=None): assert python2, 'The checkUnits module requires the python 2 version of cf' import cf if extra != None: for k in sorted( extra.keys() ): try: a = cf.units.Units( k ) except: log.warn( 'WARN.001.0004: bad units %s (%s)' % (k,str(extra[k])) ) for k in sorted( cc.keys() ): try: a = cf.units.Units( k ) except: log.warn( 'Bad CF units: %s' % k ) a = None try: if a != None: for k2 in cc[k].a: try: b = cf.units.Units( k2 ) except: b = None log.warn( 'WARN.001.0003: unit invalid : %s .... %s: %s' % (k,k2,str( cc[k].a[k2]) ) ) if b != None and not a.equivalent(b): log.warn( 'WARN.001.0002: unit mismatch: %s .... %s: %s' % (k,k2,str( cc[k].a[k2]) ) ) except: log.error( 'ERROR: exception raised checking units: %s -- %s' % (k,type(a)) ) raise class checkUnits3(object): def __init__(self, cc, extra=None): assert not python2, 'The checkUnits3 module requires the python 3 version of cf' import cf if extra != None: for k in sorted( extra.keys() ): try: a = cf.units.Units( k ) except: log.error( 'ERRO: exception raised by units %s (%s)' % (k,str(extra[k])) ) if not a.isvalid: log.warn( 'WARN.001.0004: bad units %s (%s)' % (k,str(extra[k])) ) for k in sorted( cc.keys() ): try: a = cf.units.Units( k ) if not a.isvalid: log.warn( 'Bad CF units: %s' % k ) except: log.error( 'ERROR: exception raised: Bad CF units: %s' % k ) continue try: if a.isvalid: for k2 in cc[k].a: try: b = cf.units.Units( k2 ) except: log.warn( 'ERROE.001.0003: exception raised by unit : %s .... %s: %s' % (k,k2,str( cc[k].a[k2]) ) ) continue if not b.isvalid: log.warn( 'WARN.001.0003: unit invalid : %s .... %s: %s' % (k,k2,str( cc[k].a[k2]) ) ) continue if not a.equivalent(b): log.warn( 'WARN.001.0002: unit mismatch: %s .... %s: %s' % (k,k2,str( cc[k].a[k2]) ) ) except: log.error( 'ERROR: exception raised checking units: %s -- %s' % (k,type(a)) ) raise def splitStr( s ): if s == '': return [''] if s.find( ' ') == -1: return [s,] return s.split() class checkDims(object): def __init__(self, dq): """Generate list of used dimensions in ss; Check against defined dimenions [todo]""" ss = set() self.cellm = set() cc = collections.defaultdict(list) duds = ('', '****', '?',) for i in dq.coll['spatialShape'].items: if i.dimensions != '': for x in i.dimensions.split( '|' ): if x in duds: cc[x].append( i.uid ) else: ss.add(x) for i in dq.coll['temporalShape'].items: if i.dimensions != '': for x in i.dimensions.split( '|' ): if x in duds: cc[x].append( i.uid ) else: ss.add(x) for i in dq.coll['structure'].items: self.cellm.add( i.cell_methods ) if 'odims' in i.__dict__: if i.odims != '': for x in i.odims.split( ' ' ): if x in duds: cc[x].append( i.uid ) else: ss.add(x) if 'coords' in i.__dict__: if i.coords != '': for x in i.coords.split( ' ' ): if x in duds: cc[x].append( i.uid ) else: ss.add(x) cellm = sorted( list( self.cellm ) ) for c in cellm: log.info( 'cell_methods\t %s' % c ) log.info( str( sorted( list(ss) ) ) ) ll = set( [i.label for i in dq.coll['grids'].items] ) for x in ss: if x not in ll: log.error( 'ERROR: dimension not found: %s' % x ) class Auditor(object): def __init__(self,dq): self.dq = dq self.run() def run(self): self.loadDraftNewVar() self.audit_var() self.audit_units() self.audit_sn() self.audit_cmv() self.audit_rqv() self.audit_rql() self.audit_rvg() self.audit_str() self.audit_cm() self.audit_expt() self.audit_mip() def audit_mip(self): file = '/home/mjuckes/Desktop/git/import/CMIP6_CVs/CMIP6_activity_id.json' ii = json.load( open( file ) ) mips0 = ii['activity_id'].keys() mips1 = [i.label for i in self.dq.coll['mip'].items] ss = [m for m in mips0 if m not in mips1] ##assert len(ss) == 0, 'MIPS missing: %s' % ss if len(ss) != 0: log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) log.error( 'ERROR.mips.00010: mips missing: %s' % ss ) for u,i in self.dq.inx.uid.items(): if "mip" in i.__dict__: if i.mip not in mips1: log.error( 'ERROR.mips.00900: bad mip attribute, %s.%s - %s::%s' % (i._h.label,i.label,i.uid,i.mip) ) def audit_expt(self): file = '/home/mjuckes/Desktop/git/import/CMIP6_CVs/CMIP6_experiment_id.json' ex = json.load( open( file ) ) ne = 0 ssex = set() ee = dict() ff = dict() tlene = 0 for i in self.dq.coll['experiment'].items: ssex.add(i.label) ee[i.label] = i.uid if i.label not in ex['experiment_id']: log.error( 'ERROR.expt.00001: experiment name not found: %s' % (i.label) ) ne += 1 if len( i.tier ) != len( i.ensz ): log.error( 'ERROR.expt.00010: ensemble and tier attribute inconsistency: %s: %s -- %s' % (i.label, str( i.tier ), str( i.ensz ) ) ) tlene += 1 rqi = set( self.dq.inx.iref_by_sect[i.uid].a['requestItem'] ) rqi2 = set( self.dq.inx.iref_by_sect[i.egid].a['requestItem'] ) rqi3 = set( self.dq.inx.iref_by_sect[i.mip].a['requestItem'] ) ff[i.label] = rqi.union( rqi2 ).union( rqi3 ) ss = set( [self.dq.inx.uid[x].nenmax for x in ff[i.label] ] ) if len( ss ) == 0: log.error( 'ERROR.nenmax.0001: no nenmax values found: %s, %s' % (i.label, i.uid) ) else: if max( ss ) > i.ensz[-1]: log.error( 'INFO.nenmax.0002: %s: nenmax > ensz: %s -- %s: %s, %s' % (i.mip, max(ss),i.ensz,i.label, i.uid) ) elif -1 not in ss and max(ss) < i.ensz[-1]: log.error( 'ERROR.nenmax.0003: %s: max nenmax < ensz: %s -- %s: %s, %s' % (i.mip, max(ss),i.ensz,i.label, i.uid) ) for x in ff[i.label]: this = self.dq.inx.uid[x] print( 'ERROR.nenmax.0003-ex: %s %s' % (this.uid,this.label) ) if ne == 0: log.info( 'INFO.expt.00001: all experiment names valid' ) if tlene != 0: log.error( 'ERROR.expt.00011: ensemble and tier attribute inconsistencies: %s' % tlene ) else: log.error( 'INFO.expt.00011: zero ensemble and tier attribute inconsistencies: %s' % tlene ) ne = 0 for k in ex['experiment_id']: if k not in ssex: log.error( 'ERROR.expt.00002: experiment name not used: %s [%s]' % (k,ex['experiment_id'][k].get("activity_id",'__missing__') ) ) ne += 1 if ne == 0: log.info( 'INFO.expt.00002: all experiment names used' ) ## check historical_ext request items thisrqi = ff['historical-ext'] for uu in thisrqi: i = self.dq.inx.uid[uu] if 'tslice' in i.__dict__: ts = self.dq.inx.uid[i.tslice] if ts.label in ['hist20','hist65']: log.error( 'ERROR.tslice.001: inapprpriate time slice: %s, %s, %s' % (i.label, i.uid, ts.label) ) def audit_cm(self): log.info( 'Running Cell Methods Audit' ) cc = cmchecl.check_cm() nf = 0 for i in self.dq.coll['cellMethods'].items: res = cc.test( i.cell_methods ) if not res: log.error( 'ERROR.cm.00001: cannot parse cell methods: "%s" [%s]' % (i.cell_methods,i.label) ) nf += 1 if nf == 0: log.info( 'INFO.cm.00001: all cell methods strings parsed by cmchecl' ) l1 = [i for i in self.dq.coll['structure'].items if i.cell_methods == ""] ss = set() if len( l1 ) > 0: log.error( 'ERROR.cm.00010: %s blank cell methods in structure records: ' % len(l1) ) for i in l1: log.error( 'ERROR.cm.00011: %s: %s' % ( i.label, i.title ) ) for i in l1: if 'CMORvar' in self.dq.inx.iref_by_sect[i.uid].a: for u in dq.inx.iref_by_sect[i.uid].a['CMORvar']: ss.add(u) if len(ss) > 0: log.error( 'ERROR.cm.00020: %s CMOR vars affected ' % len(ss) ) for u in ss: cmv = dq.inx.uid[u] log.error( 'ERROR.cm.00021: Invalid cell methods: %s, %s, %s [%s]' % (cmv.label, cmv.title, cmv.mipTable, cmv.prov) ) aclim = dq.inx.uid['CellMethods::aclim'].uid for ustr in dq.inx.iref_by_sect[aclim].a['structure']: log.info( 'INFO.aclim: checking %s vars' % len( dq.inx.iref_by_sect[ustr].a['CMORvar'] ) ) for ucmv in dq.inx.iref_by_sect[ustr].a['CMORvar']: if dq.inx.uid[ucmv].frequency != 'monC': log.error( 'ERROR.aclim.00001: %s, %s, %s, %s' % (ucmv,dq.inx.uid[ucmv].frequency,dq.inx.uid[ucmv].label,dq.inx.uid[ucmv].mipTable) ) def audit_units(self): nn = 0 for i in self.dq.coll['var'].items: ii = self.dq.inx.uid[ i.unid ] if ii.text != i.units: log.error( 'SEVERE.units.0001: %s, %s : %s, %s' % (i.label,i.units,ii.text,ii.label) ) nn+=1 if nn == 0: log.info( 'INFO.units.00100: unit link consistency OK' ) def audit_var(self): cc = collections.defaultdict( list ) ss = collections.defaultdict( c1 ) ssx = collections.defaultdict( set ) ##umap = { 'Wm-2':'W m-2', 'string':''} umap = { 'string':''} rtokens = [('sw','shortwave'),('lw','longwave')] antonyms = [('surface','below-ground'),('longwave','shortwave'),('upward','downward'),('lw','shortwave'),('sw','longwave')] for i in self.dq.coll['var'].items: cc[i.label].append(i) if i.sn not in self.dq.inx.uid: print( 'SEVERE: standard name uid not valid: %s' % i.sn ) else: isn = self.dq.inx.uid[i.sn] if isn._h.label != 'remarks': u = umap.get( isn.units, isn.units ) ss[u].a[i.units].append( i.label ) else: ssx[i.units].add( i.label ) tws = [x.lower() for x in i.title.split() ] sws = [x.lower() for x in isn.title.split() ] for tt in antonyms: if (tt[0] in tws and tt[1] in sws ) or (tt[0] in sws and tt[1] in tws): log.error( 'ERROR.var.0401: antonym check: %s - %s [%s]' % (i.title,isn.title,tt) ) for k in [0,1]: a = rtokens[k] b = rtokens[1-k] if i.label.lower().find( a[0] ) != -1: if i.title.lower().find( b[1] ) != -1: log.error( 'ERROR.var.0201: potential variable name inconsistency: %s, %s, %s' % (i.label, i.title, i.uid) ) if i.sn.lower().find( b[1] ) != -1: log.error( 'ERROR.var.0202: potential variable name inconsistency: %s, %s, %s' % (i.label, i.sn, i.uid) ) if i.sn.lower().find( a[1] ) != -1: if i.title.lower().find( b[1] ) != -1: log.error( 'ERROR.var.0203: potential variable title inconsistency: %s, %s, %s' % (i.sn, i.title, i.uid) ) if i.unid in self.dq.inx.uid: if self.dq.inx.uid[i.unid].text != i.units: log.error( 'ERROR.units.0001: units do not match linked units record: %s, %s, %s' % (i.label, i.units, self.dq.inx.uid[i.unid].text) ) ii = [k for k in cc if len(cc[k]) > 1] log.warn( 'var.0001: %48s [%s]: %s' % ('Duplicate variable names',len( self.dq.coll['var'].items ),len(ii)) ) showDupVar=False showDupVar=True showDupVarBrief=False if showDupVar: for i in ii: log.info( '----- %s -----' % i ) for x in cc[i]: log.info( '%s, %s, %s, %s' % ( x.label,x.title,x.sn,x.prov )) elif showDupVarBrief: lg.info( str( ii )) self.cc = cc if not python2: ##log.warn( 'Skipping the units check .. not available in python3' ) checkUnits3( ss, extra=ssx ) else: checkUnits( ss, extra=ssx ) def audit_str(self): nms = 0 ss = set() cc = collections.defaultdict( list ) for i in self.dq.coll['structure'].items: cc[ (i.spid, i.tmid,i.odims,i.coords,i.cmid) ].append( i ) if i.cmid != '__unset__' and self.dq.inx.uid[i.cmid]._h.label == 'remarks' and i.cell_methods != '': ss.add(i.cmid) nms += 1 if nms > 0: log.error( 'str.00010: missing cell methods records for %s records (%s)' % (nms, len(ss) ) ) log.error( 'str.00011: %s' % str( sorted(list(ss)) ) ) else: log.info( 'str.00010: all structure methods have valid cmid' ) ks = [k for k in cc if len(cc[k]) > 1] log.info( 'str.00020: count of repeated space/time structure: %s' % len(ks) ) if len(ks) > 0: oo = open( 'structureRepeats.csv', 'w' ) for k in ks: oo.write( '\t'.join( [mapCoreAt(x) for x in ['NEXT', k[0], k[1], k[2], k[3], self.dq.inx.uid[k[0]].label, self.dq.inx.uid[k[1]].label]] ) + '\n' ) for i in cc[k]: oo.write( '\t'.join( [mapCoreAt(x) for x in ['+',i.uid, i.label, i.title, i.odims, i.coords, i.cell_methods, i.cell_measures, i.flag_meanings, i.spid ] ] ) + '\n' ) oo.close() def saveRevisedNewVar(self,ll): oo = open( 'revised_newvars.csv', 'w' ) for v in sorted( list( ll ) ): oo.write( self.draftNewVar[v] ) oo.close() def loadDraftNewVar(self): self.draftNewVar = {} for f in ['draft_newvars.csv','draft_newvars02.csv']: ii = open( f, 'r' ).readlines() ss = set() for l in ii: r = l.split( '\t' ) vn = r[0].strip() assert vn not in ss, 'Duplicate variable name in %s: %s' % (f,vn) ss.add(vn) if vn in self.draftNewVar: log.info( 'INFO.newvar.0001: overwriting new var: %s' % vn ) self.draftNewVar[vn] = l def audit_cmv(self): zmtabs = ['AERmonZ', 'E6hrZ', 'EdayZ', 'EmonZ'] mipt = mipTab.mipt() zmtabsi = [mipt.invmap[x] for x in zmtabs] ccmv = collections.defaultdict( list ) ccmv2 = collections.defaultdict( list ) ccmv4 = collections.defaultdict( list ) ccmv6 = collections.defaultdict( list ) ccp = collections.defaultdict( set ) cc = collections.defaultdict( set ) cci = collections.defaultdict( int ) ool = open( 'vlab_comp.csv', 'w' ) for i in self.dq.coll['CMORvar'].items: v = self.dq.inx.uid[i.vid] if i.title != v.title: cc[i.label].add( i.uid ) if i.label != v.label: lv = len(v.label) if len(i.label) > lv and i.label[:lv] == v.label and i.label[lv:] in ['27','Clim','4','7h','2d']: cci[i.label[lv:]] += 1 else: ool.write( '\t'.join( [i.label, v.label, i.mipTable, i.title, i.uid] ) + '\n' ) ool.write( '\t'.join( ['%s:%s' % (k,cci[k]) for k in cci] ) + '\n' ) ool.close() oo = open( 'vname_comp.csv', 'w' ) oox = open( 'vname_compx.csv', 'w' ) for k in sorted( cc.keys() ): for u in cc[k]: c = self.dq.inx.uid[u] v = self.dq.inx.uid[c.vid] if c.title.lower() == v.title.lower(): oo.write( '\t'.join( [c.label, '', c.title, u] ) + '\n' ) oo.write( '\t'.join( ['', '1', v.title, ''] ) + '\n' ) else: oox.write( '\t'.join( [c.label, '', c.title, u] ) + '\n' ) oox.write( '\t'.join( ['', '1', v.title, ''] ) + '\n' ) oo.close() oox.close() cmlk = {} zmstr = set() estr = dict() for i in self.dq.coll['structure'].items: tt = self.dq.inx.uid[ i.tmid ].label sp = self.dq.inx.uid[ i.spid ].label estr[i.uid] = (tt,sp) cmlk[i.uid] = i.cell_methods if i.cell_methods.find( 'longitude: mean' ) != -1 or i.cell_methods.find( 'global: mean' ): zmstr.add( i.uid ) ## ## check that vars in AERmonZ, E6hrZ, EdayZ, EmonZ have cell_methods with longitude mean. ## nms = 0 ss = set() badRealm = set() badRealmNames = set() hasHeight = set( ['tas','uas','vas','huss','hurs','sfcwind','sfcwindMax','tasmax','tasmin','tasmaxCrop','tasminCrop','tasLut','hussLut'] ) chh = collections.defaultdict( set ) for i in self.dq.coll['CMORvar'].items: if i.frequency[-2:] == 'Pt' and estr[i.stid][0] != "time-point": log.error( 'cmv.01021: frequency -- time axis error: %s.%s, %s, %s, %s, %s' % (i.mipTable, i.label,i.uid, i.frequency, i.stid, estr[i.stid] ) ) elif i.frequency[-2:] != 'Pt' and estr[i.stid][0] == "time-point": log.error( 'cmv.01022: frequency -- time axis error: %s.%s, %s, %s, %s, %s' % (i.mipTable, i.label,i.uid, i.frequency, i.stid, estr[i.stid] ) ) badzm = set() badzmt = set() spidmap = {'XY-AH':'XY-A/AH','XY-A':'XY-A/AH'} for i in self.dq.coll['CMORvar'].items: if i.mipTable in zmtabsi and (i.stid not in zmstr): badzm.add( i.uid ) badzmt.add( i.mipTable ) log.error( 'cmv.00121: bad zonal mean variable: %s.%s, %s, %s, %s, %s' % (i.mipTable, i.label,i.uid, i.prov, i.stid, cmlk[i.stid]) ) ccp[ i.label ].add( i.positive ) ccmv[i.label].append(i) ccmv4[ (i.frequency,i.label,i.stid)].append(i) ccmv2[ (i.mipTable,i.label)].append(i) if i.stid not in self.dq.inx.uid or self.dq.inx.uid[i.stid]._h.label == 'remarks': ss.add( i.stid ) nms += 1 else: st = self.dq.inx.uid[i.stid] sp = self.dq.inx.uid[st.spid] ccmv6[ (i.frequency,i.label,spidmap.get(sp.label,sp.label),st.tmid)].append((i,st)) if i.label in hasHeight: if st.coords.find( 'height' ) == -1: chh[ i.label ].add( i.uid ) if i.modeling_realm.strip() == '': badRealm.add( (i.mipTable, i.label ) ) badRealmNames.add( ('BLANK','') ) else: for rr in i.modeling_realm.split(): if rr not in realms: badRealm.add( (i.mipTable, i.label ) ) badRealmNames.add( (i.modeling_realm,rr) ) kkp = [k for k in ccp if len(ccp[k]) > 1] if len(kkp) > 0: log.error( 'cmv.00170: bad positive attributes: %s, %s' % (len( kkp ),str(kkp)) ) else: log.info( 'cmv.00170: positive attributes OK' ) if len( chh ) > 0: log.error( 'cmv.04170: near surface field without height coord: %s' % (len(chh) ) ) for k in sorted( list( chh.keys() ) ): log.error( 'cmv.04171: without height coord: %s: %s' % (k,len(chh[k]) ) ) else: log.info( 'cmv.04170: near surface fields OK (wrt height coord)' ) if len( badzm ) > 0: log.error( 'cmv.00120: bad zonal mean variables: %s, %s' % (len( badzm ),str(badzmt)) ) if len( badRealm ) > 0: log.error( 'cmv.00030: bad modeling realms set in %s cmv records' % len( badRealm ) ) log.error( 'cmv.00031: bad modeling realms: %s' % str( sorted( list( badRealmNames) ) ) ) cc1 = collections.defaultdict( list ) for tab, lab in badRealm: cc1[tab].append(lab) for tab in sorted( cc1.keys() ): log.error( 'cmv.00032: table %s: %s' % (tab,str(sorted(cc1[tab]))) ) else: log.info( 'cmv.00030: modeling realms set ok cmv records' ) assert len( badRealm ) == 0, 'Bad realms: %s' % str( sorted( list( badRealmNames) ) ) if nms > 0: log.error( 'cmv.00010: missing variable structures for %s CMORvar records, bad stids: %s' % (nms,len(ss)) ) log.error( 'cmv.00011: bad stids: %s' % str( sorted( list( ss ) ) ) ) else: log.info( 'cmv.00010: variable structures for all CMORvar records present' ) ii0 = [i for i in self.dq.coll['CMORvar'].items if i.mipTable == ''] if len(ii0) > 0: log.error( 'cmv.00020: %48s [%s]: %s' % ('Empty MIP table in CMORvar',len(self.dq.coll['CMORvar'].items),len(ii0)) ) else: log.info( 'cmv.00020: all CMORvar records have mip table defined' ) ii = [i for i in self.dq.coll['CMORvar'].items if self.dq.inx.uid[i.vid]._h.label == 'remarks'] if len(ii) == 0: log.info( 'cmv.00200: all CMORvar records have valid vid' ) else: ii1 = [i for i in ii if 'requestVar' in self.dq.inx.iref_by_sect[i.uid].a] log.error( 'cmv.00200: %48s [%s]: %s (requested: %s)' % ('CMORvar records with bad vid',len(self.dq.coll['CMORvar'].items),len(ii),len(ii1)) ) iix2 = [] iix3 = [] for i in ii1: ok = False ok3 = False for u in self.dq.inx.iref_by_sect[i.uid].a['requestVar']: vg = self.dq.inx.uid[ self.dq.inx.uid[u].vgid ] if vg._h.label != 'remarks': ok = True iix2.append(i) if 'requestLink' in self.dq.inx.iref_by_sect[vg.uid].a: iix3.append(i) log.error( 'cmv.0021: of these: valid requestVarGroup: %s; valid requestLink(s): %s' % (len(iix2),len(iix3)) ) rnv = set() for i in iix3: if i.label in self.draftNewVar: log.error( 'cmv.0125: %s, %s, %s' % (i.uid,i.label,i.title) ) rnv.add( i.label ) else: log.error( 'cmv.0122: %s, %s, %s' % (i.uid,i.label,i.title) ) if len( rnv ) > 0: self.saveRevisedNewVar( rnv ) ii2 = [k for k in ccmv2 if len(ccmv2[k]) > 1] if len(ii2) == 0: log.info( 'cmv.0030: unique CMOR variable names OK' ) else: log.error( 'cmv.0030: non-unique CMOR variable naming for %s records' % len(ii2) ) log.error( 'cmv.0031: %s' % str(ii2) ) ii4 = [k for k in ccmv4 if len(ccmv4[k]) > 1] ii6 = [k for k in ccmv6 if len(ccmv6[k]) > 1] ii4b = list() ii6b = list() for k in ii4: if len(ccmv4[k]) == 2 and set( [i.mipTable[-3:] for i in ccmv4[k]] ) == set(['Ant','Gre']): pass else: ii4b.append(k) for k in ii6: if len(ccmv6[k]) == 2 and set( [i.mipTable[-3:] for i,s in ccmv6[k]] ) == set(['Ant','Gre']): pass else: ii6b.append(k) if len(ii4b) == 0: log.info( 'cmv.0040: unique CMOR variable usage [4] OK' ) else: log.error( 'cmv.0040: non-unique CMOR variable usage possible for %s records' % len(ii4b) ) log.error( 'cmv.0041: %s' % str(ii4b) ) kk=0 oo = open( 'cmv_probDuplication.csv', 'w' ) for k in ii4b: kk += 1 rec = ('NEXT\t%s\t' % kk ) + str( '\t'.join( k ) ) oo.write( rec + '\n' ) vid = None for i in ccmv4[k]: assert vid == None or i.vid == vid, 'Unexpected vid variation ...: %s: %s %s' % (i.label, i.vid, vid) vid = i.vid rec = '+\t%s\t%s\t%s\t%s\t%s' % (i.uid,i.mipTable,i.title,i.description,i.prov) oo.write( rec + '\n' ) oo.close() if len(ii6b) == 0: log.info( 'cmv.0060: unique CMOR variable usage [6] OK' ) else: log.info( 'cmv.0060: unique CMOR variable issues [6] %s ' % len(ii6b) ) kk=0 ccx = collections.defaultdict( set ) for k in ii6b: ss = set() freq,label,spid,tmid = k for i,st in ccmv6[k]: ss.add( (st.uid,st.label,st.title) ) tt = tuple( sorted( list( ss ) ) ) ccx[tt].add( (freq,label) ) log.info( 'cmv.0061: unique CMOR variable issues [6.1] %s ' % len(ccx.keys()) ) for tt in ccx: ll = [t[2] for t in tt] log.info( 'cmv.0062: unique CMOR variable issues [6.2] %s: %s ' % ('; '.join(ll),str(ccx[tt]) ) ) oo = open( 'cmv_probDuplication6.csv', 'w' ) for k in ii6b: kk += 1 rec = ('NEXT\t%s\t' % kk ) + str( '\t'.join( k ) ) oo.write( rec + '\n' ) vid = None for i,st in ccmv6[k]: assert vid == None or i.vid == vid, 'Unexpected vid variation ...: %s: %s %s' % (i.label, i.vid, vid) vid = i.vid rec = '+\t%s\t%s\t%s\t%s\t%s\t%s' % (i.uid,i.mipTable,i.title,st.title,i.description,i.prov) oo.write( rec + '\n' ) oo.close() ccmv3 = collections.defaultdict( list ) for t in ccmv2: s = set() for i in ccmv2[t]: id = i.uid if 'varRelLnk' in self.dq.inx.iref_by_sect[id].a: for x in self.dq.inx.iref_by_sect[id].a['varRelLnk']: s.add( x ) s = list(s) if len(s) > 0: ccmv3[t] = s ii5 = [k for k in ii2 if k not in ccmv3 or len(ccmv2[k]) > len(ccmv3[k])] cc4 = collections.defaultdict( list ) for t,v in ii2: cc4[t].append( v ) ncmx = 0 for uid, cmorvar in self.dq.inx.CMORvar.uid.items(): if cmorvar.modeling_realm in ['ocnBgChem', 'ocean']: # Cell measures are attached to the structure that the cmorvar links to cm = self.dq.inx.uid[cmorvar.stid].cell_measures # check for areacello and ignore some edge cases if 'areacello' not in cm and cm not in ['', '--OPT', '--UGRID']: print( '{0.mipTable}/{0.label} : {1}'.format(cmorvar, cm) ) log.error( 'cmv.01001: cell_measures error in ocean/ocnBgChem realms {0.mipTable}/{0.label} : {1}'.format(cmorvar, cm) ) ncmx += 1 if ncmx == 0: log.info( 'cmv.01001: cell_measures checks passed' ) log.warn( '%48s [%s]: %s [%s tables]' % ('Duplicate variables in CMORvar',len(self.dq.coll['CMORvar'].items),len(ii2), len( cc4.keys() ) ) ) showAllCmvErrors=False showAllCmvErrors=True if showAllCmvErrors: rats = ['uid','label','title','description','frequency','mipTable','stid','prov','provNote'] oocm = open( 'cmvDup.csv', 'w' ) c1 = collections.defaultdict( list ) ##for i in ii2: ##c1[i.prov].append( i.label ) ##log.warn( 'No MIP var: %s ... %s' % (str( sorted( [i.label for i in ii if i.label in self.cc] ) ), str(c1))) for k in sorted( cc4.keys() ): log.warn( '%16s:: %s' % (k,str(sorted(cc4[k]))) ) for v in sorted(cc4[k]): sv = set() for i in ccmv2[ (k,v) ]: sv.add(i.vid) if len(list(sv)) == 1: vun = True vv = self.dq.inx.uid[ list(sv)[0] ] oocm.write( '@VAR:\t\t\t%s\t%s\t%s\n' % (vv.label,vv.title,vv.description) ) else: vun = False for i in ccmv2[ (k,v) ]: stt = self.dq.inx.uid[i.stid].title if 'Temporal mean' in stt: if i.frequency == '3hr': i.title += ' [3 hour mean]' xtra = '' oocm.write( '\t\t' + '\t'.join( [str(i.__dict__[x]) for x in rats] ) + '\t%s%s\n' % (stt,xtra) ) if not vun: vv = self.dq.inx.uid[i.vid] xtra = '\t\t\t\t%s\t%s\t%s\n' % (vv.label,vv.title,vv.description) oocm.write(xtra) oocm.write( '####\n' ) oocm.close() t2_ii = ii[:] self.ccmv = ccmv def audit_sn(self): cf = loadcf.cf(base='ing02/') log.info( 'INFO: loading loadcf version: %s' % cf.version ) sned = set() ##for l in open( 'cfeditor_july5.txt' ).readlines(): ##sned.add( l.strip() ) cc = collections.defaultdict( list ) cc2 = collections.defaultdict( int ) cc3 = collections.defaultdict( dict ) nrem = 0 snm = set() snmm = collections.defaultdict( int ) nalias = 0 nsne = 0 nsnle = 0 snset = set() for i in self.dq.coll['var'].items: if 'CMORvar' in self.dq.inx.iref_by_sect[i.uid].a: srq = set() for u in self.dq.inx.iref_by_sect[i.uid].a['CMORvar']: if 'requestVar' in self.dq.inx.iref_by_sect[u].a: for uu in self.dq.inx.iref_by_sect[u].a['requestVar']: rqv = self.dq.inx.uid[uu] try: srq.add( rqv.priority ) except: print( [rqv.label, rqv.uid, rqv._h.label] ) raise if len(srq) > 0: pm = min( srq ) kk = i.sn if kk not in self.dq.inx.uid: print( 'SEVERE: missing uid: %s' % kk ) elif self.dq.inx.uid[kk]._h.label == 'remarks': nrem += 1 snm.add(kk) ## if pm == 1: print( 'MISSING STANDARD NAME: %s' % [i.label, i.title, i.sn, i.provmip] ) snmm[pm] += 1 ppp = set() for cmv in self.dq.inx.iref_by_sect[i.uid].a['CMORvar']: for rqv in self.dq.inx.iref_by_sect[cmv].a['requestVar']: ppp.add( self.dq.inx.uid[rqv].priority ) cc2[i.provmip] += 1 cc3[i.provmip][i.label]=(i.title,i.sn,str(min(ppp))) if kk in cf.alias: log.warn( 'sn.010: standard name alias: %s --> %s' % (kk,cf.alias[kk]) ) nalias += 1 elif kk not in cf.names: nsne += 1 else: snset.add( kk ) if i.procnote != []: ll = sorted( i.procnote ) kk += ':' + '|'.join(ll ) cc[kk].append( i.uid ) assert nsne == nrem, 'Audit error: %s --- %s' % (nsne, nrem) print( 'sn count check: %s, Alias count: %s' % ([ nsne, nrem] ,nalias) ) log.warn( 'sn.001: Number of missing standard names: %s [for %s variables] {%s}' % (len(snm),nrem,str(snmm)) ) log.warn( 'sn.002: Number of missing standard names by MIP: %s ' % (str(cc2)) ) if nalias > 0: log.warn( 'sn.020: Number of alias standard names: %s' % nalias ) else: log.info( 'sn.020: no alias standard names' ) ll = [] for k in cc: if len( cc[k] ) > 1: ll.append( k ) oo = open( 'audit_sn_errors.csv', 'w' ) for k in sorted(cc3.keys()): for v in sorted( cc3[k].keys() ): t,s,p = cc3[k][v] if s in sned: m = 'Editor' else: m = '' oo.write( '\t'.join( [k,v,t,s,p,m] ) + '\n' ) oo.close() log.warn( 'sn.003: Duplicate use of standard names: %s' % len(ll) ) oo = open( 'audit_sn_repeats.csv', 'w' ) for k in sorted( ll ): orc = [k,] l1 = cc[k][:] l2 = [] l1b = [] for u in cc[k]: i = self.dq.inx.uid[u] if i.prov == "CMIP6 endorsement [SIMIP]": l2.append(u) else: l1b.append(u) if len( l2) > 1: log.error( 'WARN.simip.00001: repeat use of standard name in SIMIP: %s:: %s' % (k,str(l2)) ) if len(l2) == 1: orc.append( '*' ) else: orc.append( '' ) nnn = 0 for u in l2 +l1b: i = self.dq.inx.uid[u] if "CMORvar" not in self.dq.inx.iref_by_sect[u].a: orc += [i.label +'**', i.title] else: nn = 0 for uu in self.dq.inx.iref_by_sect[u].a['CMORvar']: cmv = self.dq.inx.uid[uu] if 'requestVar' in self.dq.inx.iref_by_sect[uu].a: nn += 1 if nn == 0: orc += [i.label + '*', i.title] else: nnn += 1 orc += [i.label, i.title] if nnn > 1: oo.write( '\t'.join( orc ) + '\n' ) oo.close() for s in snset: i = self.dq.inx.uid[s] if i.label != cf.names[s][0]: log.warn( 'sn.060: standard name label error: %s: %s --- %s' % (s,i.label,cf.names[kk][0]) ) nsnle += 1 if nsnle > 0: log.warn( 'sn.061: Number of standard name label errors: %s' % nsnle ) else: log.info( 'sn.061: No standard name label errors' ) def audit_rqv(self): cc = collections.defaultdict( list ) ccc = collections.defaultdict( set ) ccx = collections.defaultdict( set ) vgs = set( [i.uid for i in self.dq.coll['requestVarGroup'].items] ) eevgid1 = set() eevgid2 = set() for i in self.dq.coll['requestVar'].items: cc[ (i.vid,i.vgid) ].append( i ) ccc[ i.vgid ].add( (i.vid,i.priority) ) if i.vgid not in vgs: if i.vgid in self.dq.inx.uid: eevgid1.add( (i.uid,i.vgid) ) else: eevgid2.add( (i.uid,i.vgid) ) kk0 = [k for k in ccc if k not in self.dq.inx.uid] assert len(eevgid2) == 0, 'Request vars targetting invalid vgids: %s' % (str(eevgid2)) assert len(eevgid1) == 0, 'Request var vgids targetting wrong record types : %s' % (str(eevgid1)) for k in ccc: ccx[ tuple( sorted( list( ccc[k] ) ) ) ].add( k ) kk = [ k for k in ccx.keys() if len( ccx[k] ) > 1 ] if len(kk) > 0: oo = open( 'purgeRqvGroup.csv', 'w' ) for t in kk: s1 = [k for k in ccx[t] if 'requestLink' not in self.dq.inx.iref_by_sect[k].a or len(self.dq.inx.iref_by_sect[k].a['requestLink'] ) == 0] if len(s1) == len(ccx[t]): if len(s1) == 0: oo.write( '++++++++ pick [0]\n' ) else: oo.write( '++++++++ pick\n' ) elif len(s1) == len(ccx[t])-1: oo.write( '******** all\n' ) elif len(s1) > 0: oo.write( '-------- all**\n' ) for k in s1: i = self.dq.inx.uid[k] oo.write( '\t'.join( [i.__dict__[x] for x in ['mip','label','title','uid','ref','refNote'] ] ) + '\n' ) if len( ccx[t] )-1 > len( s1 ): oo.write( 'xxxxxxxx review\n' ) nn = 0 for k in ccx[t]: if k not in s1: i = self.dq.inx.uid[k] nn += 1 oo.write( '\t'.join( [i.__dict__[x] for x in ['mip','label','title','uid','ref','refNote'] ] ) + '\n' ) if nn == 0: oo.write( '!!!!!!!!!!!\n' ) oo.write( '\t'.join( s1 ) +'\n' ) oo.write( '\t'.join( ccx[t] ) +'\n' ) log.error( 'ERROR.rqv-group.00001: %s' % str(ccx[t]) ) oo.close() else: log.error( 'INFO.rqv-group.00001: no request group duplication' ) tt = [t for t in cc.keys() if len( cc[t] ) > 1] if len(tt) > 0: cc2 = collections.defaultdict( list ) for t in tt: s1 = set( [i.label for i in cc[t]] ) p1 = set( [i.priority for i in cc[t]] ) if len(s1) > 1: log.error( 'ERROR.rqv-dup.9002: %s, %s' % (str(t),str(s1)) ) elif len(p1) > 1: log.error( 'ERROR.rqv-dup.9003: %s, %s' % (str(t),str(p1)) ) else: label = s1.pop() s1 = [i.mip for i in cc[t]] s2 = [m for m in s1 if m != 'CMIP5'] if len(s2) == 0: cc2[label].append( (t,['CMIP5','CMIP5'] ) ) elif len(s2) == 1: cc2[label].append( (t,[s2[0],'CMIP5'] ) ) else: cc2[label].append( (t,sorted(s2) ) ) oo = open( 'rqvPrune.csv', 'w' ) for k in sorted( cc2.keys() ): for a,b in cc2[k]: log.error( 'ERROR.rqv-dup.0001: %s, %s' % (str(a),str(b)) ) try: oo.write( '%s\t%s\t%s\t%s\t%s\n' % (k,a[0],a[1],b[0],b[1]) ) except: print( 'SEVERE: FAILED TO WRITE: %s' % [k,a,b] ) oo.close() ixx = [i for i in self.dq.coll['requestVar'].items if i.vid in self.dq.inx.uid] if len(ixx) < len( self.dq.inx.uid ): log.error( 'SEVERE.var.00301: request vars with bad links ....' ) ii = [i for i in ixx if self.dq.inx.uid[i.vid]._h.label == 'remarks'] ii2 = [i for i in ii if i.label not in self.cc] ii3 = [i.vgid for i in ii if self.dq.inx.uid[i.vgid]._h.label != 'remarks'] ##ii4: set of invalid requestVarGroup records ii4 = [i for i in ii3 if 'requestLink' not in self.dq.inx.iref_by_sect[i].a ] ###### ii5Info = 'Set of bad requestVar records linking to a valid requestVarGroup records.' ii5 = [i for i in ii if i.vgid not in ii4] ##ii6: set of bad requestVar records linking to a valid requestVarGroup records and no variable name match. ii6 = [i for i in ii5 if i.label not in self.cc] log.error( '%48s [%s]: %s [%s, %s, %s; %s]' % ('Bad variable links in requestVar',len(self.dq.coll['requestVar'].items),len(ii),len(ii2), len(ii5), len(ii6), len(ii4)) ) for iix,m in [(ii2,2),(ii5,5),(ii6,6),(ii4,4)]: thisl = min( [len(iix),6] ) if thisl > 0: print( '>>>>>>>>>>> %s' % m ) for i in iix[:thisl]: if type(i) == type( 'x' ): print( i ) else: print( [i.label, i.uid ] ) log.info ( str( ii5Info )) svg = set() for i in ii5: log.info( 'ii5: %s' % str([i.label, i.title, i.uid, i.mip, self.dq.inx.uid[i.vgid].title] ) ) svg.add( i.vgid ) for i in ii2: if i.vgid in self.dq.inx.uid and self.dq.inx.uid[i.vgid]._h.label != 'remarks': xx = '%s [%s]' % (self.dq.inx.uid[i.vgid].title, i.vgid) else: xx = i.vgid log.info( 'ii2: %s' % str([i.label, i.title, i.uid, i.mip, xx] ) ) log.warn( 'rvg.0010: requestVarGroups with bad request vars:' ) for u in sorted( list(svg) ): log.warn( 'rvg.0011: %s: %s' % (u, self.dq.inx.uid[u].title )) log.info( ' [# records]: # broken [var name not known, link to valid group, valid group and no var name; valid request groups' ) showAllRqvErrors=False if showAllRqvErrors: log.info( '%s\n%s' % (str( [i.label for i in ii if i.label in self.cc]), str( [i.label for i in ii2] ) ) ) ii1 = [i for i in ii if i.label in self.ccmv] listBadRequestVar=False if listBadRequestVar: for i in ii1: log.info( str( i.label,i.mip,i.table,[x.frequency for x in self.ccmv[i.label]] ) ) def audit_rql(self): ii0 = [i for i in self.dq.coll['requestLink'].items if i.refid not in self.dq.inx.uid] assert len(ii0) == 0, 'Broken requestLink references to requestVarGroups: %s' % str([(x.label,x.uid) for x in ii0]) ii = [i for i in self.dq.coll['requestLink'].items if self.dq.inx.uid[i.refid]._h.label == 'remarks'] log.info( '%48s: %s (from %s)' % ('ERROR.rql.001: Bad request group links in requestLink',len(ii), len(self.dq.coll['requestLink'].items)) ) rql_detail=True if rql_detail: for i in ii: log.info( '%s: %s, %s [%s]' % ('ERROR.rql.002: ',i.mip,i.label,i.uid) ) def audit_rvg(self): ii = [i for i in self.dq.coll['requestVarGroup'].items if len( dq.inx.iref_by_sect[i.uid].a['requestVar']) == 0] ii1 = [i for i in ii if len( dq.inx.iref_by_sect[i.uid].a['requestLink']) != 0] log.info( '%48s: %s/%s (from %s)' % ('Request groups with no request links:',len(ii) - len(ii1),len(ii1), len(self.dq.coll['requestVarGroup'].items)) ) listEmptyRequestVarGroups=True if listEmptyRequestVarGroups: for i in ii1: log.info ( 'INFO.rvgempty.0001: %s: %s, %s [%s]' % (i.label, i.title, i.mip, i.uid) ) ## check Oclim -- a timeSlice should be specified. i = 'ef12f514-5629-11e6-9079-ac72891c3257' nn = nne = 0 for x in dq.inx.iref_by_sect[i].a['requestLink']: rql = dq.inx.uid[ x ] for i in dq.inx.iref_by_sect[x].a['requestItem']: rqi = dq.inx.uid[ i ] nn += 1 if 'tslice' not in rqi.__dict__.keys(): nne += 1 log.error( 'ERROR.rvg.00005: missing timeSlice: %s, %s, %s' % (rql.label, rql.uid, rqi.label ) ) log.info( 'INFO.rvg.00005: timeSlice checks for Oclim --- error count %s (%s)' % (nne,nn) ) ## check subhourly. i = '33b0f666-a27a-11e6-bfbb-ac72891c3257' cc = collections.defaultdict( dict ) for x in dq.inx.iref_by_sect[i].a['requestVar']: cmv = dq.inx.uid[ dq.inx.uid[x].vid ] st = dq.inx.uid[ cmv.stid ] sp = dq.inx.uid[ st.spid ] cc[sp.label][cmv.label] = cmv.uid ne = 0 for k in sorted( cc.keys() ): if k.find( 'XY' ) != -1: log.error( 'ERROR.rvg.00002: bad shape in subhourly data: %s, %s' % (k,sorted( cc[k].keys() ) ) ) ne += 1 if ne == 0: log.info( 'INFO.rvg.00002: sub hourly variable shapes OK' ) ##dq = dreq.loadDreq(manifest=None) dq = dreq.loadDreq(manifest='dreqManifest_audit.txt') a = Auditor( dq ) cd = checkDims( dq )