## specification is not explicit -- but following assumes no brackets in comment. #dim1: [dim2: [dim3: ...]] method [where type1 [over type2]] [within|over days|years] [(comment)] ## ## except that comment can occur after every method ... ## parse left to right: ## starts : ## optional : ## or == method ## where|within|over|( import re ## s1 appears to work, splitting string into dictionaries. ad "+$" to end to see whether string as a whole is a complete match. ##s1 = re.compile( '((?P(([a-zA-Z][a-zA-Z0-9-_]*:\s+)+)([a-zA-Z]*\s*))(?Pwhere\s[a-zA-Z][a-zA-Z0-9-]*\s*){0,1}(?P((over|within)\s(days|years)\s*){0,3})(?P\(.*?\)){0,1})' ) cmparseString = '((([a-zA-Z][a-zA-Z0-9-_]*:\s+)+)([a-zA-Z_ ]*[a-zA-Z](?=(\s|$)))(\s*(\(.*?\))){0,1})+' ## after re.compile( .. ) and findall, for each match, 0: matched string, 1: dimensions, 3: method, 6: comment a1 = ('dim: mid_range d2: point', 2, True) a2 = ('d1: d2: mean d3: minimum (comment: test)', 2, True) a3 = ('d1: d2: maximum d3: point', 2, True) a4 = ('d1: d2: maximum d3: mean where x over days', 2, True) a4 = ('d1: d2: maximum d3: mean where x over days within years (comment ...)', 2, True) a5 = ('d1: d2: maximum where stuff d3: mean', 2, True) a6 = ('d1: d2: maximum where stuff d3: point bad stuff here', 2, False) ## to check the "xxx" in "where xxx" need to know about the data .... can't really check up front, unless the check is for ## conventional areas + supplied additional terms .... class check_cm(object): """Check whether a string is a valid CF cell_methods string. The first check is with a regular expression, check_cm.tstr, which will parse a valid cell_methods string into sections. It does not check the validity of methods or dimensions, but checks for words and generic syntax. The second stage validates the methods specified against the CF list (check_cm.methods). """ def __init__(self): self.methods = {'point', 'sum', 'mean', 'maximum', 'minimum', 'mid_range', 'standard_deviation', 'variance', 'mode', 'median'} self.tstr = '(((?P(([a-zA-Z][a-zA-Z0-9-_]*:\s+)+)(?P[a-zA-Z_]*\s*))(?Pwhere\s[a-zA-Z][a-zA-Z0-9_-]*\s*(?Pover\s[a-zA-Z][a-zA-Z0-9_-]*\s*){0,1}){0,1}(?P((over|within)\s(days|years|hours)\s*){0,3})(?P\(.*?\)){0,1})\s*)' self.s1 = re.compile( "^" + self.tstr + "+$" ) self.s2 = re.compile( self.tstr ) def test(self,a0,isv=True,nn=None,acceptEmpty=True): a = a0.strip() if a == '': if acceptEmpty: self.res = None return True else: print( 'ERROR.004: empty cell methods string' ) return False y = self.s1.match(a) if not y: if isv: print( 'ERROR.002: match failed: %s' % a ) else: print( 'Bad string detected: %s' % a ) return False else: z = [i.groupdict() for i in self.s2.finditer(a)] if nn != None: if len(z) != nn: print( 'ERROR.003: Failed to parse into sections: %s' % [z,len(z),nn] ) return False for zz in z: if zz['e'].strip() not in self.methods: if isv: print( 'ERROR.004: bad method %s [%s]' % (zz['e'],zz) ) else: print( 'Bad string detected (method wrong): %s' % a ) return False self.res = z return True if __name__ == '__main__': cc = check_cm() for a, nn, isv in [a1,a2,a3,a4,a5,a6]: cc.test(a,isv=isv,nn=nn) for a in open('qq').readlines(): l = a.strip() if len(l) != 0: r = cc.test(l) if not r: print( 'ERROR: %s' % l )