source: nappy/trunk/nappy/utils/compare_na.py @ 3628

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/nappy/trunk/nappy/utils/compare_na.py@3628
Revision 3628, 8.2 KB checked in by astephen, 14 years ago (diff)

Fixing defaults for delimiter and float format throughout.

Line 
1#!/usr/bin/env python
2
3"""
4compare_na.py
5=============
6
7Tool to compare contents of NASA Ames files or directories full of files.
8Allows you to compare headers and data blocks in NASA Ames.
9
10Usage:
11======
12
13    compare_na.py [-h | --header-only]  [-b | --body-only]
14                  [-n | --number-strict] [-a | --approx-equal]
15                  [-1 <delimiter_1> | --delimiter-1=<delimiter_1>]
16                  [-2 <delimiter_2> | --delimiter-2=<delimiter_2>]
17                  <item1> <item2>
18                                                                   
19
20Where:
21======
22
23    <item1> and <item2>         can either be a text file or directory.
24    -h | --header-only          selects compare only header(s)
25    -b | --body-only            selects compare only body(s)
26    -n | --number-strict        compares exact formatting of numbers in data block
27                                (default is to compare them by value).
28    -a | --approx-equal         considers any two numbers being compared the same as long
29                                as the difference between them is less than 1%.
30    <delimiter_1>               delimiter to use for file 1.
31    <delimiter_2>               delimiter to use for file 2.
32
33"""
34
35# Import standard library modules
36import os
37import sys
38import re
39import getopt
40
41# Import local modules
42from compare import *
43
44equality_threshold = 0.01 # i.e. within 1% of each other
45file_exclusion_patterns = (".*CSV.*", ".*svn.*", "\..*", ".*\.pyc$", ".*~$") 
46file_exclusions = [re.compile(pattn) for pattn in file_exclusion_patterns]
47letter_match = re.compile("[a-zA-Z]")
48
49
50def exitNicely(msg):
51    "Tidy exit."
52    print __doc__
53    print msg
54    sys.exit()
55
56
57def compareNA(i1, i2, **kwargs):
58    """
59    Compares items whether files or directories.
60    Reports any differences at the command line but
61    also returns them in a dictionary as:
62    ???
63    **kwargs are forwarded as dictionary to compNAFiles().
64    """
65    if os.path.isfile(i1):
66        apply(compNAFiles, (i1, i2), kwargs)
67    elif os.path.isdir(i1):
68        compDirs(i1, i2)
69    else:
70        exitNicely("Cannot recognise/find item '" + i1 + "'.")
71
72
73def compareSections(l1, l2, number_clever=True, approx_equal=False, 
74                   delimiter_1=None, delimiter_2=None):
75    """
76    Compares sections of NASA Ames files (i.e. headers and bodies).
77    """ 
78    leng = len(l1)
79    if len(l2) < leng:
80        leng = len(l2)
81
82    all_same = True
83
84    for i in range(leng):
85
86        # Start by setting same equal to True and then try and disprove this
87        same = True
88
89        l1[i] = l1[i].strip()
90        l2[i] = l2[i].strip()
91
92        # If letters found in line then not going to be numeric
93        # Hence we can just test if lines are identical
94        if letter_match.search(l1[i]):
95            if l1[i] == l2[i]:
96                continue
97
98        items1 = l1[i].split(delimiter_1)
99        items2 = l2[i].split(delimiter_2)
100       
101        if len(items1) != len(items2):
102            # Check that space delimiter hasn't just split identical lines to different lengths
103            if len(items1) == 1 and items1[0].split() == items2:
104                continue
105            elif len(items2) == 1 and items2[0].split() == items1:
106                continue
107            else:
108                same = False
109        else: 
110            if number_clever == False:
111                if items1 != items2:
112                    same = False
113            else:
114                for count in range(len(items1)):
115                    try:
116                        a = float(items1[count])
117                        b = float(items2[count])
118                    except:
119                        a = items1[count]
120                        b = items2[count]
121
122                    if a != b:
123                       
124                        if approx_equal:
125                        # Check to see if testing for approximate equality
126                            if a == 0:  a = 0.000000001
127                            if b == 0:  b = 0.000000001
128                            divided = a/b
129                            if divided < 1:
130                                divided = b/a
131                            if (1 - divided) > equality_threshold:
132                                same = False
133                                break 
134                        else: 
135                            same = False
136                            break 
137             
138        if same == False:
139            all_same = False
140            print "Line %s:" % (i+1)
141            print ">>>", l1[i]
142            print "<<<", l2[i]
143
144    return all_same
145
146
147def compNAFiles(f1, f2, header=True, body=True, number_clever=True, approx_equal=False,
148                delimiter_1=None, delimiter_2=None):
149    """
150    Compares contents of two NASA Ames files f1 and f2.
151    header=False or body=False will not compare these sections of the files.
152    number_clever=True will compare 5.00000 and 5 making them equal in the body.
153    If approx_equal is True then approximate equality is good enough to return two
154    numbers as being equal (within equality_threshold set at top of this module).
155    If f1_delimiter and f2_delimiter are provided then the comparer will consider
156    two lines identical if they have the delimiters sent in as arguments.
157    """ 
158    name = os.path.split(f1)[-1]
159    # Ignore anything that is in exclusion list
160    for excl in file_exclusions:
161        if excl.match(name):
162            print "IGNORING EXCLUDED file:", f1
163            return
164
165    # Check they exist
166    for f in (f1, f2):
167        if not os.path.isfile(f):
168            exitNicely("CANNOT compare files as item does not exist:" + f)
169   
170    # Note delimiter set as None will do split on white-space (which we want!)
171
172    l1 = open(f1).readlines()
173    l2 = open(f2).readlines()
174    head_len1 = int(l1[0].split(delimiter_1)[0])
175    head_len2 = int(l2[0].split(delimiter_2)[0])
176
177    header1 = l1[:head_len1]
178    header2 = l2[:head_len2]
179    body1 = l1[head_len1:]
180    body2 = l2[head_len2:]
181
182    same = True
183    if header == True:
184        print "Comparing headers:"
185        print ">>> %s header:" % f1
186        print "<<< %s header:" % f2
187        same = compareSections(header1, header2, number_clever, approx_equal, delimiter_1, delimiter_2) 
188        if same == True:
189            print "HEADERS ARE IDENTICAL."
190        if len(header1) != len(header2):
191            print "Header lengths differ:\n>>> %s: %s\n<<< %s: %s" % (f1, len(header1), f2, len(header2))
192
193    if body == True:
194        print "Comparing bodies:"
195        print ">>> %s body:" % f1
196        print "<<< %s body:" % f2
197        same = compareSections(body1, body2, number_clever, approx_equal, delimiter_1, delimiter_2)
198        if same == True:
199            print "BODIES ARE IDENTICAL."
200        if len(body1) != len(body2):
201            print "Body lengths differ:\n>>> %s: %s\n<<< %s: %s" % (f1, len(body1), f2, len(body2))
202       
203    return same
204
205
206def parseArgs(args):
207    """
208    Parses arguments returning a dictionary.
209    """
210    arg_dict = {}
211    a = arg_dict
212    a["header"] = True
213    a["body"] = True
214    a["number_clever"] = True
215    a["approx_equal"] = False
216    a["delimiter_1"] = None
217    a["delimiter_2"] = None
218
219    (arg_list, files) = getopt.getopt(args, "hbna1:2:", ["header-only", "body-only",
220                    "number-strict", "approx-equal", "delimiter-1=", "delimiter-2="])
221
222    for arg, value in arg_list:
223        if arg in ("--header-only", "-h"):
224            a["body"] = False
225        elif arg in ("--body-only", "-b"):
226            a["header"] = False 
227        elif arg in ("--number-strict", "-n"):
228            a["number_clever"] = False
229        elif arg in ("--approx-equal", "-a"):
230            a["approx_equal"] = True
231        elif arg in ("--delimiter-1", "-1"):
232            a["delimiter_1"] = value
233        elif arg in ("--delimiter-2", "-2"):
234            a["delimiter_2"] = value
235        else:
236            exitNicely("Unrecognised argument provided: " + arg)
237
238    if len(files) != 2:
239        exitNicely("Must provide a minimum of two file names as command line arguments.")
240
241    if a["header"] == False and a["body"] == False:
242        exitNicely("Invalid selection: header-only and body-only cannot be selected together.")
243
244    return (files, a)
245
246
247def main(args):
248    "Main controller."
249    files, arg_dict = parseArgs(args)
250    apply(compareNA, files, arg_dict) 
251   
252 
253if __name__=="__main__":
254
255    args = sys.argv[1:]
256    main(args)
Note: See TracBrowser for help on using the repository browser.