source: nappy/trunk/nappy/utils/compare_na.py @ 3625

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/nappy/trunk/nappy/utils/compare_na.py@3625
Revision 3625, 7.8 KB checked in by astephen, 13 years ago (diff)

Cleaned up some of the data files and improved unit tests and comparison code.

Line 
1#!/usr/bin/env python
2
3"""
4compare_na.py
5=============
6
7Tool to compare contents of NASA Ames files or directories full of files.
8Allows you to compare headers and data blocks in NASA Ames.
9
10Usage:
11======
12
13    compare_na.py [-h | --header-only]  [-b | --body-only]
14                  [-n | --number-strict] [-a | --approx-equal]
15                  [-1 <delimiter_1> | --delimiter-1=<delimiter_1>]
16                  [-2 <delimiter_2> | --delimiter-2=<delimiter_2>]
17                  <item1> <item2>
18                                                                   
19
20Where:
21======
22
23    <item1> and <item2>         can either be a text file or directory.
24    -h | --header-only          selects compare only header(s)
25    -b | --body-only            selects compare only body(s)
26    -n | --number-strict        compares exact formatting of numbers in data block
27                                (default is to compare them by value).
28    -a | --approx-equal         considers any two numbers being compared the same as long
29                                as the difference between them is less than 1%.
30    <delimiter_1>               delimiter to use for file 1.
31    <delimiter_2>               delimiter to use for file 2.
32
33"""
34
35# Import standard library modules
36import os
37import sys
38import re
39import getopt
40
41# Import local modules
42from compare import *
43
44equality_threshold = 0.01 # i.e. within 1% of each other
45file_exclusion_patterns = (".*CSV.*", ".*svn.*", "\..*", ".*\.pyc$", ".*~$") 
46file_exclusions = [re.compile(pattn) for pattn in file_exclusion_patterns]
47
48
49def exitNicely(msg):
50    "Tidy exit."
51    print __doc__
52    print msg
53    sys.exit()
54
55
56def compareNA(i1, i2, **kwargs):
57    """
58    Compares items whether files or directories.
59    Reports any differences at the command line but
60    also returns them in a dictionary as:
61    ???
62    **kwargs are forwarded as dictionary to compNAFiles().
63    """
64    if os.path.isfile(i1):
65        apply(compNAFiles, (i1, i2), kwargs)
66    elif os.path.isdir(i1):
67        compDirs(i1, i2)
68    else:
69        exitNicely("Cannot recognise/find item '" + i1 + "'.")
70
71
72def compareSections(l1, l2, number_clever=True, approx_equal=False, 
73                   delimiter_1=None, delimiter_2=None):
74    """
75    Compares sections of NASA Ames files (i.e. headers and bodies).
76    """ 
77    leng = len(l1)
78    if len(l2) < leng:
79        leng = len(l2)
80
81    all_same = True
82
83    for i in range(leng):
84        items1 = l1[i].split(delimiter_1)
85        items2 = l2[i].split(delimiter_2)
86       
87        same = True
88        if len(items1) != len(items2):
89            # Check that space delimiter hasn't just split identical lines to different lengths
90            if len(items1) == 1 and items1[0].split() == items2:
91                continue
92            elif len(items2) == 1 and items2[0].split() == items1:
93                continue
94            else:
95                same = False
96        else: 
97            if number_clever == False:
98                if items1 != items2:
99                    same = False
100            else:
101                for count in range(len(items1)):
102                    try:
103                        a = float(items1[count])
104                        b = float(items2[count])
105                    except:
106                        a = items1[count]
107                        b = items2[count]
108
109                    if a != b:
110                       
111                        if approx_equal:
112                        # Check to see if testing for approximate equality
113                            if a == 0:  a = 0.000000001
114                            if b == 0:  b = 0.000000001
115                            divided = a/b
116                            if divided < 1:
117                                divided = b/a
118                            if (1 - divided) > equality_threshold:
119                                same = False
120                                break 
121                        else: 
122                            same = False
123                            break 
124             
125        if same == False:
126            all_same = False
127            print "Line %s:" % (i+1)
128            print ">>>", l1[i]
129            print "<<<", l2[i]
130
131    return all_same
132
133
134def compNAFiles(f1, f2, header=True, body=True, number_clever=True, approx_equal=False,
135                delimiter_1=None, delimiter_2=None):
136    """
137    Compares contents of two NASA Ames files f1 and f2.
138    header=False or body=False will not compare these sections of the files.
139    number_clever=True will compare 5.00000 and 5 making them equal in the body.
140    If approx_equal is True then approximate equality is good enough to return two
141    numbers as being equal (within equality_threshold set at top of this module).
142    If f1_delimiter and f2_delimiter are provided then the comparer will consider
143    two lines identical if they have the delimiters sent in as arguments.
144    """ 
145    name = os.path.split(f1)[-1]
146    # Ignore anything that is in exclusion list
147    for excl in file_exclusions:
148        if excl.match(name):
149            print "IGNORING EXCLUDED file:", f1
150            return
151
152    # Check they exist
153    for f in (f1, f2):
154        if not os.path.isfile(f):
155            exitNicely("CANNOT compare files as item does not exist:" + f)
156   
157    # Note delimiter set as None will do split on white-space (which we want!)
158
159    l1 = open(f1).readlines()
160    l2 = open(f2).readlines()
161    head_len1 = int(l1[0].split(delimiter_1)[0])
162    head_len2 = int(l2[0].split(delimiter_2)[0])
163
164    header1 = l1[:head_len1]
165    header2 = l2[:head_len2]
166    body1 = l1[head_len1:]
167    body2 = l2[head_len2:]
168
169    same = True
170    if header == True:
171        print "Comparing headers:"
172        print ">>> %s header:" % f1
173        print "<<< %s header:" % f2
174        same = compareSections(header1, header2, number_clever, approx_equal, delimiter_1, delimiter_2) 
175        if same == True:
176            print "HEADERS ARE IDENTICAL."
177        if len(header1) != len(header2):
178            print "Header lengths differ:\n>>> %s: %s\n<<< %s: %s" % (f1, len(header1), f2, len(header2))
179
180    if body == True:
181        print "Comparing bodies:"
182        print ">>> %s body:" % f1
183        print "<<< %s body:" % f2
184        same = compareSections(body1, body2, number_clever, approx_equal, delimiter_1, delimiter_2)
185        if same == True:
186            print "BODIES ARE IDENTICAL."
187        if len(body1) != len(body2):
188            print "Body lengths differ:\n>>> %s: %s\n<<< %s: %s" % (f1, len(body1), f2, len(body2))
189       
190    return same
191
192
193def parseArgs(args):
194    """
195    Parses arguments returning a dictionary.
196    """
197    arg_dict = {}
198    a = arg_dict
199    a["header"] = True
200    a["body"] = True
201    a["number_clever"] = True
202    a["approx_equal"] = False
203    a["delimiter_1"] = None
204    a["delimiter_2"] = None
205
206    (arg_list, files) = getopt.getopt(args, "hbna1:2:", ["header-only", "body-only",
207                    "number-strict", "approx-equal", "delimiter-1=", "delimiter-2="])
208
209    for arg, value in arg_list:
210        if arg in ("--header-only", "-h"):
211            a["body"] = False
212        elif arg in ("--body-only", "-b"):
213            a["header"] = False 
214        elif arg in ("--number-strict", "-n"):
215            a["number_clever"] = False
216        elif arg in ("--approx-equal", "-a"):
217            a["approx_equal"] = True
218        elif arg in ("--delimiter-1", "-1"):
219            a["delimiter_1"] = value
220        elif arg in ("--delimiter-2", "-2"):
221            a["delimiter_2"] = value
222        else:
223            exitNicely("Unrecognised argument provided: " + arg)
224
225    if len(files) != 2:
226        exitNicely("Must provide a minimum of two file names as command line arguments.")
227
228    if a["header"] == False and a["body"] == False:
229        exitNicely("Invalid selection: header-only and body-only cannot be selected together.")
230
231    return (files, a)
232
233
234def main(args):
235    "Main controller."
236    files, arg_dict = parseArgs(args)
237    apply(compareNA, files, arg_dict) 
238   
239 
240if __name__=="__main__":
241
242    args = sys.argv[1:]
243    main(args)
Note: See TracBrowser for help on using the repository browser.