source: TI02-CSML/trunk/csml/csmllibs/xmlEncoding.py @ 2633

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI02-CSML/trunk/csml/csmllibs/xmlEncoding.py@2633
Revision 2633, 2.8 KB checked in by domlowe, 12 years ago (diff)

added fix to check ordering of output axes, fixes latitude reversal problem

Line 
1import codecs, encodings
2#orginal code by Paul Prescod, submitted to ASPN
3
4"""Caller will hand this library a buffer and ask it to either convert
5it or auto-detect the type."""
6
7# None represents a potentially variable byte. "##" in the XML spec...
8autodetect_dict={ # bytepattern     : ("name",             
9                (0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),       
10                (0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
11                (0xFE, 0xFF, None, None) : ("utf_16_be"), 
12                (0xFF, 0xFE, None, None) : ("utf_16_le"), 
13                (0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
14                (0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
15                (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
16                (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC"),
17                 }
18
19def autoDetectXMLEncoding(buffer):
20    """ buffer -> encoding_name
21    The buffer should be at least 4 bytes long.
22        Returns None if encoding cannot be detected.
23        Note that encoding_name might not have an installed
24        decoder (e.g. EBCDIC)
25    """
26    # a more efficient implementation would not decode the whole
27    # buffer at once but otherwise we'd have to decode a character at
28    # a time looking for the quote character...that's a pain
29   
30    #truncate long buffer:
31    buffer = buffer[0:4]
32    encoding = "utf_8" # according to the XML spec, this is the default
33                          # this code successively tries to refine the default
34                          # whenever it fails to refine, it falls back to
35                          # the last place encoding was set.
36    bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer))
37   
38    enc_info = autodetect_dict.get(bytes, None)
39
40    if not enc_info: # try autodetection again removing potentially
41                     # variable bytes
42        bytes = (byte1, byte2, None, None)
43        enc_info = autodetect_dict.get(bytes)
44       
45    if enc_info:
46        encoding = enc_info # we've got a guess... these are
47                            #the new defaults
48
49        # try to find a more precise encoding using xml declaration
50        secret_decoder_ring = codecs.lookup(encoding)[1]
51        (decoded,length) = secret_decoder_ring(buffer) 
52        first_line = decoded.split("\n")[0]
53        if first_line and first_line.startswith(u"<?xml"):
54            encoding_pos = first_line.find(u"encoding")
55            if encoding_pos!=-1:
56                # look for double quote
57                quote_pos=first_line.find('"', encoding_pos) 
58
59                if quote_pos==-1:                 # look for single quote
60                    quote_pos=first_line.find("'", encoding_pos) 
61
62                if quote_pos>-1:
63                    quote_char,rest=(first_line[quote_pos],
64                                                first_line[quote_pos+1:])
65                    encoding=rest[:rest.find(quote_char)]
66    return encoding
Note: See TracBrowser for help on using the repository browser.