source: TI02-CSML/trunk/csml/csmllibs/xmlEncoding.py @ 1484

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI02-CSML/trunk/csml/csmllibs/xmlEncoding.py@1484
Revision 1484, 2.9 KB checked in by domlowe, 13 years ago (diff)

moving xmlEncodign and NetCDFwriter into csmllibs

Line 
1import codecs, encodings
2#orginal code by Paul Prescod, submitted to ASPN
3
4"""Caller will hand this library a buffer and ask it to either convert
5it or auto-detect the type."""
6
7# None represents a potentially variable byte. "##" in the XML spec...
8autodetect_dict={ # bytepattern     : ("name",             
9                (0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),       
10                (0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
11                (0xFE, 0xFF, None, None) : ("utf_16_be"), 
12                (0xFF, 0xFE, None, None) : ("utf_16_le"), 
13                (0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
14                (0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
15                (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
16                (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC"),
17                 }
18
19def autoDetectXMLEncoding(buffer):
20    """ buffer -> encoding_name
21    The buffer should be at least 4 bytes long.
22        Returns None if encoding cannot be detected.
23        Note that encoding_name might not have an installed
24        decoder (e.g. EBCDIC)
25    """
26    # a more efficient implementation would not decode the whole
27    # buffer at once but otherwise we'd have to decode a character at
28    # a time looking for the quote character...that's a pain
29   
30    #truncate long buffer:
31    buffer = buffer[0:4]
32    encoding = "utf_8" # according to the XML spec, this is the default
33                          # this code successively tries to refine the default
34                          # whenever it fails to refine, it falls back to
35                          # the last place encoding was set.
36    bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer))
37    print bytes
38   
39    enc_info = autodetect_dict.get(bytes, None)
40
41    if not enc_info: # try autodetection again removing potentially
42                     # variable bytes
43        bytes = (byte1, byte2, None, None)
44        enc_info = autodetect_dict.get(bytes)
45       
46    if enc_info:
47        encoding = enc_info # we've got a guess... these are
48                            #the new defaults
49
50        # try to find a more precise encoding using xml declaration
51        secret_decoder_ring = codecs.lookup(encoding)[1]
52        (decoded,length) = secret_decoder_ring(buffer) 
53        first_line = decoded.split("\n")[0]
54        if first_line and first_line.startswith(u"<?xml"):
55            encoding_pos = first_line.find(u"encoding")
56            if encoding_pos!=-1:
57                # look for double quote
58                quote_pos=first_line.find('"', encoding_pos) 
59
60                if quote_pos==-1:                 # look for single quote
61                    quote_pos=first_line.find("'", encoding_pos) 
62
63                if quote_pos>-1:
64                    quote_char,rest=(first_line[quote_pos],
65                                                first_line[quote_pos+1:])
66                    encoding=rest[:rest.find(quote_char)]
67    return encoding
Note: See TracBrowser for help on using the repository browser.