source: TI02-CSML/trunk/parser/API/xmlEncoding.py @ 1438

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI02-CSML/trunk/parser/API/xmlEncoding.py@1438
Revision 1438, 2.9 KB checked in by domlowe, 13 years ago (diff)

Fix for XML unicode problems (I think). Tests files to see what format they are then passes the correct encoding information to elementtree. Requres further testing

Line 
1import codecs, encodings
2#orginal code by Paul Prescod, submitted to ASPN
3
4"""Caller will hand this library a buffer and ask it to either convert
5it or auto-detect the type."""
6
7# None represents a potentially variable byte. "##" in the XML spec...
8autodetect_dict={ # bytepattern     : ("name",             
9                (0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),       
10                (0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
11                (0xFE, 0xFF, None, None) : ("utf_16_be"), 
12                (0xFF, 0xFE, None, None) : ("utf_16_le"), 
13                (0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
14                (0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
15                (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
16                (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC"),
17                 }
18
19def autoDetectXMLEncoding(buffer):
20    """ buffer -> encoding_name
21    The buffer should be at least 4 bytes long.
22        Returns None if encoding cannot be detected.
23        Note that encoding_name might not have an installed
24        decoder (e.g. EBCDIC)
25    """
26    # a more efficient implementation would not decode the whole
27    # buffer at once but otherwise we'd have to decode a character at
28    # a time looking for the quote character...that's a pain
29   
30    #truncate long buffer:
31    buffer = buffer[0:4]
32    encoding = "utf_8" # according to the XML spec, this is the default
33                          # this code successively tries to refine the default
34                          # whenever it fails to refine, it falls back to
35                          # the last place encoding was set.
36    bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer))
37    print bytes
38   
39    enc_info = autodetect_dict.get(bytes, None)
40
41    if not enc_info: # try autodetection again removing potentially
42                     # variable bytes
43        bytes = (byte1, byte2, None, None)
44        enc_info = autodetect_dict.get(bytes)
45       
46    if enc_info:
47        encoding = enc_info # we've got a guess... these are
48                            #the new defaults
49
50        # try to find a more precise encoding using xml declaration
51        secret_decoder_ring = codecs.lookup(encoding)[1]
52        (decoded,length) = secret_decoder_ring(buffer) 
53        first_line = decoded.split("\n")[0]
54        if first_line and first_line.startswith(u"<?xml"):
55            encoding_pos = first_line.find(u"encoding")
56            if encoding_pos!=-1:
57                # look for double quote
58                quote_pos=first_line.find('"', encoding_pos) 
59
60                if quote_pos==-1:                 # look for single quote
61                    quote_pos=first_line.find("'", encoding_pos) 
62
63                if quote_pos>-1:
64                    quote_char,rest=(first_line[quote_pos],
65                                                first_line[quote_pos+1:])
66                    encoding=rest[:rest.find(quote_char)]
67    return encoding
Note: See TracBrowser for help on using the repository browser.