1 | #!/usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | |
---|
4 | """ |
---|
5 | Caller will hand this library a buffer and ask it to either convert |
---|
6 | it or auto-detect the type. |
---|
7 | |
---|
8 | Based on http://code.activestate.com/recipes/52257/ |
---|
9 | |
---|
10 | Licensed under the PSF License |
---|
11 | """ |
---|
12 | from gluon._compat import to_unicode |
---|
13 | import codecs |
---|
14 | |
---|
15 | # None represents a potentially variable byte. "##" in the XML spec... |
---|
16 | autodetect_dict = { # bytepattern : ("name", |
---|
17 | (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"), |
---|
18 | (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"), |
---|
19 | (0xFE, 0xFF, None, None): ("utf_16_be"), |
---|
20 | (0xFF, 0xFE, None, None): ("utf_16_le"), |
---|
21 | (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"), |
---|
22 | (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"), |
---|
23 | (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"), |
---|
24 | (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC") |
---|
25 | } |
---|
26 | |
---|
27 | |
---|
28 | def autoDetectXMLEncoding(buffer): |
---|
29 | """ buffer -> encoding_name |
---|
30 | The buffer should be at least 4 bytes long. |
---|
31 | Returns None if encoding cannot be detected. |
---|
32 | Note that encoding_name might not have an installed |
---|
33 | decoder (e.g. EBCDIC) |
---|
34 | """ |
---|
35 | # a more efficient implementation would not decode the whole |
---|
36 | # buffer at once but otherwise we'd have to decode a character at |
---|
37 | # a time looking for the quote character...that's a pain |
---|
38 | |
---|
39 | encoding = "utf_8" |
---|
40 | # according to the XML spec, this is the default this code successively tries to refine the default |
---|
41 | # whenever it fails to refine, it falls back to the last place encoding was set. |
---|
42 | |
---|
43 | if len(buffer) >= 4: |
---|
44 | bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4])) |
---|
45 | enc_info = autodetect_dict.get(bytes, None) |
---|
46 | if not enc_info: # try autodetection again removing potentially |
---|
47 | # variable bytes |
---|
48 | bytes = (byte1, byte2, None, None) |
---|
49 | enc_info = autodetect_dict.get(bytes) |
---|
50 | else: |
---|
51 | enc_info = None |
---|
52 | |
---|
53 | if enc_info: |
---|
54 | encoding = enc_info # we've got a guess... these are the new defaults |
---|
55 | |
---|
56 | # try to find a more precise encoding using xml declaration |
---|
57 | secret_decoder_ring = codecs.lookup(encoding)[1] |
---|
58 | (decoded, length) = secret_decoder_ring(buffer) |
---|
59 | first_line = decoded.split("\n")[0] |
---|
60 | if first_line and first_line.startswith("<?xml"): |
---|
61 | encoding_pos = first_line.find("encoding") |
---|
62 | if encoding_pos != -1: |
---|
63 | # look for double quote |
---|
64 | quote_pos = first_line.find('"', encoding_pos) |
---|
65 | |
---|
66 | if quote_pos == -1: # look for single quote |
---|
67 | quote_pos = first_line.find("'", encoding_pos) |
---|
68 | |
---|
69 | if quote_pos > -1: |
---|
70 | quote_char, rest = (first_line[quote_pos], |
---|
71 | first_line[quote_pos + 1:]) |
---|
72 | encoding = rest[:rest.find(quote_char)] |
---|
73 | |
---|
74 | return encoding |
---|
75 | |
---|
76 | |
---|
77 | def decoder(buffer): |
---|
78 | encoding = autoDetectXMLEncoding(buffer) |
---|
79 | return to_unicode(buffer, charset=encoding) |
---|