source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/decoder.py

main
Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago

Historial Limpio

  • Property mode set to 100755
File size: 3.1 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4"""
5Caller will hand this library a buffer and ask it to either convert
6it or auto-detect the type.
7
8Based on http://code.activestate.com/recipes/52257/
9
10Licensed under the PSF License
11"""
12from gluon._compat import to_unicode
13import codecs
14
15# None represents a potentially variable byte. "##" in the XML spec...
16autodetect_dict = {  # bytepattern     : ("name",
17                                         (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"),
18                                         (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"),
19                                         (0xFE, 0xFF, None, None): ("utf_16_be"),
20                                         (0xFF, 0xFE, None, None): ("utf_16_le"),
21                                         (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"),
22                                         (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"),
23                                         (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
24                                         (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
25}
26
27
28def autoDetectXMLEncoding(buffer):
29    """ buffer -> encoding_name
30    The buffer should be at least 4 bytes long.
31    Returns None if encoding cannot be detected.
32    Note that encoding_name might not have an installed
33    decoder (e.g. EBCDIC)
34    """
35    # a more efficient implementation would not decode the whole
36    # buffer at once but otherwise we'd have to decode a character at
37    # a time looking for the quote character...that's a pain
38
39    encoding = "utf_8"
40    # according to the XML spec, this is the default this code successively tries to refine the default
41    # whenever it fails to refine, it falls back to the last place encoding was set.
42
43    if len(buffer) >= 4:
44        bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
45        enc_info = autodetect_dict.get(bytes, None)
46        if not enc_info:  # try autodetection again removing potentially
47            # variable bytes
48            bytes = (byte1, byte2, None, None)
49            enc_info = autodetect_dict.get(bytes)
50    else:
51        enc_info = None
52
53    if enc_info:
54        encoding = enc_info  # we've got a guess... these are the new defaults
55
56        # try to find a more precise encoding using xml declaration
57        secret_decoder_ring = codecs.lookup(encoding)[1]
58        (decoded, length) = secret_decoder_ring(buffer)
59        first_line = decoded.split("\n")[0]
60        if first_line and first_line.startswith("<?xml"):
61            encoding_pos = first_line.find("encoding")
62            if encoding_pos != -1:
63                # look for double quote
64                quote_pos = first_line.find('"', encoding_pos)
65
66                if quote_pos == -1:                 # look for single quote
67                    quote_pos = first_line.find("'", encoding_pos)
68
69                if quote_pos > -1:
70                    quote_char, rest = (first_line[quote_pos],
71                                        first_line[quote_pos + 1:])
72                    encoding = rest[:rest.find(quote_char)]
73
74    return encoding
75
76
77def decoder(buffer):
78    encoding = autoDetectXMLEncoding(buffer)
79    return to_unicode(buffer, charset=encoding)
Note: See TracBrowser for help on using the repository browser.