Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/decoder.py

main

Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago
Historial Limpio
Property mode set to `100755`
File size: 3.1 KB

Line
1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3
4	"""
5	Caller will hand this library a buffer and ask it to either convert
6	it or auto-detect the type.
7
8	Based on http://code.activestate.com/recipes/52257/
9
10	Licensed under the PSF License
11	"""
12	from gluon._compat import to_unicode
13	import codecs
14
15	# None represents a potentially variable byte. "##" in the XML spec...
16	autodetect_dict = { # bytepattern : ("name",
17	(0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"),
18	(0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"),
19	(0xFE, 0xFF, None, None): ("utf_16_be"),
20	(0xFF, 0xFE, None, None): ("utf_16_le"),
21	(0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"),
22	(0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"),
23	(0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
24	(0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
25	}
26
27
28	def autoDetectXMLEncoding(buffer):
29	""" buffer -> encoding_name
30	The buffer should be at least 4 bytes long.
31	Returns None if encoding cannot be detected.
32	Note that encoding_name might not have an installed
33	decoder (e.g. EBCDIC)
34	"""
35	# a more efficient implementation would not decode the whole
36	# buffer at once but otherwise we'd have to decode a character at
37	# a time looking for the quote character...that's a pain
38
39	encoding = "utf_8"
40	# according to the XML spec, this is the default this code successively tries to refine the default
41	# whenever it fails to refine, it falls back to the last place encoding was set.
42
43	if len(buffer) >= 4:
44	bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
45	enc_info = autodetect_dict.get(bytes, None)
46	if not enc_info: # try autodetection again removing potentially
47	# variable bytes
48	bytes = (byte1, byte2, None, None)
49	enc_info = autodetect_dict.get(bytes)
50	else:
51	enc_info = None
52
53	if enc_info:
54	encoding = enc_info # we've got a guess... these are the new defaults
55
56	# try to find a more precise encoding using xml declaration
57	secret_decoder_ring = codecs.lookup(encoding)[1]
58	(decoded, length) = secret_decoder_ring(buffer)
59	first_line = decoded.split("\n")[0]
60	if first_line and first_line.startswith("<?xml"):
61	encoding_pos = first_line.find("encoding")
62	if encoding_pos != -1:
63	# look for double quote
64	quote_pos = first_line.find('"', encoding_pos)
65
66	if quote_pos == -1: # look for single quote
67	quote_pos = first_line.find("'", encoding_pos)
68
69	if quote_pos > -1:
70	quote_char, rest = (first_line[quote_pos],
71	first_line[quote_pos + 1:])
72	encoding = rest[:rest.find(quote_char)]
73
74	return encoding
75
76
77	def decoder(buffer):
78	encoding = autoDetectXMLEncoding(buffer)
79	return to_unicode(buffer, charset=encoding)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: