Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/pyuca/pyuca.py

main

Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago
Historial Limpio
Property mode set to `100755`
File size: 4.8 KB

Line
1	# pyuca - Unicode Collation Algorithm
2	# Version: 2012-06-21
3	#
4	# James Tauber
5	# http://jtauber.com/
6
7	# Copyright (c) 2006-2012 James Tauber and contributors
8	#
9	# Permission is hereby granted, free of charge, to any person obtaining a copy
10	# of this software and associated documentation files (the "Software"), to deal
11	# in the Software without restriction, including without limitation the rights
12	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13	# copies of the Software, and to permit persons to whom the Software is
14	# furnished to do so, subject to the following conditions:
15	#
16	# The above copyright notice and this permission notice shall be included in
17	# all copies or substantial portions of the Software.
18	#
19	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25	# THE SOFTWARE.
26
27
28	"""
29	Preliminary implementation of the Unicode Collation Algorithm.
30
31	This only implements the simple parts of the algorithm but I have successfully
32	tested it using the Default Unicode Collation Element Table (DUCET) to collate
33	Ancient Greek correctly.
34
35	Usage example:
36
37	from pyuca import Collator
38	c = Collator("allkeys.txt")
39
40	sorted_words = sorted(words, key=c.sort_key)
41
42	allkeys.txt (1 MB) is available at
43
44	http://www.unicode.org/Public/UCA/latest/allkeys.txt
45
46	but you can always subset this for just the characters you are dealing with.
47	"""
48
49
50	class Node:
51
52	def __init__(self):
53	self.value = None
54	self.children = {}
55
56
57	class Trie:
58
59	def __init__(self):
60	self.root = Node()
61
62	def add(self, key, value):
63	curr_node = self.root
64	for part in key:
65	curr_node = curr_node.children.setdefault(part, Node())
66	curr_node.value = value
67
68	def find_prefix(self, key):
69	curr_node = self.root
70	remainder = key
71	for part in key:
72	if part not in curr_node.children:
73	break
74	curr_node = curr_node.children[part]
75	remainder = remainder[1:]
76	return (curr_node.value, remainder)
77
78
79	class Collator:
80
81	def __init__(self, filename):
82
83	self.table = Trie()
84	self.load(filename)
85
86	def load(self, filename):
87	for line in open(filename):
88	if line.startswith("#") or line.startswith("%"):
89	continue
90	if line.strip() == "":
91	continue
92	line = line[:line.find("#")] + "\n"
93	line = line[:line.find("%")] + "\n"
94	line = line.strip()
95
96	if line.startswith("@"):
97	pass
98	else:
99	semicolon = line.find(";")
100	charList = line[:semicolon].strip().split()
101	x = line[semicolon:]
102	collElements = []
103	while True:
104	begin = x.find("[")
105	if begin == -1:
106	break
107	end = x[begin:].find("]")
108	collElement = x[begin:begin+end+1]
109	x = x[begin + 1:]
110
111	alt = collElement[1]
112	chars = collElement[2:-1].split(".")
113
114	collElements.append((alt, chars))
115	integer_points = [int(ch, 16) for ch in charList]
116	self.table.add(integer_points, collElements)
117
118	def sort_key(self, string):
119
120	collation_elements = []
121
122	lookup_key = [ord(ch) for ch in string]
123	while lookup_key:
124	value, lookup_key = self.table.find_prefix(lookup_key)
125	if not value:
126	# Calculate implicit weighting for CJK Ideographs
127	# contributed by David Schneider 2009-07-27
128	# http://www.unicode.org/reports/tr10/#Implicit_Weights
129	value = []
130	value.append((".", ["%X" % (0xFB40 + (lookup_key[0] >> 15)), "0020", "0002", "0001"]))
131	value.append((".", ["%X" % ((lookup_key[0] & 0x7FFF) \| 0x8000), "0000", "0000", "0000"]))
132	lookup_key = lookup_key[1:]
133	collation_elements.extend(value)
134	sort_key = []
135
136	for level in range(4):
137	if level:
138	sort_key.append(0) # level separator
139	for element in collation_elements:
140	ce_l = int(element[1][level], 16)
141	if ce_l:
142	sort_key.append(ce_l)
143
144	return tuple(sort_key)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: