1 | # pyuca - Unicode Collation Algorithm |
---|
2 | # Version: 2012-06-21 |
---|
3 | # |
---|
4 | # James Tauber |
---|
5 | # http://jtauber.com/ |
---|
6 | |
---|
7 | # Copyright (c) 2006-2012 James Tauber and contributors |
---|
8 | # |
---|
9 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
---|
10 | # of this software and associated documentation files (the "Software"), to deal |
---|
11 | # in the Software without restriction, including without limitation the rights |
---|
12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
---|
13 | # copies of the Software, and to permit persons to whom the Software is |
---|
14 | # furnished to do so, subject to the following conditions: |
---|
15 | # |
---|
16 | # The above copyright notice and this permission notice shall be included in |
---|
17 | # all copies or substantial portions of the Software. |
---|
18 | # |
---|
19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
---|
22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
---|
23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
---|
24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
---|
25 | # THE SOFTWARE. |
---|
26 | |
---|
27 | |
---|
28 | """ |
---|
29 | Preliminary implementation of the Unicode Collation Algorithm. |
---|
30 | |
---|
31 | This only implements the simple parts of the algorithm but I have successfully |
---|
32 | tested it using the Default Unicode Collation Element Table (DUCET) to collate |
---|
33 | Ancient Greek correctly. |
---|
34 | |
---|
35 | Usage example: |
---|
36 | |
---|
37 | from pyuca import Collator |
---|
38 | c = Collator("allkeys.txt") |
---|
39 | |
---|
40 | sorted_words = sorted(words, key=c.sort_key) |
---|
41 | |
---|
42 | allkeys.txt (1 MB) is available at |
---|
43 | |
---|
44 | http://www.unicode.org/Public/UCA/latest/allkeys.txt |
---|
45 | |
---|
46 | but you can always subset this for just the characters you are dealing with. |
---|
47 | """ |
---|
48 | |
---|
49 | |
---|
50 | class Node: |
---|
51 | |
---|
52 | def __init__(self): |
---|
53 | self.value = None |
---|
54 | self.children = {} |
---|
55 | |
---|
56 | |
---|
57 | class Trie: |
---|
58 | |
---|
59 | def __init__(self): |
---|
60 | self.root = Node() |
---|
61 | |
---|
62 | def add(self, key, value): |
---|
63 | curr_node = self.root |
---|
64 | for part in key: |
---|
65 | curr_node = curr_node.children.setdefault(part, Node()) |
---|
66 | curr_node.value = value |
---|
67 | |
---|
68 | def find_prefix(self, key): |
---|
69 | curr_node = self.root |
---|
70 | remainder = key |
---|
71 | for part in key: |
---|
72 | if part not in curr_node.children: |
---|
73 | break |
---|
74 | curr_node = curr_node.children[part] |
---|
75 | remainder = remainder[1:] |
---|
76 | return (curr_node.value, remainder) |
---|
77 | |
---|
78 | |
---|
79 | class Collator: |
---|
80 | |
---|
81 | def __init__(self, filename): |
---|
82 | |
---|
83 | self.table = Trie() |
---|
84 | self.load(filename) |
---|
85 | |
---|
86 | def load(self, filename): |
---|
87 | for line in open(filename): |
---|
88 | if line.startswith("#") or line.startswith("%"): |
---|
89 | continue |
---|
90 | if line.strip() == "": |
---|
91 | continue |
---|
92 | line = line[:line.find("#")] + "\n" |
---|
93 | line = line[:line.find("%")] + "\n" |
---|
94 | line = line.strip() |
---|
95 | |
---|
96 | if line.startswith("@"): |
---|
97 | pass |
---|
98 | else: |
---|
99 | semicolon = line.find(";") |
---|
100 | charList = line[:semicolon].strip().split() |
---|
101 | x = line[semicolon:] |
---|
102 | collElements = [] |
---|
103 | while True: |
---|
104 | begin = x.find("[") |
---|
105 | if begin == -1: |
---|
106 | break |
---|
107 | end = x[begin:].find("]") |
---|
108 | collElement = x[begin:begin+end+1] |
---|
109 | x = x[begin + 1:] |
---|
110 | |
---|
111 | alt = collElement[1] |
---|
112 | chars = collElement[2:-1].split(".") |
---|
113 | |
---|
114 | collElements.append((alt, chars)) |
---|
115 | integer_points = [int(ch, 16) for ch in charList] |
---|
116 | self.table.add(integer_points, collElements) |
---|
117 | |
---|
118 | def sort_key(self, string): |
---|
119 | |
---|
120 | collation_elements = [] |
---|
121 | |
---|
122 | lookup_key = [ord(ch) for ch in string] |
---|
123 | while lookup_key: |
---|
124 | value, lookup_key = self.table.find_prefix(lookup_key) |
---|
125 | if not value: |
---|
126 | # Calculate implicit weighting for CJK Ideographs |
---|
127 | # contributed by David Schneider 2009-07-27 |
---|
128 | # http://www.unicode.org/reports/tr10/#Implicit_Weights |
---|
129 | value = [] |
---|
130 | value.append((".", ["%X" % (0xFB40 + (lookup_key[0] >> 15)), "0020", "0002", "0001"])) |
---|
131 | value.append((".", ["%X" % ((lookup_key[0] & 0x7FFF) | 0x8000), "0000", "0000", "0000"])) |
---|
132 | lookup_key = lookup_key[1:] |
---|
133 | collation_elements.extend(value) |
---|
134 | sort_key = [] |
---|
135 | |
---|
136 | for level in range(4): |
---|
137 | if level: |
---|
138 | sort_key.append(0) # level separator |
---|
139 | for element in collation_elements: |
---|
140 | ce_l = int(element[1][level], 16) |
---|
141 | if ce_l: |
---|
142 | sort_key.append(ce_l) |
---|
143 | |
---|
144 | return tuple(sort_key) |
---|