source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/pyuca/pyuca.py

main
Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago

Historial Limpio

  • Property mode set to 100755
File size: 4.8 KB
Line 
1# pyuca - Unicode Collation Algorithm
2# Version: 2012-06-21
3#
4# James Tauber
5# http://jtauber.com/
6
7# Copyright (c) 2006-2012 James Tauber and contributors
8#
9# Permission is hereby granted, free of charge, to any person obtaining a copy
10# of this software and associated documentation files (the "Software"), to deal
11# in the Software without restriction, including without limitation the rights
12# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the Software is
14# furnished to do so, subject to the following conditions:
15#
16# The above copyright notice and this permission notice shall be included in
17# all copies or substantial portions of the Software.
18#
19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25# THE SOFTWARE.
26
27
28"""
29Preliminary implementation of the Unicode Collation Algorithm.
30
31This only implements the simple parts of the algorithm but I have successfully
32tested it using the Default Unicode Collation Element Table (DUCET) to collate
33Ancient Greek correctly.
34
35Usage example:
36
37    from pyuca import Collator
38    c = Collator("allkeys.txt")
39
40    sorted_words = sorted(words, key=c.sort_key)
41
42allkeys.txt (1 MB) is available at
43
44    http://www.unicode.org/Public/UCA/latest/allkeys.txt
45
46but you can always subset this for just the characters you are dealing with.
47"""
48
49
50class Node:
51   
52    def __init__(self):
53        self.value = None
54        self.children = {}
55
56
57class Trie:
58   
59    def __init__(self):
60        self.root = Node()
61   
62    def add(self, key, value):
63        curr_node = self.root
64        for part in key:
65            curr_node = curr_node.children.setdefault(part, Node())
66        curr_node.value = value
67   
68    def find_prefix(self, key):
69        curr_node = self.root
70        remainder = key
71        for part in key:
72            if part not in curr_node.children:
73                break
74            curr_node = curr_node.children[part]
75            remainder = remainder[1:]
76        return (curr_node.value, remainder)
77
78
79class Collator:
80
81    def __init__(self, filename):
82
83        self.table = Trie()
84        self.load(filename)
85
86    def load(self, filename):
87        for line in open(filename):
88            if line.startswith("#") or line.startswith("%"):
89                continue
90            if line.strip() == "":
91                continue
92            line = line[:line.find("#")] + "\n"
93            line = line[:line.find("%")] + "\n"
94            line = line.strip()
95           
96            if line.startswith("@"):
97                pass
98            else:
99                semicolon = line.find(";")
100                charList = line[:semicolon].strip().split()
101                x = line[semicolon:]
102                collElements = []
103                while True:
104                    begin = x.find("[")
105                    if begin == -1:
106                        break               
107                    end = x[begin:].find("]")
108                    collElement = x[begin:begin+end+1]
109                    x = x[begin + 1:]
110                   
111                    alt = collElement[1]
112                    chars = collElement[2:-1].split(".")
113                   
114                    collElements.append((alt, chars))
115                integer_points = [int(ch, 16) for ch in charList]
116                self.table.add(integer_points, collElements)
117   
118    def sort_key(self, string):
119       
120        collation_elements = []
121       
122        lookup_key = [ord(ch) for ch in string]
123        while lookup_key:
124            value, lookup_key = self.table.find_prefix(lookup_key)
125            if not value:
126                # Calculate implicit weighting for CJK Ideographs
127                # contributed by David Schneider 2009-07-27
128                # http://www.unicode.org/reports/tr10/#Implicit_Weights
129                value = []
130                value.append((".", ["%X" % (0xFB40 + (lookup_key[0] >> 15)), "0020", "0002", "0001"]))
131                value.append((".", ["%X" % ((lookup_key[0] & 0x7FFF) | 0x8000), "0000", "0000", "0000"]))
132                lookup_key = lookup_key[1:]
133            collation_elements.extend(value)
134        sort_key = []
135       
136        for level in range(4):
137            if level:
138                sort_key.append(0) # level separator
139            for element in collation_elements:
140                ce_l = int(element[1][level], 16)
141                if ce_l:
142                    sort_key.append(ce_l)
143       
144        return tuple(sort_key)
Note: See TracBrowser for help on using the repository browser.