1 | #!/usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | """ |
---|
4 | | This file is part of the web2py Web Framework |
---|
5 | | Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu> |
---|
6 | | License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html) |
---|
7 | | Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com> |
---|
8 | | for Web2py project |
---|
9 | |
---|
10 | Utilities and class for UTF8 strings managing |
---|
11 | ---------------------------------------------- |
---|
12 | """ |
---|
13 | from __future__ import print_function |
---|
14 | from gluon._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native, reload |
---|
15 | |
---|
16 | __all__ = ['Utf8'] |
---|
17 | |
---|
18 | repr_escape_tab = {} |
---|
19 | #FIXME PY3 |
---|
20 | for i in range(1, 32): |
---|
21 | repr_escape_tab[i] = to_unicode("\\"+"x%02x" % i) |
---|
22 | repr_escape_tab[7] = u'\\a' |
---|
23 | repr_escape_tab[8] = u'\\b' |
---|
24 | repr_escape_tab[9] = u'\\t' |
---|
25 | repr_escape_tab[10] = u'\\n' |
---|
26 | repr_escape_tab[11] = u'\\v' |
---|
27 | repr_escape_tab[12] = u'\\f' |
---|
28 | repr_escape_tab[13] = u'\\r' |
---|
29 | repr_escape_tab[ord('\\')] = u'\\\\' |
---|
30 | repr_escape_tab2 = repr_escape_tab.copy() |
---|
31 | repr_escape_tab2[ord('\'')] = u"\\'" |
---|
32 | |
---|
33 | |
---|
34 | def sort_key(s): |
---|
35 | """Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/) |
---|
36 | is used for utf-8 and unicode strings sorting and for utf-8 strings |
---|
37 | comparison |
---|
38 | |
---|
39 | Note: |
---|
40 | pyuca is a very memory cost module! It loads the whole |
---|
41 | "allkey.txt" file (~2mb!) into the memory. But this |
---|
42 | functionality is needed only when sort_key() is called as a |
---|
43 | part of sort() function or when Utf8 strings are compared. |
---|
44 | |
---|
45 | So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS |
---|
46 | FIRST CALL) imports pyuca and replaces itself with a real |
---|
47 | sort_key() function |
---|
48 | """ |
---|
49 | global sort_key |
---|
50 | try: |
---|
51 | from gluon.contrib.pyuca import unicode_collator |
---|
52 | unicode_sort_key = unicode_collator.sort_key |
---|
53 | sort_key = lambda s: unicode_sort_key( |
---|
54 | to_unicode(s, 'utf-8') if isinstance(s, str) else s) |
---|
55 | except: |
---|
56 | sort_key = lambda s: ( |
---|
57 | to_unicode(s, 'utf-8') if isinstance(s, str) else s).lower() |
---|
58 | return sort_key(s) |
---|
59 | |
---|
60 | |
---|
61 | def ord(char): |
---|
62 | """Returns unicode id for utf8 or unicode *char* character |
---|
63 | SUPPOSE that *char* is an utf-8 or unicode character only |
---|
64 | """ |
---|
65 | if isinstance(char, unicodeT): |
---|
66 | return __builtin__.ord(char) |
---|
67 | return __builtin__.ord(to_unicode(char, 'utf-8')) |
---|
68 | |
---|
69 | |
---|
70 | def chr(code): |
---|
71 | """Returns utf8-character with *code* unicode id """ |
---|
72 | return Utf8(unichr(code)) |
---|
73 | |
---|
74 | |
---|
75 | def size(string): |
---|
76 | """Returns length of utf-8 string in bytes |
---|
77 | |
---|
78 | Note: |
---|
79 | The length of correspondent utf-8 string is returned for unicode string |
---|
80 | """ |
---|
81 | return Utf8(string).__size__() |
---|
82 | |
---|
83 | |
---|
84 | def truncate(string, length, dots='...'): |
---|
85 | """Returns string of length < *length* or truncate string with adding |
---|
86 | *dots* suffix to the string's end |
---|
87 | |
---|
88 | Args: |
---|
89 | length (int): max length of string |
---|
90 | dots (str or unicode): string suffix, when string is cutted |
---|
91 | |
---|
92 | Returns: |
---|
93 | (utf8-str): original or cutted string |
---|
94 | """ |
---|
95 | text = to_unicode(string, 'utf-8') |
---|
96 | dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots |
---|
97 | if len(text) > length: |
---|
98 | text = text[:length - len(dots)] + dots |
---|
99 | return str.__new__(Utf8, text.encode('utf-8')) |
---|
100 | |
---|
101 | |
---|
102 | class Utf8(str): |
---|
103 | """ |
---|
104 | Class for utf8 string storing and manipulations |
---|
105 | |
---|
106 | The base presupposition of this class usage is: |
---|
107 | "ALL strings in the application are either of |
---|
108 | utf-8 or unicode type, even when simple str |
---|
109 | type is used. UTF-8 is only a "packed" version |
---|
110 | of unicode, so Utf-8 and unicode strings are |
---|
111 | interchangeable." |
---|
112 | |
---|
113 | CAUTION! This class is slower than str/unicode! |
---|
114 | Do NOT use it inside intensive loops. Simply |
---|
115 | decode string(s) to unicode before loop and |
---|
116 | encode it back to utf-8 string(s) after |
---|
117 | intensive calculation. |
---|
118 | |
---|
119 | You can see the benefit of this class in doctests() below |
---|
120 | """ |
---|
121 | def __new__(cls, content='', codepage='utf-8'): |
---|
122 | if isinstance(content, unicodeT): |
---|
123 | return str.__new__(cls, to_native(content, 'utf-8')) |
---|
124 | elif codepage in ('utf-8', 'utf8') or isinstance(content, cls): |
---|
125 | return str.__new__(cls, content) |
---|
126 | else: |
---|
127 | return str.__new__(cls, to_native(to_unicode(content, codepage), 'utf-8')) |
---|
128 | |
---|
129 | def __repr__(self): |
---|
130 | r''' # note that we use raw strings to avoid having to use double back slashes below |
---|
131 | NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function:: |
---|
132 | |
---|
133 | utf8.__repr__() works same as str.repr() when processing ascii string |
---|
134 | >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'" |
---|
135 | True |
---|
136 | >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\'' |
---|
137 | True |
---|
138 | >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"' |
---|
139 | True |
---|
140 | >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\'' |
---|
141 | True |
---|
142 | >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n |
---|
143 | True |
---|
144 | |
---|
145 | Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string:: |
---|
146 | |
---|
147 | >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字') |
---|
148 | True |
---|
149 | >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字') |
---|
150 | True |
---|
151 | >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字") |
---|
152 | True |
---|
153 | >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字") |
---|
154 | True |
---|
155 | >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n |
---|
156 | True |
---|
157 | ''' |
---|
158 | if str.find(self, "'") >= 0 and str.find(self, '"') < 0: # only single quote exists |
---|
159 | return '"' + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab), 'utf-8') + '"' |
---|
160 | else: |
---|
161 | return "'" + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab2), 'utf-8') + "'" |
---|
162 | |
---|
163 | def __size__(self): |
---|
164 | """ length of utf-8 string in bytes """ |
---|
165 | return str.__len__(self) |
---|
166 | |
---|
167 | def __contains__(self, other): |
---|
168 | return str.__contains__(self, Utf8(other)) |
---|
169 | |
---|
170 | def __getitem__(self, index): |
---|
171 | return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8')) |
---|
172 | |
---|
173 | def __getslice__(self, begin, end): |
---|
174 | return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8')) |
---|
175 | |
---|
176 | def __add__(self, other): |
---|
177 | return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8') |
---|
178 | if isinstance(other, unicode) else other)) |
---|
179 | |
---|
180 | def __len__(self): |
---|
181 | return len(to_unicode(self, 'utf-8')) |
---|
182 | |
---|
183 | def __mul__(self, integer): |
---|
184 | return str.__new__(Utf8, str.__mul__(self, integer)) |
---|
185 | |
---|
186 | def __eq__(self, string): |
---|
187 | return str.__eq__(self, Utf8(string)) |
---|
188 | |
---|
189 | def __ne__(self, string): |
---|
190 | return str.__ne__(self, Utf8(string)) |
---|
191 | |
---|
192 | def capitalize(self): |
---|
193 | return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8')) |
---|
194 | |
---|
195 | def center(self, length): |
---|
196 | return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8')) |
---|
197 | |
---|
198 | def upper(self): |
---|
199 | return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8')) |
---|
200 | |
---|
201 | def lower(self): |
---|
202 | return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8')) |
---|
203 | |
---|
204 | def title(self): |
---|
205 | return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8')) |
---|
206 | |
---|
207 | def index(self, string): |
---|
208 | return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8')) |
---|
209 | |
---|
210 | def isalnum(self): |
---|
211 | return unicode(self, 'utf-8').isalnum() |
---|
212 | |
---|
213 | def isalpha(self): |
---|
214 | return unicode(self, 'utf-8').isalpha() |
---|
215 | |
---|
216 | def isdigit(self): |
---|
217 | return unicode(self, 'utf-8').isdigit() |
---|
218 | |
---|
219 | def islower(self): |
---|
220 | return unicode(self, 'utf-8').islower() |
---|
221 | |
---|
222 | def isspace(self): |
---|
223 | return unicode(self, 'utf-8').isspace() |
---|
224 | |
---|
225 | def istitle(self): |
---|
226 | return unicode(self, 'utf-8').istitle() |
---|
227 | |
---|
228 | def isupper(self): |
---|
229 | return unicode(self, 'utf-8').isupper() |
---|
230 | |
---|
231 | def zfill(self, length): |
---|
232 | return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8')) |
---|
233 | |
---|
234 | def join(self, iter): |
---|
235 | return str.__new__(Utf8, str.join(self, [Utf8(c) for c in |
---|
236 | list(unicode(iter, 'utf-8') if |
---|
237 | isinstance(iter, str) else |
---|
238 | iter)])) |
---|
239 | |
---|
240 | def lstrip(self, chars=None): |
---|
241 | return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars))) |
---|
242 | |
---|
243 | def rstrip(self, chars=None): |
---|
244 | return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars))) |
---|
245 | |
---|
246 | def strip(self, chars=None): |
---|
247 | return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars))) |
---|
248 | |
---|
249 | def swapcase(self): |
---|
250 | return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8')) |
---|
251 | |
---|
252 | def count(self, sub, start=0, end=None): |
---|
253 | unistr = unicode(self, 'utf-8') |
---|
254 | return unistr.count( |
---|
255 | unicode(sub, 'utf-8') if isinstance(sub, str) else sub, |
---|
256 | start, len(unistr) if end is None else end) |
---|
257 | |
---|
258 | def decode(self, encoding='utf-8', errors='strict'): |
---|
259 | return str.decode(self, encoding, errors) |
---|
260 | |
---|
261 | def encode(self, encoding, errors='strict'): |
---|
262 | return unicode(self, 'utf-8').encode(encoding, errors) |
---|
263 | |
---|
264 | def expandtabs(self, tabsize=8): |
---|
265 | return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8')) |
---|
266 | |
---|
267 | def find(self, sub, start=None, end=None): |
---|
268 | return unicode(self, 'utf-8').find(unicode(sub, 'utf-8') |
---|
269 | if isinstance(sub, str) else sub, start, end) |
---|
270 | |
---|
271 | def ljust(self, width, fillchar=' '): |
---|
272 | return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8') |
---|
273 | if isinstance(fillchar, str) else fillchar).encode('utf-8')) |
---|
274 | |
---|
275 | def partition(self, sep): |
---|
276 | (head, sep, tail) = str.partition(self, Utf8(sep)) |
---|
277 | return (str.__new__(Utf8, head), |
---|
278 | str.__new__(Utf8, sep), |
---|
279 | str.__new__(Utf8, tail)) |
---|
280 | |
---|
281 | def replace(self, old, new, count=-1): |
---|
282 | return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count)) |
---|
283 | |
---|
284 | def rfind(self, sub, start=None, end=None): |
---|
285 | return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8') |
---|
286 | if isinstance(sub, str) else sub, start, end) |
---|
287 | |
---|
288 | def rindex(self, string): |
---|
289 | return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode) |
---|
290 | else unicode(string, 'utf-8')) |
---|
291 | |
---|
292 | def rjust(self, width, fillchar=' '): |
---|
293 | return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8') |
---|
294 | if isinstance(fillchar, str) else fillchar).encode('utf-8')) |
---|
295 | |
---|
296 | def rpartition(self, sep): |
---|
297 | (head, sep, tail) = str.rpartition(self, Utf8(sep)) |
---|
298 | return (str.__new__(Utf8, head), |
---|
299 | str.__new__(Utf8, sep), |
---|
300 | str.__new__(Utf8, tail)) |
---|
301 | |
---|
302 | def rsplit(self, sep=None, maxsplit=-1): |
---|
303 | return [str.__new__(Utf8, part) for part in str.rsplit(self, |
---|
304 | None if sep is None else Utf8(sep), maxsplit)] |
---|
305 | |
---|
306 | def split(self, sep=None, maxsplit=-1): |
---|
307 | return [str.__new__(Utf8, part) for part in str.split(self, |
---|
308 | None if sep is None else Utf8(sep), maxsplit)] |
---|
309 | |
---|
310 | def splitlines(self, keepends=False): |
---|
311 | return [str.__new__(Utf8, part) for part in str.splitlines(self, keepends)] |
---|
312 | |
---|
313 | def startswith(self, prefix, start=0, end=None): |
---|
314 | unistr = unicode(self, 'utf-8') |
---|
315 | if isinstance(prefix, tuple): |
---|
316 | prefix = tuple(unicode( |
---|
317 | s, 'utf-8') if isinstance(s, str) else s for s in prefix) |
---|
318 | elif isinstance(prefix, str): |
---|
319 | prefix = unicode(prefix, 'utf-8') |
---|
320 | return unistr.startswith(prefix, start, len(unistr) if end is None else end) |
---|
321 | |
---|
322 | def translate(self, table, deletechars=''): |
---|
323 | if isinstance(table, dict): |
---|
324 | return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8')) |
---|
325 | else: |
---|
326 | return str.__new__(Utf8, str.translate(self, table, deletechars)) |
---|
327 | |
---|
328 | def endswith(self, prefix, start=0, end=None): |
---|
329 | unistr = unicode(self, 'utf-8') |
---|
330 | if isinstance(prefix, tuple): |
---|
331 | prefix = tuple(unicode( |
---|
332 | s, 'utf-8') if isinstance(s, str) else s for s in prefix) |
---|
333 | elif isinstance(prefix, str): |
---|
334 | prefix = unicode(prefix, 'utf-8') |
---|
335 | return unistr.endswith(prefix, start, len(unistr) if end is None else end) |
---|
336 | if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method |
---|
337 | def format(self, *args, **kwargs): |
---|
338 | args = [unicode( |
---|
339 | s, 'utf-8') if isinstance(s, str) else s for s in args] |
---|
340 | kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, |
---|
341 | unicode(v, 'utf-8') if isinstance(v, str) else v) |
---|
342 | for k, v in iteritems(kwargs)) |
---|
343 | return str.__new__(Utf8, unicode(self, 'utf-8').format(*args, **kwargs).encode('utf-8')) |
---|
344 | |
---|
345 | def __mod__(self, right): |
---|
346 | if isinstance(right, tuple): |
---|
347 | right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v |
---|
348 | for v in right) |
---|
349 | elif isinstance(right, dict): |
---|
350 | right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, |
---|
351 | unicode(v, 'utf-8') if isinstance(v, str) else v) |
---|
352 | for k, v in iteritems(right)) |
---|
353 | elif isinstance(right, str): |
---|
354 | right = unicode(right, 'utf-8') |
---|
355 | return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8')) |
---|
356 | |
---|
357 | def __ge__(self, string): |
---|
358 | return sort_key(self) >= sort_key(string) |
---|
359 | |
---|
360 | def __gt__(self, string): |
---|
361 | return sort_key(self) > sort_key(string) |
---|
362 | |
---|
363 | def __le__(self, string): |
---|
364 | return sort_key(self) <= sort_key(string) |
---|
365 | |
---|
366 | def __lt__(self, string): |
---|
367 | return sort_key(self) < sort_key(string) |
---|
368 | |
---|
369 | |
---|
370 | if __name__ == '__main__': |
---|
371 | def doctests(): |
---|
372 | u""" |
---|
373 | doctests: |
---|
374 | >>> test_unicode=u'ПРоба Є PRobe' |
---|
375 | >>> test_unicode_word=u'ПРоба' |
---|
376 | >>> test_number_str='12345' |
---|
377 | >>> test_unicode |
---|
378 | u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' |
---|
379 | >>> print test_unicode |
---|
380 | ПРоба Є PRobe |
---|
381 | >>> test_word=test_unicode_word.encode('utf-8') |
---|
382 | >>> test_str=test_unicode.encode('utf-8') |
---|
383 | >>> s=Utf8(test_str) |
---|
384 | >>> s |
---|
385 | 'ПРоба Є PRobe' |
---|
386 | >>> type(s) |
---|
387 | <class '__main__.Utf8'> |
---|
388 | >>> s == test_str |
---|
389 | True |
---|
390 | >>> len(test_str) # wrong length of utf8-string! |
---|
391 | 19 |
---|
392 | >>> len(test_unicode) # RIGHT! |
---|
393 | 13 |
---|
394 | >>> len(s) # RIGHT! |
---|
395 | 13 |
---|
396 | >>> size(test_str) # size of utf-8 string (in bytes) == len(str) |
---|
397 | 19 |
---|
398 | >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string) |
---|
399 | 19 |
---|
400 | >>> size(s) # size of utf-8 string in bytes |
---|
401 | 19 |
---|
402 | >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord() |
---|
403 | ... __builtin__.ord('б') # ascii string |
---|
404 | ... except Exception, e: |
---|
405 | ... print 'Exception:', e |
---|
406 | Exception: ord() expected a character, but string of length 2 found |
---|
407 | >>> ord('б') # utf8.ord() is used(!!!) |
---|
408 | 1073 |
---|
409 | >>> ord(u'б') # utf8.ord() is used(!!!) |
---|
410 | 1073 |
---|
411 | >>> ord(s[3]) # utf8.ord() is used(!!!) |
---|
412 | 1073 |
---|
413 | >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!) |
---|
414 | 'б' |
---|
415 | >>> type(chr(1073)) # utf8.chr() is used(!!!) |
---|
416 | <class '__main__.Utf8'> |
---|
417 | >>> s=Utf8(test_unicode) |
---|
418 | >>> s |
---|
419 | 'ПРоба Є PRobe' |
---|
420 | >>> s == test_str |
---|
421 | True |
---|
422 | >>> test_str == s |
---|
423 | True |
---|
424 | >>> s == test_unicode |
---|
425 | True |
---|
426 | >>> test_unicode == s |
---|
427 | True |
---|
428 | >>> print test_str.upper() # only ASCII characters uppered |
---|
429 | ПРоба Є PROBE |
---|
430 | >>> print test_unicode.upper() # unicode gives right result |
---|
431 | ПРОБА Є PROBE |
---|
432 | >>> s.upper() # utf8 class use unicode.upper() |
---|
433 | 'ПРОБА Є PROBE' |
---|
434 | >>> type(s.upper()) |
---|
435 | <class '__main__.Utf8'> |
---|
436 | >>> s.lower() |
---|
437 | 'проба є probe' |
---|
438 | >>> type(s.lower()) |
---|
439 | <class '__main__.Utf8'> |
---|
440 | >>> s.capitalize() |
---|
441 | 'Проба є probe' |
---|
442 | >>> type(s.capitalize()) |
---|
443 | <class '__main__.Utf8'> |
---|
444 | >>> len(s) |
---|
445 | 13 |
---|
446 | >>> len(test_unicode) |
---|
447 | 13 |
---|
448 | >>> s+'. Probe is проба' |
---|
449 | 'ПРоба Є PRobe. Probe is проба' |
---|
450 | >>> type(s+'. Probe is проба') |
---|
451 | <class '__main__.Utf8'> |
---|
452 | >>> s+u'. Probe is проба' |
---|
453 | 'ПРоба Є PRobe. Probe is проба' |
---|
454 | >>> type(s+u'. Probe is проба') |
---|
455 | <class '__main__.Utf8'> |
---|
456 | >>> s+s |
---|
457 | 'ПРоба Є PRobeПРоба Є PRobe' |
---|
458 | >>> type(s+s) |
---|
459 | <class '__main__.Utf8'> |
---|
460 | >>> a=s |
---|
461 | >>> a+=s |
---|
462 | >>> a+=test_unicode |
---|
463 | >>> a+=test_str |
---|
464 | >>> a |
---|
465 | 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' |
---|
466 | >>> type(a) |
---|
467 | <class '__main__.Utf8'> |
---|
468 | >>> s*3 |
---|
469 | 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' |
---|
470 | >>> type(s*3) |
---|
471 | <class '__main__.Utf8'> |
---|
472 | >>> a=Utf8("-проба-") |
---|
473 | >>> a*=10 |
---|
474 | >>> a |
---|
475 | '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-' |
---|
476 | >>> type(a) |
---|
477 | <class '__main__.Utf8'> |
---|
478 | >>> print "'"+test_str.center(17)+"'" # WRONG RESULT! |
---|
479 | 'ПРоба Є PRobe' |
---|
480 | >>> s.center(17) # RIGHT! |
---|
481 | ' ПРоба Є PRobe ' |
---|
482 | >>> type(s.center(17)) |
---|
483 | <class '__main__.Utf8'> |
---|
484 | >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha |
---|
485 | False |
---|
486 | >>> Utf8(test_word+test_number_str).isalnum() |
---|
487 | True |
---|
488 | >>> s.isalnum() |
---|
489 | False |
---|
490 | >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha |
---|
491 | False |
---|
492 | >>> Utf8(test_word).isalpha() # RIGHT! |
---|
493 | True |
---|
494 | >>> s.lower().islower() |
---|
495 | True |
---|
496 | >>> s.upper().isupper() |
---|
497 | True |
---|
498 | >>> print test_str.zfill(17) # WRONG RESULT! |
---|
499 | ПРоба Є PRobe |
---|
500 | >>> s.zfill(17) # RIGHT! |
---|
501 | '0000ПРоба Є PRobe' |
---|
502 | >>> type(s.zfill(17)) |
---|
503 | <class '__main__.Utf8'> |
---|
504 | >>> s.istitle() |
---|
505 | False |
---|
506 | >>> s.title().istitle() |
---|
507 | True |
---|
508 | >>> Utf8('1234').isdigit() |
---|
509 | True |
---|
510 | >>> Utf8(' \t').isspace() |
---|
511 | True |
---|
512 | >>> s.join('•|•') |
---|
513 | '•ПРоба Є PRobe|ПРоба Є PRobe•' |
---|
514 | >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)')) |
---|
515 | '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)' |
---|
516 | >>> type(s) |
---|
517 | <class '__main__.Utf8'> |
---|
518 | >>> s==test_str |
---|
519 | True |
---|
520 | >>> s==test_unicode |
---|
521 | True |
---|
522 | >>> s.swapcase() |
---|
523 | 'прОБА є prOBE' |
---|
524 | >>> type(s.swapcase()) |
---|
525 | <class '__main__.Utf8'> |
---|
526 | >>> truncate(s, 10) |
---|
527 | 'ПРоба Є...' |
---|
528 | >>> truncate(s, 20) |
---|
529 | 'ПРоба Є PRobe' |
---|
530 | >>> truncate(s, 10, '•••') # utf-8 string as *dots* |
---|
531 | 'ПРоба Є•••' |
---|
532 | >>> truncate(s, 10, u'®') # you can use unicode string as *dots* |
---|
533 | 'ПРоба Є P®' |
---|
534 | >>> type(truncate(s, 10)) |
---|
535 | <class '__main__.Utf8'> |
---|
536 | >>> Utf8(s.encode('koi8-u'), 'koi8-u') |
---|
537 | 'ПРоба Є PRobe' |
---|
538 | >>> s.decode() # convert utf-8 string to unicode |
---|
539 | u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' |
---|
540 | >>> a='про\\tba' |
---|
541 | >>> str_tmp=a.expandtabs() |
---|
542 | >>> utf8_tmp=Utf8(a).expandtabs() |
---|
543 | >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8) |
---|
544 | 'про.....ba' |
---|
545 | >>> utf8_tmp.index('b') |
---|
546 | 8 |
---|
547 | >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH! |
---|
548 | 'про..ba' |
---|
549 | >>> str_tmp.index('b') # WRONG index of 'b' character |
---|
550 | 8 |
---|
551 | >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT! |
---|
552 | 'про..ba' |
---|
553 | >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT! |
---|
554 | 'про.ba' |
---|
555 | >>> s.find('Є') |
---|
556 | 6 |
---|
557 | >>> s.find(u'Є') |
---|
558 | 6 |
---|
559 | >>> s.find(' ', 6) |
---|
560 | 7 |
---|
561 | >>> s.rfind(' ') |
---|
562 | 7 |
---|
563 | >>> s.partition('Є') |
---|
564 | ('ПРоба ', 'Є', ' PRobe') |
---|
565 | >>> s.partition(u'Є') |
---|
566 | ('ПРоба ', 'Є', ' PRobe') |
---|
567 | >>> (a,b,c) = s.partition('Є') |
---|
568 | >>> type(a), type(b), type(c) |
---|
569 | (<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>) |
---|
570 | >>> s.partition(' ') |
---|
571 | ('ПРоба', ' ', 'Є PRobe') |
---|
572 | >>> s.rpartition(' ') |
---|
573 | ('ПРоба Є', ' ', 'PRobe') |
---|
574 | >>> s.index('Є') |
---|
575 | 6 |
---|
576 | >>> s.rindex(u'Є') |
---|
577 | 6 |
---|
578 | >>> s.index(' ') |
---|
579 | 5 |
---|
580 | >>> s.rindex(' ') |
---|
581 | 7 |
---|
582 | >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е') |
---|
583 | >>> a.split() |
---|
584 | ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е'] |
---|
585 | >>> a.rsplit() |
---|
586 | ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е'] |
---|
587 | >>> a.expandtabs().split('б') |
---|
588 | ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] |
---|
589 | >>> a.expandtabs().rsplit('б') |
---|
590 | ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] |
---|
591 | >>> a.expandtabs().split(u'б', 1) |
---|
592 | ['а ', ' ц д е а б ц д е а б ц д е'] |
---|
593 | >>> a.expandtabs().rsplit(u'б', 1) |
---|
594 | ['а б ц д е а б ц д е а ', ' ц д е'] |
---|
595 | >>> a=Utf8("рядок1\\nрядок2\\nрядок3") |
---|
596 | >>> a.splitlines() |
---|
597 | ['рядок1', 'рядок2', 'рядок3'] |
---|
598 | >>> a.splitlines(True) |
---|
599 | ['рядок1\\n', 'рядок2\\n', 'рядок3'] |
---|
600 | >>> s[6] |
---|
601 | 'Є' |
---|
602 | >>> s[0] |
---|
603 | 'П' |
---|
604 | >>> s[-1] |
---|
605 | 'e' |
---|
606 | >>> s[:10] |
---|
607 | 'ПРоба Є PR' |
---|
608 | >>> s[2:-2:2] |
---|
609 | 'оаЄPo' |
---|
610 | >>> s[::-1] |
---|
611 | 'eboRP Є абоРП' |
---|
612 | >>> s.startswith('ПР') |
---|
613 | True |
---|
614 | >>> s.startswith(('ПР', u'об'),0) |
---|
615 | True |
---|
616 | >>> s.startswith(u'об', 2, 4) |
---|
617 | True |
---|
618 | >>> s.endswith('be') |
---|
619 | True |
---|
620 | >>> s.endswith(('be', 'PR', u'Є')) |
---|
621 | True |
---|
622 | >>> s.endswith('PR', 8, 10) |
---|
623 | True |
---|
624 | >>> s.endswith('Є', -7, -6) |
---|
625 | True |
---|
626 | >>> s.count(' ') |
---|
627 | 2 |
---|
628 | >>> s.count(' ',6) |
---|
629 | 1 |
---|
630 | >>> s.count(u'Є') |
---|
631 | 1 |
---|
632 | >>> s.count('Є', 0, 5) |
---|
633 | 0 |
---|
634 | >>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s, |
---|
635 | ... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" } |
---|
636 | "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe" |
---|
637 | >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]") |
---|
638 | >>> a%=(s, s[::-1], 1000) |
---|
639 | >>> a |
---|
640 | 'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]' |
---|
641 | >>> if hasattr(Utf8, 'format'): |
---|
642 | ... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字", |
---|
643 | ... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000' |
---|
644 | ... else: # format() method is not used in python with version <2.6: |
---|
645 | ... print True |
---|
646 | True |
---|
647 | >>> u'Б'<u'Ї' # WRONG ORDER! |
---|
648 | False |
---|
649 | >>> 'Б'<'Ї' # WRONG ORDER! |
---|
650 | False |
---|
651 | >>> Utf8('Б')<'Ї' # RIGHT! |
---|
652 | True |
---|
653 | >>> u'д'>u'ґ' # WRONG ORDER! |
---|
654 | False |
---|
655 | >>> Utf8('д')>Utf8('ґ') # RIGHT! |
---|
656 | True |
---|
657 | >>> u'є'<=u'ж' # WRONG ORDER! |
---|
658 | False |
---|
659 | >>> Utf8('є')<=u'ж' # RIGHT! |
---|
660 | True |
---|
661 | >>> Utf8('є')<=u'є' |
---|
662 | True |
---|
663 | >>> u'Ї'>=u'И' # WRONG ORDER! |
---|
664 | False |
---|
665 | >>> Utf8(u'Ї') >= u'И' # RIGHT |
---|
666 | True |
---|
667 | >>> Utf8('Є') >= 'Є' |
---|
668 | True |
---|
669 | >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type |
---|
670 | >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type |
---|
671 | >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class |
---|
672 | >>> result = "".join(sorted(a)) |
---|
673 | >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted |
---|
674 | '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91' |
---|
675 | >>> try: |
---|
676 | ... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode |
---|
677 | ... except Exception, e: |
---|
678 | ... print 'Exception:', e |
---|
679 | Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte |
---|
680 | >>> try: # FAILED! (working with bytes, not with utf8-charactes) |
---|
681 | ... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only! |
---|
682 | ... except Exception, e: |
---|
683 | ... print 'Exception:', e |
---|
684 | Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data |
---|
685 | >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result |
---|
686 | аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ |
---|
687 | >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used |
---|
688 | ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ |
---|
689 | >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used |
---|
690 | аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ |
---|
691 | >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used |
---|
692 | аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ |
---|
693 | >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used |
---|
694 | аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ |
---|
695 | >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance |
---|
696 | 'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ' |
---|
697 | >>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа", |
---|
698 | ... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест", |
---|
699 | ... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця", |
---|
700 | ... ], key=sort_key): |
---|
701 | ... print result.ljust(20), type(result) |
---|
702 | абетка <type 'str'> |
---|
703 | Астро <type 'str'> |
---|
704 | аякс <type 'str'> |
---|
705 | білінг <type 'str'> |
---|
706 | веб <type 'str'> |
---|
707 | гала <type 'unicode'> |
---|
708 | ґанок <type 'str'> |
---|
709 | Гоша <class '__main__.Utf8'> |
---|
710 | Дар'я <class '__main__.Utf8'> |
---|
711 | Єва <type 'str'> |
---|
712 | Жужа <type 'unicode'> |
---|
713 | Іа <type 'str'> |
---|
714 | Їжа <type 'str'> |
---|
715 | Київ <type 'str'> |
---|
716 | лимонад <type 'str'> |
---|
717 | ложка <type 'str'> |
---|
718 | Матриця <type 'str'> |
---|
719 | проба <type 'str'> |
---|
720 | тест <type 'unicode'> |
---|
721 | шовк <type 'str'> |
---|
722 | Юляся <type 'str'> |
---|
723 | яблуко <type 'str'> |
---|
724 | |
---|
725 | >>> a=Utf8("中文字") |
---|
726 | >>> L=list(a) |
---|
727 | >>> L |
---|
728 | ['中', '文', '字'] |
---|
729 | >>> a="".join(L) |
---|
730 | >>> print a |
---|
731 | 中文字 |
---|
732 | >>> type(a) |
---|
733 | <type 'str'> |
---|
734 | >>> a="中文字" # standard str type |
---|
735 | >>> L=list(a) |
---|
736 | >>> L |
---|
737 | ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97'] |
---|
738 | >>> from string import maketrans |
---|
739 | >>> str_tab=maketrans('PRobe','12345') |
---|
740 | >>> unicode_tab={ord(u'П'):ord(u'Ж'), |
---|
741 | ... ord(u'Р') : u'Ш', |
---|
742 | ... ord(Utf8('о')) : None, # utf8.ord() is used |
---|
743 | ... ord('б') : None, # -//-//- |
---|
744 | ... ord(u'а') : u"中文字", |
---|
745 | ... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported |
---|
746 | ... } |
---|
747 | >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ') |
---|
748 | 'ЖШ中文字•12345' |
---|
749 | """ |
---|
750 | import sys |
---|
751 | reload(sys) |
---|
752 | sys.setdefaultencoding("UTF-8") |
---|
753 | import doctest |
---|
754 | print("DOCTESTS STARTED...") |
---|
755 | doctest.testmod() |
---|
756 | print("DOCTESTS FINISHED") |
---|
757 | |
---|
758 | doctests() |
---|