Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/utf8.py

main

Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago
Historial Limpio
Property mode set to `100755`
File size: 29.5 KB

Line
1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3	"""
4	\| This file is part of the web2py Web Framework
5	\| Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
6	\| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
7	\| Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
8	\| for Web2py project
9
10	Utilities and class for UTF8 strings managing
11	----------------------------------------------
12	"""
13	from __future__ import print_function
14	from gluon._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native, reload
15
16	__all__ = ['Utf8']
17
18	repr_escape_tab = {}
19	#FIXME PY3
20	for i in range(1, 32):
21	repr_escape_tab[i] = to_unicode("\\"+"x%02x" % i)
22	repr_escape_tab[7] = u'\\a'
23	repr_escape_tab[8] = u'\\b'
24	repr_escape_tab[9] = u'\\t'
25	repr_escape_tab[10] = u'\\n'
26	repr_escape_tab[11] = u'\\v'
27	repr_escape_tab[12] = u'\\f'
28	repr_escape_tab[13] = u'\\r'
29	repr_escape_tab[ord('\\')] = u'\\\\'
30	repr_escape_tab2 = repr_escape_tab.copy()
31	repr_escape_tab2[ord('\'')] = u"\\'"
32
33
34	def sort_key(s):
35	"""Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
36	is used for utf-8 and unicode strings sorting and for utf-8 strings
37	comparison
38
39	Note:
40	pyuca is a very memory cost module! It loads the whole
41	"allkey.txt" file (~2mb!) into the memory. But this
42	functionality is needed only when sort_key() is called as a
43	part of sort() function or when Utf8 strings are compared.
44
45	So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
46	FIRST CALL) imports pyuca and replaces itself with a real
47	sort_key() function
48	"""
49	global sort_key
50	try:
51	from gluon.contrib.pyuca import unicode_collator
52	unicode_sort_key = unicode_collator.sort_key
53	sort_key = lambda s: unicode_sort_key(
54	to_unicode(s, 'utf-8') if isinstance(s, str) else s)
55	except:
56	sort_key = lambda s: (
57	to_unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
58	return sort_key(s)
59
60
61	def ord(char):
62	"""Returns unicode id for utf8 or unicode char character
63	SUPPOSE that char is an utf-8 or unicode character only
64	"""
65	if isinstance(char, unicodeT):
66	return __builtin__.ord(char)
67	return __builtin__.ord(to_unicode(char, 'utf-8'))
68
69
70	def chr(code):
71	"""Returns utf8-character with code unicode id """
72	return Utf8(unichr(code))
73
74
75	def size(string):
76	"""Returns length of utf-8 string in bytes
77
78	Note:
79	The length of correspondent utf-8 string is returned for unicode string
80	"""
81	return Utf8(string).__size__()
82
83
84	def truncate(string, length, dots='...'):
85	"""Returns string of length < length or truncate string with adding
86	dots suffix to the string's end
87
88	Args:
89	length (int): max length of string
90	dots (str or unicode): string suffix, when string is cutted
91
92	Returns:
93	(utf8-str): original or cutted string
94	"""
95	text = to_unicode(string, 'utf-8')
96	dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots
97	if len(text) > length:
98	text = text[:length - len(dots)] + dots
99	return str.__new__(Utf8, text.encode('utf-8'))
100
101
102	class Utf8(str):
103	"""
104	Class for utf8 string storing and manipulations
105
106	The base presupposition of this class usage is:
107	"ALL strings in the application are either of
108	utf-8 or unicode type, even when simple str
109	type is used. UTF-8 is only a "packed" version
110	of unicode, so Utf-8 and unicode strings are
111	interchangeable."
112
113	CAUTION! This class is slower than str/unicode!
114	Do NOT use it inside intensive loops. Simply
115	decode string(s) to unicode before loop and
116	encode it back to utf-8 string(s) after
117	intensive calculation.
118
119	You can see the benefit of this class in doctests() below
120	"""
121	def __new__(cls, content='', codepage='utf-8'):
122	if isinstance(content, unicodeT):
123	return str.__new__(cls, to_native(content, 'utf-8'))
124	elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
125	return str.__new__(cls, content)
126	else:
127	return str.__new__(cls, to_native(to_unicode(content, codepage), 'utf-8'))
128
129	def __repr__(self):
130	r''' # note that we use raw strings to avoid having to use double back slashes below
131	NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function::
132
133	utf8.__repr__() works same as str.repr() when processing ascii string
134	>>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
135	True
136	>>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
137	True
138	>>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
139	True
140	>>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
141	True
142	>>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
143	True
144
145	Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string::
146
147	>>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
148	True
149	>>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
150	True
151	>>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
152	True
153	>>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
154	True
155	>>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
156	True
157	'''
158	if str.find(self, "'") >= 0 and str.find(self, '"') < 0: # only single quote exists
159	return '"' + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab), 'utf-8') + '"'
160	else:
161	return "'" + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab2), 'utf-8') + "'"
162
163	def __size__(self):
164	""" length of utf-8 string in bytes """
165	return str.__len__(self)
166
167	def __contains__(self, other):
168	return str.__contains__(self, Utf8(other))
169
170	def __getitem__(self, index):
171	return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8'))
172
173	def __getslice__(self, begin, end):
174	return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8'))
175
176	def __add__(self, other):
177	return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
178	if isinstance(other, unicode) else other))
179
180	def __len__(self):
181	return len(to_unicode(self, 'utf-8'))
182
183	def __mul__(self, integer):
184	return str.__new__(Utf8, str.__mul__(self, integer))
185
186	def __eq__(self, string):
187	return str.__eq__(self, Utf8(string))
188
189	def __ne__(self, string):
190	return str.__ne__(self, Utf8(string))
191
192	def capitalize(self):
193	return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))
194
195	def center(self, length):
196	return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))
197
198	def upper(self):
199	return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))
200
201	def lower(self):
202	return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))
203
204	def title(self):
205	return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))
206
207	def index(self, string):
208	return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8'))
209
210	def isalnum(self):
211	return unicode(self, 'utf-8').isalnum()
212
213	def isalpha(self):
214	return unicode(self, 'utf-8').isalpha()
215
216	def isdigit(self):
217	return unicode(self, 'utf-8').isdigit()
218
219	def islower(self):
220	return unicode(self, 'utf-8').islower()
221
222	def isspace(self):
223	return unicode(self, 'utf-8').isspace()
224
225	def istitle(self):
226	return unicode(self, 'utf-8').istitle()
227
228	def isupper(self):
229	return unicode(self, 'utf-8').isupper()
230
231	def zfill(self, length):
232	return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))
233
234	def join(self, iter):
235	return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
236	list(unicode(iter, 'utf-8') if
237	isinstance(iter, str) else
238	iter)]))
239
240	def lstrip(self, chars=None):
241	return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))
242
243	def rstrip(self, chars=None):
244	return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))
245
246	def strip(self, chars=None):
247	return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))
248
249	def swapcase(self):
250	return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))
251
252	def count(self, sub, start=0, end=None):
253	unistr = unicode(self, 'utf-8')
254	return unistr.count(
255	unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
256	start, len(unistr) if end is None else end)
257
258	def decode(self, encoding='utf-8', errors='strict'):
259	return str.decode(self, encoding, errors)
260
261	def encode(self, encoding, errors='strict'):
262	return unicode(self, 'utf-8').encode(encoding, errors)
263
264	def expandtabs(self, tabsize=8):
265	return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))
266
267	def find(self, sub, start=None, end=None):
268	return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
269	if isinstance(sub, str) else sub, start, end)
270
271	def ljust(self, width, fillchar=' '):
272	return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
273	if isinstance(fillchar, str) else fillchar).encode('utf-8'))
274
275	def partition(self, sep):
276	(head, sep, tail) = str.partition(self, Utf8(sep))
277	return (str.__new__(Utf8, head),
278	str.__new__(Utf8, sep),
279	str.__new__(Utf8, tail))
280
281	def replace(self, old, new, count=-1):
282	return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))
283
284	def rfind(self, sub, start=None, end=None):
285	return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
286	if isinstance(sub, str) else sub, start, end)
287
288	def rindex(self, string):
289	return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode)
290	else unicode(string, 'utf-8'))
291
292	def rjust(self, width, fillchar=' '):
293	return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
294	if isinstance(fillchar, str) else fillchar).encode('utf-8'))
295
296	def rpartition(self, sep):
297	(head, sep, tail) = str.rpartition(self, Utf8(sep))
298	return (str.__new__(Utf8, head),
299	str.__new__(Utf8, sep),
300	str.__new__(Utf8, tail))
301
302	def rsplit(self, sep=None, maxsplit=-1):
303	return [str.__new__(Utf8, part) for part in str.rsplit(self,
304	None if sep is None else Utf8(sep), maxsplit)]
305
306	def split(self, sep=None, maxsplit=-1):
307	return [str.__new__(Utf8, part) for part in str.split(self,
308	None if sep is None else Utf8(sep), maxsplit)]
309
310	def splitlines(self, keepends=False):
311	return [str.__new__(Utf8, part) for part in str.splitlines(self, keepends)]
312
313	def startswith(self, prefix, start=0, end=None):
314	unistr = unicode(self, 'utf-8')
315	if isinstance(prefix, tuple):
316	prefix = tuple(unicode(
317	s, 'utf-8') if isinstance(s, str) else s for s in prefix)
318	elif isinstance(prefix, str):
319	prefix = unicode(prefix, 'utf-8')
320	return unistr.startswith(prefix, start, len(unistr) if end is None else end)
321
322	def translate(self, table, deletechars=''):
323	if isinstance(table, dict):
324	return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8'))
325	else:
326	return str.__new__(Utf8, str.translate(self, table, deletechars))
327
328	def endswith(self, prefix, start=0, end=None):
329	unistr = unicode(self, 'utf-8')
330	if isinstance(prefix, tuple):
331	prefix = tuple(unicode(
332	s, 'utf-8') if isinstance(s, str) else s for s in prefix)
333	elif isinstance(prefix, str):
334	prefix = unicode(prefix, 'utf-8')
335	return unistr.endswith(prefix, start, len(unistr) if end is None else end)
336	if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method
337	def format(self, args, *kwargs):
338	args = [unicode(
339	s, 'utf-8') if isinstance(s, str) else s for s in args]
340	kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
341	unicode(v, 'utf-8') if isinstance(v, str) else v)
342	for k, v in iteritems(kwargs))
343	return str.__new__(Utf8, unicode(self, 'utf-8').format(args, *kwargs).encode('utf-8'))
344
345	def __mod__(self, right):
346	if isinstance(right, tuple):
347	right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
348	for v in right)
349	elif isinstance(right, dict):
350	right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
351	unicode(v, 'utf-8') if isinstance(v, str) else v)
352	for k, v in iteritems(right))
353	elif isinstance(right, str):
354	right = unicode(right, 'utf-8')
355	return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
356
357	def __ge__(self, string):
358	return sort_key(self) >= sort_key(string)
359
360	def __gt__(self, string):
361	return sort_key(self) > sort_key(string)
362
363	def __le__(self, string):
364	return sort_key(self) <= sort_key(string)
365
366	def __lt__(self, string):
367	return sort_key(self) < sort_key(string)
368
369
370	if __name__ == '__main__':
371	def doctests():
372	u"""
373	doctests:
374	>>> test_unicode=u'ПРоба Є PRobe'
375	>>> test_unicode_word=u'ПРоба'
376	>>> test_number_str='12345'
377	>>> test_unicode
378	u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
379	>>> print test_unicode
380	ПРоба Є PRobe
381	>>> test_word=test_unicode_word.encode('utf-8')
382	>>> test_str=test_unicode.encode('utf-8')
383	>>> s=Utf8(test_str)
384	>>> s
385	'ПРоба Є PRobe'
386	>>> type(s)
387	<class '__main__.Utf8'>
388	>>> s == test_str
389	True
390	>>> len(test_str) # wrong length of utf8-string!
391	19
392	>>> len(test_unicode) # RIGHT!
393	13
394	>>> len(s) # RIGHT!
395	13
396	>>> size(test_str) # size of utf-8 string (in bytes) == len(str)
397	19
398	>>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
399	19
400	>>> size(s) # size of utf-8 string in bytes
401	19
402	>>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
403	... __builtin__.ord('б') # ascii string
404	... except Exception, e:
405	... print 'Exception:', e
406	Exception: ord() expected a character, but string of length 2 found
407	>>> ord('б') # utf8.ord() is used(!!!)
408	1073
409	>>> ord(u'б') # utf8.ord() is used(!!!)
410	1073
411	>>> ord(s[3]) # utf8.ord() is used(!!!)
412	1073
413	>>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
414	'б'
415	>>> type(chr(1073)) # utf8.chr() is used(!!!)
416	<class '__main__.Utf8'>
417	>>> s=Utf8(test_unicode)
418	>>> s
419	'ПРоба Є PRobe'
420	>>> s == test_str
421	True
422	>>> test_str == s
423	True
424	>>> s == test_unicode
425	True
426	>>> test_unicode == s
427	True
428	>>> print test_str.upper() # only ASCII characters uppered
429	ПРоба Є PROBE
430	>>> print test_unicode.upper() # unicode gives right result
431	ПРОБА Є PROBE
432	>>> s.upper() # utf8 class use unicode.upper()
433	'ПРОБА Є PROBE'
434	>>> type(s.upper())
435	<class '__main__.Utf8'>
436	>>> s.lower()
437	'проба є probe'
438	>>> type(s.lower())
439	<class '__main__.Utf8'>
440	>>> s.capitalize()
441	'Проба є probe'
442	>>> type(s.capitalize())
443	<class '__main__.Utf8'>
444	>>> len(s)
445	13
446	>>> len(test_unicode)
447	13
448	>>> s+'. Probe is проба'
449	'ПРоба Є PRobe. Probe is проба'
450	>>> type(s+'. Probe is проба')
451	<class '__main__.Utf8'>
452	>>> s+u'. Probe is проба'
453	'ПРоба Є PRobe. Probe is проба'
454	>>> type(s+u'. Probe is проба')
455	<class '__main__.Utf8'>
456	>>> s+s
457	'ПРоба Є PRobeПРоба Є PRobe'
458	>>> type(s+s)
459	<class '__main__.Utf8'>
460	>>> a=s
461	>>> a+=s
462	>>> a+=test_unicode
463	>>> a+=test_str
464	>>> a
465	'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
466	>>> type(a)
467	<class '__main__.Utf8'>
468	>>> s*3
469	'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
470	>>> type(s*3)
471	<class '__main__.Utf8'>
472	>>> a=Utf8("-проба-")
473	>>> a*=10
474	>>> a
475	'-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
476	>>> type(a)
477	<class '__main__.Utf8'>
478	>>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
479	'ПРоба Є PRobe'
480	>>> s.center(17) # RIGHT!
481	' ПРоба Є PRobe '
482	>>> type(s.center(17))
483	<class '__main__.Utf8'>
484	>>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
485	False
486	>>> Utf8(test_word+test_number_str).isalnum()
487	True
488	>>> s.isalnum()
489	False
490	>>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
491	False
492	>>> Utf8(test_word).isalpha() # RIGHT!
493	True
494	>>> s.lower().islower()
495	True
496	>>> s.upper().isupper()
497	True
498	>>> print test_str.zfill(17) # WRONG RESULT!
499	ПРоба Є PRobe
500	>>> s.zfill(17) # RIGHT!
501	'0000ПРоба Є PRobe'
502	>>> type(s.zfill(17))
503	<class '__main__.Utf8'>
504	>>> s.istitle()
505	False
506	>>> s.title().istitle()
507	True
508	>>> Utf8('1234').isdigit()
509	True
510	>>> Utf8(' \t').isspace()
511	True
512	>>> s.join('•\|•')
513	'•ПРоба Є PRobe\|ПРоба Є PRobe•'
514	>>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
515	'(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
516	>>> type(s)
517	<class '__main__.Utf8'>
518	>>> s==test_str
519	True
520	>>> s==test_unicode
521	True
522	>>> s.swapcase()
523	'прОБА є prOBE'
524	>>> type(s.swapcase())
525	<class '__main__.Utf8'>
526	>>> truncate(s, 10)
527	'ПРоба Є...'
528	>>> truncate(s, 20)
529	'ПРоба Є PRobe'
530	>>> truncate(s, 10, '•••') # utf-8 string as dots
531	'ПРоба Є•••'
532	>>> truncate(s, 10, u'®') # you can use unicode string as dots
533	'ПРоба Є P®'
534	>>> type(truncate(s, 10))
535	<class '__main__.Utf8'>
536	>>> Utf8(s.encode('koi8-u'), 'koi8-u')
537	'ПРоба Є PRobe'
538	>>> s.decode() # convert utf-8 string to unicode
539	u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
540	>>> a='про\\tba'
541	>>> str_tmp=a.expandtabs()
542	>>> utf8_tmp=Utf8(a).expandtabs()
543	>>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
544	'про.....ba'
545	>>> utf8_tmp.index('b')
546	8
547	>>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
548	'про..ba'
549	>>> str_tmp.index('b') # WRONG index of 'b' character
550	8
551	>>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
552	'про..ba'
553	>>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
554	'про.ba'
555	>>> s.find('Є')
556	6
557	>>> s.find(u'Є')
558	6
559	>>> s.find(' ', 6)
560	7
561	>>> s.rfind(' ')
562	7
563	>>> s.partition('Є')
564	('ПРоба ', 'Є', ' PRobe')
565	>>> s.partition(u'Є')
566	('ПРоба ', 'Є', ' PRobe')
567	>>> (a,b,c) = s.partition('Є')
568	>>> type(a), type(b), type(c)
569	(<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
570	>>> s.partition(' ')
571	('ПРоба', ' ', 'Є PRobe')
572	>>> s.rpartition(' ')
573	('ПРоба Є', ' ', 'PRobe')
574	>>> s.index('Є')
575	6
576	>>> s.rindex(u'Є')
577	6
578	>>> s.index(' ')
579	5
580	>>> s.rindex(' ')
581	7
582	>>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
583	>>> a.split()
584	['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
585	>>> a.rsplit()
586	['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
587	>>> a.expandtabs().split('б')
588	['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
589	>>> a.expandtabs().rsplit('б')
590	['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
591	>>> a.expandtabs().split(u'б', 1)
592	['а ', ' ц д е а б ц д е а б ц д е']
593	>>> a.expandtabs().rsplit(u'б', 1)
594	['а б ц д е а б ц д е а ', ' ц д е']
595	>>> a=Utf8("рядок1\\nрядок2\\nрядок3")
596	>>> a.splitlines()
597	['рядок1', 'рядок2', 'рядок3']
598	>>> a.splitlines(True)
599	['рядок1\\n', 'рядок2\\n', 'рядок3']
600	>>> s[6]
601	'Є'
602	>>> s[0]
603	'П'
604	>>> s[-1]
605	'e'
606	>>> s[:10]
607	'ПРоба Є PR'
608	>>> s[2:-2:2]
609	'оаЄPo'
610	>>> s[::-1]
611	'eboRP Є абоРП'
612	>>> s.startswith('ПР')
613	True
614	>>> s.startswith(('ПР', u'об'),0)
615	True
616	>>> s.startswith(u'об', 2, 4)
617	True
618	>>> s.endswith('be')
619	True
620	>>> s.endswith(('be', 'PR', u'Є'))
621	True
622	>>> s.endswith('PR', 8, 10)
623	True
624	>>> s.endswith('Є', -7, -6)
625	True
626	>>> s.count(' ')
627	2
628	>>> s.count(' ',6)
629	1
630	>>> s.count(u'Є')
631	1
632	>>> s.count('Є', 0, 5)
633	0
634	>>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
635	... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" }
636	"Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
637	>>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
638	>>> a%=(s, s[::-1], 1000)
639	>>> a
640	'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
641	>>> if hasattr(Utf8, 'format'):
642	... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
643	... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
644	... else: # format() method is not used in python with version <2.6:
645	... print True
646	True
647	>>> u'Б'<u'Ї' # WRONG ORDER!
648	False
649	>>> 'Б'<'Ї' # WRONG ORDER!
650	False
651	>>> Utf8('Б')<'Ї' # RIGHT!
652	True
653	>>> u'д'>u'ґ' # WRONG ORDER!
654	False
655	>>> Utf8('д')>Utf8('ґ') # RIGHT!
656	True
657	>>> u'є'<=u'ж' # WRONG ORDER!
658	False
659	>>> Utf8('є')<=u'ж' # RIGHT!
660	True
661	>>> Utf8('є')<=u'є'
662	True
663	>>> u'Ї'>=u'И' # WRONG ORDER!
664	False
665	>>> Utf8(u'Ї') >= u'И' # RIGHT
666	True
667	>>> Utf8('Є') >= 'Є'
668	True
669	>>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type
670	>>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
671	>>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
672	>>> result = "".join(sorted(a))
673	>>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
674	'\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
675	>>> try:
676	... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
677	... except Exception, e:
678	... print 'Exception:', e
679	Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
680	>>> try: # FAILED! (working with bytes, not with utf8-charactes)
681	... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
682	... except Exception, e:
683	... print 'Exception:', e
684	Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
685	>>> print "".join( sorted(Utf8(a))) # converting a to unicode or utf8-string gives us correct result
686	аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
687	>>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
688	ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
689	>>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
690	аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
691	>>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
692	аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
693	>>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
694	аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
695	>>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
696	'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
697	>>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
698	... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
699	... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
700	... ], key=sort_key):
701	... print result.ljust(20), type(result)
702	абетка <type 'str'>
703	Астро <type 'str'>
704	аякс <type 'str'>
705	білінг <type 'str'>
706	веб <type 'str'>
707	гала <type 'unicode'>
708	ґанок <type 'str'>
709	Гоша <class '__main__.Utf8'>
710	Дар'я <class '__main__.Utf8'>
711	Єва <type 'str'>
712	Жужа <type 'unicode'>
713	Іа <type 'str'>
714	Їжа <type 'str'>
715	Київ <type 'str'>
716	лимонад <type 'str'>
717	ложка <type 'str'>
718	Матриця <type 'str'>
719	проба <type 'str'>
720	тест <type 'unicode'>
721	шовк <type 'str'>
722	Юляся <type 'str'>
723	яблуко <type 'str'>
724
725	>>> a=Utf8("中文字")
726	>>> L=list(a)
727	>>> L
728	['中', '文', '字']
729	>>> a="".join(L)
730	>>> print a
731	中文字
732	>>> type(a)
733	<type 'str'>
734	>>> a="中文字" # standard str type
735	>>> L=list(a)
736	>>> L
737	['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97']
738	>>> from string import maketrans
739	>>> str_tab=maketrans('PRobe','12345')
740	>>> unicode_tab={ord(u'П'):ord(u'Ж'),
741	... ord(u'Р') : u'Ш',
742	... ord(Utf8('о')) : None, # utf8.ord() is used
743	... ord('б') : None, # -//-//-
744	... ord(u'а') : u"中文字",
745	... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported
746	... }
747	>>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
748	'ЖШ中文字•12345'
749	"""
750	import sys
751	reload(sys)
752	sys.setdefaultencoding("UTF-8")
753	import doctest
754	print("DOCTESTS STARTED...")
755	doctest.testmod()
756	print("DOCTESTS FINISHED")
757
758	doctests()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: