1
# cython: language_level=3
3
""" Test script for the Unicode implementation.
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
7
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
18
# from test import support, string_tests
19
from contextlib import contextmanager
25
return unittest.skip("Ignoring CPython-only test")(func)
27
def run_with_locale(*args):
28
return support._ignore
30
cpython_only = _ignore
32
def check_free_after_iterating(*args):
36
def check_warnings(*args):
37
yield # ignore any warnings
41
include "test_unicode_string_tests.pxi"
44
############### ORIGINAL TESTS START HERE #################
47
# Error handling (bad decoder return)
48
def search_function(encoding):
49
def decode1(input, errors="strict"):
50
return 42 # not a tuple
51
def encode1(input, errors="strict"):
52
return 42 # not a tuple
53
def encode2(input, errors="strict"):
54
return (42, 42) # no unicode
55
def decode2(input, errors="strict"):
56
return (42, 42) # no unicode
57
if encoding=="test.unicode1":
58
return (encode1, decode1, None, None)
59
elif encoding=="test.unicode2":
60
return (encode2, decode2, None, None)
63
codecs.register(search_function)
65
def duplicate_string(text):
67
Try to get a fresh clone of the specified text:
68
new object with a reference count of 1.
70
This is a best-effort: latin1 single letters and the empty
71
string ('') are singletons and cannot be cloned.
73
return text.encode().decode()
75
class StrSubclass(str):
78
class UnicodeTest(CommonTest,
79
MixinStrUnicodeUserStringTest,
85
def checkequalnofix(self, result, object, methodname, *args):
86
method = getattr(object, methodname)
87
realresult = method(*args)
88
self.assertEqual(realresult, result)
89
self.assertTrue(type(realresult) is type(result))
91
# if the original is returned make sure that
92
# this doesn't happen with subclasses
93
if realresult is object:
96
return 'usub(%r)' % str.__repr__(self)
98
method = getattr(object, methodname)
99
realresult = method(*args)
100
self.assertEqual(realresult, result)
101
self.assertTrue(object is not realresult)
103
def test_literals(self):
104
self.assertEqual('\xff', '\u00ff')
105
self.assertEqual('\uffff', '\U0000ffff')
106
self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
107
self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
108
self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
109
# raw strings should not have unicode escapes
110
self.assertNotEqual(r"\u0020", " ")
112
def test_ascii(self):
113
if not sys.platform.startswith('java'):
114
# Test basic sanity of repr()
115
self.assertEqual(ascii('abc'), "'abc'")
116
self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
117
self.assertEqual(ascii('ab\\'), "'ab\\\\'")
118
self.assertEqual(ascii('\\c'), "'\\\\c'")
119
self.assertEqual(ascii('\\'), "'\\\\'")
120
self.assertEqual(ascii('\n'), "'\\n'")
121
self.assertEqual(ascii('\r'), "'\\r'")
122
self.assertEqual(ascii('\t'), "'\\t'")
123
self.assertEqual(ascii('\b'), "'\\x08'")
124
self.assertEqual(ascii("'\""), """'\\'"'""")
125
self.assertEqual(ascii("'\""), """'\\'"'""")
126
self.assertEqual(ascii("'"), '''"'"''')
127
self.assertEqual(ascii('"'), """'"'""")
129
"'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
130
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
131
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
132
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
133
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
134
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
135
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
136
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
137
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
138
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
139
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
140
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
141
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
143
testrepr = ascii(''.join(map(chr, range(256))))
144
self.assertEqual(testrepr, latin1repr)
145
# Test ascii works on wide unicode escapes without overflow.
146
self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
147
ascii("\U00010000" * 39 + "\uffff" * 4096))
152
self.assertRaises(TypeError, ascii, WrongRepr())
155
if not sys.platform.startswith('java'):
156
# Test basic sanity of repr()
157
self.assertEqual(repr('abc'), "'abc'")
158
self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
159
self.assertEqual(repr('ab\\'), "'ab\\\\'")
160
self.assertEqual(repr('\\c'), "'\\\\c'")
161
self.assertEqual(repr('\\'), "'\\\\'")
162
self.assertEqual(repr('\n'), "'\\n'")
163
self.assertEqual(repr('\r'), "'\\r'")
164
self.assertEqual(repr('\t'), "'\\t'")
165
self.assertEqual(repr('\b'), "'\\x08'")
166
self.assertEqual(repr("'\""), """'\\'"'""")
167
self.assertEqual(repr("'\""), """'\\'"'""")
168
self.assertEqual(repr("'"), '''"'"''')
169
self.assertEqual(repr('"'), """'"'""")
171
"'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
172
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
173
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
174
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
175
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
176
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
177
"\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
178
"\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
179
"\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
180
"\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
181
"\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
182
"\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
183
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
185
testrepr = repr(''.join(map(chr, range(256))))
186
self.assertEqual(testrepr, latin1repr)
187
# Test repr works on wide unicode escapes without overflow.
188
self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
189
repr("\U00010000" * 39 + "\uffff" * 4096))
194
self.assertRaises(TypeError, repr, WrongRepr())
196
def test_iterators(self):
197
# Make sure unicode objects have an __iter__ method
198
it = "\u1111\u2222\u3333".__iter__()
199
self.assertEqual(next(it), "\u1111")
200
self.assertEqual(next(it), "\u2222")
201
self.assertEqual(next(it), "\u3333")
202
self.assertRaises(StopIteration, next, it)
204
def test_count(self):
205
CommonTest.test_count(self)
206
# check mixed argument types
207
self.checkequalnofix(3, 'aaa', 'count', 'a')
208
self.checkequalnofix(0, 'aaa', 'count', 'b')
209
self.checkequalnofix(3, 'aaa', 'count', 'a')
210
self.checkequalnofix(0, 'aaa', 'count', 'b')
211
self.checkequalnofix(0, 'aaa', 'count', 'b')
212
self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
213
self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
214
self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
215
self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
217
self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
218
self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
219
self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
220
self.checkequal(0, 'a' * 10, 'count', '\u0102')
221
self.checkequal(0, 'a' * 10, 'count', '\U00100304')
222
self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
223
self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
224
self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
225
self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
226
self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
227
self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
228
self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
231
CommonTest.test_find(self)
232
# test implementation details of the memchr fast path
233
self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
234
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
235
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
236
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
237
self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
238
self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
239
self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
240
# check mixed argument types
241
self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
242
self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
243
self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
245
self.assertRaises(TypeError, 'hello'.find)
246
self.assertRaises(TypeError, 'hello'.find, 42)
248
self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
249
self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
250
self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
251
self.checkequal(-1, 'a' * 100, 'find', '\u0102')
252
self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
253
self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
254
self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
255
self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
256
self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
257
self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
258
self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
259
self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
261
def test_rfind(self):
262
CommonTest.test_rfind(self)
263
# test implementation details of the memrchr fast path
264
self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
265
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
266
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
267
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
268
self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
269
self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
270
self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
271
# check mixed argument types
272
self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
273
self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
274
self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
276
self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
277
self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
278
self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
279
self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
280
self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
281
self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
282
self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
283
self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
284
self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
285
self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
286
self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
287
self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
289
def test_index(self):
290
CommonTest.test_index(self)
291
self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
292
self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
293
self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
294
self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
295
self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
296
self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
297
self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
298
self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
300
self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
301
self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
302
self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
303
self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
304
self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
305
self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
306
self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
307
self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
308
self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
309
self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
310
self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
311
self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
313
def test_rindex(self):
314
CommonTest.test_rindex(self)
315
self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
316
self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
317
self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
318
self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
320
self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
321
self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
322
self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
323
self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
324
self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
326
self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
327
self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
328
self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
329
self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
330
self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
331
self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
332
self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
333
self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
334
self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
335
self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
336
self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
337
self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
339
def test_maketrans_translate(self):
340
# these work with plain translate()
341
self.checkequalnofix('bbbc', 'abababc', 'translate',
343
self.checkequalnofix('iiic', 'abababc', 'translate',
344
{ord('a'): None, ord('b'): ord('i')})
345
self.checkequalnofix('iiix', 'abababc', 'translate',
346
{ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
347
self.checkequalnofix('c', 'abababc', 'translate',
348
{ord('a'): None, ord('b'): ''})
349
self.checkequalnofix('xyyx', 'xzx', 'translate',
352
# this needs maketrans()
353
self.checkequalnofix('abababc', 'abababc', 'translate',
355
tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
356
self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
357
# test alternative way of calling maketrans()
358
tbl = self.type2test.maketrans('abc', 'xyz', 'd')
359
self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
361
# various tests switching from ASCII to latin1 or the opposite;
362
# same length, remove a letter, or replace with a longer string.
363
self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
365
self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
367
self.assertEqual("[a]".translate(str.maketrans({'a': None})),
369
self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
371
self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
373
self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
375
self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
378
# test non-ASCII (don't take the fast-path)
379
self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
381
self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
383
self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
385
self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
387
self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
390
# invalid Unicode characters
391
invalid_char = 0x10ffff+1
392
for before in "a\xe9\u20ac\U0010ffff":
393
mapping = str.maketrans({before: invalid_char})
394
text = "[%s]" % before
395
self.assertRaises(ValueError, text.translate, mapping)
398
self.assertRaises(TypeError, self.type2test.maketrans)
399
self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
400
self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
401
self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
402
self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
403
self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
404
self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
406
self.assertRaises(TypeError, 'hello'.translate)
407
self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
409
def test_split(self):
410
CommonTest.test_split(self)
413
for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
416
for delim in ('c', '\u0102', '\U00010302'):
417
self.checkequal([left + right],
418
left + right, 'split', delim)
419
self.checkequal([left, right],
420
left + delim + right, 'split', delim)
421
self.checkequal([left + right],
422
left + right, 'split', delim * 2)
423
self.checkequal([left, right],
424
left + delim * 2 + right, 'split', delim *2)
426
def test_rsplit(self):
427
CommonTest.test_rsplit(self)
429
for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
432
for delim in ('c', '\u0102', '\U00010302'):
433
self.checkequal([left + right],
434
left + right, 'rsplit', delim)
435
self.checkequal([left, right],
436
left + delim + right, 'rsplit', delim)
437
self.checkequal([left + right],
438
left + right, 'rsplit', delim * 2)
439
self.checkequal([left, right],
440
left + delim * 2 + right, 'rsplit', delim *2)
442
def test_partition(self):
443
MixinStrUnicodeUserStringTest.test_partition(self)
445
self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
446
for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
449
for delim in ('c', '\u0102', '\U00010302'):
450
self.checkequal((left + right, '', ''),
451
left + right, 'partition', delim)
452
self.checkequal((left, delim, right),
453
left + delim + right, 'partition', delim)
454
self.checkequal((left + right, '', ''),
455
left + right, 'partition', delim * 2)
456
self.checkequal((left, delim * 2, right),
457
left + delim * 2 + right, 'partition', delim * 2)
459
def test_rpartition(self):
460
MixinStrUnicodeUserStringTest.test_rpartition(self)
462
self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
463
for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
466
for delim in ('c', '\u0102', '\U00010302'):
467
self.checkequal(('', '', left + right),
468
left + right, 'rpartition', delim)
469
self.checkequal((left, delim, right),
470
left + delim + right, 'rpartition', delim)
471
self.checkequal(('', '', left + right),
472
left + right, 'rpartition', delim * 2)
473
self.checkequal((left, delim * 2, right),
474
left + delim * 2 + right, 'rpartition', delim * 2)
477
MixinStrUnicodeUserStringTest.test_join(self)
480
def __init__(self, sval): self.sval = sval
481
def __str__(self): return self.sval
484
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
485
self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
486
self.checkequalnofix('w x y z', ' ', 'join', Sequence('wxyz'))
487
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
488
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
489
self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
490
self.checkequalnofix('w x y z', ' ', 'join', Sequence('wxyz'))
491
self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
492
self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
493
self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
494
self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
496
@unittest.skipIf(sys.maxsize > 2**32,
497
'needs too much memory on a 64-bit platform')
498
def test_join_overflow(self):
499
size = int(sys.maxsize**0.5) + 1
500
seq = ('A' * size,) * size
501
self.assertRaises(OverflowError, ''.join, seq)
503
def test_replace(self):
504
CommonTest.test_replace(self)
506
# method call forwarded from str implementation because of unicode argument
507
self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
508
self.assertRaises(TypeError, 'replace'.replace, "r", 42)
510
for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
513
for delim in ('c', '\u0102', '\U00010302'):
514
for repl in ('d', '\u0103', '\U00010303'):
515
self.checkequal(left + right,
516
left + right, 'replace', delim, repl)
517
self.checkequal(left + repl + right,
518
left + delim + right,
519
'replace', delim, repl)
520
self.checkequal(left + right,
521
left + right, 'replace', delim * 2, repl)
522
self.checkequal(left + repl + right,
523
left + delim * 2 + right,
524
'replace', delim * 2, repl)
526
@support.cpython_only
527
def test_replace_id(self):
530
self.assertIs(text.replace(pattern, pattern), text)
532
def test_bytes_comparison(self):
533
with support.check_warnings():
534
warnings.simplefilter('ignore', BytesWarning)
535
self.assertEqual('abc' == b'abc', False)
536
self.assertEqual('abc' != b'abc', True)
537
self.assertEqual('abc' == bytearray(b'abc'), False)
538
self.assertEqual('abc' != bytearray(b'abc'), True)
540
def test_comparison(self):
542
self.assertEqual('abc', 'abc')
543
self.assertTrue('abcd' > 'abc')
544
self.assertTrue('abc' < 'abcd')
547
# Move these tests to a Unicode collation module test...
548
# Testing UTF-16 code point order comparisons...
550
# No surrogates, no fixup required.
551
self.assertTrue('\u0061' < '\u20ac')
552
# Non surrogate below surrogate value, no fixup required
553
self.assertTrue('\u0061' < '\ud800\udc02')
555
# Non surrogate above surrogate value, fixup required
556
def test_lecmp(s, s2):
557
self.assertTrue(s < s2)
596
# Surrogates on both sides, no fixup required
597
self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
599
def test_islower(self):
600
super().test_islower()
601
self.checkequalnofix(False, '\u1FFc', 'islower')
602
self.assertFalse('\u2167'.islower())
603
self.assertTrue('\u2177'.islower())
605
self.assertFalse('\U00010401'.islower())
606
self.assertFalse('\U00010427'.islower())
608
self.assertTrue('\U00010429'.islower())
609
self.assertTrue('\U0001044E'.islower())
611
self.assertFalse('\U0001F40D'.islower())
612
self.assertFalse('\U0001F46F'.islower())
614
def test_isupper(self):
615
super().test_isupper()
616
if not sys.platform.startswith('java'):
617
self.checkequalnofix(False, '\u1FFc', 'isupper')
618
self.assertTrue('\u2167'.isupper())
619
self.assertFalse('\u2177'.isupper())
621
self.assertTrue('\U00010401'.isupper())
622
self.assertTrue('\U00010427'.isupper())
624
self.assertFalse('\U00010429'.isupper())
625
self.assertFalse('\U0001044E'.isupper())
627
self.assertFalse('\U0001F40D'.isupper())
628
self.assertFalse('\U0001F46F'.isupper())
630
def test_istitle(self):
631
super().test_istitle()
632
self.checkequalnofix(True, '\u1FFc', 'istitle')
633
self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
635
# non-BMP, uppercase + lowercase
636
self.assertTrue('\U00010401\U00010429'.istitle())
637
self.assertTrue('\U00010427\U0001044E'.istitle())
638
# apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
639
for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
640
self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
642
def test_isspace(self):
643
super().test_isspace()
644
self.checkequalnofix(True, '\u2000', 'isspace')
645
self.checkequalnofix(True, '\u200a', 'isspace')
646
self.checkequalnofix(False, '\u2014', 'isspace')
647
# apparently there are no non-BMP spaces chars in Unicode 6
648
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
649
'\U0001F40D', '\U0001F46F']:
650
self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
652
def test_isalnum(self):
653
super().test_isalnum()
654
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
655
'\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
656
self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
658
def test_isalpha(self):
659
super().test_isalpha()
660
self.checkequalnofix(True, '\u1FFc', 'isalpha')
662
self.assertTrue('\U00010401'.isalpha())
663
self.assertTrue('\U00010427'.isalpha())
664
self.assertTrue('\U00010429'.isalpha())
665
self.assertTrue('\U0001044E'.isalpha())
667
self.assertFalse('\U0001F40D'.isalpha())
668
self.assertFalse('\U0001F46F'.isalpha())
670
def test_isascii(self):
671
super().test_isascii()
672
self.assertFalse("\u20ac".isascii())
673
self.assertFalse("\U0010ffff".isascii())
675
def test_isdecimal(self):
676
self.checkequalnofix(False, '', 'isdecimal')
677
self.checkequalnofix(False, 'a', 'isdecimal')
678
self.checkequalnofix(True, '0', 'isdecimal')
679
self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
680
self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
681
self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
682
self.checkequalnofix(True, '0123456789', 'isdecimal')
683
self.checkequalnofix(False, '0123456789a', 'isdecimal')
685
self.checkraises(TypeError, 'abc', 'isdecimal', 42)
687
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
688
'\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
689
self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
690
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
691
self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
693
def test_isdigit(self):
694
super().test_isdigit()
695
self.checkequalnofix(True, '\u2460', 'isdigit')
696
self.checkequalnofix(False, '\xbc', 'isdigit')
697
self.checkequalnofix(True, '\u0660', 'isdigit')
699
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
700
'\U0001F40D', '\U0001F46F', '\U00011065']:
701
self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
702
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
703
self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
705
def test_isnumeric(self):
706
self.checkequalnofix(False, '', 'isnumeric')
707
self.checkequalnofix(False, 'a', 'isnumeric')
708
self.checkequalnofix(True, '0', 'isnumeric')
709
self.checkequalnofix(True, '\u2460', 'isnumeric')
710
self.checkequalnofix(True, '\xbc', 'isnumeric')
711
self.checkequalnofix(True, '\u0660', 'isnumeric')
712
self.checkequalnofix(True, '0123456789', 'isnumeric')
713
self.checkequalnofix(False, '0123456789a', 'isnumeric')
715
self.assertRaises(TypeError, "abc".isnumeric, 42)
717
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
718
'\U0001F40D', '\U0001F46F']:
719
self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
720
for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
721
'\U000104A0', '\U0001F107']:
722
self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
724
def test_isidentifier(self):
725
self.assertTrue("a".isidentifier())
726
self.assertTrue("Z".isidentifier())
727
self.assertTrue("_".isidentifier())
728
self.assertTrue("b0".isidentifier())
729
self.assertTrue("bc".isidentifier())
730
self.assertTrue("b_".isidentifier())
731
self.assertTrue("µ".isidentifier())
732
self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
734
self.assertFalse(" ".isidentifier())
735
self.assertFalse("[".isidentifier())
736
self.assertFalse("©".isidentifier())
737
self.assertFalse("0".isidentifier())
739
def test_isprintable(self):
740
self.assertTrue("".isprintable())
741
self.assertTrue(" ".isprintable())
742
self.assertTrue("abcdefg".isprintable())
743
self.assertFalse("abcdefg\n".isprintable())
744
# some defined Unicode character
745
self.assertTrue("\u0374".isprintable())
746
# undefined character
747
self.assertFalse("\u0378".isprintable())
748
# single surrogate character
749
self.assertFalse("\ud800".isprintable())
751
self.assertTrue('\U0001F46F'.isprintable())
752
self.assertFalse('\U000E0020'.isprintable())
754
def test_surrogates(self):
755
for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
756
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
757
self.assertTrue(s.islower())
758
self.assertFalse(s.isupper())
759
self.assertFalse(s.istitle())
760
for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
761
'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
762
self.assertFalse(s.islower())
763
self.assertTrue(s.isupper())
764
self.assertTrue(s.istitle())
766
for meth_name in ('islower', 'isupper', 'istitle'):
767
meth = getattr(str, meth_name)
768
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
769
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
771
for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
772
'isdecimal', 'isnumeric',
773
'isidentifier', 'isprintable'):
774
meth = getattr(str, meth_name)
775
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
776
'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
777
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
778
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
781
def test_lower(self):
782
CommonTest.test_lower(self)
783
self.assertEqual('\U00010427'.lower(), '\U0001044F')
784
self.assertEqual('\U00010427\U00010427'.lower(),
785
'\U0001044F\U0001044F')
786
self.assertEqual('\U00010427\U0001044F'.lower(),
787
'\U0001044F\U0001044F')
788
self.assertEqual('X\U00010427x\U0001044F'.lower(),
789
'x\U0001044Fx\U0001044F')
790
self.assertEqual('fi'.lower(), 'fi')
791
self.assertEqual('\u0130'.lower(), '\u0069\u0307')
792
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
793
self.assertEqual('\u03a3'.lower(), '\u03c3')
794
self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
795
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
796
self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
797
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
798
self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
799
self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
800
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
801
self.assertEqual('\u2177'.lower(), '\u2177')
803
def test_casefold(self):
804
self.assertEqual('hello'.casefold(), 'hello')
805
self.assertEqual('hELlo'.casefold(), 'hello')
806
self.assertEqual('ß'.casefold(), 'ss')
807
self.assertEqual('fi'.casefold(), 'fi')
808
self.assertEqual('\u03a3'.casefold(), '\u03c3')
809
self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
810
self.assertEqual('\u00b5'.casefold(), '\u03bc')
812
def test_upper(self):
813
CommonTest.test_upper(self)
814
self.assertEqual('\U0001044F'.upper(), '\U00010427')
815
self.assertEqual('\U0001044F\U0001044F'.upper(),
816
'\U00010427\U00010427')
817
self.assertEqual('\U00010427\U0001044F'.upper(),
818
'\U00010427\U00010427')
819
self.assertEqual('X\U00010427x\U0001044F'.upper(),
820
'X\U00010427X\U00010427')
821
self.assertEqual('fi'.upper(), 'FI')
822
self.assertEqual('\u0130'.upper(), '\u0130')
823
self.assertEqual('\u03a3'.upper(), '\u03a3')
824
self.assertEqual('ß'.upper(), 'SS')
825
self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
826
self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
827
self.assertEqual('\u2177'.upper(), '\u2167')
829
def test_capitalize(self):
830
CommonTest.test_capitalize(self)
831
self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
832
self.assertEqual('\U0001044F\U0001044F'.capitalize(),
833
'\U00010427\U0001044F')
834
self.assertEqual('\U00010427\U0001044F'.capitalize(),
835
'\U00010427\U0001044F')
836
self.assertEqual('\U0001044F\U00010427'.capitalize(),
837
'\U00010427\U0001044F')
838
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
839
'X\U0001044Fx\U0001044F')
840
self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
841
exp = '\u0399\u0308\u0300\u0069\u0307'
842
self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
843
if sys.version_info < (3, 8):
844
self.assertEqual('finnish'.capitalize(), 'FInnish')
846
self.assertEqual('finnish'.capitalize(), 'Finnish')
847
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
849
def test_title(self):
851
self.assertEqual('\U0001044F'.title(), '\U00010427')
852
self.assertEqual('\U0001044F\U0001044F'.title(),
853
'\U00010427\U0001044F')
854
self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
855
'\U00010427\U0001044F \U00010427\U0001044F')
856
self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
857
'\U00010427\U0001044F \U00010427\U0001044F')
858
self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
859
'\U00010427\U0001044F \U00010427\U0001044F')
860
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
861
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
862
self.assertEqual('fiNNISH'.title(), 'Finnish')
863
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
864
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
866
def test_swapcase(self):
867
CommonTest.test_swapcase(self)
868
self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
869
self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
870
self.assertEqual('\U0001044F\U0001044F'.swapcase(),
871
'\U00010427\U00010427')
872
self.assertEqual('\U00010427\U0001044F'.swapcase(),
873
'\U0001044F\U00010427')
874
self.assertEqual('\U0001044F\U00010427'.swapcase(),
875
'\U00010427\U0001044F')
876
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
877
'x\U0001044FX\U00010427')
878
self.assertEqual('fi'.swapcase(), 'FI')
879
self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
880
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
881
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
882
self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
883
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
884
self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
885
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
886
self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
887
self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
888
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
889
self.assertEqual('ß'.swapcase(), 'SS')
890
self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
892
def test_center(self):
893
CommonTest.test_center(self)
894
self.assertEqual('x'.center(2, '\U0010FFFF'),
896
self.assertEqual('x'.center(3, '\U0010FFFF'),
897
'\U0010FFFFx\U0010FFFF')
898
self.assertEqual('x'.center(4, '\U0010FFFF'),
899
'\U0010FFFFx\U0010FFFF\U0010FFFF')
901
@unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
902
@support.cpython_only
903
def test_case_operation_overflow(self):
909
self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
911
self.assertRaises(OverflowError, s.upper)
915
def test_contains(self):
916
# Testing Unicode contains method
917
self.assertIn('a', 'abdb')
918
self.assertIn('a', 'bdab')
919
self.assertIn('a', 'bdaba')
920
self.assertIn('a', 'bdba')
921
self.assertNotIn('a', 'bdb')
922
self.assertIn('a', 'bdba')
923
self.assertIn('a', ('a',1,None))
924
self.assertIn('a', (1,None,'a'))
925
self.assertIn('a', ('a',1,None))
926
self.assertIn('a', (1,None,'a'))
927
self.assertNotIn('a', ('x',1,'y'))
928
self.assertNotIn('a', ('x',1,None))
929
self.assertNotIn('abcd', 'abcxxxx')
930
self.assertIn('ab', 'abcd')
931
self.assertIn('ab', 'abc')
932
self.assertIn('ab', (1,None,'ab'))
933
self.assertIn('', 'abc')
934
self.assertIn('', '')
935
self.assertIn('', 'abc')
936
self.assertNotIn('\0', 'abc')
937
self.assertIn('\0', '\0abc')
938
self.assertIn('\0', 'abc\0')
939
self.assertIn('a', '\0abc')
940
self.assertIn('asdf', 'asdf')
941
self.assertNotIn('asdf', 'asd')
942
self.assertNotIn('asdf', '')
944
self.assertRaises(TypeError, "abc".__contains__)
946
for fill in ('a', '\u0100', '\U00010300'):
948
for delim in ('c', '\u0102', '\U00010302'):
949
self.assertNotIn(delim, fill)
950
self.assertIn(delim, fill + delim)
951
self.assertNotIn(delim * 2, fill)
952
self.assertIn(delim * 2, fill + delim * 2)
954
def test_issue18183(self):
955
'\U00010000\U00100000'.lower()
956
'\U00010000\U00100000'.casefold()
957
'\U00010000\U00100000'.upper()
958
'\U00010000\U00100000'.capitalize()
959
'\U00010000\U00100000'.title()
960
'\U00010000\U00100000'.swapcase()
961
'\U00100000'.center(3, '\U00010000')
962
'\U00100000'.ljust(3, '\U00010000')
963
'\U00100000'.rjust(3, '\U00010000')
965
def test_format(self):
966
self.assertEqual(''.format(), '')
967
self.assertEqual('a'.format(), 'a')
968
self.assertEqual('ab'.format(), 'ab')
969
self.assertEqual('a{{'.format(), 'a{')
970
self.assertEqual('a}}'.format(), 'a}')
971
self.assertEqual('{{b'.format(), '{b')
972
self.assertEqual('}}b'.format(), '}b')
973
self.assertEqual('a{{b'.format(), 'a{b')
975
# examples from the PEP:
977
self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
978
self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
980
self.assertEqual("My name is {0} :-{{}}".format('Fred'),
981
"My name is Fred :-{}")
983
d = datetime.date(2007, 8, 18)
984
self.assertEqual("The year is {0.year}".format(d),
987
# classes we'll use for testing
989
def __init__(self, x=100):
991
def __format__(self, spec):
995
def __init__(self, x):
997
def __format__(self, spec):
1000
# class with __str__, but no __format__
1002
def __init__(self, x):
1005
return 'E(' + self.x + ')'
1007
# class with __repr__, but no __format__ or __str__
1009
def __init__(self, x):
1012
return 'F(' + self.x + ')'
1014
# class with __format__ that forwards to string, for some format_spec's
1016
def __init__(self, x):
1019
return "string is " + self.x
1020
def __format__(self, format_spec):
1021
if format_spec == 'd':
1022
return 'G(' + self.x + ')'
1023
return object.__format__(self, format_spec)
1025
class I(datetime.date):
1026
def __format__(self, format_spec):
1027
return self.strftime(format_spec)
1030
def __format__(self, format_spec):
1031
return int.__format__(self * 2, format_spec)
1034
def __init__(self, x):
1037
return 'M(' + self.x + ')'
1041
def __init__(self, x):
1044
return 'N(' + self.x + ')'
1047
self.assertEqual(''.format(), '')
1048
self.assertEqual('abc'.format(), 'abc')
1049
self.assertEqual('{0}'.format('abc'), 'abc')
1050
self.assertEqual('{0:}'.format('abc'), 'abc')
1051
# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1052
self.assertEqual('X{0}'.format('abc'), 'Xabc')
1053
self.assertEqual('{0}X'.format('abc'), 'abcX')
1054
self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1055
self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1056
self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1057
self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1058
self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1059
self.assertEqual('{0}'.format(-15), '-15')
1060
self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1061
self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1062
self.assertEqual('{{'.format(), '{')
1063
self.assertEqual('}}'.format(), '}')
1064
self.assertEqual('{{}}'.format(), '{}')
1065
self.assertEqual('{{x}}'.format(), '{x}')
1066
self.assertEqual('{{{0}}}'.format(123), '{123}')
1067
self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1068
self.assertEqual('}}{{'.format(), '}{')
1069
self.assertEqual('}}x{{'.format(), '}x{')
1072
self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1073
self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1074
self.assertEqual("{0[ ]}".format({' ':3}), '3')
1076
self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1077
self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1078
self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1079
self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1080
self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1081
self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1082
self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1085
self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1086
self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1087
self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1088
self.assertEqual('{0:.0s}'.format('abcdef'), '')
1089
self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1090
self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1091
self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1092
self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1093
self.assertEqual('{0:x<0s}'.format('result'), 'result')
1094
self.assertEqual('{0:x<5s}'.format('result'), 'result')
1095
self.assertEqual('{0:x<6s}'.format('result'), 'result')
1096
self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1097
self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1098
self.assertEqual('{0: <7s}'.format('result'), 'result ')
1099
self.assertEqual('{0:<7s}'.format('result'), 'result ')
1100
self.assertEqual('{0:>7s}'.format('result'), ' result')
1101
self.assertEqual('{0:>8s}'.format('result'), ' result')
1102
self.assertEqual('{0:^8s}'.format('result'), ' result ')
1103
self.assertEqual('{0:^9s}'.format('result'), ' result ')
1104
self.assertEqual('{0:^10s}'.format('result'), ' result ')
1105
self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1106
self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1107
self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1109
# issue 12546: use \x00 as a fill character
1110
self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1111
self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1112
self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1113
self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1115
self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1116
self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1117
self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1118
self.assertEqual('{0:<6}'.format(3), '3 ')
1120
self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1121
self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1122
self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1123
self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1125
self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1126
self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1127
self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1128
self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1130
# format specifiers for user defined type
1131
self.assertEqual('{0:abc}'.format(C()), 'abc')
1133
# !r, !s and !a coercions
1134
self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1135
self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1136
self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1137
self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1138
self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1139
self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1140
self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1141
self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1142
self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1143
self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1144
self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1145
self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1146
self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1147
self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1148
self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1149
self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1151
# test fallback to object.__format__
1152
self.assertEqual('{0}'.format({}), '{}')
1153
self.assertEqual('{0}'.format([]), '[]')
1154
self.assertEqual('{0}'.format([1]), '[1]')
1156
self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1157
self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1159
self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1160
self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1161
self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1163
self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1168
# test deriving from a builtin type and overriding __format__
1169
self.assertEqual("{0}".format(J(10)), "20")
1172
# string format specifiers
1173
self.assertEqual('{0:}'.format('a'), 'a')
1175
# computed format specifiers
1176
self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1177
self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1178
self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1179
self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1180
self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1182
# test various errors
1183
self.assertRaises(ValueError, '{'.format)
1184
self.assertRaises(ValueError, '}'.format)
1185
self.assertRaises(ValueError, 'a{'.format)
1186
self.assertRaises(ValueError, 'a}'.format)
1187
self.assertRaises(ValueError, '{a'.format)
1188
self.assertRaises(ValueError, '}a'.format)
1189
self.assertRaises(IndexError, '{0}'.format)
1190
self.assertRaises(IndexError, '{1}'.format, 'abc')
1191
self.assertRaises(KeyError, '{x}'.format)
1192
self.assertRaises(ValueError, "}{".format)
1193
self.assertRaises(ValueError, "abc{0:{}".format)
1194
self.assertRaises(ValueError, "{0".format)
1195
self.assertRaises(IndexError, "{0.}".format)
1196
self.assertRaises(ValueError, "{0.}".format, 0)
1197
self.assertRaises(ValueError, "{0[}".format)
1198
self.assertRaises(ValueError, "{0[}".format, [])
1199
self.assertRaises(KeyError, "{0]}".format)
1200
self.assertRaises(ValueError, "{0.[]}".format, 0)
1201
self.assertRaises(ValueError, "{0..foo}".format, 0)
1202
self.assertRaises(ValueError, "{0[0}".format, 0)
1203
self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1204
self.assertRaises(KeyError, "{c]}".format)
1205
self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1206
self.assertRaises(ValueError, "{0}}".format, 0)
1207
self.assertRaises(KeyError, "{foo}".format, bar=3)
1208
self.assertRaises(ValueError, "{0!x}".format, 3)
1209
self.assertRaises(ValueError, "{0!}".format, 0)
1210
self.assertRaises(ValueError, "{0!rs}".format, 0)
1211
self.assertRaises(ValueError, "{!}".format)
1212
self.assertRaises(IndexError, "{:}".format)
1213
self.assertRaises(IndexError, "{:s}".format)
1214
self.assertRaises(IndexError, "{}".format)
1215
big = "23098475029384702983476098230754973209482573"
1216
self.assertRaises(ValueError, ("{" + big + "}").format)
1217
self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1220
self.assertRaises(ValueError, "{0[0]x}".format, [None])
1221
self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1223
# can't have a replacement on the field name portion
1224
self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1226
# exceed maximum recursion depth
1227
self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1228
self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1229
0, 1, 2, 3, 4, 5, 6, 7)
1231
# string format spec errors
1232
self.assertRaises(ValueError, "{0:-s}".format, '')
1233
self.assertRaises(ValueError, format, "", "-")
1234
self.assertRaises(ValueError, "{0:=s}".format, '')
1236
# Alternate formatting is not supported
1237
self.assertRaises(ValueError, format, '', '#')
1238
self.assertRaises(ValueError, format, '', '#20')
1241
self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1242
'ABC\u0410\u0411\u0412')
1243
self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1245
self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1248
self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1249
self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1250
self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1251
self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1252
self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1253
self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1254
self.assertRaises(ValueError, "{a{}b}".format, 42)
1255
self.assertRaises(ValueError, "{a{b}".format, 42)
1256
self.assertRaises(ValueError, "{[}".format, 42)
1258
self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1262
self.assertEqual("{!r}".format(m), 'M(data)')
1263
self.assertRaises(TypeError, "{!s}".format, m)
1264
self.assertRaises(TypeError, "{}".format, m)
1266
self.assertEqual("{!r}".format(n), 'N(data)')
1267
self.assertEqual("{!s}".format(n), 'N(data)')
1268
self.assertRaises(TypeError, "{}".format, n)
1270
def test_format_map(self):
1271
self.assertEqual(''.format_map({}), '')
1272
self.assertEqual('a'.format_map({}), 'a')
1273
self.assertEqual('ab'.format_map({}), 'ab')
1274
self.assertEqual('a{{'.format_map({}), 'a{')
1275
self.assertEqual('a}}'.format_map({}), 'a}')
1276
self.assertEqual('{{b'.format_map({}), '{b')
1277
self.assertEqual('}}b'.format_map({}), '}b')
1278
self.assertEqual('a{{b'.format_map({}), 'a{b')
1281
class Mapping(dict):
1282
def __missing__(self, key):
1284
self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1285
self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1287
class InternalMapping:
1289
self.mapping = {'a': 'hello'}
1290
def __getitem__(self, key):
1291
return self.mapping[key]
1292
self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1296
def __init__(self, x=100):
1298
def __format__(self, spec):
1300
self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1302
# test various errors
1303
self.assertRaises(TypeError, ''.format_map)
1304
self.assertRaises(TypeError, 'a'.format_map)
1306
self.assertRaises(ValueError, '{'.format_map, {})
1307
self.assertRaises(ValueError, '}'.format_map, {})
1308
self.assertRaises(ValueError, 'a{'.format_map, {})
1309
self.assertRaises(ValueError, 'a}'.format_map, {})
1310
self.assertRaises(ValueError, '{a'.format_map, {})
1311
self.assertRaises(ValueError, '}a'.format_map, {})
1313
# issue #12579: can't supply positional params to format_map
1314
self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1315
self.assertRaises(ValueError, '{}'.format_map, 'a')
1316
self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1320
def __getitem__(self, key):
1322
self.assertRaises(KeyError, '{a}'.format_map, {})
1323
self.assertRaises(TypeError, '{a}'.format_map, [])
1324
self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1326
def test_format_huge_precision(self):
1327
format_string = ".{}f".format(sys.maxsize + 1)
1328
with self.assertRaises(ValueError):
1329
result = format(2.34, format_string)
1331
def test_format_huge_width(self):
1332
format_string = "{}f".format(sys.maxsize + 1)
1333
with self.assertRaises(ValueError):
1334
result = format(2.34, format_string)
1336
def test_format_huge_item_number(self):
1337
format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1338
with self.assertRaises(ValueError):
1339
result = format_string.format(2.34)
1341
def test_format_auto_numbering(self):
1343
def __init__(self, x=100):
1345
def __format__(self, spec):
1348
self.assertEqual('{}'.format(10), '10')
1349
self.assertEqual('{:5}'.format('s'), 's ')
1350
self.assertEqual('{!r}'.format('s'), "'s'")
1351
self.assertEqual('{._x}'.format(C(10)), '10')
1352
self.assertEqual('{[1]}'.format([1, 2]), '2')
1353
self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1354
self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1356
self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1357
self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1359
# can't mix and match numbering and auto-numbering
1360
self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1361
self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1362
self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1363
self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1365
# can mix and match auto-numbering and named
1366
self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1367
self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1368
self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1369
self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1371
def test_formatting(self):
1372
MixinStrUnicodeUserStringTest.test_formatting(self)
1373
# Testing Unicode formatting strings...
1374
self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1375
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1376
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1377
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1378
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1379
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1380
if not sys.platform.startswith('java'):
1381
self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1382
self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1383
self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1384
self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1385
self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1387
self.assertEqual('%c' % 0x1234, '\u1234')
1388
self.assertEqual('%c' % 0x21483, '\U00021483')
1389
self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1390
self.assertEqual('%c' % '\U00021483', '\U00021483')
1391
self.assertRaises(TypeError, "%c".__mod__, "aa")
1392
self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1393
self.assertRaises(TypeError, "%i".__mod__, "aa")
1395
# formatting jobs delegated from the string implementation:
1396
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1397
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1398
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1399
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1400
self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1401
self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1402
self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1403
self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1404
self.assertEqual('...%s...' % "abc", '...abc...')
1405
self.assertEqual('%*s' % (5,'abc',), ' abc')
1406
self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1407
self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1408
self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1409
self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1410
self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1411
self.assertEqual('%c' % 'a', 'a')
1415
self.assertEqual('%s' % Wrapper(), '\u1234')
1420
self.assertEqual('%f' % NAN, 'nan')
1421
self.assertEqual('%F' % NAN, 'NAN')
1422
self.assertEqual('%f' % INF, 'inf')
1423
self.assertEqual('%F' % INF, 'INF')
1426
self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1427
self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1431
def __init__(self, value):
1432
self.value = int(value)
1435
def __index__(self):
1438
def __init__(self, value):
1439
self.value = float(value)
1441
return int(self.value)
1442
pi = PseudoFloat(3.1415)
1443
letter_m = PseudoInt(109)
1444
self.assertEqual('%x' % 42, '2a')
1445
self.assertEqual('%X' % 15, 'F')
1446
self.assertEqual('%o' % 9, '11')
1447
self.assertEqual('%c' % 109, 'm')
1448
self.assertEqual('%x' % letter_m, '6d')
1449
self.assertEqual('%X' % letter_m, '6D')
1450
self.assertEqual('%o' % letter_m, '155')
1451
self.assertEqual('%c' % letter_m, 'm')
1452
self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1453
self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1454
self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1455
self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1456
self.assertRaises(TypeError, operator.mod, '%c', pi),
1458
def test_formatting_with_enum(self):
1461
class Float(float, enum.Enum):
1463
class Int(enum.IntEnum):
1465
class Str(str, enum.Enum):
1467
# Testing Unicode formatting strings...
1468
self.assertEqual(("%s, %s" % (Str.ABC, Str.ABC)).replace("Str.", ""),
1470
self.assertEqual(("%s, %s, %d, %i, %u, %f, %5.2f" %
1472
Int.IDES, Int.IDES, Int.IDES,
1473
Float.PI, Float.PI)).replace("Str.", ""),
1474
'ABC, ABC, 15, 15, 15, 3.141593, 3.14')
1476
# formatting jobs delegated from the string implementation:
1477
self.assertEqual(('...%(foo)s...' % {'foo':Str.ABC}).replace("Str.", ""),
1479
self.assertEqual(('...%(foo)s...' % {'foo':Int.IDES}).replace("Int.", ""),
1480
'...IDES...' if sys.version_info < (3,11) else '...15...')
1481
self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1483
self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1485
self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1487
self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1490
def test_formatting_huge_precision(self):
1491
format_string = "%.{}f".format(sys.maxsize + 1)
1492
with self.assertRaises(ValueError):
1493
result = format_string % 2.34
1495
@unittest.skip('BROKEN!')
1496
def test_issue28598_strsubclass_rhs(self):
1497
# A subclass of str with an __rmod__ method should be able to hook
1498
# into the % operator
1499
class SubclassedStr(str):
1500
def __rmod__(self, other):
1501
return 'Success, self.__rmod__({!r}) was called'.format(other)
1502
self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1503
"Success, self.__rmod__('lhs %% %r') was called")
1505
@support.cpython_only
1506
def test_formatting_huge_precision_c_limits(self):
1507
from _testcapi import INT_MAX
1508
format_string = "%.{}f".format(INT_MAX + 1)
1509
with self.assertRaises(ValueError):
1510
result = format_string % 2.34
1512
def test_formatting_huge_width(self):
1513
format_string = "%{}f".format(sys.maxsize + 1)
1514
with self.assertRaises(ValueError):
1515
result = format_string % 2.34
1517
def test_startswith_endswith_errors(self):
1518
for meth in ('foo'.startswith, 'foo'.endswith):
1519
with self.assertRaises(TypeError) as cm:
1521
exc = str(cm.exception)
1522
self.assertIn('str', exc)
1523
self.assertIn('tuple', exc)
1525
@support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1526
def test_format_float(self):
1527
# should not format with a comma, but always with C locale
1528
self.assertEqual('1.0', '%.1f' % 1.0)
1530
def test_constructor(self):
1531
# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1534
str('unicode remains unicode'),
1535
'unicode remains unicode'
1538
for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1539
subclass = StrSubclass(text)
1540
self.assertEqual(str(subclass), text)
1541
self.assertEqual(len(subclass), len(text))
1543
self.assertEqual(subclass.encode('ascii'), b'ascii')
1544
self.assertEqual(subclass.encode('utf-8'), b'ascii')
1547
str('strings are converted to unicode'),
1548
'strings are converted to unicode'
1552
def __init__(self, x):
1558
str(StringCompat('__str__ compatible objects are recognized')),
1559
'__str__ compatible objects are recognized'
1562
# unicode(obj) is compatible to str():
1564
o = StringCompat('unicode(obj) is compatible to str()')
1565
self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1566
self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1568
for obj in (123, 123.45, 123):
1569
self.assertEqual(str(obj), str(str(obj)))
1571
# unicode(obj, encoding, error) tests (this maps to
1572
# PyUnicode_FromEncodedObject() at C level)
1574
if not sys.platform.startswith('java'):
1578
'decoding unicode is not supported',
1584
str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1585
'strings are decoded to unicode'
1588
if not sys.platform.startswith('java'):
1591
memoryview(b'character buffers are decoded to unicode'),
1595
'character buffers are decoded to unicode'
1598
self.assertRaises(TypeError, str, 42, 42, 42)
1600
def test_constructor_keyword_args(self):
1601
"""Pass various keyword argument combinations to the constructor."""
1602
# The object argument can be passed as a keyword.
1603
self.assertEqual(str(object='foo'), 'foo')
1604
self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1605
# The errors argument without encoding triggers "decode" mode.
1606
self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1607
self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1609
def test_constructor_defaults(self):
1610
"""Check the constructor argument defaults."""
1611
# The object argument defaults to '' or b''.
1612
self.assertEqual(str(), '')
1613
self.assertEqual(str(errors='strict'), '')
1614
utf8_cent = '¢'.encode('utf-8')
1615
# The encoding argument defaults to utf-8.
1616
self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1617
# The errors argument defaults to strict.
1618
self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1620
def test_codecs_utf7(self):
1622
('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1623
('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1624
('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1625
('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1631
(r'\\?', b'+AFwAXA?'),
1632
(r'\\\?', b'+AFwAXABc?'),
1633
(r'++--', b'+-+---'),
1634
('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1638
for (x, y) in utfTests:
1639
self.assertEqual(x.encode('utf-7'), y)
1641
# Unpaired surrogates are passed through
1642
self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1643
self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1644
self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1645
self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1646
self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1647
self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1648
self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1649
self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1651
self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1652
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1654
# Issue #2242: crash on some Windows/MSVC versions
1655
self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1657
# Direct encoded characters
1658
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1659
# Optional direct characters
1660
set_o = '!"#$%&*;<=>@[]^_`{|}'
1662
self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1663
self.assertEqual(c.encode('ascii').decode('utf7'), c)
1665
self.assertEqual(c.encode('ascii').decode('utf7'), c)
1667
if sys.version_info >= (3, 8):
1668
with self.assertRaisesRegex(UnicodeDecodeError,
1669
'ill-formed sequence'):
1670
b'+@'.decode('utf-7')
1672
def test_codecs_utf8(self):
1673
self.assertEqual(''.encode('utf-8'), b'')
1674
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1675
self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1676
self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1677
self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1678
self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1679
self.assertEqual(('\U00010002'*10).encode('utf-8'),
1680
b'\xf0\x90\x80\x82'*10)
1682
'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1683
'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1684
'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1685
'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1686
'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1687
' Nunstuck git und'.encode('utf-8'),
1688
b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1689
b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1690
b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1691
b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1692
b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1693
b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1694
b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1695
b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1696
b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1697
b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1700
# UTF-8 specific decoding tests
1701
self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1702
self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1703
self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1705
# Other possible utf-8 test cases:
1706
# * strict decoding testing for all of the
1707
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
1709
def test_utf8_decode_valid_sequences(self):
1712
(b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1714
(b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1716
(b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1717
(b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1719
(b'\xF0\x90\x80\x80', '\U00010000'),
1720
(b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1722
for seq, res in sequences:
1723
self.assertEqual(seq.decode('utf-8'), res)
1726
def test_utf8_decode_invalid_sequences(self):
1727
# continuation bytes in a sequence of 2, 3, or 4 bytes
1728
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1729
# start bytes of a 2-byte sequence equivalent to code points < 0x7F
1730
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1731
# start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1732
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1733
invalid_start_bytes = (
1734
continuation_bytes + invalid_2B_seq_start_bytes +
1735
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1738
for byte in invalid_start_bytes:
1739
self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1741
for sb in invalid_2B_seq_start_bytes:
1742
for cb in continuation_bytes:
1743
self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1745
for sb in invalid_4B_seq_start_bytes:
1746
for cb1 in continuation_bytes[:3]:
1747
for cb3 in continuation_bytes[:3]:
1748
self.assertRaises(UnicodeDecodeError,
1749
(sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1751
for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1752
self.assertRaises(UnicodeDecodeError,
1753
(b'\xE0'+cb+b'\x80').decode, 'utf-8')
1754
self.assertRaises(UnicodeDecodeError,
1755
(b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1757
for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1758
self.assertRaises(UnicodeDecodeError,
1759
(b'\xED'+cb+b'\x80').decode, 'utf-8')
1760
self.assertRaises(UnicodeDecodeError,
1761
(b'\xED'+cb+b'\xBF').decode, 'utf-8')
1762
for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1763
self.assertRaises(UnicodeDecodeError,
1764
(b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1765
self.assertRaises(UnicodeDecodeError,
1766
(b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1767
for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1768
self.assertRaises(UnicodeDecodeError,
1769
(b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1770
self.assertRaises(UnicodeDecodeError,
1771
(b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1773
def test_issue8271(self):
1774
# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1775
# only the start byte and the continuation byte(s) are now considered
1776
# invalid, instead of the number of bytes specified by the start byte.
1777
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1778
# table 3-8, Row 2) for more information about the algorithm used.
1781
# invalid start bytes
1782
(b'\x80', FFFD), # continuation byte
1783
(b'\x80\x80', FFFD*2), # 2 continuation bytes
1785
(b'\xc0\xc0', FFFD*2),
1787
(b'\xc1\xc0', FFFD*2),
1788
(b'\xc0\xc1', FFFD*2),
1789
# with start byte of a 2-byte sequence
1790
(b'\xc2', FFFD), # only the start byte
1791
(b'\xc2\xc2', FFFD*2), # 2 start bytes
1792
(b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1793
(b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1794
# with start byte of a 3-byte sequence
1795
(b'\xe1', FFFD), # only the start byte
1796
(b'\xe1\xe1', FFFD*2), # 2 start bytes
1797
(b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1798
(b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1799
(b'\xe1\x80', FFFD), # only 1 continuation byte
1800
(b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1801
(b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1802
(b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1803
(b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1804
(b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1805
(b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1806
# with start byte of a 4-byte sequence
1807
(b'\xf1', FFFD), # only the start byte
1808
(b'\xf1\xf1', FFFD*2), # 2 start bytes
1809
(b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1810
(b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1811
(b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1812
(b'\xf1\x80', FFFD), # only 1 continuation bytes
1813
(b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1814
(b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1815
(b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1816
(b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1817
(b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1818
(b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1819
(b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1820
(b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1821
(b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1822
(b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1823
(b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1824
(b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1825
# with invalid start byte of a 4-byte sequence (rfc2279)
1826
(b'\xf5', FFFD), # only the start byte
1827
(b'\xf5\xf5', FFFD*2), # 2 start bytes
1828
(b'\xf5\x80', FFFD*2), # only 1 continuation byte
1829
(b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1830
(b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1831
(b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1832
(b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1833
(b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1834
# with invalid start byte of a 5-byte sequence (rfc2279)
1835
(b'\xf8', FFFD), # only the start byte
1836
(b'\xf8\xf8', FFFD*2), # 2 start bytes
1837
(b'\xf8\x80', FFFD*2), # only one continuation byte
1838
(b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1839
(b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1840
# with invalid start byte of a 6-byte sequence (rfc2279)
1841
(b'\xfc', FFFD), # only the start byte
1842
(b'\xfc\xfc', FFFD*2), # 2 start bytes
1843
(b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1844
(b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1845
# invalid start byte
1847
(b'\xfe\x80\x80', FFFD*3),
1849
(b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1850
(b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1851
(b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1852
(b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1853
'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1855
for n, (seq, res) in enumerate(sequences):
1856
self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1857
self.assertEqual(seq.decode('utf-8', 'replace'), res)
1858
self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1859
self.assertEqual(seq.decode('utf-8', 'ignore'),
1860
res.replace('\uFFFD', ''))
1862
def assertCorrectUTF8Decoding(self, seq, res, err):
1864
Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1865
'strict' is used, returns res when 'replace' is used, and that doesn't
1866
return anything when 'ignore' is used.
1868
with self.assertRaises(UnicodeDecodeError) as cm:
1872
self.assertIn(err, str(exc))
1873
self.assertEqual(seq.decode('utf-8', 'replace'), res)
1874
self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1875
'aaaa' + res + 'bbbb')
1876
res = res.replace('\ufffd', '')
1877
self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1878
self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1879
'aaaa' + res + 'bbbb')
1881
def test_invalid_start_byte(self):
1883
Test that an 'invalid start byte' error is raised when the first byte
1884
is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1885
4-bytes sequence. The invalid start byte is replaced with a single
1886
U+FFFD when errors='replace'.
1887
E.g. <80> is a continuation byte and can appear only after a start byte.
1890
for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1891
self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1892
'invalid start byte')
1894
def test_unexpected_end_of_data(self):
1896
Test that an 'unexpected end of data' error is raised when the string
1897
ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1898
enough continuation bytes. The incomplete sequence is replaced with a
1899
single U+FFFD when errors='replace'.
1900
E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1901
sequence, but it's followed by only 2 valid continuation bytes and the
1902
last continuation bytes is missing.
1903
Note: the continuation bytes must be all valid, if one of them is
1904
invalid another error will be raised.
1908
'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1909
'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1910
'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1911
'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1912
'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1913
'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1916
for seq in sequences:
1917
self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1918
'unexpected end of data')
1920
def test_invalid_cb_for_2bytes_seq(self):
1922
Test that an 'invalid continuation byte' error is raised when the
1923
continuation byte of a 2-bytes sequence is invalid. The start byte
1924
is replaced by a single U+FFFD and the second byte is handled
1925
separately when errors='replace'.
1926
E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1927
sequence, but 41 is not a valid continuation byte because it's the
1933
('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1934
('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1935
('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1936
('DF C0', FFFDx2), ('DF FF', FFFDx2),
1938
for seq, res in sequences:
1939
self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1940
'invalid continuation byte')
1942
def test_invalid_cb_for_3bytes_seq(self):
1944
Test that an 'invalid continuation byte' error is raised when the
1945
continuation byte(s) of a 3-bytes sequence are invalid. When
1946
errors='replace', if the first continuation byte is valid, the first
1947
two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1948
third byte is handled separately, otherwise only the start byte is
1949
replaced with a U+FFFD and the other continuation bytes are handled
1951
E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1952
sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1953
because it's the ASCII letter 'A'.
1954
Note: when the start byte is E0 or ED, the valid ranges for the first
1955
continuation byte are limited to A0..BF and 80..9F respectively.
1956
Python 2 used to consider all the bytes in range 80..BF valid when the
1957
start byte was ED. This is fixed in Python 3.
1962
('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1963
('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1964
('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1965
('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1966
('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1967
('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1968
('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1969
('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1970
('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1971
('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1972
('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1973
('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1974
('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1975
('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1976
('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1977
('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1978
('ED 7F', FFFD+'\x7f'),
1979
('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1980
('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1981
('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1982
('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1983
('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1984
('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1985
('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1986
('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1987
('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1988
('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1989
('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1990
('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1991
('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1992
('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1993
('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1994
('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1996
for seq, res in sequences:
1997
self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1998
'invalid continuation byte')
2000
def test_invalid_cb_for_4bytes_seq(self):
2002
Test that an 'invalid continuation byte' error is raised when the
2003
continuation byte(s) of a 4-bytes sequence are invalid. When
2004
errors='replace',the start byte and all the following valid
2005
continuation bytes are replaced with a single U+FFFD, and all the bytes
2006
starting from the first invalid continuation bytes (included) are
2008
E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2009
sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2010
because it's the ASCII letter 'A'.
2011
Note: when the start byte is E0 or ED, the valid ranges for the first
2012
continuation byte are limited to A0..BF and 80..9F respectively.
2013
However, when the start byte is ED, Python 2 considers all the bytes
2014
in range 80..BF valid. This is fixed in Python 3.
2019
('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2020
('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2021
('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2022
('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2023
('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2024
('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2025
('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2026
('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2027
('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2028
('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2029
('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2030
('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2031
('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2032
('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2033
('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2034
('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2035
('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2036
('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2037
('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2038
('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2039
('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2040
('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2041
('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2042
('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2043
('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2044
('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2045
('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2046
('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2047
('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2048
('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2049
('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2050
('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2051
('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2052
('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2053
('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2054
('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2055
('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2056
('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2057
('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2058
('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2059
('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2060
('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2061
('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2062
('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2063
('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2064
('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2065
('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2066
('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2067
('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2068
('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2069
('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2070
('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2071
('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2072
('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2073
('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2075
for seq, res in sequences:
2076
self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2077
'invalid continuation byte')
2079
def test_codecs_idna(self):
2080
# Test whether trailing dot is preserved
2081
self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2083
def test_codecs_errors(self):
2084
# Error handling (encoding)
2085
self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2086
self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2087
self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2088
self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2089
self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2090
'Andr\202 x'.encode('ascii', errors='replace'))
2091
self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2092
'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2094
# Error handling (decoding)
2095
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2096
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2097
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2098
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2099
self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2101
# Error handling (unknown character names)
2102
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2104
# Error handling (truncated escape sequence)
2105
self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2107
self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2108
self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2109
self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2110
self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2112
# Error handling (wrong arguments)
2113
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2115
# Error handling (lone surrogate in
2116
# _PyUnicode_TransformDecimalAndSpaceToASCII())
2117
self.assertRaises(ValueError, int, "\ud800")
2118
self.assertRaises(ValueError, int, "\udf00")
2119
self.assertRaises(ValueError, float, "\ud800")
2120
self.assertRaises(ValueError, float, "\udf00")
2121
self.assertRaises(ValueError, complex, "\ud800")
2122
self.assertRaises(ValueError, complex, "\udf00")
2124
def test_codecs(self):
2126
self.assertEqual('hello'.encode('ascii'), b'hello')
2127
self.assertEqual('hello'.encode('utf-7'), b'hello')
2128
self.assertEqual('hello'.encode('utf-8'), b'hello')
2129
self.assertEqual('hello'.encode('utf-8'), b'hello')
2130
self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2131
self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2132
self.assertEqual('hello'.encode('latin-1'), b'hello')
2134
# Default encoding is utf-8
2135
self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2137
# Roundtrip safety for BMP (just the first 1024 chars)
2138
for c in range(1024):
2140
for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2141
'utf-16-be', 'raw_unicode_escape',
2143
self.assertEqual(str(u.encode(encoding),encoding), u)
2145
# Roundtrip safety for BMP (just the first 256 chars)
2146
for c in range(256):
2148
for encoding in ('latin-1',):
2149
self.assertEqual(str(u.encode(encoding),encoding), u)
2151
# Roundtrip safety for BMP (just the first 128 chars)
2152
for c in range(128):
2154
for encoding in ('ascii',):
2155
self.assertEqual(str(u.encode(encoding),encoding), u)
2157
# Roundtrip safety for non-BMP (just a few chars)
2158
with warnings.catch_warnings():
2159
u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2160
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2161
'raw_unicode_escape', 'unicode_escape'):
2162
self.assertEqual(str(u.encode(encoding),encoding), u)
2164
# UTF-8 must be roundtrip safe for all code points
2165
# (except surrogates, which are forbidden).
2166
u = ''.join(map(chr, list(range(0, 0xd800)) +
2167
list(range(0xe000, 0x110000))))
2168
for encoding in ('utf-8',):
2169
self.assertEqual(str(u.encode(encoding),encoding), u)
2171
def test_codecs_charmap(self):
2173
s = bytes(range(128))
2175
'cp037', 'cp1026', 'cp273',
2176
'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2177
'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2178
'cp863', 'cp865', 'cp866', 'cp1125',
2179
'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2180
'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2181
'iso8859_7', 'iso8859_9',
2182
'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2183
'mac_cyrillic', 'mac_latin2',
2185
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2186
'cp1256', 'cp1257', 'cp1258',
2187
'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2189
'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2190
'cp1006', 'iso8859_8',
2192
### These have undefined mappings:
2195
### These fail the round-trip:
2199
self.assertEqual(str(s, encoding).encode(encoding), s)
2202
s = bytes(range(128, 256))
2204
'cp037', 'cp1026', 'cp273',
2205
'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2206
'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2207
'cp863', 'cp865', 'cp866', 'cp1125',
2208
'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2209
'iso8859_2', 'iso8859_4', 'iso8859_5',
2210
'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2211
'mac_cyrillic', 'mac_latin2',
2213
### These have undefined mappings:
2214
#'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2215
#'cp1256', 'cp1257', 'cp1258',
2216
#'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2217
#'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2218
#'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2220
### These fail the round-trip:
2221
#'cp1006', 'cp875', 'iso8859_8',
2224
self.assertEqual(str(s, encoding).encode(encoding), s)
2226
def test_concatenation(self):
2227
self.assertEqual(("abc" "def"), "abcdef")
2228
self.assertEqual(("abc" "def"), "abcdef")
2229
self.assertEqual(("abc" "def"), "abcdef")
2230
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2231
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2233
def test_printing(self):
2235
def write(self, text):
2239
print('abc', file=out)
2240
print('abc', 'def', file=out)
2241
print('abc', 'def', file=out)
2242
print('abc', 'def', file=out)
2243
print('abc\n', file=out)
2244
print('abc\n', end=' ', file=out)
2245
print('abc\n', end=' ', file=out)
2246
print('def\n', file=out)
2247
print('def\n', file=out)
2249
def test_ucs4(self):
2251
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2252
self.assertEqual(x, y)
2255
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2256
self.assertEqual(x, y)
2258
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2259
self.assertEqual(x, y)
2262
br'\U11111111'.decode("raw-unicode-escape")
2263
except UnicodeDecodeError as e:
2264
self.assertEqual(e.start, 0)
2265
self.assertEqual(e.end, 10)
2267
self.fail("Should have raised UnicodeDecodeError")
2269
def test_conversion(self):
2270
# Make sure __str__() works properly
2275
class StrSubclassToStr(str):
2279
class StrSubclassToStrSubclass(str):
2280
def __new__(cls, content=""):
2281
return str.__new__(cls, 2*content)
2285
self.assertEqual(str(ObjectToStr()), "foo")
2286
self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2287
s = str(StrSubclassToStrSubclass("foo"))
2288
self.assertEqual(s, "foofoo")
2289
self.assertIs(type(s), StrSubclassToStrSubclass)
2290
s = StrSubclass(StrSubclassToStrSubclass("foo"))
2291
self.assertEqual(s, "foofoo")
2292
self.assertIs(type(s), StrSubclass)
2294
def test_unicode_repr(self):
2303
self.assertEqual(repr(s1()), '\\n')
2304
self.assertEqual(repr(s2()), '\\n')
2306
def test_printable_repr(self):
2307
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2308
self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
2310
# This test only affects 32-bit platforms because expandtabs can only take
2311
# an int as the max value, not a 64-bit C long. If expandtabs is changed
2312
# to take a 64-bit long, this test should apply to all platforms.
2313
@unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2314
'only applies to 32-bit platforms')
2315
def test_expandtabs_overflows_gracefully(self):
2316
self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2318
@support.cpython_only
2319
def test_expandtabs_optimization(self):
2321
self.assertIs(s.expandtabs(), s)
2324
# Not useful for Cython: struct sizes change between versions
2325
# so it's hard to keep reliably up-to-date, and it's largely checking
2326
# a CPython implementation detail
2327
def test_raiseMemError(self):
2328
if struct.calcsize('P') == 8:
2330
ascii_struct_size = 48
2331
compact_struct_size = 72
2334
ascii_struct_size = 24
2335
compact_struct_size = 36
2337
for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2340
char_size = 1 # sizeof(Py_UCS1)
2341
struct_size = ascii_struct_size
2342
elif code < 0x10000:
2343
char_size = 2 # sizeof(Py_UCS2)
2344
struct_size = compact_struct_size
2346
char_size = 4 # sizeof(Py_UCS4)
2347
struct_size = compact_struct_size
2348
# Note: sys.maxsize is half of the actual max allocation because of
2349
# the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2350
# be allocatable, given enough memory.
2351
maxlen = ((sys.maxsize - struct_size) // char_size)
2352
alloc = lambda: char * maxlen
2353
self.assertRaises(MemoryError, alloc)
2354
self.assertRaises(MemoryError, alloc)
2357
def test_format_subclass(self):
2360
return '__str__ overridden'
2362
self.assertEqual("%s" % s, '__str__ overridden')
2363
self.assertEqual("{}".format(s), '__str__ overridden')
2365
def test_subclass_add(self):
2367
def __add__(self, o):
2369
self.assertEqual(S("4") + S("5"), "3")
2371
def __iadd__(self, o):
2375
self.assertEqual(s, "3")
2377
def _test_getnewargs(self):
2379
args = text.__getnewargs__()
2380
self.assertIsNot(args[0], text)
2381
self.assertEqual(args[0], text)
2382
self.assertEqual(len(args), 1)
2384
@unittest.skipIf(sys.version_info < (3, 8), 'resize test requires Py3.8+')
2385
@support.cpython_only
2386
def test_resize(self):
2387
from _testcapi import getargs_u
2388
for length in range(1, 100, 7):
2389
# generate a fresh string (refcount=1)
2390
text = 'a' * length + 'b'
2392
# fill wstr internal field
2393
abc = getargs_u(text)
2394
self.assertEqual(abc, text)
2396
# resize text: wstr field must be cleared and then recomputed
2398
abcdef = getargs_u(text)
2399
self.assertNotEqual(abc, abcdef)
2400
self.assertEqual(abcdef, text)
2402
def test_compare(self):
2411
astral = '\U00100000' * N
2412
astral2 = '\U0010ffff' * N
2418
for text1, text2 in itertools.combinations(strings, 2):
2419
equal = (text1 is text2)
2420
self.assertEqual(text1 == text2, equal)
2421
self.assertEqual(text1 != text2, not equal)
2424
self.assertTrue(text1 <= text2)
2425
self.assertTrue(text1 >= text2)
2427
# text1 is text2: duplicate strings to skip the "str1 == str2"
2428
# optimization in unicode_compare_eq() and really compare
2429
# character per character
2430
copy1 = duplicate_string(text1)
2431
copy2 = duplicate_string(text2)
2432
self.assertIsNot(copy1, copy2)
2434
self.assertTrue(copy1 == copy2)
2435
self.assertFalse(copy1 != copy2)
2437
self.assertTrue(copy1 <= copy2)
2438
self.assertTrue(copy2 >= copy2)
2440
self.assertTrue(ascii < ascii2)
2441
self.assertTrue(ascii < latin)
2442
self.assertTrue(ascii < bmp)
2443
self.assertTrue(ascii < astral)
2444
self.assertFalse(ascii >= ascii2)
2445
self.assertFalse(ascii >= latin)
2446
self.assertFalse(ascii >= bmp)
2447
self.assertFalse(ascii >= astral)
2449
self.assertFalse(latin < ascii)
2450
self.assertTrue(latin < latin2)
2451
self.assertTrue(latin < bmp)
2452
self.assertTrue(latin < astral)
2453
self.assertTrue(latin >= ascii)
2454
self.assertFalse(latin >= latin2)
2455
self.assertFalse(latin >= bmp)
2456
self.assertFalse(latin >= astral)
2458
self.assertFalse(bmp < ascii)
2459
self.assertFalse(bmp < latin)
2460
self.assertTrue(bmp < bmp2)
2461
self.assertTrue(bmp < astral)
2462
self.assertTrue(bmp >= ascii)
2463
self.assertTrue(bmp >= latin)
2464
self.assertFalse(bmp >= bmp2)
2465
self.assertFalse(bmp >= astral)
2467
self.assertFalse(astral < ascii)
2468
self.assertFalse(astral < latin)
2469
self.assertFalse(astral < bmp2)
2470
self.assertTrue(astral < astral2)
2471
self.assertTrue(astral >= ascii)
2472
self.assertTrue(astral >= latin)
2473
self.assertTrue(astral >= bmp2)
2474
self.assertFalse(astral >= astral2)
2476
def test_free_after_iterating(self):
2477
support.check_free_after_iterating(self, iter, str)
2478
support.check_free_after_iterating(self, reversed, str)
2482
class CAPITest(unittest.TestCase):
2484
# Test PyUnicode_FromFormat()
2485
def test_from_format(self):
2486
support.import_module('ctypes')
2487
from ctypes import (
2488
pythonapi, py_object, sizeof,
2489
c_int, c_long, c_longlong, c_ssize_t,
2490
c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2491
name = "PyUnicode_FromFormat"
2492
_PyUnicode_FromFormat = getattr(pythonapi, name)
2493
_PyUnicode_FromFormat.restype = py_object
2495
def PyUnicode_FromFormat(format, *args):
2497
py_object(arg) if isinstance(arg, str) else arg
2499
return _PyUnicode_FromFormat(format, *cargs)
2501
def check_format(expected, format, *args):
2502
text = PyUnicode_FromFormat(format, *args)
2503
self.assertEqual(expected, text)
2505
# ascii format, non-ascii argument
2506
check_format('ascii\x7f=unicode\xe9',
2507
b'ascii\x7f=%U', 'unicode\xe9')
2509
# non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2511
self.assertRaisesRegex(ValueError,
2512
r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2513
'string, got a non-ASCII byte: 0xe9$',
2514
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2517
check_format('\uabcd',
2518
b'%c', c_int(0xabcd))
2519
check_format('\U0010ffff',
2520
b'%c', c_int(0x10ffff))
2521
with self.assertRaises(OverflowError):
2522
PyUnicode_FromFormat(b'%c', c_int(0x110000))
2524
check_format('\U00010000\U00100000',
2525
b'%c%c', c_int(0x10000), c_int(0x100000))
2536
check_format('%abc',
2542
check_format('abc[\ufffd',
2543
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2544
check_format("'\\u20acABC'",
2546
check_format("'\\u20",
2547
b'%.5A', '\u20acABCDEF')
2548
check_format("'\u20acABC'",
2550
check_format("'\u20acA",
2551
b'%.3R', '\u20acABCDEF')
2552
check_format('\u20acAB',
2553
b'%.3S', '\u20acABCDEF')
2554
check_format('\u20acAB',
2555
b'%.3U', '\u20acABCDEF')
2556
check_format('\u20acAB',
2557
b'%.3V', '\u20acABCDEF', None)
2558
check_format('abc[\ufffd',
2559
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2561
# following tests comes from #7330
2562
# test width modifier and precision modifier with %S
2563
check_format("repr= abc",
2565
check_format("repr=ab",
2566
b'repr=%.2S', 'abc')
2567
check_format("repr= ab",
2568
b'repr=%5.2S', 'abc')
2570
# test width modifier and precision modifier with %R
2571
check_format("repr= 'abc'",
2573
check_format("repr='ab",
2574
b'repr=%.3R', 'abc')
2575
check_format("repr= 'ab",
2576
b'repr=%5.3R', 'abc')
2578
# test width modifier and precision modifier with %A
2579
check_format("repr= 'abc'",
2581
check_format("repr='ab",
2582
b'repr=%.3A', 'abc')
2583
check_format("repr= 'ab",
2584
b'repr=%5.3A', 'abc')
2586
# test width modifier and precision modifier with %s
2587
check_format("repr= abc",
2588
b'repr=%5s', b'abc')
2589
check_format("repr=ab",
2590
b'repr=%.2s', b'abc')
2591
check_format("repr= ab",
2592
b'repr=%5.2s', b'abc')
2594
# test width modifier and precision modifier with %U
2595
check_format("repr= abc",
2597
check_format("repr=ab",
2598
b'repr=%.2U', 'abc')
2599
check_format("repr= ab",
2600
b'repr=%5.2U', 'abc')
2602
# test width modifier and precision modifier with %V
2603
check_format("repr= abc",
2604
b'repr=%5V', 'abc', b'123')
2605
check_format("repr=ab",
2606
b'repr=%.2V', 'abc', b'123')
2607
check_format("repr= ab",
2608
b'repr=%5.2V', 'abc', b'123')
2609
check_format("repr= 123",
2610
b'repr=%5V', None, b'123')
2611
check_format("repr=12",
2612
b'repr=%.2V', None, b'123')
2613
check_format("repr= 12",
2614
b'repr=%5.2V', None, b'123')
2616
# test integer formats (%i, %d, %u)
2619
check_format('0010',
2620
b'%0.4i', c_int(10))
2621
check_format('-123',
2623
check_format('-123',
2624
b'%li', c_long(-123))
2625
check_format('-123',
2626
b'%lli', c_longlong(-123))
2627
check_format('-123',
2628
b'%zi', c_ssize_t(-123))
2630
check_format('-123',
2632
check_format('-123',
2633
b'%ld', c_long(-123))
2634
check_format('-123',
2635
b'%lld', c_longlong(-123))
2636
check_format('-123',
2637
b'%zd', c_ssize_t(-123))
2642
b'%lu', c_ulong(123))
2644
b'%llu', c_ulonglong(123))
2646
b'%zu', c_size_t(123))
2649
min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2650
max_longlong = -min_longlong - 1
2651
check_format(str(min_longlong),
2652
b'%lld', c_longlong(min_longlong))
2653
check_format(str(max_longlong),
2654
b'%lld', c_longlong(max_longlong))
2655
max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2656
check_format(str(max_ulonglong),
2657
b'%llu', c_ulonglong(max_ulonglong))
2658
PyUnicode_FromFormat(b'%p', c_void_p(-1))
2660
# test padding (width and/or precision)
2661
check_format('123'.rjust(10, '0'),
2662
b'%010i', c_int(123))
2663
check_format('123'.rjust(100),
2664
b'%100i', c_int(123))
2665
check_format('123'.rjust(100, '0'),
2666
b'%.100i', c_int(123))
2667
check_format('123'.rjust(80, '0').rjust(100),
2668
b'%100.80i', c_int(123))
2670
check_format('123'.rjust(10, '0'),
2671
b'%010u', c_uint(123))
2672
check_format('123'.rjust(100),
2673
b'%100u', c_uint(123))
2674
check_format('123'.rjust(100, '0'),
2675
b'%.100u', c_uint(123))
2676
check_format('123'.rjust(80, '0').rjust(100),
2677
b'%100.80u', c_uint(123))
2679
check_format('123'.rjust(10, '0'),
2680
b'%010x', c_int(0x123))
2681
check_format('123'.rjust(100),
2682
b'%100x', c_int(0x123))
2683
check_format('123'.rjust(100, '0'),
2684
b'%.100x', c_int(0x123))
2685
check_format('123'.rjust(80, '0').rjust(100),
2686
b'%100.80x', c_int(0x123))
2689
check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2690
b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2693
check_format('repr=abc',
2694
b'repr=%V', 'abc', b'xyz')
2696
# Test string decode from parameter of %s using utf-8.
2697
# b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2699
check_format('repr=\u4eba\u6c11',
2700
b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2702
#Test replace error handler.
2703
check_format('repr=abc\ufffd',
2704
b'repr=%V', None, b'abc\xff')
2706
# not supported: copy the raw format string. these tests are just here
2707
# to check for crashes and should not be considered as specifications
2710
check_format('%1abc',
2714
check_format('%.%s',
2717
# Issue #33817: empty strings
2723
# Test PyUnicode_AsWideChar()
2724
@support.cpython_only
2725
def test_aswidechar(self):
2726
from _testcapi import unicode_aswidechar
2727
support.import_module('ctypes')
2728
from ctypes import c_wchar, sizeof
2730
wchar, size = unicode_aswidechar('abcdef', 2)
2731
self.assertEqual(size, 2)
2732
self.assertEqual(wchar, 'ab')
2734
wchar, size = unicode_aswidechar('abc', 3)
2735
self.assertEqual(size, 3)
2736
self.assertEqual(wchar, 'abc')
2738
wchar, size = unicode_aswidechar('abc', 4)
2739
self.assertEqual(size, 3)
2740
self.assertEqual(wchar, 'abc\0')
2742
wchar, size = unicode_aswidechar('abc', 10)
2743
self.assertEqual(size, 3)
2744
self.assertEqual(wchar, 'abc\0')
2746
wchar, size = unicode_aswidechar('abc\0def', 20)
2747
self.assertEqual(size, 7)
2748
self.assertEqual(wchar, 'abc\0def\0')
2750
nonbmp = chr(0x10ffff)
2751
if sizeof(c_wchar) == 2:
2754
else: # sizeof(c_wchar) == 4
2757
wchar, size = unicode_aswidechar(nonbmp, buflen)
2758
self.assertEqual(size, nchar)
2759
self.assertEqual(wchar, nonbmp + '\0')
2761
# Test PyUnicode_AsWideCharString()
2762
@support.cpython_only
2763
def test_aswidecharstring(self):
2764
from _testcapi import unicode_aswidecharstring
2765
support.import_module('ctypes')
2766
from ctypes import c_wchar, sizeof
2768
wchar, size = unicode_aswidecharstring('abc')
2769
self.assertEqual(size, 3)
2770
self.assertEqual(wchar, 'abc\0')
2772
wchar, size = unicode_aswidecharstring('abc\0def')
2773
self.assertEqual(size, 7)
2774
self.assertEqual(wchar, 'abc\0def\0')
2776
nonbmp = chr(0x10ffff)
2777
if sizeof(c_wchar) == 2:
2779
else: # sizeof(c_wchar) == 4
2781
wchar, size = unicode_aswidecharstring(nonbmp)
2782
self.assertEqual(size, nchar)
2783
self.assertEqual(wchar, nonbmp + '\0')
2785
# Test PyUnicode_AsUCS4()
2786
@support.cpython_only
2787
def test_asucs4(self):
2788
from _testcapi import unicode_asucs4
2789
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2790
'a\ud800b\udfffc', '\ud834\udd1e']:
2792
self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2793
self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2794
self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2795
self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2796
self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2797
self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2798
s = '\0'.join([s, s])
2799
self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2800
self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2802
# Test PyUnicode_FindChar()
2803
@support.cpython_only
2804
def test_findchar(self):
2805
from _testcapi import unicode_findchar
2807
for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2808
for i, ch in enumerate(str):
2809
self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2810
self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2813
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2814
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2816
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2817
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2819
self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2820
self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2822
self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2823
self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2825
# Test PyUnicode_CopyCharacters()
2826
@support.cpython_only
2827
def test_copycharacters(self):
2828
from _testcapi import unicode_copycharacters
2831
'abcde', '\xa1\xa2\xa3\xa4\xa5',
2832
'\u4f60\u597d\u4e16\u754c\uff01',
2833
'\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2836
for idx, from_ in enumerate(strings):
2837
# wide -> narrow: exceed maxchar limitation
2838
for to in strings[:idx]:
2841
unicode_copycharacters, to, 0, from_, 0, 5
2844
for from_start in range(5):
2846
unicode_copycharacters(from_, 0, from_, from_start, 5),
2847
(from_[from_start:from_start+5].ljust(5, '\0'),
2850
for to_start in range(5):
2852
unicode_copycharacters(from_, to_start, from_, to_start, 5),
2853
(from_[to_start:to_start+5].rjust(5, '\0'),
2857
# Tests omitted since this creates invalid strings.
2860
self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2861
self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2862
self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2863
self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2864
self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2865
self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2866
self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2868
@support.cpython_only
2869
def test_encode_decimal(self):
2870
from _testcapi import unicode_encodedecimal
2871
self.assertEqual(unicode_encodedecimal('123'),
2873
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2875
self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2877
self.assertRaises(UnicodeEncodeError,
2878
unicode_encodedecimal, "123\u20ac", "strict")
2879
self.assertRaisesRegex(
2881
"^'decimal' codec can't encode character",
2882
unicode_encodedecimal, "123\u20ac", "replace")
2884
@support.cpython_only
2885
def test_transform_decimal(self):
2886
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2887
self.assertEqual(transform_decimal('123'),
2889
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2891
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2892
"\N{EM SPACE}3.14\N{EN SPACE}")
2893
self.assertEqual(transform_decimal('123\u20ac'),
2896
@support.cpython_only
2897
def test_pep393_utf8_caching_bug(self):
2898
# Issue #25709: Problem with string concatenation and utf-8 cache
2899
from _testcapi import getargs_s_hash
2900
for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2903
# Due to CPython specific optimization the 's' string can be
2906
# Parsing with the "s#" format code calls indirectly
2907
# PyUnicode_AsUTF8AndSize() which creates the UTF-8
2908
# encoded string cached in the Unicode object.
2909
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2910
# Check that the second call returns the same result
2911
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2916
class StringModuleTest(unittest.TestCase):
2917
def test_formatter_parser(self):
2919
return list(_string.formatter_parser(format))
2921
formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2922
self.assertEqual(formatter, [
2923
('prefix ', '2', '', 's'),
2924
('xxx', '0', '^+10.3f', None),
2925
('', 'obj.attr', '', 's'),
2926
(' ', 'z[0]', '10', 's'),
2929
formatter = parse("prefix {} suffix")
2930
self.assertEqual(formatter, [
2931
('prefix ', '', '', None),
2932
(' suffix', None, None, None),
2935
formatter = parse("str")
2936
self.assertEqual(formatter, [
2937
('str', None, None, None),
2940
formatter = parse("")
2941
self.assertEqual(formatter, [])
2943
formatter = parse("{0}")
2944
self.assertEqual(formatter, [
2945
('', '0', '', None),
2948
self.assertRaises(TypeError, _string.formatter_parser, 1)
2950
def test_formatter_field_name_split(self):
2952
items = list(_string.formatter_field_name_split(name))
2953
items[1] = list(items[1])
2955
self.assertEqual(split("obj"), ["obj", []])
2956
self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2957
self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2958
self.assertEqual(split("obj.arg[key1][key2]"), [
2964
self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2968
if __name__ == "__main__":