cython

test_unicode.pyx
2969 строк · 130.3 Кб
Перенос по словам
1
# cython: language_level=3
2

3
""" Test script for the Unicode implementation.
4

5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6

7
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8

9
"""
10
#import _string
11
import codecs
12
import itertools
13
import operator
14
#import struct
15
#import sys
16
#import unittest
17
import warnings
18
# from test import support, string_tests
19
from contextlib import contextmanager
20

21

22
class support(object):
23
    @staticmethod
24
    def _ignore(func):
25
        return unittest.skip("Ignoring CPython-only test")(func)
26

27
    def run_with_locale(*args):
28
        return support._ignore
29

30
    cpython_only = _ignore
31

32
    def check_free_after_iterating(*args):
33
        pass
34

35
    @contextmanager
36
    def check_warnings(*args):
37
        yield  # ignore any warnings
38

39
support = support()
40

41
include "test_unicode_string_tests.pxi"
42

43

44
############### ORIGINAL TESTS START HERE #################
45

46

47
# Error handling (bad decoder return)
48
def search_function(encoding):
49
    def decode1(input, errors="strict"):
50
        return 42 # not a tuple
51
    def encode1(input, errors="strict"):
52
        return 42 # not a tuple
53
    def encode2(input, errors="strict"):
54
        return (42, 42) # no unicode
55
    def decode2(input, errors="strict"):
56
        return (42, 42) # no unicode
57
    if encoding=="test.unicode1":
58
        return (encode1, decode1, None, None)
59
    elif encoding=="test.unicode2":
60
        return (encode2, decode2, None, None)
61
    else:
62
        return None
63
codecs.register(search_function)
64

65
def duplicate_string(text):
66
    """
67
    Try to get a fresh clone of the specified text:
68
    new object with a reference count of 1.
69

70
    This is a best-effort: latin1 single letters and the empty
71
    string ('') are singletons and cannot be cloned.
72
    """
73
    return text.encode().decode()
74

75
class StrSubclass(str):
76
    pass
77

78
class UnicodeTest(CommonTest,
79
        MixinStrUnicodeUserStringTest,
80
        MixinStrUnicodeTest,
81
        unittest.TestCase):
82

83
    type2test = str
84

85
    def checkequalnofix(self, result, object, methodname, *args):
86
        method = getattr(object, methodname)
87
        realresult = method(*args)
88
        self.assertEqual(realresult, result)
89
        self.assertTrue(type(realresult) is type(result))
90

91
        # if the original is returned make sure that
92
        # this doesn't happen with subclasses
93
        if realresult is object:
94
            class usub(str):
95
                def __repr__(self):
96
                    return 'usub(%r)' % str.__repr__(self)
97
            object = usub(object)
98
            method = getattr(object, methodname)
99
            realresult = method(*args)
100
            self.assertEqual(realresult, result)
101
            self.assertTrue(object is not realresult)
102

103
    def test_literals(self):
104
        self.assertEqual('\xff', '\u00ff')
105
        self.assertEqual('\uffff', '\U0000ffff')
106
        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
107
        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
108
        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
109
        # raw strings should not have unicode escapes
110
        self.assertNotEqual(r"\u0020", " ")
111

112
    def test_ascii(self):
113
        if not sys.platform.startswith('java'):
114
            # Test basic sanity of repr()
115
            self.assertEqual(ascii('abc'), "'abc'")
116
            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
117
            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
118
            self.assertEqual(ascii('\\c'), "'\\\\c'")
119
            self.assertEqual(ascii('\\'), "'\\\\'")
120
            self.assertEqual(ascii('\n'), "'\\n'")
121
            self.assertEqual(ascii('\r'), "'\\r'")
122
            self.assertEqual(ascii('\t'), "'\\t'")
123
            self.assertEqual(ascii('\b'), "'\\x08'")
124
            self.assertEqual(ascii("'\""), """'\\'"'""")
125
            self.assertEqual(ascii("'\""), """'\\'"'""")
126
            self.assertEqual(ascii("'"), '''"'"''')
127
            self.assertEqual(ascii('"'), """'"'""")
128
            latin1repr = (
129
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
130
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
131
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
132
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
133
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
134
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
135
                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
136
                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
137
                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
138
                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
139
                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
140
                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
141
                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
142
                "\\xfe\\xff'")
143
            testrepr = ascii(''.join(map(chr, range(256))))
144
            self.assertEqual(testrepr, latin1repr)
145
            # Test ascii works on wide unicode escapes without overflow.
146
            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
147
                             ascii("\U00010000" * 39 + "\uffff" * 4096))
148

149
            class WrongRepr:
150
                def __repr__(self):
151
                    return b'byte-repr'
152
            self.assertRaises(TypeError, ascii, WrongRepr())
153

154
    def test_repr(self):
155
        if not sys.platform.startswith('java'):
156
            # Test basic sanity of repr()
157
            self.assertEqual(repr('abc'), "'abc'")
158
            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
159
            self.assertEqual(repr('ab\\'), "'ab\\\\'")
160
            self.assertEqual(repr('\\c'), "'\\\\c'")
161
            self.assertEqual(repr('\\'), "'\\\\'")
162
            self.assertEqual(repr('\n'), "'\\n'")
163
            self.assertEqual(repr('\r'), "'\\r'")
164
            self.assertEqual(repr('\t'), "'\\t'")
165
            self.assertEqual(repr('\b'), "'\\x08'")
166
            self.assertEqual(repr("'\""), """'\\'"'""")
167
            self.assertEqual(repr("'\""), """'\\'"'""")
168
            self.assertEqual(repr("'"), '''"'"''')
169
            self.assertEqual(repr('"'), """'"'""")
170
            latin1repr = (
171
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
172
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
173
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
174
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
175
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
176
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
177
                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
178
                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
179
                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
180
                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
181
                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
182
                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
183
                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
184
                "\xfe\xff'")
185
            testrepr = repr(''.join(map(chr, range(256))))
186
            self.assertEqual(testrepr, latin1repr)
187
            # Test repr works on wide unicode escapes without overflow.
188
            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
189
                             repr("\U00010000" * 39 + "\uffff" * 4096))
190

191
            class WrongRepr:
192
                def __repr__(self):
193
                    return b'byte-repr'
194
            self.assertRaises(TypeError, repr, WrongRepr())
195

196
    def test_iterators(self):
197
        # Make sure unicode objects have an __iter__ method
198
        it = "\u1111\u2222\u3333".__iter__()
199
        self.assertEqual(next(it), "\u1111")
200
        self.assertEqual(next(it), "\u2222")
201
        self.assertEqual(next(it), "\u3333")
202
        self.assertRaises(StopIteration, next, it)
203

204
    def test_count(self):
205
        CommonTest.test_count(self)
206
        # check mixed argument types
207
        self.checkequalnofix(3,  'aaa', 'count', 'a')
208
        self.checkequalnofix(0,  'aaa', 'count', 'b')
209
        self.checkequalnofix(3, 'aaa', 'count',  'a')
210
        self.checkequalnofix(0, 'aaa', 'count',  'b')
211
        self.checkequalnofix(0, 'aaa', 'count',  'b')
212
        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
213
        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
214
        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
215
        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
216
        # test mixed kinds
217
        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
218
        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
219
        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
220
        self.checkequal(0, 'a' * 10, 'count', '\u0102')
221
        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
222
        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
223
        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
224
        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
225
        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
226
        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
227
        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
228
        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
229

230
    def test_find(self):
231
        CommonTest.test_find(self)
232
        # test implementation details of the memchr fast path
233
        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
234
        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
235
        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
236
        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
237
        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
238
        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
239
        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
240
        # check mixed argument types
241
        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
242
        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
243
        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
244

245
        self.assertRaises(TypeError, 'hello'.find)
246
        self.assertRaises(TypeError, 'hello'.find, 42)
247
        # test mixed kinds
248
        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
249
        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
250
        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
251
        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
252
        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
253
        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
254
        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
255
        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
256
        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
257
        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
258
        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
259
        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
260

261
    def test_rfind(self):
262
        CommonTest.test_rfind(self)
263
        # test implementation details of the memrchr fast path
264
        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
265
        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
266
        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
267
        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
268
        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
269
        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
270
        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
271
        # check mixed argument types
272
        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
273
        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
274
        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
275
        # test mixed kinds
276
        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
277
        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
278
        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
279
        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
280
        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
281
        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
282
        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
283
        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
284
        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
285
        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
286
        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
287
        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
288

289
    def test_index(self):
290
        CommonTest.test_index(self)
291
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
292
        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
293
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
294
        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
295
        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
296
        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
297
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
298
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
299
        # test mixed kinds
300
        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
301
        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
302
        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
303
        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
304
        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
305
        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
306
        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
307
        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
308
        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
309
        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
310
        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
311
        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
312

313
    def test_rindex(self):
314
        CommonTest.test_rindex(self)
315
        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
316
        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
317
        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
318
        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
319

320
        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
321
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
322
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
323
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
324
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
325
        # test mixed kinds
326
        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
327
        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
328
        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
329
        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
330
        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
331
        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
332
        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
333
        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
334
        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
335
        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
336
        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
337
        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
338

339
    def test_maketrans_translate(self):
340
        # these work with plain translate()
341
        self.checkequalnofix('bbbc', 'abababc', 'translate',
342
                             {ord('a'): None})
343
        self.checkequalnofix('iiic', 'abababc', 'translate',
344
                             {ord('a'): None, ord('b'): ord('i')})
345
        self.checkequalnofix('iiix', 'abababc', 'translate',
346
                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
347
        self.checkequalnofix('c', 'abababc', 'translate',
348
                             {ord('a'): None, ord('b'): ''})
349
        self.checkequalnofix('xyyx', 'xzx', 'translate',
350
                             {ord('z'): 'yy'})
351

352
        # this needs maketrans()
353
        self.checkequalnofix('abababc', 'abababc', 'translate',
354
                             {'b': '<i>'})
355
        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
356
        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
357
        # test alternative way of calling maketrans()
358
        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
359
        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
360

361
        # various tests switching from ASCII to latin1 or the opposite;
362
        # same length, remove a letter, or replace with a longer string.
363
        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
364
                         "[X]")
365
        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
366
                         "[X]")
367
        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
368
                         "[]")
369
        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
370
                         "[XXX]")
371
        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
372
                         "[\xe9]")
373
        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
374
                         "x123")
375
        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
376
                         "x\xe9")
377

378
        # test non-ASCII (don't take the fast-path)
379
        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
380
                         "[<\xe9>]")
381
        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
382
                         "[a]")
383
        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
384
                         "[]")
385
        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
386
                         "[123]")
387
        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
388
                         "[<\u20ac>\xe9]")
389

390
        # invalid Unicode characters
391
        invalid_char = 0x10ffff+1
392
        for before in "a\xe9\u20ac\U0010ffff":
393
            mapping = str.maketrans({before: invalid_char})
394
            text = "[%s]" % before
395
            self.assertRaises(ValueError, text.translate, mapping)
396

397
        # errors
398
        self.assertRaises(TypeError, self.type2test.maketrans)
399
        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
400
        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
401
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
402
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
403
        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
404
        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
405

406
        self.assertRaises(TypeError, 'hello'.translate)
407
        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
408

409
    def test_split(self):
410
        CommonTest.test_split(self)
411

412
        # test mixed kinds
413
        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
414
            left *= 9
415
            right *= 9
416
            for delim in ('c', '\u0102', '\U00010302'):
417
                self.checkequal([left + right],
418
                                left + right, 'split', delim)
419
                self.checkequal([left, right],
420
                                left + delim + right, 'split', delim)
421
                self.checkequal([left + right],
422
                                left + right, 'split', delim * 2)
423
                self.checkequal([left, right],
424
                                left + delim * 2 + right, 'split', delim *2)
425

426
    def test_rsplit(self):
427
        CommonTest.test_rsplit(self)
428
        # test mixed kinds
429
        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
430
            left *= 9
431
            right *= 9
432
            for delim in ('c', '\u0102', '\U00010302'):
433
                self.checkequal([left + right],
434
                                left + right, 'rsplit', delim)
435
                self.checkequal([left, right],
436
                                left + delim + right, 'rsplit', delim)
437
                self.checkequal([left + right],
438
                                left + right, 'rsplit', delim * 2)
439
                self.checkequal([left, right],
440
                                left + delim * 2 + right, 'rsplit', delim *2)
441

442
    def test_partition(self):
443
        MixinStrUnicodeUserStringTest.test_partition(self)
444
        # test mixed kinds
445
        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
446
        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
447
            left *= 9
448
            right *= 9
449
            for delim in ('c', '\u0102', '\U00010302'):
450
                self.checkequal((left + right, '', ''),
451
                                left + right, 'partition', delim)
452
                self.checkequal((left, delim, right),
453
                                left + delim + right, 'partition', delim)
454
                self.checkequal((left + right, '', ''),
455
                                left + right, 'partition', delim * 2)
456
                self.checkequal((left, delim * 2, right),
457
                                left + delim * 2 + right, 'partition', delim * 2)
458

459
    def test_rpartition(self):
460
        MixinStrUnicodeUserStringTest.test_rpartition(self)
461
        # test mixed kinds
462
        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
463
        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
464
            left *= 9
465
            right *= 9
466
            for delim in ('c', '\u0102', '\U00010302'):
467
                self.checkequal(('', '', left + right),
468
                                left + right, 'rpartition', delim)
469
                self.checkequal((left, delim, right),
470
                                left + delim + right, 'rpartition', delim)
471
                self.checkequal(('', '', left + right),
472
                                left + right, 'rpartition', delim * 2)
473
                self.checkequal((left, delim * 2, right),
474
                                left + delim * 2 + right, 'rpartition', delim * 2)
475

476
    def test_join(self):
477
        MixinStrUnicodeUserStringTest.test_join(self)
478

479
        class MyWrapper:
480
            def __init__(self, sval): self.sval = sval
481
            def __str__(self): return self.sval
482

483
        # mixed arguments
484
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
485
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
486
        self.checkequalnofix('w x y z', ' ', 'join', Sequence('wxyz'))
487
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
488
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
489
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
490
        self.checkequalnofix('w x y z', ' ', 'join', Sequence('wxyz'))
491
        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
492
        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
493
        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
494
        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
495

496
    @unittest.skipIf(sys.maxsize > 2**32,
497
        'needs too much memory on a 64-bit platform')
498
    def test_join_overflow(self):
499
        size = int(sys.maxsize**0.5) + 1
500
        seq = ('A' * size,) * size
501
        self.assertRaises(OverflowError, ''.join, seq)
502

503
    def test_replace(self):
504
        CommonTest.test_replace(self)
505

506
        # method call forwarded from str implementation because of unicode argument
507
        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
508
        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
509
        # test mixed kinds
510
        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
511
            left *= 9
512
            right *= 9
513
            for delim in ('c', '\u0102', '\U00010302'):
514
                for repl in ('d', '\u0103', '\U00010303'):
515
                    self.checkequal(left + right,
516
                                    left + right, 'replace', delim, repl)
517
                    self.checkequal(left + repl + right,
518
                                    left + delim + right,
519
                                    'replace', delim, repl)
520
                    self.checkequal(left + right,
521
                                    left + right, 'replace', delim * 2, repl)
522
                    self.checkequal(left + repl + right,
523
                                    left + delim * 2 + right,
524
                                    'replace', delim * 2, repl)
525

526
    @support.cpython_only
527
    def test_replace_id(self):
528
        pattern = 'abc'
529
        text = 'abc def'
530
        self.assertIs(text.replace(pattern, pattern), text)
531

532
    def test_bytes_comparison(self):
533
        with support.check_warnings():
534
            warnings.simplefilter('ignore', BytesWarning)
535
            self.assertEqual('abc' == b'abc', False)
536
            self.assertEqual('abc' != b'abc', True)
537
            self.assertEqual('abc' == bytearray(b'abc'), False)
538
            self.assertEqual('abc' != bytearray(b'abc'), True)
539

540
    def test_comparison(self):
541
        # Comparisons:
542
        self.assertEqual('abc', 'abc')
543
        self.assertTrue('abcd' > 'abc')
544
        self.assertTrue('abc' < 'abcd')
545

546
        if 0:
547
            # Move these tests to a Unicode collation module test...
548
            # Testing UTF-16 code point order comparisons...
549

550
            # No surrogates, no fixup required.
551
            self.assertTrue('\u0061' < '\u20ac')
552
            # Non surrogate below surrogate value, no fixup required
553
            self.assertTrue('\u0061' < '\ud800\udc02')
554

555
            # Non surrogate above surrogate value, fixup required
556
            def test_lecmp(s, s2):
557
                self.assertTrue(s < s2)
558

559
            def test_fixup(s):
560
                s2 = '\ud800\udc01'
561
                test_lecmp(s, s2)
562
                s2 = '\ud900\udc01'
563
                test_lecmp(s, s2)
564
                s2 = '\uda00\udc01'
565
                test_lecmp(s, s2)
566
                s2 = '\udb00\udc01'
567
                test_lecmp(s, s2)
568
                s2 = '\ud800\udd01'
569
                test_lecmp(s, s2)
570
                s2 = '\ud900\udd01'
571
                test_lecmp(s, s2)
572
                s2 = '\uda00\udd01'
573
                test_lecmp(s, s2)
574
                s2 = '\udb00\udd01'
575
                test_lecmp(s, s2)
576
                s2 = '\ud800\ude01'
577
                test_lecmp(s, s2)
578
                s2 = '\ud900\ude01'
579
                test_lecmp(s, s2)
580
                s2 = '\uda00\ude01'
581
                test_lecmp(s, s2)
582
                s2 = '\udb00\ude01'
583
                test_lecmp(s, s2)
584
                s2 = '\ud800\udfff'
585
                test_lecmp(s, s2)
586
                s2 = '\ud900\udfff'
587
                test_lecmp(s, s2)
588
                s2 = '\uda00\udfff'
589
                test_lecmp(s, s2)
590
                s2 = '\udb00\udfff'
591
                test_lecmp(s, s2)
592

593
                test_fixup('\ue000')
594
                test_fixup('\uff61')
595

596
        # Surrogates on both sides, no fixup required
597
        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
598

599
    def test_islower(self):
600
        super().test_islower()
601
        self.checkequalnofix(False, '\u1FFc', 'islower')
602
        self.assertFalse('\u2167'.islower())
603
        self.assertTrue('\u2177'.islower())
604
        # non-BMP, uppercase
605
        self.assertFalse('\U00010401'.islower())
606
        self.assertFalse('\U00010427'.islower())
607
        # non-BMP, lowercase
608
        self.assertTrue('\U00010429'.islower())
609
        self.assertTrue('\U0001044E'.islower())
610
        # non-BMP, non-cased
611
        self.assertFalse('\U0001F40D'.islower())
612
        self.assertFalse('\U0001F46F'.islower())
613

614
    def test_isupper(self):
615
        super().test_isupper()
616
        if not sys.platform.startswith('java'):
617
            self.checkequalnofix(False, '\u1FFc', 'isupper')
618
        self.assertTrue('\u2167'.isupper())
619
        self.assertFalse('\u2177'.isupper())
620
        # non-BMP, uppercase
621
        self.assertTrue('\U00010401'.isupper())
622
        self.assertTrue('\U00010427'.isupper())
623
        # non-BMP, lowercase
624
        self.assertFalse('\U00010429'.isupper())
625
        self.assertFalse('\U0001044E'.isupper())
626
        # non-BMP, non-cased
627
        self.assertFalse('\U0001F40D'.isupper())
628
        self.assertFalse('\U0001F46F'.isupper())
629

630
    def test_istitle(self):
631
        super().test_istitle()
632
        self.checkequalnofix(True, '\u1FFc', 'istitle')
633
        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
634

635
        # non-BMP, uppercase + lowercase
636
        self.assertTrue('\U00010401\U00010429'.istitle())
637
        self.assertTrue('\U00010427\U0001044E'.istitle())
638
        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
639
        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
640
            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
641

642
    def test_isspace(self):
643
        super().test_isspace()
644
        self.checkequalnofix(True, '\u2000', 'isspace')
645
        self.checkequalnofix(True, '\u200a', 'isspace')
646
        self.checkequalnofix(False, '\u2014', 'isspace')
647
        # apparently there are no non-BMP spaces chars in Unicode 6
648
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
649
                   '\U0001F40D', '\U0001F46F']:
650
            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
651

652
    def test_isalnum(self):
653
        super().test_isalnum()
654
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
655
                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
656
            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
657

658
    def test_isalpha(self):
659
        super().test_isalpha()
660
        self.checkequalnofix(True, '\u1FFc', 'isalpha')
661
        # non-BMP, cased
662
        self.assertTrue('\U00010401'.isalpha())
663
        self.assertTrue('\U00010427'.isalpha())
664
        self.assertTrue('\U00010429'.isalpha())
665
        self.assertTrue('\U0001044E'.isalpha())
666
        # non-BMP, non-cased
667
        self.assertFalse('\U0001F40D'.isalpha())
668
        self.assertFalse('\U0001F46F'.isalpha())
669

670
    def test_isascii(self):
671
        super().test_isascii()
672
        self.assertFalse("\u20ac".isascii())
673
        self.assertFalse("\U0010ffff".isascii())
674

675
    def test_isdecimal(self):
676
        self.checkequalnofix(False, '', 'isdecimal')
677
        self.checkequalnofix(False, 'a', 'isdecimal')
678
        self.checkequalnofix(True, '0', 'isdecimal')
679
        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
680
        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
681
        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
682
        self.checkequalnofix(True, '0123456789', 'isdecimal')
683
        self.checkequalnofix(False, '0123456789a', 'isdecimal')
684

685
        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
686

687
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
688
                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
689
            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
690
        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
691
            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
692

693
    def test_isdigit(self):
694
        super().test_isdigit()
695
        self.checkequalnofix(True, '\u2460', 'isdigit')
696
        self.checkequalnofix(False, '\xbc', 'isdigit')
697
        self.checkequalnofix(True, '\u0660', 'isdigit')
698

699
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
700
                   '\U0001F40D', '\U0001F46F', '\U00011065']:
701
            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
702
        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
703
            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
704

705
    def test_isnumeric(self):
706
        self.checkequalnofix(False, '', 'isnumeric')
707
        self.checkequalnofix(False, 'a', 'isnumeric')
708
        self.checkequalnofix(True, '0', 'isnumeric')
709
        self.checkequalnofix(True, '\u2460', 'isnumeric')
710
        self.checkequalnofix(True, '\xbc', 'isnumeric')
711
        self.checkequalnofix(True, '\u0660', 'isnumeric')
712
        self.checkequalnofix(True, '0123456789', 'isnumeric')
713
        self.checkequalnofix(False, '0123456789a', 'isnumeric')
714

715
        self.assertRaises(TypeError, "abc".isnumeric, 42)
716

717
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
718
                   '\U0001F40D', '\U0001F46F']:
719
            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
720
        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
721
                   '\U000104A0', '\U0001F107']:
722
            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
723

724
    def test_isidentifier(self):
725
        self.assertTrue("a".isidentifier())
726
        self.assertTrue("Z".isidentifier())
727
        self.assertTrue("_".isidentifier())
728
        self.assertTrue("b0".isidentifier())
729
        self.assertTrue("bc".isidentifier())
730
        self.assertTrue("b_".isidentifier())
731
        self.assertTrue("µ".isidentifier())
732
        self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
733

734
        self.assertFalse(" ".isidentifier())
735
        self.assertFalse("[".isidentifier())
736
        self.assertFalse("©".isidentifier())
737
        self.assertFalse("0".isidentifier())
738

739
    def test_isprintable(self):
740
        self.assertTrue("".isprintable())
741
        self.assertTrue(" ".isprintable())
742
        self.assertTrue("abcdefg".isprintable())
743
        self.assertFalse("abcdefg\n".isprintable())
744
        # some defined Unicode character
745
        self.assertTrue("\u0374".isprintable())
746
        # undefined character
747
        self.assertFalse("\u0378".isprintable())
748
        # single surrogate character
749
        self.assertFalse("\ud800".isprintable())
750

751
        self.assertTrue('\U0001F46F'.isprintable())
752
        self.assertFalse('\U000E0020'.isprintable())
753

754
    def test_surrogates(self):
755
        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
756
                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
757
            self.assertTrue(s.islower())
758
            self.assertFalse(s.isupper())
759
            self.assertFalse(s.istitle())
760
        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
761
                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
762
            self.assertFalse(s.islower())
763
            self.assertTrue(s.isupper())
764
            self.assertTrue(s.istitle())
765

766
        for meth_name in ('islower', 'isupper', 'istitle'):
767
            meth = getattr(str, meth_name)
768
            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
769
                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
770

771
        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
772
                          'isdecimal', 'isnumeric',
773
                          'isidentifier', 'isprintable'):
774
            meth = getattr(str, meth_name)
775
            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
776
                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
777
                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
778
                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
779

780

781
    def test_lower(self):
782
        CommonTest.test_lower(self)
783
        self.assertEqual('\U00010427'.lower(), '\U0001044F')
784
        self.assertEqual('\U00010427\U00010427'.lower(),
785
                         '\U0001044F\U0001044F')
786
        self.assertEqual('\U00010427\U0001044F'.lower(),
787
                         '\U0001044F\U0001044F')
788
        self.assertEqual('X\U00010427x\U0001044F'.lower(),
789
                         'x\U0001044Fx\U0001044F')
790
        self.assertEqual('ﬁ'.lower(), 'ﬁ')
791
        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
792
        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
793
        self.assertEqual('\u03a3'.lower(), '\u03c3')
794
        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
795
        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
796
        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
797
        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
798
        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
799
        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
800
        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
801
        self.assertEqual('\u2177'.lower(), '\u2177')
802

803
    def test_casefold(self):
804
        self.assertEqual('hello'.casefold(), 'hello')
805
        self.assertEqual('hELlo'.casefold(), 'hello')
806
        self.assertEqual('ß'.casefold(), 'ss')
807
        self.assertEqual('ﬁ'.casefold(), 'fi')
808
        self.assertEqual('\u03a3'.casefold(), '\u03c3')
809
        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
810
        self.assertEqual('\u00b5'.casefold(), '\u03bc')
811

812
    def test_upper(self):
813
        CommonTest.test_upper(self)
814
        self.assertEqual('\U0001044F'.upper(), '\U00010427')
815
        self.assertEqual('\U0001044F\U0001044F'.upper(),
816
                         '\U00010427\U00010427')
817
        self.assertEqual('\U00010427\U0001044F'.upper(),
818
                         '\U00010427\U00010427')
819
        self.assertEqual('X\U00010427x\U0001044F'.upper(),
820
                         'X\U00010427X\U00010427')
821
        self.assertEqual('ﬁ'.upper(), 'FI')
822
        self.assertEqual('\u0130'.upper(), '\u0130')
823
        self.assertEqual('\u03a3'.upper(), '\u03a3')
824
        self.assertEqual('ß'.upper(), 'SS')
825
        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
826
        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
827
        self.assertEqual('\u2177'.upper(), '\u2167')
828

829
    def test_capitalize(self):
830
        CommonTest.test_capitalize(self)
831
        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
832
        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
833
                         '\U00010427\U0001044F')
834
        self.assertEqual('\U00010427\U0001044F'.capitalize(),
835
                         '\U00010427\U0001044F')
836
        self.assertEqual('\U0001044F\U00010427'.capitalize(),
837
                         '\U00010427\U0001044F')
838
        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
839
                         'X\U0001044Fx\U0001044F')
840
        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
841
        exp = '\u0399\u0308\u0300\u0069\u0307'
842
        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
843
        if sys.version_info < (3, 8):
844
            self.assertEqual('ﬁnnish'.capitalize(), 'FInnish')
845
        else:
846
            self.assertEqual('ﬁnnish'.capitalize(), 'Finnish')
847
        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
848

849
    def test_title(self):
850
        super().test_title()
851
        self.assertEqual('\U0001044F'.title(), '\U00010427')
852
        self.assertEqual('\U0001044F\U0001044F'.title(),
853
                         '\U00010427\U0001044F')
854
        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
855
                         '\U00010427\U0001044F \U00010427\U0001044F')
856
        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
857
                         '\U00010427\U0001044F \U00010427\U0001044F')
858
        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
859
                         '\U00010427\U0001044F \U00010427\U0001044F')
860
        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
861
                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
862
        self.assertEqual('ﬁNNISH'.title(), 'Finnish')
863
        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
864
        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
865

866
    def test_swapcase(self):
867
        CommonTest.test_swapcase(self)
868
        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
869
        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
870
        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
871
                         '\U00010427\U00010427')
872
        self.assertEqual('\U00010427\U0001044F'.swapcase(),
873
                         '\U0001044F\U00010427')
874
        self.assertEqual('\U0001044F\U00010427'.swapcase(),
875
                         '\U00010427\U0001044F')
876
        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
877
                         'x\U0001044FX\U00010427')
878
        self.assertEqual('ﬁ'.swapcase(), 'FI')
879
        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
880
        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
881
        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
882
        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
883
        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
884
        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
885
        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
886
        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
887
        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
888
        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
889
        self.assertEqual('ß'.swapcase(), 'SS')
890
        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
891

892
    def test_center(self):
893
        CommonTest.test_center(self)
894
        self.assertEqual('x'.center(2, '\U0010FFFF'),
895
                         'x\U0010FFFF')
896
        self.assertEqual('x'.center(3, '\U0010FFFF'),
897
                         '\U0010FFFFx\U0010FFFF')
898
        self.assertEqual('x'.center(4, '\U0010FFFF'),
899
                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
900

901
    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
902
    @support.cpython_only
903
    def test_case_operation_overflow(self):
904
        # Issue #22643
905
        size = 2**32//12 + 1
906
        try:
907
            s = "ü" * size
908
        except MemoryError:
909
            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
910
        try:
911
            self.assertRaises(OverflowError, s.upper)
912
        finally:
913
            del s
914

915
    def test_contains(self):
916
        # Testing Unicode contains method
917
        self.assertIn('a', 'abdb')
918
        self.assertIn('a', 'bdab')
919
        self.assertIn('a', 'bdaba')
920
        self.assertIn('a', 'bdba')
921
        self.assertNotIn('a', 'bdb')
922
        self.assertIn('a', 'bdba')
923
        self.assertIn('a', ('a',1,None))
924
        self.assertIn('a', (1,None,'a'))
925
        self.assertIn('a', ('a',1,None))
926
        self.assertIn('a', (1,None,'a'))
927
        self.assertNotIn('a', ('x',1,'y'))
928
        self.assertNotIn('a', ('x',1,None))
929
        self.assertNotIn('abcd', 'abcxxxx')
930
        self.assertIn('ab', 'abcd')
931
        self.assertIn('ab', 'abc')
932
        self.assertIn('ab', (1,None,'ab'))
933
        self.assertIn('', 'abc')
934
        self.assertIn('', '')
935
        self.assertIn('', 'abc')
936
        self.assertNotIn('\0', 'abc')
937
        self.assertIn('\0', '\0abc')
938
        self.assertIn('\0', 'abc\0')
939
        self.assertIn('a', '\0abc')
940
        self.assertIn('asdf', 'asdf')
941
        self.assertNotIn('asdf', 'asd')
942
        self.assertNotIn('asdf', '')
943

944
        self.assertRaises(TypeError, "abc".__contains__)
945
        # test mixed kinds
946
        for fill in ('a', '\u0100', '\U00010300'):
947
            fill *= 9
948
            for delim in ('c', '\u0102', '\U00010302'):
949
                self.assertNotIn(delim, fill)
950
                self.assertIn(delim, fill + delim)
951
                self.assertNotIn(delim * 2, fill)
952
                self.assertIn(delim * 2, fill + delim * 2)
953

954
    def test_issue18183(self):
955
        '\U00010000\U00100000'.lower()
956
        '\U00010000\U00100000'.casefold()
957
        '\U00010000\U00100000'.upper()
958
        '\U00010000\U00100000'.capitalize()
959
        '\U00010000\U00100000'.title()
960
        '\U00010000\U00100000'.swapcase()
961
        '\U00100000'.center(3, '\U00010000')
962
        '\U00100000'.ljust(3, '\U00010000')
963
        '\U00100000'.rjust(3, '\U00010000')
964

965
    def test_format(self):
966
        self.assertEqual(''.format(), '')
967
        self.assertEqual('a'.format(), 'a')
968
        self.assertEqual('ab'.format(), 'ab')
969
        self.assertEqual('a{{'.format(), 'a{')
970
        self.assertEqual('a}}'.format(), 'a}')
971
        self.assertEqual('{{b'.format(), '{b')
972
        self.assertEqual('}}b'.format(), '}b')
973
        self.assertEqual('a{{b'.format(), 'a{b')
974

975
        # examples from the PEP:
976
        import datetime
977
        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
978
        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
979
                         "My name is Fred")
980
        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
981
                         "My name is Fred :-{}")
982

983
        d = datetime.date(2007, 8, 18)
984
        self.assertEqual("The year is {0.year}".format(d),
985
                         "The year is 2007")
986

987
        # classes we'll use for testing
988
        class C:
989
            def __init__(self, x=100):
990
                self._x = x
991
            def __format__(self, spec):
992
                return spec
993

994
        class D:
995
            def __init__(self, x):
996
                self.x = x
997
            def __format__(self, spec):
998
                return str(self.x)
999

1000
        # class with __str__, but no __format__
1001
        class E:
1002
            def __init__(self, x):
1003
                self.x = x
1004
            def __str__(self):
1005
                return 'E(' + self.x + ')'
1006

1007
        # class with __repr__, but no __format__ or __str__
1008
        class F:
1009
            def __init__(self, x):
1010
                self.x = x
1011
            def __repr__(self):
1012
                return 'F(' + self.x + ')'
1013

1014
        # class with __format__ that forwards to string, for some format_spec's
1015
        class G:
1016
            def __init__(self, x):
1017
                self.x = x
1018
            def __str__(self):
1019
                return "string is " + self.x
1020
            def __format__(self, format_spec):
1021
                if format_spec == 'd':
1022
                    return 'G(' + self.x + ')'
1023
                return object.__format__(self, format_spec)
1024

1025
        class I(datetime.date):
1026
            def __format__(self, format_spec):
1027
                return self.strftime(format_spec)
1028

1029
        class J(int):
1030
            def __format__(self, format_spec):
1031
                return int.__format__(self * 2, format_spec)
1032

1033
        class M:
1034
            def __init__(self, x):
1035
                self.x = x
1036
            def __repr__(self):
1037
                return 'M(' + self.x + ')'
1038
            __str__ = None
1039

1040
        class N:
1041
            def __init__(self, x):
1042
                self.x = x
1043
            def __repr__(self):
1044
                return 'N(' + self.x + ')'
1045
            __format__ = None
1046

1047
        self.assertEqual(''.format(), '')
1048
        self.assertEqual('abc'.format(), 'abc')
1049
        self.assertEqual('{0}'.format('abc'), 'abc')
1050
        self.assertEqual('{0:}'.format('abc'), 'abc')
1051
#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1052
        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1053
        self.assertEqual('{0}X'.format('abc'), 'abcX')
1054
        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1055
        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1056
        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1057
        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1058
        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1059
        self.assertEqual('{0}'.format(-15), '-15')
1060
        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1061
        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1062
        self.assertEqual('{{'.format(), '{')
1063
        self.assertEqual('}}'.format(), '}')
1064
        self.assertEqual('{{}}'.format(), '{}')
1065
        self.assertEqual('{{x}}'.format(), '{x}')
1066
        self.assertEqual('{{{0}}}'.format(123), '{123}')
1067
        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1068
        self.assertEqual('}}{{'.format(), '}{')
1069
        self.assertEqual('}}x{{'.format(), '}x{')
1070

1071
        # weird field names
1072
        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1073
        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1074
        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1075

1076
        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1077
        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1078
        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1079
        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1080
        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1081
        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1082
        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1083

1084
        # strings
1085
        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1086
        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1087
        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1088
        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1089
        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1090
        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1091
        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1092
        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1093
        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1094
        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1095
        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1096
        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1097
        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1098
        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1099
        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1100
        self.assertEqual('{0:>7s}'.format('result'), ' result')
1101
        self.assertEqual('{0:>8s}'.format('result'), '  result')
1102
        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1103
        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1104
        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1105
        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1106
        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1107
        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1108

1109
        # issue 12546: use \x00 as a fill character
1110
        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1111
        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1112
        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1113
        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1114

1115
        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1116
        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1117
        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1118
        self.assertEqual('{0:<6}'.format(3), '3     ')
1119

1120
        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1121
        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1122
        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1123
        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1124

1125
        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1126
        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1127
        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1128
        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1129

1130
        # format specifiers for user defined type
1131
        self.assertEqual('{0:abc}'.format(C()), 'abc')
1132

1133
        # !r, !s and !a coercions
1134
        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1135
        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1136
        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1137
        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1138
        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1139
        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1140
        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1141
        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1142
        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1143
        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1144
        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1145
        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1146
        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1147
        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1148
        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1149
        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1150

1151
        # test fallback to object.__format__
1152
        self.assertEqual('{0}'.format({}), '{}')
1153
        self.assertEqual('{0}'.format([]), '[]')
1154
        self.assertEqual('{0}'.format([1]), '[1]')
1155

1156
        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1157
        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1158

1159
        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1160
        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1161
        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1162

1163
        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1164
                                                       month=8,
1165
                                                       day=27)),
1166
                         "date: 2007-08-27")
1167

1168
        # test deriving from a builtin type and overriding __format__
1169
        self.assertEqual("{0}".format(J(10)), "20")
1170

1171

1172
        # string format specifiers
1173
        self.assertEqual('{0:}'.format('a'), 'a')
1174

1175
        # computed format specifiers
1176
        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1177
        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1178
        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1179
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1180
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1181

1182
        # test various errors
1183
        self.assertRaises(ValueError, '{'.format)
1184
        self.assertRaises(ValueError, '}'.format)
1185
        self.assertRaises(ValueError, 'a{'.format)
1186
        self.assertRaises(ValueError, 'a}'.format)
1187
        self.assertRaises(ValueError, '{a'.format)
1188
        self.assertRaises(ValueError, '}a'.format)
1189
        self.assertRaises(IndexError, '{0}'.format)
1190
        self.assertRaises(IndexError, '{1}'.format, 'abc')
1191
        self.assertRaises(KeyError,   '{x}'.format)
1192
        self.assertRaises(ValueError, "}{".format)
1193
        self.assertRaises(ValueError, "abc{0:{}".format)
1194
        self.assertRaises(ValueError, "{0".format)
1195
        self.assertRaises(IndexError, "{0.}".format)
1196
        self.assertRaises(ValueError, "{0.}".format, 0)
1197
        self.assertRaises(ValueError, "{0[}".format)
1198
        self.assertRaises(ValueError, "{0[}".format, [])
1199
        self.assertRaises(KeyError,   "{0]}".format)
1200
        self.assertRaises(ValueError, "{0.[]}".format, 0)
1201
        self.assertRaises(ValueError, "{0..foo}".format, 0)
1202
        self.assertRaises(ValueError, "{0[0}".format, 0)
1203
        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1204
        self.assertRaises(KeyError,   "{c]}".format)
1205
        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1206
        self.assertRaises(ValueError, "{0}}".format, 0)
1207
        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1208
        self.assertRaises(ValueError, "{0!x}".format, 3)
1209
        self.assertRaises(ValueError, "{0!}".format, 0)
1210
        self.assertRaises(ValueError, "{0!rs}".format, 0)
1211
        self.assertRaises(ValueError, "{!}".format)
1212
        self.assertRaises(IndexError, "{:}".format)
1213
        self.assertRaises(IndexError, "{:s}".format)
1214
        self.assertRaises(IndexError, "{}".format)
1215
        big = "23098475029384702983476098230754973209482573"
1216
        self.assertRaises(ValueError, ("{" + big + "}").format)
1217
        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1218

1219
        # issue 6089
1220
        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1221
        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1222

1223
        # can't have a replacement on the field name portion
1224
        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1225

1226
        # exceed maximum recursion depth
1227
        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1228
        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1229
                          0, 1, 2, 3, 4, 5, 6, 7)
1230

1231
        # string format spec errors
1232
        self.assertRaises(ValueError, "{0:-s}".format, '')
1233
        self.assertRaises(ValueError, format, "", "-")
1234
        self.assertRaises(ValueError, "{0:=s}".format, '')
1235

1236
        # Alternate formatting is not supported
1237
        self.assertRaises(ValueError, format, '', '#')
1238
        self.assertRaises(ValueError, format, '', '#20')
1239

1240
        # Non-ASCII
1241
        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1242
                         'ABC\u0410\u0411\u0412')
1243
        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1244
                         'ABC')
1245
        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1246
                         '')
1247

1248
        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1249
        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1250
        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1251
        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1252
        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1253
        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1254
        self.assertRaises(ValueError, "{a{}b}".format, 42)
1255
        self.assertRaises(ValueError, "{a{b}".format, 42)
1256
        self.assertRaises(ValueError, "{[}".format, 42)
1257

1258
        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1259

1260
        # Blocking fallback
1261
        m = M('data')
1262
        self.assertEqual("{!r}".format(m), 'M(data)')
1263
        self.assertRaises(TypeError, "{!s}".format, m)
1264
        self.assertRaises(TypeError, "{}".format, m)
1265
        n = N('data')
1266
        self.assertEqual("{!r}".format(n), 'N(data)')
1267
        self.assertEqual("{!s}".format(n), 'N(data)')
1268
        self.assertRaises(TypeError, "{}".format, n)
1269

1270
    def test_format_map(self):
1271
        self.assertEqual(''.format_map({}), '')
1272
        self.assertEqual('a'.format_map({}), 'a')
1273
        self.assertEqual('ab'.format_map({}), 'ab')
1274
        self.assertEqual('a{{'.format_map({}), 'a{')
1275
        self.assertEqual('a}}'.format_map({}), 'a}')
1276
        self.assertEqual('{{b'.format_map({}), '{b')
1277
        self.assertEqual('}}b'.format_map({}), '}b')
1278
        self.assertEqual('a{{b'.format_map({}), 'a{b')
1279

1280
        # using mappings
1281
        class Mapping(dict):
1282
            def __missing__(self, key):
1283
                return key
1284
        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1285
        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1286

1287
        class InternalMapping:
1288
            def __init__(self):
1289
                self.mapping = {'a': 'hello'}
1290
            def __getitem__(self, key):
1291
                return self.mapping[key]
1292
        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1293

1294

1295
        class C:
1296
            def __init__(self, x=100):
1297
                self._x = x
1298
            def __format__(self, spec):
1299
                return spec
1300
        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1301

1302
        # test various errors
1303
        self.assertRaises(TypeError, ''.format_map)
1304
        self.assertRaises(TypeError, 'a'.format_map)
1305

1306
        self.assertRaises(ValueError, '{'.format_map, {})
1307
        self.assertRaises(ValueError, '}'.format_map, {})
1308
        self.assertRaises(ValueError, 'a{'.format_map, {})
1309
        self.assertRaises(ValueError, 'a}'.format_map, {})
1310
        self.assertRaises(ValueError, '{a'.format_map, {})
1311
        self.assertRaises(ValueError, '}a'.format_map, {})
1312

1313
        # issue #12579: can't supply positional params to format_map
1314
        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1315
        self.assertRaises(ValueError, '{}'.format_map, 'a')
1316
        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1317

1318
        ZERO = 0
1319
        class BadMapping:
1320
            def __getitem__(self, key):
1321
                return 1 / ZERO
1322
        self.assertRaises(KeyError, '{a}'.format_map, {})
1323
        self.assertRaises(TypeError, '{a}'.format_map, [])
1324
        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1325

1326
    def test_format_huge_precision(self):
1327
        format_string = ".{}f".format(sys.maxsize + 1)
1328
        with self.assertRaises(ValueError):
1329
            result = format(2.34, format_string)
1330

1331
    def test_format_huge_width(self):
1332
        format_string = "{}f".format(sys.maxsize + 1)
1333
        with self.assertRaises(ValueError):
1334
            result = format(2.34, format_string)
1335

1336
    def test_format_huge_item_number(self):
1337
        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1338
        with self.assertRaises(ValueError):
1339
            result = format_string.format(2.34)
1340

1341
    def test_format_auto_numbering(self):
1342
        class C:
1343
            def __init__(self, x=100):
1344
                self._x = x
1345
            def __format__(self, spec):
1346
                return spec
1347

1348
        self.assertEqual('{}'.format(10), '10')
1349
        self.assertEqual('{:5}'.format('s'), 's    ')
1350
        self.assertEqual('{!r}'.format('s'), "'s'")
1351
        self.assertEqual('{._x}'.format(C(10)), '10')
1352
        self.assertEqual('{[1]}'.format([1, 2]), '2')
1353
        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1354
        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1355

1356
        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1357
        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1358

1359
        # can't mix and match numbering and auto-numbering
1360
        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1361
        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1362
        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1363
        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1364

1365
        # can mix and match auto-numbering and named
1366
        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1367
        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1368
        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1369
        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1370

1371
    def test_formatting(self):
1372
        MixinStrUnicodeUserStringTest.test_formatting(self)
1373
        # Testing Unicode formatting strings...
1374
        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1375
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1376
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1377
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1378
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1379
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1380
        if not sys.platform.startswith('java'):
1381
            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1382
            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1383
            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1384
        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1385
        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1386

1387
        self.assertEqual('%c' % 0x1234, '\u1234')
1388
        self.assertEqual('%c' % 0x21483, '\U00021483')
1389
        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1390
        self.assertEqual('%c' % '\U00021483', '\U00021483')
1391
        self.assertRaises(TypeError, "%c".__mod__, "aa")
1392
        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1393
        self.assertRaises(TypeError, "%i".__mod__, "aa")
1394

1395
        # formatting jobs delegated from the string implementation:
1396
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1397
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1398
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1399
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1400
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1401
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1402
        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1403
        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1404
        self.assertEqual('...%s...' % "abc", '...abc...')
1405
        self.assertEqual('%*s' % (5,'abc',), '  abc')
1406
        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1407
        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1408
        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1409
        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1410
        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1411
        self.assertEqual('%c' % 'a', 'a')
1412
        class Wrapper:
1413
            def __str__(self):
1414
                return '\u1234'
1415
        self.assertEqual('%s' % Wrapper(), '\u1234')
1416

1417
        # issue 3382
1418
        NAN = float('nan')
1419
        INF = float('inf')
1420
        self.assertEqual('%f' % NAN, 'nan')
1421
        self.assertEqual('%F' % NAN, 'NAN')
1422
        self.assertEqual('%f' % INF, 'inf')
1423
        self.assertEqual('%F' % INF, 'INF')
1424

1425
        # PEP 393
1426
        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1427
        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1428

1429
        #issue 19995
1430
        class PseudoInt:
1431
            def __init__(self, value):
1432
                self.value = int(value)
1433
            def __int__(self):
1434
                return self.value
1435
            def __index__(self):
1436
                return self.value
1437
        class PseudoFloat:
1438
            def __init__(self, value):
1439
                self.value = float(value)
1440
            def __int__(self):
1441
                return int(self.value)
1442
        pi = PseudoFloat(3.1415)
1443
        letter_m = PseudoInt(109)
1444
        self.assertEqual('%x' % 42, '2a')
1445
        self.assertEqual('%X' % 15, 'F')
1446
        self.assertEqual('%o' % 9, '11')
1447
        self.assertEqual('%c' % 109, 'm')
1448
        self.assertEqual('%x' % letter_m, '6d')
1449
        self.assertEqual('%X' % letter_m, '6D')
1450
        self.assertEqual('%o' % letter_m, '155')
1451
        self.assertEqual('%c' % letter_m, 'm')
1452
        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1453
        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1454
        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1455
        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1456
        self.assertRaises(TypeError, operator.mod, '%c', pi),
1457

1458
    def test_formatting_with_enum(self):
1459
        # issue18780
1460
        import enum
1461
        class Float(float, enum.Enum):
1462
            PI = 3.1415926
1463
        class Int(enum.IntEnum):
1464
            IDES = 15
1465
        class Str(str, enum.Enum):
1466
            ABC = 'abc'
1467
        # Testing Unicode formatting strings...
1468
        self.assertEqual(("%s, %s" % (Str.ABC, Str.ABC)).replace("Str.", ""),
1469
                         'ABC, ABC')
1470
        self.assertEqual(("%s, %s, %d, %i, %u, %f, %5.2f" %
1471
                        (Str.ABC, Str.ABC,
1472
                         Int.IDES, Int.IDES, Int.IDES,
1473
                         Float.PI, Float.PI)).replace("Str.", ""),
1474
                         'ABC, ABC, 15, 15, 15, 3.141593,  3.14')
1475

1476
        # formatting jobs delegated from the string implementation:
1477
        self.assertEqual(('...%(foo)s...' % {'foo':Str.ABC}).replace("Str.", ""),
1478
                         '...ABC...')
1479
        self.assertEqual(('...%(foo)s...' % {'foo':Int.IDES}).replace("Int.", ""),
1480
                         '...IDES...' if sys.version_info < (3,11) else '...15...')
1481
        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1482
                         '...15...')
1483
        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1484
                         '...15...')
1485
        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1486
                         '...15...')
1487
        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1488
                         '...3.141593...')
1489

1490
    def test_formatting_huge_precision(self):
1491
        format_string = "%.{}f".format(sys.maxsize + 1)
1492
        with self.assertRaises(ValueError):
1493
            result = format_string % 2.34
1494

1495
    @unittest.skip('BROKEN!')
1496
    def test_issue28598_strsubclass_rhs(self):
1497
        # A subclass of str with an __rmod__ method should be able to hook
1498
        # into the % operator
1499
        class SubclassedStr(str):
1500
            def __rmod__(self, other):
1501
                return 'Success, self.__rmod__({!r}) was called'.format(other)
1502
        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1503
                         "Success, self.__rmod__('lhs %% %r') was called")
1504

1505
    @support.cpython_only
1506
    def test_formatting_huge_precision_c_limits(self):
1507
        from _testcapi import INT_MAX
1508
        format_string = "%.{}f".format(INT_MAX + 1)
1509
        with self.assertRaises(ValueError):
1510
            result = format_string % 2.34
1511

1512
    def test_formatting_huge_width(self):
1513
        format_string = "%{}f".format(sys.maxsize + 1)
1514
        with self.assertRaises(ValueError):
1515
            result = format_string % 2.34
1516

1517
    def test_startswith_endswith_errors(self):
1518
        for meth in ('foo'.startswith, 'foo'.endswith):
1519
            with self.assertRaises(TypeError) as cm:
1520
                meth(['f'])
1521
            exc = str(cm.exception)
1522
            self.assertIn('str', exc)
1523
            self.assertIn('tuple', exc)
1524

1525
    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1526
    def test_format_float(self):
1527
        # should not format with a comma, but always with C locale
1528
        self.assertEqual('1.0', '%.1f' % 1.0)
1529

1530
    def test_constructor(self):
1531
        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1532

1533
        self.assertEqual(
1534
            str('unicode remains unicode'),
1535
            'unicode remains unicode'
1536
        )
1537

1538
        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1539
            subclass = StrSubclass(text)
1540
            self.assertEqual(str(subclass), text)
1541
            self.assertEqual(len(subclass), len(text))
1542
            if text == 'ascii':
1543
                self.assertEqual(subclass.encode('ascii'), b'ascii')
1544
                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1545

1546
        self.assertEqual(
1547
            str('strings are converted to unicode'),
1548
            'strings are converted to unicode'
1549
        )
1550

1551
        class StringCompat:
1552
            def __init__(self, x):
1553
                self.x = x
1554
            def __str__(self):
1555
                return self.x
1556

1557
        self.assertEqual(
1558
            str(StringCompat('__str__ compatible objects are recognized')),
1559
            '__str__ compatible objects are recognized'
1560
        )
1561

1562
        # unicode(obj) is compatible to str():
1563

1564
        o = StringCompat('unicode(obj) is compatible to str()')
1565
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1566
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1567

1568
        for obj in (123, 123.45, 123):
1569
            self.assertEqual(str(obj), str(str(obj)))
1570

1571
        # unicode(obj, encoding, error) tests (this maps to
1572
        # PyUnicode_FromEncodedObject() at C level)
1573

1574
        if not sys.platform.startswith('java'):
1575
            self.assertRaises(
1576
                TypeError,
1577
                str,
1578
                'decoding unicode is not supported',
1579
                'utf-8',
1580
                'strict'
1581
            )
1582

1583
        self.assertEqual(
1584
            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1585
            'strings are decoded to unicode'
1586
        )
1587

1588
        if not sys.platform.startswith('java'):
1589
            self.assertEqual(
1590
                str(
1591
                    memoryview(b'character buffers are decoded to unicode'),
1592
                    'utf-8',
1593
                    'strict'
1594
                ),
1595
                'character buffers are decoded to unicode'
1596
            )
1597

1598
        self.assertRaises(TypeError, str, 42, 42, 42)
1599

1600
    def test_constructor_keyword_args(self):
1601
        """Pass various keyword argument combinations to the constructor."""
1602
        # The object argument can be passed as a keyword.
1603
        self.assertEqual(str(object='foo'), 'foo')
1604
        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1605
        # The errors argument without encoding triggers "decode" mode.
1606
        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1607
        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1608

1609
    def test_constructor_defaults(self):
1610
        """Check the constructor argument defaults."""
1611
        # The object argument defaults to '' or b''.
1612
        self.assertEqual(str(), '')
1613
        self.assertEqual(str(errors='strict'), '')
1614
        utf8_cent = '¢'.encode('utf-8')
1615
        # The encoding argument defaults to utf-8.
1616
        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1617
        # The errors argument defaults to strict.
1618
        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1619

1620
    def test_codecs_utf7(self):
1621
        utfTests = [
1622
            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1623
            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1624
            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1625
            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1626
            ('+', b'+-'),
1627
            ('+-', b'+--'),
1628
            ('+?', b'+-?'),
1629
            (r'\?', b'+AFw?'),
1630
            ('+?', b'+-?'),
1631
            (r'\\?', b'+AFwAXA?'),
1632
            (r'\\\?', b'+AFwAXABc?'),
1633
            (r'++--', b'+-+---'),
1634
            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1635
            ('/', b'/'),
1636
        ]
1637

1638
        for (x, y) in utfTests:
1639
            self.assertEqual(x.encode('utf-7'), y)
1640

1641
        # Unpaired surrogates are passed through
1642
        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1643
        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1644
        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1645
        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1646
        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1647
        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1648
        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1649
        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1650

1651
        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1652
        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1653

1654
        # Issue #2242: crash on some Windows/MSVC versions
1655
        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1656

1657
        # Direct encoded characters
1658
        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1659
        # Optional direct characters
1660
        set_o = '!"#$%&*;<=>@[]^_`{|}'
1661
        for c in set_d:
1662
            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1663
            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1664
        for c in set_o:
1665
            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1666

1667
        if sys.version_info >= (3, 8):
1668
            with self.assertRaisesRegex(UnicodeDecodeError,
1669
                                        'ill-formed sequence'):
1670
                b'+@'.decode('utf-7')
1671

1672
    def test_codecs_utf8(self):
1673
        self.assertEqual(''.encode('utf-8'), b'')
1674
        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1675
        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1676
        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1677
        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1678
        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1679
        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1680
                         b'\xf0\x90\x80\x82'*10)
1681
        self.assertEqual(
1682
            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1683
            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1684
            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1685
            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1686
            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1687
            ' Nunstuck git und'.encode('utf-8'),
1688
            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1689
            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1690
            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1691
            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1692
            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1693
            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1694
            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1695
            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1696
            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1697
            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1698
        )
1699

1700
        # UTF-8 specific decoding tests
1701
        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1702
        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1703
        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1704

1705
        # Other possible utf-8 test cases:
1706
        # * strict decoding testing for all of the
1707
        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1708

1709
    def test_utf8_decode_valid_sequences(self):
1710
        sequences = [
1711
            # single byte
1712
            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1713
            # 2 bytes
1714
            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1715
            # 3 bytes
1716
            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1717
            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1718
            # 4 bytes
1719
            (b'\xF0\x90\x80\x80', '\U00010000'),
1720
            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1721
        ]
1722
        for seq, res in sequences:
1723
            self.assertEqual(seq.decode('utf-8'), res)
1724

1725

1726
    def test_utf8_decode_invalid_sequences(self):
1727
        # continuation bytes in a sequence of 2, 3, or 4 bytes
1728
        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1729
        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1730
        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1731
        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1732
        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1733
        invalid_start_bytes = (
1734
            continuation_bytes + invalid_2B_seq_start_bytes +
1735
            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1736
        )
1737

1738
        for byte in invalid_start_bytes:
1739
            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1740

1741
        for sb in invalid_2B_seq_start_bytes:
1742
            for cb in continuation_bytes:
1743
                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1744

1745
        for sb in invalid_4B_seq_start_bytes:
1746
            for cb1 in continuation_bytes[:3]:
1747
                for cb3 in continuation_bytes[:3]:
1748
                    self.assertRaises(UnicodeDecodeError,
1749
                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1750

1751
        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1752
            self.assertRaises(UnicodeDecodeError,
1753
                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1754
            self.assertRaises(UnicodeDecodeError,
1755
                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1756
        # surrogates
1757
        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1758
            self.assertRaises(UnicodeDecodeError,
1759
                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1760
            self.assertRaises(UnicodeDecodeError,
1761
                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1762
        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1763
            self.assertRaises(UnicodeDecodeError,
1764
                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1765
            self.assertRaises(UnicodeDecodeError,
1766
                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1767
        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1768
            self.assertRaises(UnicodeDecodeError,
1769
                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1770
            self.assertRaises(UnicodeDecodeError,
1771
                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1772

1773
    def test_issue8271(self):
1774
        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1775
        # only the start byte and the continuation byte(s) are now considered
1776
        # invalid, instead of the number of bytes specified by the start byte.
1777
        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1778
        # table 3-8, Row 2) for more information about the algorithm used.
1779
        FFFD = '\ufffd'
1780
        sequences = [
1781
            # invalid start bytes
1782
            (b'\x80', FFFD), # continuation byte
1783
            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1784
            (b'\xc0', FFFD),
1785
            (b'\xc0\xc0', FFFD*2),
1786
            (b'\xc1', FFFD),
1787
            (b'\xc1\xc0', FFFD*2),
1788
            (b'\xc0\xc1', FFFD*2),
1789
            # with start byte of a 2-byte sequence
1790
            (b'\xc2', FFFD), # only the start byte
1791
            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1792
            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1793
            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1794
            # with start byte of a 3-byte sequence
1795
            (b'\xe1', FFFD), # only the start byte
1796
            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1797
            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1798
            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1799
            (b'\xe1\x80', FFFD), # only 1 continuation byte
1800
            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1801
            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1802
            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1803
            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1804
            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1805
            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1806
            # with start byte of a 4-byte sequence
1807
            (b'\xf1', FFFD), # only the start byte
1808
            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1809
            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1810
            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1811
            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1812
            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1813
            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1814
            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1815
            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1816
            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1817
            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1818
            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1819
            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1820
            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1821
            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1822
            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1823
            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1824
            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1825
            # with invalid start byte of a 4-byte sequence (rfc2279)
1826
            (b'\xf5', FFFD), # only the start byte
1827
            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1828
            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1829
            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1830
            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1831
            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1832
            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1833
            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1834
            # with invalid start byte of a 5-byte sequence (rfc2279)
1835
            (b'\xf8', FFFD), # only the start byte
1836
            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1837
            (b'\xf8\x80', FFFD*2), # only one continuation byte
1838
            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1839
            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1840
            # with invalid start byte of a 6-byte sequence (rfc2279)
1841
            (b'\xfc', FFFD), # only the start byte
1842
            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1843
            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1844
            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1845
            # invalid start byte
1846
            (b'\xfe', FFFD),
1847
            (b'\xfe\x80\x80', FFFD*3),
1848
            # other sequences
1849
            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1850
            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1851
            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1852
            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1853
             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1854
        ]
1855
        for n, (seq, res) in enumerate(sequences):
1856
            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1857
            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1858
            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1859
            self.assertEqual(seq.decode('utf-8', 'ignore'),
1860
                             res.replace('\uFFFD', ''))
1861

1862
    def assertCorrectUTF8Decoding(self, seq, res, err):
1863
        """
1864
        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1865
        'strict' is used, returns res when 'replace' is used, and that doesn't
1866
        return anything when 'ignore' is used.
1867
        """
1868
        with self.assertRaises(UnicodeDecodeError) as cm:
1869
            seq.decode('utf-8')
1870
        exc = cm.exception
1871

1872
        self.assertIn(err, str(exc))
1873
        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1874
        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1875
                         'aaaa' + res + 'bbbb')
1876
        res = res.replace('\ufffd', '')
1877
        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1878
        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1879
                          'aaaa' + res + 'bbbb')
1880

1881
    def test_invalid_start_byte(self):
1882
        """
1883
        Test that an 'invalid start byte' error is raised when the first byte
1884
        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1885
        4-bytes sequence. The invalid start byte is replaced with a single
1886
        U+FFFD when errors='replace'.
1887
        E.g. <80> is a continuation byte and can appear only after a start byte.
1888
        """
1889
        FFFD = '\ufffd'
1890
        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1891
            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1892
                                           'invalid start byte')
1893

1894
    def test_unexpected_end_of_data(self):
1895
        """
1896
        Test that an 'unexpected end of data' error is raised when the string
1897
        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1898
        enough continuation bytes.  The incomplete sequence is replaced with a
1899
        single U+FFFD when errors='replace'.
1900
        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1901
        sequence, but it's followed by only 2 valid continuation bytes and the
1902
        last continuation bytes is missing.
1903
        Note: the continuation bytes must be all valid, if one of them is
1904
        invalid another error will be raised.
1905
        """
1906
        sequences = [
1907
            'C2', 'DF',
1908
            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1909
            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1910
            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1911
            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1912
            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1913
            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1914
        ]
1915
        FFFD = '\ufffd'
1916
        for seq in sequences:
1917
            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1918
                                           'unexpected end of data')
1919

1920
    def test_invalid_cb_for_2bytes_seq(self):
1921
        """
1922
        Test that an 'invalid continuation byte' error is raised when the
1923
        continuation byte of a 2-bytes sequence is invalid.  The start byte
1924
        is replaced by a single U+FFFD and the second byte is handled
1925
        separately when errors='replace'.
1926
        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1927
        sequence, but 41 is not a valid continuation byte because it's the
1928
        ASCII letter 'A'.
1929
        """
1930
        FFFD = '\ufffd'
1931
        FFFDx2 = FFFD * 2
1932
        sequences = [
1933
            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1934
            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1935
            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1936
            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1937
        ]
1938
        for seq, res in sequences:
1939
            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1940
                                           'invalid continuation byte')
1941

1942
    def test_invalid_cb_for_3bytes_seq(self):
1943
        """
1944
        Test that an 'invalid continuation byte' error is raised when the
1945
        continuation byte(s) of a 3-bytes sequence are invalid.  When
1946
        errors='replace', if the first continuation byte is valid, the first
1947
        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1948
        third byte is handled separately, otherwise only the start byte is
1949
        replaced with a U+FFFD and the other continuation bytes are handled
1950
        separately.
1951
        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1952
        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1953
        because it's the ASCII letter 'A'.
1954
        Note: when the start byte is E0 or ED, the valid ranges for the first
1955
        continuation byte are limited to A0..BF and 80..9F respectively.
1956
        Python 2 used to consider all the bytes in range 80..BF valid when the
1957
        start byte was ED.  This is fixed in Python 3.
1958
        """
1959
        FFFD = '\ufffd'
1960
        FFFDx2 = FFFD * 2
1961
        sequences = [
1962
            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1963
            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1964
            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1965
            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1966
            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1967
            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1968
            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1969
            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1970
            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1971
            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1972
            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1973
            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1974
            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1975
            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1976
            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1977
            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1978
            ('ED 7F', FFFD+'\x7f'),
1979
            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1980
            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1981
            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1982
            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1983
            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1984
            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1985
            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1986
            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1987
            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1988
            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1989
            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1990
            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1991
            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1992
            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1993
            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1994
            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1995
        ]
1996
        for seq, res in sequences:
1997
            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1998
                                           'invalid continuation byte')
1999

2000
    def test_invalid_cb_for_4bytes_seq(self):
2001
        """
2002
        Test that an 'invalid continuation byte' error is raised when the
2003
        continuation byte(s) of a 4-bytes sequence are invalid.  When
2004
        errors='replace',the start byte and all the following valid
2005
        continuation bytes are replaced with a single U+FFFD, and all the bytes
2006
        starting from the first invalid continuation bytes (included) are
2007
        handled separately.
2008
        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2009
        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2010
        because it's the ASCII letter 'A'.
2011
        Note: when the start byte is E0 or ED, the valid ranges for the first
2012
        continuation byte are limited to A0..BF and 80..9F respectively.
2013
        However, when the start byte is ED, Python 2 considers all the bytes
2014
        in range 80..BF valid.  This is fixed in Python 3.
2015
        """
2016
        FFFD = '\ufffd'
2017
        FFFDx2 = FFFD * 2
2018
        sequences = [
2019
            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2020
            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2021
            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2022
            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2023
            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2024
            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2025
            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2026
            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2027
            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2028
            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2029
            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2030
            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2031
            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2032
            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2033
            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2034
            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2035
            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2036
            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2037
            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2038
            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2039
            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2040
            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2041
            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2042
            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2043
            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2044
            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2045
            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2046
            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2047
            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2048
            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2049
            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2050
            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2051
            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2052
            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2053
            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2054
            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2055
            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2056
            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2057
            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2058
            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2059
            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2060
            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2061
            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2062
            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2063
            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2064
            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2065
            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2066
            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2067
            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2068
            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2069
            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2070
            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2071
            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2072
            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2073
            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2074
        ]
2075
        for seq, res in sequences:
2076
            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2077
                                           'invalid continuation byte')
2078

2079
    def test_codecs_idna(self):
2080
        # Test whether trailing dot is preserved
2081
        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2082

2083
    def test_codecs_errors(self):
2084
        # Error handling (encoding)
2085
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2086
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2087
        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2088
        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2089
        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2090
                         'Andr\202 x'.encode('ascii', errors='replace'))
2091
        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2092
                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2093

2094
        # Error handling (decoding)
2095
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2096
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2097
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2098
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2099
        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2100

2101
        # Error handling (unknown character names)
2102
        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2103

2104
        # Error handling (truncated escape sequence)
2105
        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2106

2107
        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2108
        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2109
        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2110
        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2111

2112
        # Error handling (wrong arguments)
2113
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2114

2115
        # Error handling (lone surrogate in
2116
        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2117
        self.assertRaises(ValueError, int, "\ud800")
2118
        self.assertRaises(ValueError, int, "\udf00")
2119
        self.assertRaises(ValueError, float, "\ud800")
2120
        self.assertRaises(ValueError, float, "\udf00")
2121
        self.assertRaises(ValueError, complex, "\ud800")
2122
        self.assertRaises(ValueError, complex, "\udf00")
2123

2124
    def test_codecs(self):
2125
        # Encoding
2126
        self.assertEqual('hello'.encode('ascii'), b'hello')
2127
        self.assertEqual('hello'.encode('utf-7'), b'hello')
2128
        self.assertEqual('hello'.encode('utf-8'), b'hello')
2129
        self.assertEqual('hello'.encode('utf-8'), b'hello')
2130
        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2131
        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2132
        self.assertEqual('hello'.encode('latin-1'), b'hello')
2133

2134
        # Default encoding is utf-8
2135
        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2136

2137
        # Roundtrip safety for BMP (just the first 1024 chars)
2138
        for c in range(1024):
2139
            u = chr(c)
2140
            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2141
                             'utf-16-be', 'raw_unicode_escape',
2142
                             'unicode_escape'):
2143
                self.assertEqual(str(u.encode(encoding),encoding), u)
2144

2145
        # Roundtrip safety for BMP (just the first 256 chars)
2146
        for c in range(256):
2147
            u = chr(c)
2148
            for encoding in ('latin-1',):
2149
                self.assertEqual(str(u.encode(encoding),encoding), u)
2150

2151
        # Roundtrip safety for BMP (just the first 128 chars)
2152
        for c in range(128):
2153
            u = chr(c)
2154
            for encoding in ('ascii',):
2155
                self.assertEqual(str(u.encode(encoding),encoding), u)
2156

2157
        # Roundtrip safety for non-BMP (just a few chars)
2158
        with warnings.catch_warnings():
2159
            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2160
            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2161
                             'raw_unicode_escape', 'unicode_escape'):
2162
                self.assertEqual(str(u.encode(encoding),encoding), u)
2163

2164
        # UTF-8 must be roundtrip safe for all code points
2165
        # (except surrogates, which are forbidden).
2166
        u = ''.join(map(chr, list(range(0, 0xd800)) +
2167
                             list(range(0xe000, 0x110000))))
2168
        for encoding in ('utf-8',):
2169
            self.assertEqual(str(u.encode(encoding),encoding), u)
2170

2171
    def test_codecs_charmap(self):
2172
        # 0-127
2173
        s = bytes(range(128))
2174
        for encoding in (
2175
            'cp037', 'cp1026', 'cp273',
2176
            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2177
            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2178
            'cp863', 'cp865', 'cp866', 'cp1125',
2179
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2180
            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2181
            'iso8859_7', 'iso8859_9',
2182
            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2183
            'mac_cyrillic', 'mac_latin2',
2184

2185
            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2186
            'cp1256', 'cp1257', 'cp1258',
2187
            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2188

2189
            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2190
            'cp1006', 'iso8859_8',
2191

2192
            ### These have undefined mappings:
2193
            #'cp424',
2194

2195
            ### These fail the round-trip:
2196
            #'cp875'
2197

2198
            ):
2199
            self.assertEqual(str(s, encoding).encode(encoding), s)
2200

2201
        # 128-255
2202
        s = bytes(range(128, 256))
2203
        for encoding in (
2204
            'cp037', 'cp1026', 'cp273',
2205
            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2206
            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2207
            'cp863', 'cp865', 'cp866', 'cp1125',
2208
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2209
            'iso8859_2', 'iso8859_4', 'iso8859_5',
2210
            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2211
            'mac_cyrillic', 'mac_latin2',
2212

2213
            ### These have undefined mappings:
2214
            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2215
            #'cp1256', 'cp1257', 'cp1258',
2216
            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2217
            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2218
            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2219

2220
            ### These fail the round-trip:
2221
            #'cp1006', 'cp875', 'iso8859_8',
2222

2223
            ):
2224
            self.assertEqual(str(s, encoding).encode(encoding), s)
2225

2226
    def test_concatenation(self):
2227
        self.assertEqual(("abc" "def"), "abcdef")
2228
        self.assertEqual(("abc" "def"), "abcdef")
2229
        self.assertEqual(("abc" "def"), "abcdef")
2230
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2231
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2232

2233
    def test_printing(self):
2234
        class BitBucket:
2235
            def write(self, text):
2236
                pass
2237

2238
        out = BitBucket()
2239
        print('abc', file=out)
2240
        print('abc', 'def', file=out)
2241
        print('abc', 'def', file=out)
2242
        print('abc', 'def', file=out)
2243
        print('abc\n', file=out)
2244
        print('abc\n', end=' ', file=out)
2245
        print('abc\n', end=' ', file=out)
2246
        print('def\n', file=out)
2247
        print('def\n', file=out)
2248

2249
    def test_ucs4(self):
2250
        x = '\U00100000'
2251
        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2252
        self.assertEqual(x, y)
2253

2254
        y = br'\U00100000'
2255
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2256
        self.assertEqual(x, y)
2257
        y = br'\U00010000'
2258
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2259
        self.assertEqual(x, y)
2260

2261
        try:
2262
            br'\U11111111'.decode("raw-unicode-escape")
2263
        except UnicodeDecodeError as e:
2264
            self.assertEqual(e.start, 0)
2265
            self.assertEqual(e.end, 10)
2266
        else:
2267
            self.fail("Should have raised UnicodeDecodeError")
2268

2269
    def test_conversion(self):
2270
        # Make sure __str__() works properly
2271
        class ObjectToStr:
2272
            def __str__(self):
2273
                return "foo"
2274

2275
        class StrSubclassToStr(str):
2276
            def __str__(self):
2277
                return "foo"
2278

2279
        class StrSubclassToStrSubclass(str):
2280
            def __new__(cls, content=""):
2281
                return str.__new__(cls, 2*content)
2282
            def __str__(self):
2283
                return self
2284

2285
        self.assertEqual(str(ObjectToStr()), "foo")
2286
        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2287
        s = str(StrSubclassToStrSubclass("foo"))
2288
        self.assertEqual(s, "foofoo")
2289
        self.assertIs(type(s), StrSubclassToStrSubclass)
2290
        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2291
        self.assertEqual(s, "foofoo")
2292
        self.assertIs(type(s), StrSubclass)
2293

2294
    def test_unicode_repr(self):
2295
        class s1:
2296
            def __repr__(self):
2297
                return '\\n'
2298

2299
        class s2:
2300
            def __repr__(self):
2301
                return '\\n'
2302

2303
        self.assertEqual(repr(s1()), '\\n')
2304
        self.assertEqual(repr(s2()), '\\n')
2305

2306
    def test_printable_repr(self):
2307
        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2308
        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2309

2310
    # This test only affects 32-bit platforms because expandtabs can only take
2311
    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2312
    # to take a 64-bit long, this test should apply to all platforms.
2313
    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2314
                     'only applies to 32-bit platforms')
2315
    def test_expandtabs_overflows_gracefully(self):
2316
        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2317

2318
    @support.cpython_only
2319
    def test_expandtabs_optimization(self):
2320
        s = 'abc'
2321
        self.assertIs(s.expandtabs(), s)
2322

2323
    """
2324
    # Not useful for Cython: struct sizes change between versions
2325
    # so it's hard to keep reliably up-to-date, and it's largely checking
2326
    # a CPython implementation detail
2327
    def test_raiseMemError(self):
2328
        if struct.calcsize('P') == 8:
2329
            # 64 bits pointers
2330
            ascii_struct_size = 48
2331
            compact_struct_size = 72
2332
        else:
2333
            # 32 bits pointers
2334
            ascii_struct_size = 24
2335
            compact_struct_size = 36
2336

2337
        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2338
            code = ord(char)
2339
            if code < 0x100:
2340
                char_size = 1  # sizeof(Py_UCS1)
2341
                struct_size = ascii_struct_size
2342
            elif code < 0x10000:
2343
                char_size = 2  # sizeof(Py_UCS2)
2344
                struct_size = compact_struct_size
2345
            else:
2346
                char_size = 4  # sizeof(Py_UCS4)
2347
                struct_size = compact_struct_size
2348
            # Note: sys.maxsize is half of the actual max allocation because of
2349
            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2350
            # be allocatable, given enough memory.
2351
            maxlen = ((sys.maxsize - struct_size) // char_size)
2352
            alloc = lambda: char * maxlen
2353
            self.assertRaises(MemoryError, alloc)
2354
            self.assertRaises(MemoryError, alloc)
2355
        """
2356

2357
    def test_format_subclass(self):
2358
        class S(str):
2359
            def __str__(self):
2360
                return '__str__ overridden'
2361
        s = S('xxx')
2362
        self.assertEqual("%s" % s, '__str__ overridden')
2363
        self.assertEqual("{}".format(s), '__str__ overridden')
2364

2365
    def test_subclass_add(self):
2366
        class S(str):
2367
            def __add__(self, o):
2368
                return "3"
2369
        self.assertEqual(S("4") + S("5"), "3")
2370
        class S(str):
2371
            def __iadd__(self, o):
2372
                return "3"
2373
        s = S("1")
2374
        s += "4"
2375
        self.assertEqual(s, "3")
2376

2377
    def _test_getnewargs(self):
2378
        text = 'abc'
2379
        args = text.__getnewargs__()
2380
        self.assertIsNot(args[0], text)
2381
        self.assertEqual(args[0], text)
2382
        self.assertEqual(len(args), 1)
2383

2384
    @unittest.skipIf(sys.version_info < (3, 8), 'resize test requires Py3.8+')
2385
    @support.cpython_only
2386
    def test_resize(self):
2387
        from _testcapi import getargs_u
2388
        for length in range(1, 100, 7):
2389
            # generate a fresh string (refcount=1)
2390
            text = 'a' * length + 'b'
2391

2392
            # fill wstr internal field
2393
            abc = getargs_u(text)
2394
            self.assertEqual(abc, text)
2395

2396
            # resize text: wstr field must be cleared and then recomputed
2397
            text += 'c'
2398
            abcdef = getargs_u(text)
2399
            self.assertNotEqual(abc, abcdef)
2400
            self.assertEqual(abcdef, text)
2401

2402
    def test_compare(self):
2403
        # Issue #17615
2404
        N = 10
2405
        ascii = 'a' * N
2406
        ascii2 = 'z' * N
2407
        latin = '\x80' * N
2408
        latin2 = '\xff' * N
2409
        bmp = '\u0100' * N
2410
        bmp2 = '\uffff' * N
2411
        astral = '\U00100000' * N
2412
        astral2 = '\U0010ffff' * N
2413
        strings = (
2414
            ascii, ascii2,
2415
            latin, latin2,
2416
            bmp, bmp2,
2417
            astral, astral2)
2418
        for text1, text2 in itertools.combinations(strings, 2):
2419
            equal = (text1 is text2)
2420
            self.assertEqual(text1 == text2, equal)
2421
            self.assertEqual(text1 != text2, not equal)
2422

2423
            if equal:
2424
                self.assertTrue(text1 <= text2)
2425
                self.assertTrue(text1 >= text2)
2426

2427
                # text1 is text2: duplicate strings to skip the "str1 == str2"
2428
                # optimization in unicode_compare_eq() and really compare
2429
                # character per character
2430
                copy1 = duplicate_string(text1)
2431
                copy2 = duplicate_string(text2)
2432
                self.assertIsNot(copy1, copy2)
2433

2434
                self.assertTrue(copy1 == copy2)
2435
                self.assertFalse(copy1 != copy2)
2436

2437
                self.assertTrue(copy1 <= copy2)
2438
                self.assertTrue(copy2 >= copy2)
2439

2440
        self.assertTrue(ascii < ascii2)
2441
        self.assertTrue(ascii < latin)
2442
        self.assertTrue(ascii < bmp)
2443
        self.assertTrue(ascii < astral)
2444
        self.assertFalse(ascii >= ascii2)
2445
        self.assertFalse(ascii >= latin)
2446
        self.assertFalse(ascii >= bmp)
2447
        self.assertFalse(ascii >= astral)
2448

2449
        self.assertFalse(latin < ascii)
2450
        self.assertTrue(latin < latin2)
2451
        self.assertTrue(latin < bmp)
2452
        self.assertTrue(latin < astral)
2453
        self.assertTrue(latin >= ascii)
2454
        self.assertFalse(latin >= latin2)
2455
        self.assertFalse(latin >= bmp)
2456
        self.assertFalse(latin >= astral)
2457

2458
        self.assertFalse(bmp < ascii)
2459
        self.assertFalse(bmp < latin)
2460
        self.assertTrue(bmp < bmp2)
2461
        self.assertTrue(bmp < astral)
2462
        self.assertTrue(bmp >= ascii)
2463
        self.assertTrue(bmp >= latin)
2464
        self.assertFalse(bmp >= bmp2)
2465
        self.assertFalse(bmp >= astral)
2466

2467
        self.assertFalse(astral < ascii)
2468
        self.assertFalse(astral < latin)
2469
        self.assertFalse(astral < bmp2)
2470
        self.assertTrue(astral < astral2)
2471
        self.assertTrue(astral >= ascii)
2472
        self.assertTrue(astral >= latin)
2473
        self.assertTrue(astral >= bmp2)
2474
        self.assertFalse(astral >= astral2)
2475

2476
    def test_free_after_iterating(self):
2477
        support.check_free_after_iterating(self, iter, str)
2478
        support.check_free_after_iterating(self, reversed, str)
2479

2480

2481
u"""
2482
class CAPITest(unittest.TestCase):
2483

2484
    # Test PyUnicode_FromFormat()
2485
    def test_from_format(self):
2486
        support.import_module('ctypes')
2487
        from ctypes import (
2488
            pythonapi, py_object, sizeof,
2489
            c_int, c_long, c_longlong, c_ssize_t,
2490
            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2491
        name = "PyUnicode_FromFormat"
2492
        _PyUnicode_FromFormat = getattr(pythonapi, name)
2493
        _PyUnicode_FromFormat.restype = py_object
2494

2495
        def PyUnicode_FromFormat(format, *args):
2496
            cargs = tuple(
2497
                py_object(arg) if isinstance(arg, str) else arg
2498
                for arg in args)
2499
            return _PyUnicode_FromFormat(format, *cargs)
2500

2501
        def check_format(expected, format, *args):
2502
            text = PyUnicode_FromFormat(format, *args)
2503
            self.assertEqual(expected, text)
2504

2505
        # ascii format, non-ascii argument
2506
        check_format('ascii\x7f=unicode\xe9',
2507
                     b'ascii\x7f=%U', 'unicode\xe9')
2508

2509
        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2510
        # raises an error
2511
        self.assertRaisesRegex(ValueError,
2512
            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2513
            'string, got a non-ASCII byte: 0xe9$',
2514
            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2515

2516
        # test "%c"
2517
        check_format('\uabcd',
2518
                     b'%c', c_int(0xabcd))
2519
        check_format('\U0010ffff',
2520
                     b'%c', c_int(0x10ffff))
2521
        with self.assertRaises(OverflowError):
2522
            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2523
        # Issue #18183
2524
        check_format('\U00010000\U00100000',
2525
                     b'%c%c', c_int(0x10000), c_int(0x100000))
2526

2527
        # test "%"
2528
        check_format('%',
2529
                     b'%')
2530
        check_format('%',
2531
                     b'%%')
2532
        check_format('%s',
2533
                     b'%%s')
2534
        check_format('[%]',
2535
                     b'[%%]')
2536
        check_format('%abc',
2537
                     b'%%%s', b'abc')
2538

2539
        # truncated string
2540
        check_format('abc',
2541
                     b'%.3s', b'abcdef')
2542
        check_format('abc[\ufffd',
2543
                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2544
        check_format("'\\u20acABC'",
2545
                     b'%A', '\u20acABC')
2546
        check_format("'\\u20",
2547
                     b'%.5A', '\u20acABCDEF')
2548
        check_format("'\u20acABC'",
2549
                     b'%R', '\u20acABC')
2550
        check_format("'\u20acA",
2551
                     b'%.3R', '\u20acABCDEF')
2552
        check_format('\u20acAB',
2553
                     b'%.3S', '\u20acABCDEF')
2554
        check_format('\u20acAB',
2555
                     b'%.3U', '\u20acABCDEF')
2556
        check_format('\u20acAB',
2557
                     b'%.3V', '\u20acABCDEF', None)
2558
        check_format('abc[\ufffd',
2559
                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2560

2561
        # following tests comes from #7330
2562
        # test width modifier and precision modifier with %S
2563
        check_format("repr=  abc",
2564
                     b'repr=%5S', 'abc')
2565
        check_format("repr=ab",
2566
                     b'repr=%.2S', 'abc')
2567
        check_format("repr=   ab",
2568
                     b'repr=%5.2S', 'abc')
2569

2570
        # test width modifier and precision modifier with %R
2571
        check_format("repr=   'abc'",
2572
                     b'repr=%8R', 'abc')
2573
        check_format("repr='ab",
2574
                     b'repr=%.3R', 'abc')
2575
        check_format("repr=  'ab",
2576
                     b'repr=%5.3R', 'abc')
2577

2578
        # test width modifier and precision modifier with %A
2579
        check_format("repr=   'abc'",
2580
                     b'repr=%8A', 'abc')
2581
        check_format("repr='ab",
2582
                     b'repr=%.3A', 'abc')
2583
        check_format("repr=  'ab",
2584
                     b'repr=%5.3A', 'abc')
2585

2586
        # test width modifier and precision modifier with %s
2587
        check_format("repr=  abc",
2588
                     b'repr=%5s', b'abc')
2589
        check_format("repr=ab",
2590
                     b'repr=%.2s', b'abc')
2591
        check_format("repr=   ab",
2592
                     b'repr=%5.2s', b'abc')
2593

2594
        # test width modifier and precision modifier with %U
2595
        check_format("repr=  abc",
2596
                     b'repr=%5U', 'abc')
2597
        check_format("repr=ab",
2598
                     b'repr=%.2U', 'abc')
2599
        check_format("repr=   ab",
2600
                     b'repr=%5.2U', 'abc')
2601

2602
        # test width modifier and precision modifier with %V
2603
        check_format("repr=  abc",
2604
                     b'repr=%5V', 'abc', b'123')
2605
        check_format("repr=ab",
2606
                     b'repr=%.2V', 'abc', b'123')
2607
        check_format("repr=   ab",
2608
                     b'repr=%5.2V', 'abc', b'123')
2609
        check_format("repr=  123",
2610
                     b'repr=%5V', None, b'123')
2611
        check_format("repr=12",
2612
                     b'repr=%.2V', None, b'123')
2613
        check_format("repr=   12",
2614
                     b'repr=%5.2V', None, b'123')
2615

2616
        # test integer formats (%i, %d, %u)
2617
        check_format('010',
2618
                     b'%03i', c_int(10))
2619
        check_format('0010',
2620
                     b'%0.4i', c_int(10))
2621
        check_format('-123',
2622
                     b'%i', c_int(-123))
2623
        check_format('-123',
2624
                     b'%li', c_long(-123))
2625
        check_format('-123',
2626
                     b'%lli', c_longlong(-123))
2627
        check_format('-123',
2628
                     b'%zi', c_ssize_t(-123))
2629

2630
        check_format('-123',
2631
                     b'%d', c_int(-123))
2632
        check_format('-123',
2633
                     b'%ld', c_long(-123))
2634
        check_format('-123',
2635
                     b'%lld', c_longlong(-123))
2636
        check_format('-123',
2637
                     b'%zd', c_ssize_t(-123))
2638

2639
        check_format('123',
2640
                     b'%u', c_uint(123))
2641
        check_format('123',
2642
                     b'%lu', c_ulong(123))
2643
        check_format('123',
2644
                     b'%llu', c_ulonglong(123))
2645
        check_format('123',
2646
                     b'%zu', c_size_t(123))
2647

2648
        # test long output
2649
        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2650
        max_longlong = -min_longlong - 1
2651
        check_format(str(min_longlong),
2652
                     b'%lld', c_longlong(min_longlong))
2653
        check_format(str(max_longlong),
2654
                     b'%lld', c_longlong(max_longlong))
2655
        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2656
        check_format(str(max_ulonglong),
2657
                     b'%llu', c_ulonglong(max_ulonglong))
2658
        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2659

2660
        # test padding (width and/or precision)
2661
        check_format('123'.rjust(10, '0'),
2662
                     b'%010i', c_int(123))
2663
        check_format('123'.rjust(100),
2664
                     b'%100i', c_int(123))
2665
        check_format('123'.rjust(100, '0'),
2666
                     b'%.100i', c_int(123))
2667
        check_format('123'.rjust(80, '0').rjust(100),
2668
                     b'%100.80i', c_int(123))
2669

2670
        check_format('123'.rjust(10, '0'),
2671
                     b'%010u', c_uint(123))
2672
        check_format('123'.rjust(100),
2673
                     b'%100u', c_uint(123))
2674
        check_format('123'.rjust(100, '0'),
2675
                     b'%.100u', c_uint(123))
2676
        check_format('123'.rjust(80, '0').rjust(100),
2677
                     b'%100.80u', c_uint(123))
2678

2679
        check_format('123'.rjust(10, '0'),
2680
                     b'%010x', c_int(0x123))
2681
        check_format('123'.rjust(100),
2682
                     b'%100x', c_int(0x123))
2683
        check_format('123'.rjust(100, '0'),
2684
                     b'%.100x', c_int(0x123))
2685
        check_format('123'.rjust(80, '0').rjust(100),
2686
                     b'%100.80x', c_int(0x123))
2687

2688
        # test %A
2689
        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2690
                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2691

2692
        # test %V
2693
        check_format('repr=abc',
2694
                     b'repr=%V', 'abc', b'xyz')
2695

2696
        # Test string decode from parameter of %s using utf-8.
2697
        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2698
        # '\u4eba\u6c11'
2699
        check_format('repr=\u4eba\u6c11',
2700
                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2701

2702
        #Test replace error handler.
2703
        check_format('repr=abc\ufffd',
2704
                     b'repr=%V', None, b'abc\xff')
2705

2706
        # not supported: copy the raw format string. these tests are just here
2707
        # to check for crashes and should not be considered as specifications
2708
        check_format('%s',
2709
                     b'%1%s', b'abc')
2710
        check_format('%1abc',
2711
                     b'%1abc')
2712
        check_format('%+i',
2713
                     b'%+i', c_int(10))
2714
        check_format('%.%s',
2715
                     b'%.%s', b'abc')
2716

2717
        # Issue #33817: empty strings
2718
        check_format('',
2719
                     b'')
2720
        check_format('',
2721
                     b'%s', b'')
2722

2723
    # Test PyUnicode_AsWideChar()
2724
    @support.cpython_only
2725
    def test_aswidechar(self):
2726
        from _testcapi import unicode_aswidechar
2727
        support.import_module('ctypes')
2728
        from ctypes import c_wchar, sizeof
2729

2730
        wchar, size = unicode_aswidechar('abcdef', 2)
2731
        self.assertEqual(size, 2)
2732
        self.assertEqual(wchar, 'ab')
2733

2734
        wchar, size = unicode_aswidechar('abc', 3)
2735
        self.assertEqual(size, 3)
2736
        self.assertEqual(wchar, 'abc')
2737

2738
        wchar, size = unicode_aswidechar('abc', 4)
2739
        self.assertEqual(size, 3)
2740
        self.assertEqual(wchar, 'abc\0')
2741

2742
        wchar, size = unicode_aswidechar('abc', 10)
2743
        self.assertEqual(size, 3)
2744
        self.assertEqual(wchar, 'abc\0')
2745

2746
        wchar, size = unicode_aswidechar('abc\0def', 20)
2747
        self.assertEqual(size, 7)
2748
        self.assertEqual(wchar, 'abc\0def\0')
2749

2750
        nonbmp = chr(0x10ffff)
2751
        if sizeof(c_wchar) == 2:
2752
            buflen = 3
2753
            nchar = 2
2754
        else: # sizeof(c_wchar) == 4
2755
            buflen = 2
2756
            nchar = 1
2757
        wchar, size = unicode_aswidechar(nonbmp, buflen)
2758
        self.assertEqual(size, nchar)
2759
        self.assertEqual(wchar, nonbmp + '\0')
2760

2761
    # Test PyUnicode_AsWideCharString()
2762
    @support.cpython_only
2763
    def test_aswidecharstring(self):
2764
        from _testcapi import unicode_aswidecharstring
2765
        support.import_module('ctypes')
2766
        from ctypes import c_wchar, sizeof
2767

2768
        wchar, size = unicode_aswidecharstring('abc')
2769
        self.assertEqual(size, 3)
2770
        self.assertEqual(wchar, 'abc\0')
2771

2772
        wchar, size = unicode_aswidecharstring('abc\0def')
2773
        self.assertEqual(size, 7)
2774
        self.assertEqual(wchar, 'abc\0def\0')
2775

2776
        nonbmp = chr(0x10ffff)
2777
        if sizeof(c_wchar) == 2:
2778
            nchar = 2
2779
        else: # sizeof(c_wchar) == 4
2780
            nchar = 1
2781
        wchar, size = unicode_aswidecharstring(nonbmp)
2782
        self.assertEqual(size, nchar)
2783
        self.assertEqual(wchar, nonbmp + '\0')
2784

2785
    # Test PyUnicode_AsUCS4()
2786
    @support.cpython_only
2787
    def test_asucs4(self):
2788
        from _testcapi import unicode_asucs4
2789
        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2790
                  'a\ud800b\udfffc', '\ud834\udd1e']:
2791
            l = len(s)
2792
            self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2793
            self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2794
            self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2795
            self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2796
            self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2797
            self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2798
            s = '\0'.join([s, s])
2799
            self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2800
            self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2801

2802
    # Test PyUnicode_FindChar()
2803
    @support.cpython_only
2804
    def test_findchar(self):
2805
        from _testcapi import unicode_findchar
2806

2807
        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2808
            for i, ch in enumerate(str):
2809
                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2810
                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2811

2812
        str = "!>_<!"
2813
        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2814
        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2815
        # start < end
2816
        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2817
        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2818
        # start >= end
2819
        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2820
        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2821
        # negative
2822
        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2823
        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2824

2825
    # Test PyUnicode_CopyCharacters()
2826
    @support.cpython_only
2827
    def test_copycharacters(self):
2828
        from _testcapi import unicode_copycharacters
2829

2830
        strings = [
2831
            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2832
            '\u4f60\u597d\u4e16\u754c\uff01',
2833
            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2834
        ]
2835

2836
        for idx, from_ in enumerate(strings):
2837
            # wide -> narrow: exceed maxchar limitation
2838
            for to in strings[:idx]:
2839
                self.assertRaises(
2840
                    SystemError,
2841
                    unicode_copycharacters, to, 0, from_, 0, 5
2842
                )
2843
            # same kind
2844
            for from_start in range(5):
2845
                self.assertEqual(
2846
                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2847
                    (from_[from_start:from_start+5].ljust(5, '\0'),
2848
                     5-from_start)
2849
                )
2850
            for to_start in range(5):
2851
                self.assertEqual(
2852
                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2853
                    (from_[to_start:to_start+5].rjust(5, '\0'),
2854
                     5-to_start)
2855
                )
2856
            # narrow -> wide
2857
            # Tests omitted since this creates invalid strings.
2858

2859
        s = strings[0]
2860
        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2861
        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2862
        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2863
        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2864
        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2865
        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2866
        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2867

2868
    @support.cpython_only
2869
    def test_encode_decimal(self):
2870
        from _testcapi import unicode_encodedecimal
2871
        self.assertEqual(unicode_encodedecimal('123'),
2872
                         b'123')
2873
        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2874
                         b'3.14')
2875
        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2876
                         b' 3.14 ')
2877
        self.assertRaises(UnicodeEncodeError,
2878
                          unicode_encodedecimal, "123\u20ac", "strict")
2879
        self.assertRaisesRegex(
2880
            ValueError,
2881
            "^'decimal' codec can't encode character",
2882
            unicode_encodedecimal, "123\u20ac", "replace")
2883

2884
    @support.cpython_only
2885
    def test_transform_decimal(self):
2886
        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2887
        self.assertEqual(transform_decimal('123'),
2888
                         '123')
2889
        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2890
                         '3.14')
2891
        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2892
                         "\N{EM SPACE}3.14\N{EN SPACE}")
2893
        self.assertEqual(transform_decimal('123\u20ac'),
2894
                         '123\u20ac')
2895

2896
    @support.cpython_only
2897
    def test_pep393_utf8_caching_bug(self):
2898
        # Issue #25709: Problem with string concatenation and utf-8 cache
2899
        from _testcapi import getargs_s_hash
2900
        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2901
            s = ''
2902
            for i in range(5):
2903
                # Due to CPython specific optimization the 's' string can be
2904
                # resized in-place.
2905
                s += chr(k)
2906
                # Parsing with the "s#" format code calls indirectly
2907
                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2908
                # encoded string cached in the Unicode object.
2909
                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2910
                # Check that the second call returns the same result
2911
                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2912
"""
2913

2914

2915
u"""
2916
class StringModuleTest(unittest.TestCase):
2917
    def test_formatter_parser(self):
2918
        def parse(format):
2919
            return list(_string.formatter_parser(format))
2920

2921
        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2922
        self.assertEqual(formatter, [
2923
            ('prefix ', '2', '', 's'),
2924
            ('xxx', '0', '^+10.3f', None),
2925
            ('', 'obj.attr', '', 's'),
2926
            (' ', 'z[0]', '10', 's'),
2927
        ])
2928

2929
        formatter = parse("prefix {} suffix")
2930
        self.assertEqual(formatter, [
2931
            ('prefix ', '', '', None),
2932
            (' suffix', None, None, None),
2933
        ])
2934

2935
        formatter = parse("str")
2936
        self.assertEqual(formatter, [
2937
            ('str', None, None, None),
2938
        ])
2939

2940
        formatter = parse("")
2941
        self.assertEqual(formatter, [])
2942

2943
        formatter = parse("{0}")
2944
        self.assertEqual(formatter, [
2945
            ('', '0', '', None),
2946
        ])
2947

2948
        self.assertRaises(TypeError, _string.formatter_parser, 1)
2949

2950
    def test_formatter_field_name_split(self):
2951
        def split(name):
2952
            items = list(_string.formatter_field_name_split(name))
2953
            items[1] = list(items[1])
2954
            return items
2955
        self.assertEqual(split("obj"), ["obj", []])
2956
        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2957
        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2958
        self.assertEqual(split("obj.arg[key1][key2]"), [
2959
            "obj",
2960
            [(True, 'arg'),
2961
             (False, 'key1'),
2962
             (False, 'key2'),
2963
            ]])
2964
        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2965
"""
2966

2967

2968
if __name__ == "__main__":
2969
    unittest.main()
2970
cython

Использование cookies