12
class UnicodeLiteralBuilder:
13
"""Assemble a unicode string.
18
def append(self, characters):
19
assert isinstance(characters, str), f"Expected str, got {type(characters)}"
20
self.chars.append(characters)
22
if sys.maxunicode == 65535:
23
def append_charval(self, char_number):
24
if char_number > 65535:
27
char_number -= 0x10000
28
self.chars.append( chr((char_number // 1024) + 0xD800) )
29
self.chars.append( chr((char_number % 1024) + 0xDC00) )
31
self.chars.append( chr(char_number) )
33
def append_charval(self, char_number):
34
self.chars.append( chr(char_number) )
36
def append_uescape(self, char_number, escape_string):
37
self.append_charval(char_number)
40
return EncodedString(''.join(self.chars))
43
return (None, self.getstring())
46
class BytesLiteralBuilder:
47
"""Assemble a byte string or char value.
49
def __init__(self, target_encoding):
51
self.target_encoding = target_encoding
53
def append(self, characters):
54
if isinstance(characters, str):
55
characters = characters.encode(self.target_encoding)
56
assert isinstance(characters, bytes), str(type(characters))
57
self.chars.append(characters)
59
def append_charval(self, char_number):
60
self.chars.append( chr(char_number).encode('ISO-8859-1') )
62
def append_uescape(self, char_number, escape_string):
63
self.append(escape_string)
67
return bytes_literal(join_bytes(self.chars), self.target_encoding)
71
return self.getstring()
74
return (self.getstring(), None)
77
class StrLiteralBuilder:
78
"""Assemble both a bytes and a unicode representation of a string.
80
def __init__(self, target_encoding):
81
self._bytes = BytesLiteralBuilder(target_encoding)
82
self._unicode = UnicodeLiteralBuilder()
84
def append(self, characters):
85
self._bytes.append(characters)
86
self._unicode.append(characters)
88
def append_charval(self, char_number):
89
self._bytes.append_charval(char_number)
90
self._unicode.append_charval(char_number)
92
def append_uescape(self, char_number, escape_string):
93
self._bytes.append(escape_string)
94
self._unicode.append_charval(char_number)
97
return (self._bytes.getstring(), self._unicode.getstring())
100
class EncodedString(str):
106
def __deepcopy__(self, memo):
109
def byteencode(self):
110
assert self.encoding is not None
111
return self.encode(self.encoding)
113
def utf8encode(self):
114
assert self.encoding is None
115
return self.encode("UTF-8")
118
def is_unicode(self):
119
return self.encoding is None
121
def contains_surrogates(self):
122
return string_contains_surrogates(self)
124
def as_utf8_string(self):
125
return bytes_literal(self.utf8encode(), 'utf8')
127
def as_c_string_literal(self):
129
if self.encoding is None:
130
s = self.as_utf8_string()
132
s = bytes_literal(self.byteencode(), self.encoding)
133
return s.as_c_string_literal()
136
def string_contains_surrogates(ustring):
138
Check if the unicode string contains surrogate code points
139
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
140
Unicode, i.e. characters that would be spelled as two
141
separate code units on a narrow platform.
143
for c in map(ord, ustring):
146
if 0xD800 <= c <= 0xDFFF:
151
def string_contains_lone_surrogates(ustring):
153
Check if the unicode string contains lone surrogate code points
154
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
155
Unicode, i.e. characters that would be spelled as two
156
separate code units on a narrow platform, but that do not form a pair.
158
last_was_start = False
159
unicode_uses_surrogate_encoding = sys.maxunicode == 65535
160
for c in map(ord, ustring):
162
if c < 0xD800 or c > 0xDFFF:
165
elif not unicode_uses_surrogate_encoding:
171
last_was_start = True
173
if not last_was_start:
175
last_was_start = False
176
return last_was_start
179
class BytesLiteral(bytes):
183
def __deepcopy__(self, memo):
186
def byteencode(self):
189
def utf8encode(self):
190
assert False, "this is not a unicode string: %r" % self
193
"""Fake-decode the byte string to unicode to support %
194
formatting of unicode strings.
196
return self.decode('ISO-8859-1')
200
def as_c_string_literal(self):
201
value = split_string_literal(escape_byte_string(self))
202
return '"%s"' % value
205
def bytes_literal(s, encoding):
206
assert isinstance(s, bytes)
208
s.encoding = encoding
212
def encoded_string(s, encoding):
213
assert isinstance(s, (str, bytes))
215
if encoding is not None:
216
s.encoding = encoding
219
def encoded_string_or_bytes_literal(s, encoding):
220
if isinstance(s, bytes):
221
return bytes_literal(s, encoding)
223
return encoded_string(s, encoding)
226
char_from_escape_sequence = {
236
_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
239
def _to_escape_sequence(s):
248
return ''.join([f'\\{ord(c):03o}' for c in s])
251
def _build_specials_replacer():
254
for special in _c_special:
255
regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
256
subexps.append(regexp)
257
replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
258
sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
259
def replace_specials(m):
260
return replacements[m.group(1)]
262
return sub(replace_specials, s)
265
_replace_specials = _build_specials_replacer()
269
c = c.decode('ISO-8859-1')
275
if n < 32 or n > 127:
281
def escape_byte_string(s):
282
"""Escape a byte string so that it can be written into C code.
283
Note that this returns a Unicode string instead which, when
284
encoded as ASCII, will result in the correct byte sequence
287
s = _replace_specials(s)
289
return s.decode("ASCII")
290
except UnicodeDecodeError:
293
append, extend = s_new.append, s_new.extend
296
extend((f'\\{b:03o}').encode('ASCII'))
299
return s_new.decode('ASCII')
301
def split_string_literal(s, limit=2000):
308
while start < len(s):
310
if len(s) > end-4 and '\\' in s[end-4:end]:
311
end -= 4 - s[end-4:end].find('\\')
312
while s[end-1] == '\\':
316
end = start + limit - (limit % 2) - 4
318
chunks.append(s[start:end])
320
return '""'.join(chunks)
322
def encode_pyunicode_string(s):
323
"""Create Py_UNICODE[] representation of a given unicode string.
325
s = list(map(ord, s)) + [0]
327
if sys.maxunicode >= 0x10000:
330
if code_point >= 0x10000:
331
high, low = divmod(code_point - 0x10000, 1024)
332
utf16.append(high + 0xD800)
333
utf16.append(low + 0xDC00)
335
utf16.append(code_point)
339
if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
340
high, low = utf32[-1], code_unit
341
utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
343
utf32.append(code_unit)
347
return ",".join(map(str, utf16)), ",".join(map(str, utf32))