cython

StringEncoding.py
347 строк · 10.0 Кб
Перенос по словам
1
#
2
#   Cython -- encoding related tools
3
#
4

5

6
import re
7
import sys
8

9
join_bytes = b''.join
10

11

12
class UnicodeLiteralBuilder:
13
    """Assemble a unicode string.
14
    """
15
    def __init__(self):
16
        self.chars = []
17

18
    def append(self, characters):
19
        assert isinstance(characters, str), f"Expected str, got {type(characters)}"
20
        self.chars.append(characters)
21

22
    if sys.maxunicode == 65535:
23
        def append_charval(self, char_number):
24
            if char_number > 65535:
25
                # wide Unicode character on narrow platform => replace
26
                # by surrogate pair
27
                char_number -= 0x10000
28
                self.chars.append( chr((char_number // 1024) + 0xD800) )
29
                self.chars.append( chr((char_number  % 1024) + 0xDC00) )
30
            else:
31
                self.chars.append( chr(char_number) )
32
    else:
33
        def append_charval(self, char_number):
34
            self.chars.append( chr(char_number) )
35

36
    def append_uescape(self, char_number, escape_string):
37
        self.append_charval(char_number)
38

39
    def getstring(self):
40
        return EncodedString(''.join(self.chars))
41

42
    def getstrings(self):
43
        return (None, self.getstring())
44

45

46
class BytesLiteralBuilder:
47
    """Assemble a byte string or char value.
48
    """
49
    def __init__(self, target_encoding):
50
        self.chars = []
51
        self.target_encoding = target_encoding
52

53
    def append(self, characters):
54
        if isinstance(characters, str):
55
            characters = characters.encode(self.target_encoding)
56
        assert isinstance(characters, bytes), str(type(characters))
57
        self.chars.append(characters)
58

59
    def append_charval(self, char_number):
60
        self.chars.append( chr(char_number).encode('ISO-8859-1') )
61

62
    def append_uescape(self, char_number, escape_string):
63
        self.append(escape_string)
64

65
    def getstring(self):
66
        # this *must* return a byte string!
67
        return bytes_literal(join_bytes(self.chars), self.target_encoding)
68

69
    def getchar(self):
70
        # this *must* return a byte string!
71
        return self.getstring()
72

73
    def getstrings(self):
74
        return (self.getstring(), None)
75

76

77
class StrLiteralBuilder:
78
    """Assemble both a bytes and a unicode representation of a string.
79
    """
80
    def __init__(self, target_encoding):
81
        self._bytes   = BytesLiteralBuilder(target_encoding)
82
        self._unicode = UnicodeLiteralBuilder()
83

84
    def append(self, characters):
85
        self._bytes.append(characters)
86
        self._unicode.append(characters)
87

88
    def append_charval(self, char_number):
89
        self._bytes.append_charval(char_number)
90
        self._unicode.append_charval(char_number)
91

92
    def append_uescape(self, char_number, escape_string):
93
        self._bytes.append(escape_string)
94
        self._unicode.append_charval(char_number)
95

96
    def getstrings(self):
97
        return (self._bytes.getstring(), self._unicode.getstring())
98

99

100
class EncodedString(str):
101
    # unicode string subclass to keep track of the original encoding.
102
    # 'encoding' is None for unicode strings and the source encoding
103
    # otherwise
104
    encoding = None
105

106
    def __deepcopy__(self, memo):
107
        return self
108

109
    def byteencode(self):
110
        assert self.encoding is not None
111
        return self.encode(self.encoding)
112

113
    def utf8encode(self):
114
        assert self.encoding is None
115
        return self.encode("UTF-8")
116

117
    @property
118
    def is_unicode(self):
119
        return self.encoding is None
120

121
    def contains_surrogates(self):
122
        return string_contains_surrogates(self)
123

124
    def as_utf8_string(self):
125
        return bytes_literal(self.utf8encode(), 'utf8')
126

127
    def as_c_string_literal(self):
128
        # first encodes the string then produces a c string literal
129
        if self.encoding is None:
130
            s = self.as_utf8_string()
131
        else:
132
            s = bytes_literal(self.byteencode(), self.encoding)
133
        return s.as_c_string_literal()
134

135

136
def string_contains_surrogates(ustring):
137
    """
138
    Check if the unicode string contains surrogate code points
139
    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
140
    Unicode, i.e. characters that would be spelled as two
141
    separate code units on a narrow platform.
142
    """
143
    for c in map(ord, ustring):
144
        if c > 65535:  # can only happen on wide platforms
145
            return True
146
        if 0xD800 <= c <= 0xDFFF:
147
            return True
148
    return False
149

150

151
def string_contains_lone_surrogates(ustring):
152
    """
153
    Check if the unicode string contains lone surrogate code points
154
    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
155
    Unicode, i.e. characters that would be spelled as two
156
    separate code units on a narrow platform, but that do not form a pair.
157
    """
158
    last_was_start = False
159
    unicode_uses_surrogate_encoding = sys.maxunicode == 65535
160
    for c in map(ord, ustring):
161
        # surrogates tend to be rare
162
        if c < 0xD800 or c > 0xDFFF:
163
            if last_was_start:
164
                return True
165
        elif not unicode_uses_surrogate_encoding:
166
            # on 32bit Unicode platforms, there is never a pair
167
            return True
168
        elif c <= 0xDBFF:
169
            if last_was_start:
170
                return True  # lone start
171
            last_was_start = True
172
        else:
173
            if not last_was_start:
174
                return True  # lone end
175
            last_was_start = False
176
    return last_was_start
177

178

179
class BytesLiteral(bytes):
180
    # bytes subclass that is compatible with EncodedString
181
    encoding = None
182

183
    def __deepcopy__(self, memo):
184
        return self
185

186
    def byteencode(self):
187
        return bytes(self)
188

189
    def utf8encode(self):
190
        assert False, "this is not a unicode string: %r" % self
191

192
    def __str__(self):
193
        """Fake-decode the byte string to unicode to support %
194
        formatting of unicode strings.
195
        """
196
        return self.decode('ISO-8859-1')
197

198
    is_unicode = False
199

200
    def as_c_string_literal(self):
201
        value = split_string_literal(escape_byte_string(self))
202
        return '"%s"' % value
203

204

205
def bytes_literal(s, encoding):
206
    assert isinstance(s, bytes)
207
    s = BytesLiteral(s)
208
    s.encoding = encoding
209
    return s
210

211

212
def encoded_string(s, encoding):
213
    assert isinstance(s, (str, bytes))
214
    s = EncodedString(s)
215
    if encoding is not None:
216
        s.encoding = encoding
217
    return s
218

219
def encoded_string_or_bytes_literal(s, encoding):
220
    if isinstance(s, bytes):
221
        return bytes_literal(s, encoding)
222
    else:
223
        return encoded_string(s, encoding)
224

225

226
char_from_escape_sequence = {
227
    r'\a' : '\a',
228
    r'\b' : '\b',
229
    r'\f' : '\f',
230
    r'\n' : '\n',
231
    r'\r' : '\r',
232
    r'\t' : '\t',
233
    r'\v' : '\v',
234
    }.get
235

236
_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
237

238

239
def _to_escape_sequence(s):
240
    if s in '\n\r\t':
241
        return repr(s)[1:-1]
242
    elif s == '"':
243
        return r'\"'
244
    elif s == '\\':
245
        return r'\\'
246
    else:
247
        # within a character sequence, oct passes much better than hex
248
        return ''.join([f'\\{ord(c):03o}' for c in s])
249

250

251
def _build_specials_replacer():
252
    subexps = []
253
    replacements = {}
254
    for special in _c_special:
255
        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
256
        subexps.append(regexp)
257
        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
258
    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
259
    def replace_specials(m):
260
        return replacements[m.group(1)]
261
    def replace(s):
262
        return sub(replace_specials, s)
263
    return replace
264

265
_replace_specials = _build_specials_replacer()
266

267

268
def escape_char(c):
269
    c = c.decode('ISO-8859-1')
270
    if c in '\n\r\t\\':
271
        return repr(c)[1:-1]
272
    elif c == "'":
273
        return "\\'"
274
    n = ord(c)
275
    if n < 32 or n > 127:
276
        # hex works well for characters
277
        return "\\x%02X" % n
278
    else:
279
        return c
280

281
def escape_byte_string(s):
282
    """Escape a byte string so that it can be written into C code.
283
    Note that this returns a Unicode string instead which, when
284
    encoded as ASCII, will result in the correct byte sequence
285
    being written.
286
    """
287
    s = _replace_specials(s)
288
    try:
289
        return s.decode("ASCII")  #  trial decoding: plain ASCII => done
290
    except UnicodeDecodeError:
291
        pass
292
    s_new = bytearray()
293
    append, extend = s_new.append, s_new.extend
294
    for b in s:
295
        if b >= 128:
296
            extend((f'\\{b:03o}').encode('ASCII'))
297
        else:
298
            append(b)
299
    return s_new.decode('ASCII')
300

301
def split_string_literal(s, limit=2000):
302
    # MSVC can't handle long string literals.
303
    if len(s) < limit:
304
        return s
305
    else:
306
        start = 0
307
        chunks = []
308
        while start < len(s):
309
            end = start + limit
310
            if len(s) > end-4 and '\\' in s[end-4:end]:
311
                end -= 4 - s[end-4:end].find('\\')  # just before the backslash
312
                while s[end-1] == '\\':
313
                    end -= 1
314
                    if end == start:
315
                        # must have been a long line of backslashes
316
                        end = start + limit - (limit % 2) - 4
317
                        break
318
            chunks.append(s[start:end])
319
            start = end
320
        return '""'.join(chunks)
321

322
def encode_pyunicode_string(s):
323
    """Create Py_UNICODE[] representation of a given unicode string.
324
    """
325
    s = list(map(ord, s)) + [0]
326

327
    if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
328
        utf16, utf32 = [], s
329
        for code_point in s:
330
            if code_point >= 0x10000:  # outside of BMP
331
                high, low = divmod(code_point - 0x10000, 1024)
332
                utf16.append(high + 0xD800)
333
                utf16.append(low + 0xDC00)
334
            else:
335
                utf16.append(code_point)
336
    else:
337
        utf16, utf32 = s, []
338
        for code_unit in s:
339
            if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
340
                high, low = utf32[-1], code_unit
341
                utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
342
            else:
343
                utf32.append(code_unit)
344

345
    if utf16 == utf32:
346
        utf16 = []
347
    return ",".join(map(str, utf16)), ",".join(map(str, utf32))
348
cython

Использование cookies