efl

Форк
0
/
graphemebreak.c 
305 строк · 10.2 Кб
1
/*
2
 * Grapheme breaking in a Unicode sequence.  Designed to be used in a
3
 * generic text renderer.
4
 *
5
 * Copyright (C) 2016-2019 Andreas Röver <roever at users dot sf dot net>
6
 *
7
 * This software is provided 'as-is', without any express or implied
8
 * warranty.  In no event will the author be held liable for any damages
9
 * arising from the use of this software.
10
 *
11
 * Permission is granted to anyone to use this software for any purpose,
12
 * including commercial applications, and to alter it and redistribute
13
 * it freely, subject to the following restrictions:
14
 *
15
 * 1. The origin of this software must not be misrepresented; you must
16
 *    not claim that you wrote the original software.  If you use this
17
 *    software in a product, an acknowledgement in the product
18
 *    documentation would be appreciated but is not required.
19
 * 2. Altered source versions must be plainly marked as such, and must
20
 *    not be misrepresented as being the original software.
21
 * 3. This notice may not be removed or altered from any source
22
 *    distribution.
23
 *
24
 * The main reference is Unicode Standard Annex 29 (UAX #29):
25
 *      <URL:http://unicode.org/reports/tr29>
26
 *
27
 * When this library was designed, this annex was at Revision 29, for
28
 * Unicode 9.0.0:
29
 *      <URL:http://www.unicode.org/reports/tr29/tr29-29.html>
30
 *
31
 * This library has been updated according to Revision 35, for
32
 * Unicode 12.0.0:
33
 *      <URL:http://www.unicode.org/reports/tr29/tr29-35.html>
34
 *
35
 * The Unicode Terms of Use are available at
36
 *      <URL:http://www.unicode.org/copyright.html>
37
 */
38

39
/**
40
 * @file    graphemebreak.c
41
 *
42
 * Implementation of the grapheme breaking algorithm as described in Unicode
43
 * Standard Annex 29.
44
 *
45
 * @author  Andreas Röver
46
 */
47

48
#include <string.h>
49
#include "graphemebreak.h"
50
#include "graphemebreakdata.c"
51
#include "unibreakdef.h"
52
#include "emojidef.h"
53

54
/**
55
 * Initializes the wordbreak internals.  It currently does nothing, but
56
 * it may in the future.
57
 */
58
void init_graphemebreak(void)
59
{
60
}
61

62
/**
63
 * Gets the grapheme breaking class of a character.
64
 *
65
 * @param[in] ch  character to check
66
 * @return        the grapheme breaking class if found; \c GBP_Other otherwise
67
 */
68
static enum GraphemeBreakClass get_char_gb_class(utf32_t ch)
69
{
70
    int min = 0;
71
    int max = ARRAY_LEN(gb_prop_default) - 1;
72
    int mid;
73

74
    do
75
    {
76
        mid = (min + max) / 2;
77

78
        if (ch < gb_prop_default[mid].start)
79
            max = mid - 1;
80
        else if (ch > gb_prop_default[mid].end)
81
            min = mid + 1;
82
        else
83
            return gb_prop_default[mid].prop;
84
    } while (min <= max);
85

86
    return GBP_Other;
87
}
88

89
/**
90
 * Sets the grapheme breaking information for a generic input string.
91
 * It uses the extended grapheme cluster ruleset.
92
 *
93
 * @param[in]  s             input string
94
 * @param[in]  len           length of the input
95
 * @param[out] brks          pointer to the output breaking data, containing
96
 *                           #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK
97
 * @param[in] get_next_char  function to get the next UTF-32 character
98
 */
99
static void set_graphemebreaks(const void *s, size_t len, char *brks,
100
                               get_next_char_t get_next_char)
101
{
102
    size_t posNext = 0;
103
    int rule11Detector = 0;
104
    bool evenRegionalIndicators = true;  // is the number of preceeding
105
                                         // GBP_RegionalIndicator characters
106
                                         // even
107

108
    utf32_t ch = get_next_char(s, len, &posNext);
109
    enum GraphemeBreakClass current_class = get_char_gb_class(ch);
110

111
    // initialize whole output to inside char
112
    memset(brks, GRAPHEMEBREAK_INSIDEACHAR, len);
113

114
    while (true)
115
    {
116

117
        // this state-machine recognizes the following pattern:
118
        // extended_pictograph Extended* ZWJ
119
        // when that pattern has been detected rule11Detector will be
120
        // 3 and rule 11 can be applied below
121
        switch (current_class)
122
        {
123
            case GBP_ZWJ:
124
                if (rule11Detector == 1 || rule11Detector == 2)
125
                {
126
                    rule11Detector = 3;
127
                }
128
                else
129
                {
130
                    rule11Detector = 0;
131
                }
132
                break;
133

134
            case GBP_Extend:
135
                if (rule11Detector == 1 || rule11Detector == 2)
136
                {
137
                    rule11Detector = 2;
138
                }
139
                else
140
                {
141
                    rule11Detector = 0;
142
                }
143
                break;
144

145
            default:
146
                if (ub_is_extended_pictographic(ch))
147
                {
148
                    rule11Detector = 1;
149
                }
150
                else
151
                {
152
                    rule11Detector = 0;
153
                }
154
                break;
155
        }
156

157
        enum GraphemeBreakClass prev_class = current_class;
158

159
        // safe position if current character so that we can store the
160
        // result there later on
161
        size_t brksPos = posNext - 1;
162

163
        // get nect character
164
        ch = get_next_char(s, len, &posNext);
165

166
        if (ch == EOS)
167
        {
168
            // done, place one final break after the last character as per
169
            // algorithm rule GB1
170
            brks[brksPos] = GRAPHEMEBREAK_BREAK;
171
            break;
172
        }
173

174
        // get class of current character
175
        current_class = get_char_gb_class(ch);
176

177
        if (prev_class == GBP_Regional_Indicator)
178
        {
179
            evenRegionalIndicators = !evenRegionalIndicators;
180
        }
181
        else
182
        {
183
            evenRegionalIndicators = true;
184
        }
185

186
        // check all rules
187
        if (prev_class == GBP_CR && current_class == GBP_LF)
188
        {
189
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB3
190
        }
191
        else if ((prev_class == GBP_CR) || (prev_class == GBP_LF) ||
192
                 (prev_class == GBP_Control) || (current_class == GBP_CR) ||
193
                 (current_class == GBP_LF) ||
194
                 (current_class == GBP_Control))
195
        {
196
            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB4 + GB5
197
        }
198
        else if ((prev_class == GBP_L) &&
199
                 ((current_class == GBP_L) || (current_class == GBP_V) ||
200
                  (current_class == GBP_LV) || (current_class == GBP_LVT)))
201
        {
202
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB6
203
        }
204
        else if (((prev_class == GBP_LV) || (prev_class == GBP_V)) &&
205
                 ((current_class == GBP_V) || (current_class == GBP_T)))
206
        {
207
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB7
208
        }
209
        else if (((prev_class == GBP_LVT) || (prev_class == GBP_T)) &&
210
                 (current_class == GBP_T))
211
        {
212
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB8
213
        }
214
        else if ((current_class == GBP_Extend) ||
215
                 (current_class == GBP_ZWJ) ||
216
                 (current_class == GBP_Virama))
217
        {
218
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9
219
        }
220
        else if (current_class == GBP_SpacingMark)
221
        {
222
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9a
223
        }
224
        else if (prev_class == GBP_Prepend)
225
        {
226
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9b
227
        }
228
        else if ((rule11Detector == 3) && ub_is_extended_pictographic(ch))
229
        {
230
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB11
231
        }
232
        else if (!evenRegionalIndicators &&
233
                 (current_class == GBP_Regional_Indicator))
234
        {
235
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB12 + GB13
236
        }
237
        else
238
        {
239
            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB999
240
        }
241
    }
242
}
243

244
/**
245
 * Sets the grapheme breaking information for a UTF-8 input string.
246
 *
247
 * @param[in]  s     input UTF-8 string
248
 * @param[in]  len   length of the input
249
 * @param[in]  lang  language of the input (reserved for future use)
250
 * @param[out] brks  pointer to the output breaking data, containing
251
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
252
 *                   First element in output array is for the break behind
253
 *                   the first character the pointer must point to an
254
 *                   array with at least as many elements as there
255
 *                   are characters in the string
256
 */
257
void set_graphemebreaks_utf8(const utf8_t *s, size_t len, const char *lang,
258
                             char *brks)
259
{
260
    (void)lang;
261
    set_graphemebreaks(s, len, brks,
262
                       (get_next_char_t)ub_get_next_char_utf8);
263
}
264

265
/**
266
 * Sets the grapheme breaking information for a UTF-16 input string.
267
 *
268
 * @param[in]  s     input UTF-16 string
269
 * @param[in]  len   length of the input
270
 * @param[in]  lang  language of the input (reserved for future use)
271
 * @param[out] brks  pointer to the output breaking data, containing
272
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
273
 *                   First element in output array is for the break behind
274
 *                   the first character the pointer must point to an
275
 *                   array with at least as many elements as there
276
 *                   are characters in the string
277
 */
278
void set_graphemebreaks_utf16(const utf16_t *s, size_t len,
279
                              const char *lang, char *brks)
280
{
281
    (void)lang;
282
    set_graphemebreaks(s, len, brks,
283
                       (get_next_char_t)ub_get_next_char_utf16);
284
}
285

286
/**
287
 * Sets the grapheme breaking information for a UTF-32 input string.
288
 *
289
 * @param[in]  s     input UTF-32 string
290
 * @param[in]  len   length of the input
291
 * @param[in]  lang  language of the input (reserved for future use)
292
 * @param[out] brks  pointer to the output breaking data, containing
293
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
294
 *                   First element in output array is for the break behind
295
 *                   the first character the pointer must point to an
296
 *                   array with at least as many elements as there
297
 *                   are characters in the string
298
 */
299
void set_graphemebreaks_utf32(const utf32_t *s, size_t len,
300
                              const char *lang, char *brks)
301
{
302
    (void)lang;
303
    set_graphemebreaks(s, len, brks,
304
                       (get_next_char_t)ub_get_next_char_utf32);
305
}
306

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.