efl

wordbreak.c
550 строк · 17.2 Кб
Перенос по словам
1
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2

3
/*
4
 * Word breaking in a Unicode sequence.  Designed to be used in a
5
 * generic text renderer.
6
 *
7
 * Copyright (C) 2013-2019 Tom Hacohen <tom at stosb dot com>
8
 * Copyright (C) 2018 Wu Yongwei <wuyongwei at gmail dot com>
9
 *
10
 * This software is provided 'as-is', without any express or implied
11
 * warranty.  In no event will the author be held liable for any damages
12
 * arising from the use of this software.
13
 *
14
 * Permission is granted to anyone to use this software for any purpose,
15
 * including commercial applications, and to alter it and redistribute
16
 * it freely, subject to the following restrictions:
17
 *
18
 * 1. The origin of this software must not be misrepresented; you must
19
 *    not claim that you wrote the original software.  If you use this
20
 *    software in a product, an acknowledgement in the product
21
 *    documentation would be appreciated but is not required.
22
 * 2. Altered source versions must be plainly marked as such, and must
23
 *    not be misrepresented as being the original software.
24
 * 3. This notice may not be removed or altered from any source
25
 *    distribution.
26
 *
27
 * The main reference is Unicode Standard Annex 29 (UAX #29):
28
 *      <URL:http://unicode.org/reports/tr29>
29
 *
30
 * When this library was designed, this annex was at Revision 17, for
31
 * Unicode 6.0.0:
32
 *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
33
 *
34
 * This library has been updated according to Revision 35, for
35
 * Unicode 12.0.0:
36
 *      <URL:http://www.unicode.org/reports/tr29/tr29-35.html>
37
 *
38
 * The Unicode Terms of Use are available at
39
 *      <URL:http://www.unicode.org/copyright.html>
40
 */
41

42
/**
43
 * @file    wordbreak.c
44
 *
45
 * Implementation of the word breaking algorithm as described in Unicode
46
 * Standard Annex 29.
47
 *
48
 * @author  Tom Hacohen
49
 */
50

51
#include <assert.h>
52
#include <stddef.h>
53
#include <string.h>
54
#include "unibreakdef.h"
55
#include "wordbreak.h"
56
#include "wordbreakdata.c"
57
#include "emojidef.h"
58

59
/**
60
 * Initializes the wordbreak internals.  It currently does nothing, but
61
 * it may in the future.
62
 */
63
void init_wordbreak(void)
64
{
65
}
66

67
/**
68
 * Gets the word breaking class of a character.
69
 *
70
 * @param ch   character to check
71
 * @param wbp  pointer to the wbp breaking properties array
72
 * @param len  size of the wbp array in number of items
73
 * @return     the word breaking class if found; \c WBP_Any otherwise
74
 */
75
static enum WordBreakClass get_char_wb_class(
76
        utf32_t ch,
77
        const struct WordBreakProperties *wbp,
78
        size_t len)
79
{
80
    int min = 0;
81
    int max = len - 1;
82
    int mid;
83

84
    do
85
    {
86
        mid = (min + max) / 2;
87

88
        if (ch < wbp[mid].start)
89
            max = mid - 1;
90
        else if (ch > wbp[mid].end)
91
            min = mid + 1;
92
        else
93
            return wbp[mid].prop;
94
    }
95
    while (min <= max);
96

97
    return WBP_Any;
98
}
99

100
/**
101
 * Sets the word break types to a specific value in a range.
102
 *
103
 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
104
 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
105
 * cells that we really don't want to break after.
106
 *
107
 * @param[in]  s             input string
108
 * @param[out] brks          breaks array to fill
109
 * @param[in]  posStart      start position
110
 * @param[in]  posEnd        end position (exclusive)
111
 * @param[in]  len           length of the string
112
 * @param[in]  brkType       breaks type to use
113
 * @param[in] get_next_char  function to get the next UTF-32 character
114
 */
115
static void set_brks_to(
116
        const void *s,
117
        char *brks,
118
        size_t posStart,
119
        size_t posEnd,
120
        size_t len,
121
        char brkType,
122
        get_next_char_t get_next_char)
123
{
124
    size_t posNext = posStart;
125
    while (posNext < posEnd)
126
    {
127
        utf32_t ch;
128
        ch = get_next_char(s, len, &posNext);
129
        (void)ch;
130
        assert(ch != EOS);
131
        for (; posStart < posNext - 1; ++posStart)
132
            brks[posStart] = WORDBREAK_INSIDEACHAR;
133
        assert(posStart == posNext - 1);
134

135
        /* Only set it if we haven't set it not to break before. */
136
        if (brks[posStart] != WORDBREAK_NOBREAK)
137
            brks[posStart] = brkType;
138
        posStart = posNext;
139
    }
140
}
141

142
/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
143
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
144
                       (cls == WBP_LF))
145

146
/**
147
 * Sets the word breaking information for a generic input string.
148
 *
149
 * @param[in]  s             input string
150
 * @param[in]  len           length of the input
151
 * @param[in]  lang          language of the input (reserved for future use)
152
 * @param[out] brks          pointer to the output breaking data, containing
153
 *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
154
 *                           #WORDBREAK_INSIDEACHAR
155
 * @param[in] get_next_char  function to get the next UTF-32 character
156
 */
157
static void set_wordbreaks(
158
        const void *s,
159
        size_t len,
160
        const char *lang,
161
        char *brks,
162
        get_next_char_t get_next_char)
163
{
164
    /* Counter of how many time we cam across RI */
165
    int riCounter = 0;
166
    enum WordBreakClass wbcLast = WBP_Undefined;
167
    /* wbcSeqStart is the class that started the current sequence.
168
     * WBP_Undefined is a special case that means "sot".
169
     * This value is the class that is at the start of the current rule
170
     * matching sequence. For example, in case of Numeric+MidNum+Numeric
171
     * it'll be Numeric all the way.
172
     */
173
    enum WordBreakClass wbcSeqStart = WBP_Undefined;
174
    utf32_t ch;
175
    size_t posNext = 0;
176
    size_t posCur = 0;
177
    size_t posLast = 0;
178

179
    /* TODO: Language-specific specialization. */
180
    (void) lang;
181

182
    /* Init brks. */
183
    memset(brks, WORDBREAK_BREAK, len);
184

185
    ch = get_next_char(s, len, &posNext);
186

187
    while (ch != EOS)
188
    {
189
        enum WordBreakClass wbcCur;
190
        wbcCur = get_char_wb_class(ch, wb_prop_default,
191
                                   ARRAY_LEN(wb_prop_default));
192

193
        switch (wbcCur)
194
        {
195
        case WBP_CR:
196
            /* WB3b */
197
            set_brks_to(s, brks, posLast, posCur, len,
198
                        WORDBREAK_BREAK, get_next_char);
199
            wbcSeqStart = wbcCur;
200
            posLast = posCur;
201
            break;
202

203
        case WBP_LF:
204
            if (wbcSeqStart == WBP_CR) /* WB3 */
205
            {
206
                set_brks_to(s, brks, posLast, posCur, len,
207
                            WORDBREAK_NOBREAK, get_next_char);
208
                wbcSeqStart = wbcCur;
209
                posLast = posCur;
210
                break;
211
            }
212
#ifndef __has_attribute
213
# define __has_attribute(x) 0
214
#endif
215
#if __has_attribute(fallthrough)
216
           __attribute__((fallthrough));
217
#endif
218
            /* Fall through */
219

220
        case WBP_Newline:
221
            /* WB3a,3b */
222
            set_brks_to(s, brks, posLast, posCur, len,
223
                        WORDBREAK_BREAK, get_next_char);
224
            wbcSeqStart = wbcCur;
225
            posLast = posCur;
226
            break;
227

228
        case WBP_ZWJ:
229
        case WBP_Extend:
230
        case WBP_Format:
231
            /* WB4 - If not the first char/after a newline (WB3a,3b), skip
232
             * this class, set it to be the same as the prev, and mark
233
             * brks not to break before them. */
234
            if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
235
            {
236
                set_brks_to(s, brks, posLast, posCur, len,
237
                            WORDBREAK_BREAK, get_next_char);
238
                wbcSeqStart = wbcCur;
239
                posLast = posCur;
240
            }
241
            else
242
            {
243
                /* It's surely not the first */
244
                brks[posCur - 1] = WORDBREAK_NOBREAK;
245
                /* WB3c and WB3d precede 4, so no intervening Extend
246
                 * chars allowed. */
247
                if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ &&
248
                    wbcSeqStart != WBP_WSegSpace)
249
                {
250
                    /* "inherit" the previous class. */
251
                    wbcCur = wbcLast;
252
                }
253
            }
254
            break;
255

256
        case WBP_Katakana:
257
            if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
258
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
259
            {
260
                set_brks_to(s, brks, posLast, posCur, len,
261
                            WORDBREAK_NOBREAK, get_next_char);
262
            }
263
            /* No rule found, reset */
264
            else
265
            {
266
                set_brks_to(s, brks, posLast, posCur, len,
267
                            WORDBREAK_BREAK, get_next_char);
268
            }
269
            wbcSeqStart = wbcCur;
270
            posLast = posCur;
271
            break;
272

273
        case WBP_Hebrew_Letter:
274
        case WBP_ALetter:
275
            if ((wbcSeqStart == WBP_Hebrew_Letter) &&
276
                    (wbcLast == WBP_Double_Quote)) /* WB7b,c */
277
            {
278
               if (wbcCur == WBP_Hebrew_Letter)
279
                 {
280
                     set_brks_to(s, brks, posLast, posCur, len,
281
                             WORDBREAK_NOBREAK, get_next_char);
282
                 }
283
               else
284
                 {
285
                     set_brks_to(s, brks, posLast, posCur, len,
286
                             WORDBREAK_BREAK, get_next_char);
287
                 }
288
            }
289
            else if (((wbcSeqStart == WBP_ALetter) ||
290
                        (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
291
                    (wbcLast == WBP_Numeric) || /* WB10 */
292
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
293
            {
294
                set_brks_to(s, brks, posLast, posCur, len,
295
                            WORDBREAK_NOBREAK, get_next_char);
296
            }
297
            /* No rule found, reset */
298
            else
299
            {
300
                set_brks_to(s, brks, posLast, posCur, len,
301
                            WORDBREAK_BREAK, get_next_char);
302
            }
303
            wbcSeqStart = wbcCur;
304
            posLast = posCur;
305
            break;
306

307
        case WBP_Single_Quote:
308
            if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
309
            {
310
                set_brks_to(s, brks, posLast, posCur, len,
311
                            WORDBREAK_NOBREAK, get_next_char);
312
                wbcSeqStart = wbcCur;
313
                posLast = posCur;
314
            }
315
#ifndef __has_attribute
316
# define __has_attribute(x) 0
317
#endif
318
#if __has_attribute(fallthrough)
319
           __attribute__((fallthrough));
320
#endif
321
            /* Fall through */
322

323
        case WBP_MidNumLet:
324
            if (((wbcLast == WBP_ALetter) ||
325
                        (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
326
                    (wbcLast == WBP_Numeric)) /* WB11,12 */
327
            {
328
                /* Go on */
329
            }
330
            else
331
            {
332
                set_brks_to(s, brks, posLast, posCur, len,
333
                            WORDBREAK_BREAK, get_next_char);
334
                wbcSeqStart = wbcCur;
335
                posLast = posCur;
336
            }
337
            break;
338

339
        case WBP_MidLetter:
340
            if ((wbcLast == WBP_ALetter) ||
341
                    (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
342
            {
343
                /* Go on */
344
            }
345
            else
346
            {
347
                set_brks_to(s, brks, posLast, posCur, len,
348
                            WORDBREAK_BREAK, get_next_char);
349
                wbcSeqStart = wbcCur;
350
                posLast = posCur;
351
            }
352
            break;
353

354
        case WBP_MidNum:
355
            if (wbcLast == WBP_Numeric) /* WB11,12 */
356
            {
357
                /* Go on */
358
            }
359
            else
360
            {
361
                set_brks_to(s, brks, posLast, posCur, len,
362
                            WORDBREAK_BREAK, get_next_char);
363
                wbcSeqStart = wbcCur;
364
                posLast = posCur;
365
            }
366
            break;
367

368
        case WBP_Numeric:
369
            if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
370
                    ((wbcLast == WBP_ALetter) ||
371
                     (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
372
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
373
            {
374
                set_brks_to(s, brks, posLast, posCur, len,
375
                            WORDBREAK_NOBREAK, get_next_char);
376
            }
377
            /* No rule found, reset */
378
            else
379
            {
380
                set_brks_to(s, brks, posLast, posCur, len,
381
                            WORDBREAK_BREAK, get_next_char);
382
            }
383
            wbcSeqStart = wbcCur;
384
            posLast = posCur;
385
            break;
386

387
        case WBP_ExtendNumLet:
388
            /* WB13a,13b */
389
            if ((wbcSeqStart == wbcLast) &&
390
                ((wbcLast == WBP_ALetter) ||
391
                 (wbcLast == WBP_Hebrew_Letter) ||
392
                 (wbcLast == WBP_Numeric) ||
393
                 (wbcLast == WBP_Katakana) ||
394
                 (wbcLast == WBP_ExtendNumLet)))
395
            {
396
                set_brks_to(s, brks, posLast, posCur, len,
397
                            WORDBREAK_NOBREAK, get_next_char);
398
            }
399
            /* No rule found, reset */
400
            else
401
            {
402
                set_brks_to(s, brks, posLast, posCur, len,
403
                            WORDBREAK_BREAK, get_next_char);
404
            }
405
            wbcSeqStart = wbcCur;
406
            posLast = posCur;
407
            break;
408

409
        case WBP_Regional_Indicator:
410
            /* WB15,16 */
411
            if ((wbcSeqStart == WBP_Regional_Indicator) &&
412
                ((riCounter % 2) == 1))
413
            {
414
                set_brks_to(s, brks, posLast, posCur, len,
415
                        WORDBREAK_NOBREAK, get_next_char);
416
                riCounter = 0; /* Reset the sequence */
417
            }
418
            /* No rule found, reset */
419
            else
420
            {
421
                set_brks_to(s, brks, posLast, posCur, len,
422
                            WORDBREAK_BREAK, get_next_char);
423
                riCounter = 1;
424
            }
425
            wbcSeqStart = wbcCur;
426
            posLast = posCur;
427
            break;
428

429
        case WBP_Double_Quote:
430
            if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
431
            {
432
               /* Go on */
433
            }
434
            else
435
            {
436
                set_brks_to(s, brks, posLast, posCur, len,
437
                            WORDBREAK_BREAK, get_next_char);
438
                wbcSeqStart = wbcCur;
439
                posLast = posCur;
440
            }
441
            break;
442

443
        case WBP_WSegSpace:
444
            if (wbcLast == WBP_WSegSpace) /* WB3d */
445
            {
446
                set_brks_to(s, brks, posLast, posCur, len,
447
                            WORDBREAK_NOBREAK, get_next_char);
448
                posLast = posCur;
449
                break;
450
            }
451
#ifndef __has_attribute
452
# define __has_attribute(x) 0
453
#endif
454
#if __has_attribute(fallthrough)
455
           __attribute__((fallthrough));
456
#endif
457
            /* Fall through */
458

459
        case WBP_Any:
460
            /* Check for rule WB3c */
461
            if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch))
462
            {
463
                set_brks_to(s, brks, posLast, posCur, len,
464
                            WORDBREAK_NOBREAK, get_next_char);
465
                posLast = posCur;
466
                break;
467
            }
468

469
            /* Allow breaks and reset */
470
            set_brks_to(s, brks, posLast, posCur, len,
471
                        WORDBREAK_BREAK, get_next_char);
472
            wbcSeqStart = wbcCur;
473
            posLast = posCur;
474
            break;
475

476
        default:
477
            /* Error, should never get here! */
478
            assert(0);
479
            break;
480
        }
481

482
        wbcLast = wbcCur;
483
        posCur = posNext;
484
        ch = get_next_char(s, len, &posNext);
485
    }
486

487
    /* WB2 */
488
    set_brks_to(s, brks, posLast, posNext, len,
489
                WORDBREAK_BREAK, get_next_char);
490
}
491

492
/**
493
 * Sets the word breaking information for a UTF-8 input string.
494
 *
495
 * @param[in]  s     input UTF-8 string
496
 * @param[in]  len   length of the input
497
 * @param[in]  lang  language of the input (reserved for future use)
498
 * @param[out] brks  pointer to the output breaking data, containing
499
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
500
 *                   #WORDBREAK_INSIDEACHAR
501
 */
502
void set_wordbreaks_utf8(
503
        const utf8_t *s,
504
        size_t len,
505
        const char *lang,
506
        char *brks)
507
{
508
    set_wordbreaks(s, len, lang, brks,
509
                   (get_next_char_t)ub_get_next_char_utf8);
510
}
511

512
/**
513
 * Sets the word breaking information for a UTF-16 input string.
514
 *
515
 * @param[in]  s     input UTF-16 string
516
 * @param[in]  len   length of the input
517
 * @param[in]  lang  language of the input (reserved for future use)
518
 * @param[out] brks  pointer to the output breaking data, containing
519
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
520
 *                   #WORDBREAK_INSIDEACHAR
521
 */
522
void set_wordbreaks_utf16(
523
        const utf16_t *s,
524
        size_t len,
525
        const char *lang,
526
        char *brks)
527
{
528
    set_wordbreaks(s, len, lang, brks,
529
                   (get_next_char_t)ub_get_next_char_utf16);
530
}
531

532
/**
533
 * Sets the word breaking information for a UTF-32 input string.
534
 *
535
 * @param[in]  s     input UTF-32 string
536
 * @param[in]  len   length of the input
537
 * @param[in]  lang  language of the input (reserved for future use)
538
 * @param[out] brks  pointer to the output breaking data, containing
539
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
540
 *                   #WORDBREAK_INSIDEACHAR
541
 */
542
void set_wordbreaks_utf32(
543
        const utf32_t *s,
544
        size_t len,
545
        const char *lang,
546
        char *brks)
547
{
548
    set_wordbreaks(s, len, lang, brks,
549
                   (get_next_char_t)ub_get_next_char_utf32);
550
}
551
efl

Использование cookies