efl
550 строк · 17.2 Кб
1/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2
3/*
4* Word breaking in a Unicode sequence. Designed to be used in a
5* generic text renderer.
6*
7* Copyright (C) 2013-2019 Tom Hacohen <tom at stosb dot com>
8* Copyright (C) 2018 Wu Yongwei <wuyongwei at gmail dot com>
9*
10* This software is provided 'as-is', without any express or implied
11* warranty. In no event will the author be held liable for any damages
12* arising from the use of this software.
13*
14* Permission is granted to anyone to use this software for any purpose,
15* including commercial applications, and to alter it and redistribute
16* it freely, subject to the following restrictions:
17*
18* 1. The origin of this software must not be misrepresented; you must
19* not claim that you wrote the original software. If you use this
20* software in a product, an acknowledgement in the product
21* documentation would be appreciated but is not required.
22* 2. Altered source versions must be plainly marked as such, and must
23* not be misrepresented as being the original software.
24* 3. This notice may not be removed or altered from any source
25* distribution.
26*
27* The main reference is Unicode Standard Annex 29 (UAX #29):
28* <URL:http://unicode.org/reports/tr29>
29*
30* When this library was designed, this annex was at Revision 17, for
31* Unicode 6.0.0:
32* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
33*
34* This library has been updated according to Revision 35, for
35* Unicode 12.0.0:
36* <URL:http://www.unicode.org/reports/tr29/tr29-35.html>
37*
38* The Unicode Terms of Use are available at
39* <URL:http://www.unicode.org/copyright.html>
40*/
41
42/**
43* @file wordbreak.c
44*
45* Implementation of the word breaking algorithm as described in Unicode
46* Standard Annex 29.
47*
48* @author Tom Hacohen
49*/
50
51#include <assert.h>52#include <stddef.h>53#include <string.h>54#include "unibreakdef.h"55#include "wordbreak.h"56#include "wordbreakdata.c"57#include "emojidef.h"58
59/**
60* Initializes the wordbreak internals. It currently does nothing, but
61* it may in the future.
62*/
63void init_wordbreak(void)64{
65}
66
67/**
68* Gets the word breaking class of a character.
69*
70* @param ch character to check
71* @param wbp pointer to the wbp breaking properties array
72* @param len size of the wbp array in number of items
73* @return the word breaking class if found; \c WBP_Any otherwise
74*/
75static enum WordBreakClass get_char_wb_class(76utf32_t ch,77const struct WordBreakProperties *wbp,78size_t len)79{
80int min = 0;81int max = len - 1;82int mid;83
84do85{86mid = (min + max) / 2;87
88if (ch < wbp[mid].start)89max = mid - 1;90else if (ch > wbp[mid].end)91min = mid + 1;92else93return wbp[mid].prop;94}95while (min <= max);96
97return WBP_Any;98}
99
100/**
101* Sets the word break types to a specific value in a range.
102*
103* It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
104* Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
105* cells that we really don't want to break after.
106*
107* @param[in] s input string
108* @param[out] brks breaks array to fill
109* @param[in] posStart start position
110* @param[in] posEnd end position (exclusive)
111* @param[in] len length of the string
112* @param[in] brkType breaks type to use
113* @param[in] get_next_char function to get the next UTF-32 character
114*/
115static void set_brks_to(116const void *s,117char *brks,118size_t posStart,119size_t posEnd,120size_t len,121char brkType,122get_next_char_t get_next_char)123{
124size_t posNext = posStart;125while (posNext < posEnd)126{127utf32_t ch;128ch = get_next_char(s, len, &posNext);129(void)ch;130assert(ch != EOS);131for (; posStart < posNext - 1; ++posStart)132brks[posStart] = WORDBREAK_INSIDEACHAR;133assert(posStart == posNext - 1);134
135/* Only set it if we haven't set it not to break before. */136if (brks[posStart] != WORDBREAK_NOBREAK)137brks[posStart] = brkType;138posStart = posNext;139}140}
141
142/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
143#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \144(cls == WBP_LF))145
146/**
147* Sets the word breaking information for a generic input string.
148*
149* @param[in] s input string
150* @param[in] len length of the input
151* @param[in] lang language of the input (reserved for future use)
152* @param[out] brks pointer to the output breaking data, containing
153* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
154* #WORDBREAK_INSIDEACHAR
155* @param[in] get_next_char function to get the next UTF-32 character
156*/
157static void set_wordbreaks(158const void *s,159size_t len,160const char *lang,161char *brks,162get_next_char_t get_next_char)163{
164/* Counter of how many time we cam across RI */165int riCounter = 0;166enum WordBreakClass wbcLast = WBP_Undefined;167/* wbcSeqStart is the class that started the current sequence.168* WBP_Undefined is a special case that means "sot".
169* This value is the class that is at the start of the current rule
170* matching sequence. For example, in case of Numeric+MidNum+Numeric
171* it'll be Numeric all the way.
172*/
173enum WordBreakClass wbcSeqStart = WBP_Undefined;174utf32_t ch;175size_t posNext = 0;176size_t posCur = 0;177size_t posLast = 0;178
179/* TODO: Language-specific specialization. */180(void) lang;181
182/* Init brks. */183memset(brks, WORDBREAK_BREAK, len);184
185ch = get_next_char(s, len, &posNext);186
187while (ch != EOS)188{189enum WordBreakClass wbcCur;190wbcCur = get_char_wb_class(ch, wb_prop_default,191ARRAY_LEN(wb_prop_default));192
193switch (wbcCur)194{195case WBP_CR:196/* WB3b */197set_brks_to(s, brks, posLast, posCur, len,198WORDBREAK_BREAK, get_next_char);199wbcSeqStart = wbcCur;200posLast = posCur;201break;202
203case WBP_LF:204if (wbcSeqStart == WBP_CR) /* WB3 */205{206set_brks_to(s, brks, posLast, posCur, len,207WORDBREAK_NOBREAK, get_next_char);208wbcSeqStart = wbcCur;209posLast = posCur;210break;211}212#ifndef __has_attribute213# define __has_attribute(x) 0214#endif215#if __has_attribute(fallthrough)216__attribute__((fallthrough));217#endif218/* Fall through */219
220case WBP_Newline:221/* WB3a,3b */222set_brks_to(s, brks, posLast, posCur, len,223WORDBREAK_BREAK, get_next_char);224wbcSeqStart = wbcCur;225posLast = posCur;226break;227
228case WBP_ZWJ:229case WBP_Extend:230case WBP_Format:231/* WB4 - If not the first char/after a newline (WB3a,3b), skip232* this class, set it to be the same as the prev, and mark
233* brks not to break before them. */
234if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))235{236set_brks_to(s, brks, posLast, posCur, len,237WORDBREAK_BREAK, get_next_char);238wbcSeqStart = wbcCur;239posLast = posCur;240}241else242{243/* It's surely not the first */244brks[posCur - 1] = WORDBREAK_NOBREAK;245/* WB3c and WB3d precede 4, so no intervening Extend246* chars allowed. */
247if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ &&248wbcSeqStart != WBP_WSegSpace)249{250/* "inherit" the previous class. */251wbcCur = wbcLast;252}253}254break;255
256case WBP_Katakana:257if ((wbcSeqStart == WBP_Katakana) || /* WB13 */258(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */259{260set_brks_to(s, brks, posLast, posCur, len,261WORDBREAK_NOBREAK, get_next_char);262}263/* No rule found, reset */264else265{266set_brks_to(s, brks, posLast, posCur, len,267WORDBREAK_BREAK, get_next_char);268}269wbcSeqStart = wbcCur;270posLast = posCur;271break;272
273case WBP_Hebrew_Letter:274case WBP_ALetter:275if ((wbcSeqStart == WBP_Hebrew_Letter) &&276(wbcLast == WBP_Double_Quote)) /* WB7b,c */277{278if (wbcCur == WBP_Hebrew_Letter)279{280set_brks_to(s, brks, posLast, posCur, len,281WORDBREAK_NOBREAK, get_next_char);282}283else284{285set_brks_to(s, brks, posLast, posCur, len,286WORDBREAK_BREAK, get_next_char);287}288}289else if (((wbcSeqStart == WBP_ALetter) ||290(wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */291(wbcLast == WBP_Numeric) || /* WB10 */292(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */293{294set_brks_to(s, brks, posLast, posCur, len,295WORDBREAK_NOBREAK, get_next_char);296}297/* No rule found, reset */298else299{300set_brks_to(s, brks, posLast, posCur, len,301WORDBREAK_BREAK, get_next_char);302}303wbcSeqStart = wbcCur;304posLast = posCur;305break;306
307case WBP_Single_Quote:308if (wbcLast == WBP_Hebrew_Letter) /* WB7a */309{310set_brks_to(s, brks, posLast, posCur, len,311WORDBREAK_NOBREAK, get_next_char);312wbcSeqStart = wbcCur;313posLast = posCur;314}315#ifndef __has_attribute316# define __has_attribute(x) 0317#endif318#if __has_attribute(fallthrough)319__attribute__((fallthrough));320#endif321/* Fall through */322
323case WBP_MidNumLet:324if (((wbcLast == WBP_ALetter) ||325(wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */326(wbcLast == WBP_Numeric)) /* WB11,12 */327{328/* Go on */329}330else331{332set_brks_to(s, brks, posLast, posCur, len,333WORDBREAK_BREAK, get_next_char);334wbcSeqStart = wbcCur;335posLast = posCur;336}337break;338
339case WBP_MidLetter:340if ((wbcLast == WBP_ALetter) ||341(wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */342{343/* Go on */344}345else346{347set_brks_to(s, brks, posLast, posCur, len,348WORDBREAK_BREAK, get_next_char);349wbcSeqStart = wbcCur;350posLast = posCur;351}352break;353
354case WBP_MidNum:355if (wbcLast == WBP_Numeric) /* WB11,12 */356{357/* Go on */358}359else360{361set_brks_to(s, brks, posLast, posCur, len,362WORDBREAK_BREAK, get_next_char);363wbcSeqStart = wbcCur;364posLast = posCur;365}366break;367
368case WBP_Numeric:369if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */370((wbcLast == WBP_ALetter) ||371(wbcLast == WBP_Hebrew_Letter)) || /* WB9 */372(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */373{374set_brks_to(s, brks, posLast, posCur, len,375WORDBREAK_NOBREAK, get_next_char);376}377/* No rule found, reset */378else379{380set_brks_to(s, brks, posLast, posCur, len,381WORDBREAK_BREAK, get_next_char);382}383wbcSeqStart = wbcCur;384posLast = posCur;385break;386
387case WBP_ExtendNumLet:388/* WB13a,13b */389if ((wbcSeqStart == wbcLast) &&390((wbcLast == WBP_ALetter) ||391(wbcLast == WBP_Hebrew_Letter) ||392(wbcLast == WBP_Numeric) ||393(wbcLast == WBP_Katakana) ||394(wbcLast == WBP_ExtendNumLet)))395{396set_brks_to(s, brks, posLast, posCur, len,397WORDBREAK_NOBREAK, get_next_char);398}399/* No rule found, reset */400else401{402set_brks_to(s, brks, posLast, posCur, len,403WORDBREAK_BREAK, get_next_char);404}405wbcSeqStart = wbcCur;406posLast = posCur;407break;408
409case WBP_Regional_Indicator:410/* WB15,16 */411if ((wbcSeqStart == WBP_Regional_Indicator) &&412((riCounter % 2) == 1))413{414set_brks_to(s, brks, posLast, posCur, len,415WORDBREAK_NOBREAK, get_next_char);416riCounter = 0; /* Reset the sequence */417}418/* No rule found, reset */419else420{421set_brks_to(s, brks, posLast, posCur, len,422WORDBREAK_BREAK, get_next_char);423riCounter = 1;424}425wbcSeqStart = wbcCur;426posLast = posCur;427break;428
429case WBP_Double_Quote:430if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */431{432/* Go on */433}434else435{436set_brks_to(s, brks, posLast, posCur, len,437WORDBREAK_BREAK, get_next_char);438wbcSeqStart = wbcCur;439posLast = posCur;440}441break;442
443case WBP_WSegSpace:444if (wbcLast == WBP_WSegSpace) /* WB3d */445{446set_brks_to(s, brks, posLast, posCur, len,447WORDBREAK_NOBREAK, get_next_char);448posLast = posCur;449break;450}451#ifndef __has_attribute452# define __has_attribute(x) 0453#endif454#if __has_attribute(fallthrough)455__attribute__((fallthrough));456#endif457/* Fall through */458
459case WBP_Any:460/* Check for rule WB3c */461if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch))462{463set_brks_to(s, brks, posLast, posCur, len,464WORDBREAK_NOBREAK, get_next_char);465posLast = posCur;466break;467}468
469/* Allow breaks and reset */470set_brks_to(s, brks, posLast, posCur, len,471WORDBREAK_BREAK, get_next_char);472wbcSeqStart = wbcCur;473posLast = posCur;474break;475
476default:477/* Error, should never get here! */478assert(0);479break;480}481
482wbcLast = wbcCur;483posCur = posNext;484ch = get_next_char(s, len, &posNext);485}486
487/* WB2 */488set_brks_to(s, brks, posLast, posNext, len,489WORDBREAK_BREAK, get_next_char);490}
491
492/**
493* Sets the word breaking information for a UTF-8 input string.
494*
495* @param[in] s input UTF-8 string
496* @param[in] len length of the input
497* @param[in] lang language of the input (reserved for future use)
498* @param[out] brks pointer to the output breaking data, containing
499* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
500* #WORDBREAK_INSIDEACHAR
501*/
502void set_wordbreaks_utf8(503const utf8_t *s,504size_t len,505const char *lang,506char *brks)507{
508set_wordbreaks(s, len, lang, brks,509(get_next_char_t)ub_get_next_char_utf8);510}
511
512/**
513* Sets the word breaking information for a UTF-16 input string.
514*
515* @param[in] s input UTF-16 string
516* @param[in] len length of the input
517* @param[in] lang language of the input (reserved for future use)
518* @param[out] brks pointer to the output breaking data, containing
519* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
520* #WORDBREAK_INSIDEACHAR
521*/
522void set_wordbreaks_utf16(523const utf16_t *s,524size_t len,525const char *lang,526char *brks)527{
528set_wordbreaks(s, len, lang, brks,529(get_next_char_t)ub_get_next_char_utf16);530}
531
532/**
533* Sets the word breaking information for a UTF-32 input string.
534*
535* @param[in] s input UTF-32 string
536* @param[in] len length of the input
537* @param[in] lang language of the input (reserved for future use)
538* @param[out] brks pointer to the output breaking data, containing
539* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
540* #WORDBREAK_INSIDEACHAR
541*/
542void set_wordbreaks_utf32(543const utf32_t *s,544size_t len,545const char *lang,546char *brks)547{
548set_wordbreaks(s, len, lang, brks,549(get_next_char_t)ub_get_next_char_utf32);550}
551