efl
305 строк · 10.2 Кб
1/*
2* Grapheme breaking in a Unicode sequence. Designed to be used in a
3* generic text renderer.
4*
5* Copyright (C) 2016-2019 Andreas Röver <roever at users dot sf dot net>
6*
7* This software is provided 'as-is', without any express or implied
8* warranty. In no event will the author be held liable for any damages
9* arising from the use of this software.
10*
11* Permission is granted to anyone to use this software for any purpose,
12* including commercial applications, and to alter it and redistribute
13* it freely, subject to the following restrictions:
14*
15* 1. The origin of this software must not be misrepresented; you must
16* not claim that you wrote the original software. If you use this
17* software in a product, an acknowledgement in the product
18* documentation would be appreciated but is not required.
19* 2. Altered source versions must be plainly marked as such, and must
20* not be misrepresented as being the original software.
21* 3. This notice may not be removed or altered from any source
22* distribution.
23*
24* The main reference is Unicode Standard Annex 29 (UAX #29):
25* <URL:http://unicode.org/reports/tr29>
26*
27* When this library was designed, this annex was at Revision 29, for
28* Unicode 9.0.0:
29* <URL:http://www.unicode.org/reports/tr29/tr29-29.html>
30*
31* This library has been updated according to Revision 35, for
32* Unicode 12.0.0:
33* <URL:http://www.unicode.org/reports/tr29/tr29-35.html>
34*
35* The Unicode Terms of Use are available at
36* <URL:http://www.unicode.org/copyright.html>
37*/
38
39/**
40* @file graphemebreak.c
41*
42* Implementation of the grapheme breaking algorithm as described in Unicode
43* Standard Annex 29.
44*
45* @author Andreas Röver
46*/
47
48#include <string.h>
49#include "graphemebreak.h"
50#include "graphemebreakdata.c"
51#include "unibreakdef.h"
52#include "emojidef.h"
53
54/**
55* Initializes the wordbreak internals. It currently does nothing, but
56* it may in the future.
57*/
58void init_graphemebreak(void)
59{
60}
61
62/**
63* Gets the grapheme breaking class of a character.
64*
65* @param[in] ch character to check
66* @return the grapheme breaking class if found; \c GBP_Other otherwise
67*/
68static enum GraphemeBreakClass get_char_gb_class(utf32_t ch)
69{
70int min = 0;
71int max = ARRAY_LEN(gb_prop_default) - 1;
72int mid;
73
74do
75{
76mid = (min + max) / 2;
77
78if (ch < gb_prop_default[mid].start)
79max = mid - 1;
80else if (ch > gb_prop_default[mid].end)
81min = mid + 1;
82else
83return gb_prop_default[mid].prop;
84} while (min <= max);
85
86return GBP_Other;
87}
88
89/**
90* Sets the grapheme breaking information for a generic input string.
91* It uses the extended grapheme cluster ruleset.
92*
93* @param[in] s input string
94* @param[in] len length of the input
95* @param[out] brks pointer to the output breaking data, containing
96* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK
97* @param[in] get_next_char function to get the next UTF-32 character
98*/
99static void set_graphemebreaks(const void *s, size_t len, char *brks,
100get_next_char_t get_next_char)
101{
102size_t posNext = 0;
103int rule11Detector = 0;
104bool evenRegionalIndicators = true; // is the number of preceeding
105// GBP_RegionalIndicator characters
106// even
107
108utf32_t ch = get_next_char(s, len, &posNext);
109enum GraphemeBreakClass current_class = get_char_gb_class(ch);
110
111// initialize whole output to inside char
112memset(brks, GRAPHEMEBREAK_INSIDEACHAR, len);
113
114while (true)
115{
116
117// this state-machine recognizes the following pattern:
118// extended_pictograph Extended* ZWJ
119// when that pattern has been detected rule11Detector will be
120// 3 and rule 11 can be applied below
121switch (current_class)
122{
123case GBP_ZWJ:
124if (rule11Detector == 1 || rule11Detector == 2)
125{
126rule11Detector = 3;
127}
128else
129{
130rule11Detector = 0;
131}
132break;
133
134case GBP_Extend:
135if (rule11Detector == 1 || rule11Detector == 2)
136{
137rule11Detector = 2;
138}
139else
140{
141rule11Detector = 0;
142}
143break;
144
145default:
146if (ub_is_extended_pictographic(ch))
147{
148rule11Detector = 1;
149}
150else
151{
152rule11Detector = 0;
153}
154break;
155}
156
157enum GraphemeBreakClass prev_class = current_class;
158
159// safe position if current character so that we can store the
160// result there later on
161size_t brksPos = posNext - 1;
162
163// get nect character
164ch = get_next_char(s, len, &posNext);
165
166if (ch == EOS)
167{
168// done, place one final break after the last character as per
169// algorithm rule GB1
170brks[brksPos] = GRAPHEMEBREAK_BREAK;
171break;
172}
173
174// get class of current character
175current_class = get_char_gb_class(ch);
176
177if (prev_class == GBP_Regional_Indicator)
178{
179evenRegionalIndicators = !evenRegionalIndicators;
180}
181else
182{
183evenRegionalIndicators = true;
184}
185
186// check all rules
187if (prev_class == GBP_CR && current_class == GBP_LF)
188{
189brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB3
190}
191else if ((prev_class == GBP_CR) || (prev_class == GBP_LF) ||
192(prev_class == GBP_Control) || (current_class == GBP_CR) ||
193(current_class == GBP_LF) ||
194(current_class == GBP_Control))
195{
196brks[brksPos] = GRAPHEMEBREAK_BREAK; // Rule: GB4 + GB5
197}
198else if ((prev_class == GBP_L) &&
199((current_class == GBP_L) || (current_class == GBP_V) ||
200(current_class == GBP_LV) || (current_class == GBP_LVT)))
201{
202brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB6
203}
204else if (((prev_class == GBP_LV) || (prev_class == GBP_V)) &&
205((current_class == GBP_V) || (current_class == GBP_T)))
206{
207brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB7
208}
209else if (((prev_class == GBP_LVT) || (prev_class == GBP_T)) &&
210(current_class == GBP_T))
211{
212brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB8
213}
214else if ((current_class == GBP_Extend) ||
215(current_class == GBP_ZWJ) ||
216(current_class == GBP_Virama))
217{
218brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9
219}
220else if (current_class == GBP_SpacingMark)
221{
222brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9a
223}
224else if (prev_class == GBP_Prepend)
225{
226brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9b
227}
228else if ((rule11Detector == 3) && ub_is_extended_pictographic(ch))
229{
230brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB11
231}
232else if (!evenRegionalIndicators &&
233(current_class == GBP_Regional_Indicator))
234{
235brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB12 + GB13
236}
237else
238{
239brks[brksPos] = GRAPHEMEBREAK_BREAK; // Rule: GB999
240}
241}
242}
243
244/**
245* Sets the grapheme breaking information for a UTF-8 input string.
246*
247* @param[in] s input UTF-8 string
248* @param[in] len length of the input
249* @param[in] lang language of the input (reserved for future use)
250* @param[out] brks pointer to the output breaking data, containing
251* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
252* First element in output array is for the break behind
253* the first character the pointer must point to an
254* array with at least as many elements as there
255* are characters in the string
256*/
257void set_graphemebreaks_utf8(const utf8_t *s, size_t len, const char *lang,
258char *brks)
259{
260(void)lang;
261set_graphemebreaks(s, len, brks,
262(get_next_char_t)ub_get_next_char_utf8);
263}
264
265/**
266* Sets the grapheme breaking information for a UTF-16 input string.
267*
268* @param[in] s input UTF-16 string
269* @param[in] len length of the input
270* @param[in] lang language of the input (reserved for future use)
271* @param[out] brks pointer to the output breaking data, containing
272* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
273* First element in output array is for the break behind
274* the first character the pointer must point to an
275* array with at least as many elements as there
276* are characters in the string
277*/
278void set_graphemebreaks_utf16(const utf16_t *s, size_t len,
279const char *lang, char *brks)
280{
281(void)lang;
282set_graphemebreaks(s, len, brks,
283(get_next_char_t)ub_get_next_char_utf16);
284}
285
286/**
287* Sets the grapheme breaking information for a UTF-32 input string.
288*
289* @param[in] s input UTF-32 string
290* @param[in] len length of the input
291* @param[in] lang language of the input (reserved for future use)
292* @param[out] brks pointer to the output breaking data, containing
293* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
294* First element in output array is for the break behind
295* the first character the pointer must point to an
296* array with at least as many elements as there
297* are characters in the string
298*/
299void set_graphemebreaks_utf32(const utf32_t *s, size_t len,
300const char *lang, char *brks)
301{
302(void)lang;
303set_graphemebreaks(s, len, brks,
304(get_next_char_t)ub_get_next_char_utf32);
305}
306