efl

Форк
0
896 строк · 31.9 Кб
1
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2

3
/*
4
 * Line breaking in a Unicode sequence.  Designed to be used in a
5
 * generic text renderer.
6
 *
7
 * Copyright (C) 2008-2019 Wu Yongwei <wuyongwei at gmail dot com>
8
 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
9
 *
10
 * This software is provided 'as-is', without any express or implied
11
 * warranty.  In no event will the author be held liable for any damages
12
 * arising from the use of this software.
13
 *
14
 * Permission is granted to anyone to use this software for any purpose,
15
 * including commercial applications, and to alter it and redistribute
16
 * it freely, subject to the following restrictions:
17
 *
18
 * 1. The origin of this software must not be misrepresented; you must
19
 *    not claim that you wrote the original software.  If you use this
20
 *    software in a product, an acknowledgement in the product
21
 *    documentation would be appreciated but is not required.
22
 * 2. Altered source versions must be plainly marked as such, and must
23
 *    not be misrepresented as being the original software.
24
 * 3. This notice may not be removed or altered from any source
25
 *    distribution.
26
 *
27
 * The main reference is Unicode Standard Annex 14 (UAX #14):
28
 *      <URL:http://www.unicode.org/reports/tr14/>
29
 *
30
 * When this library was designed, this annex was at Revision 19, for
31
 * Unicode 5.0.0:
32
 *      <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
33
 *
34
 * This library has been updated according to Revision 43, for
35
 * Unicode 12.0.0:
36
 *      <URL:http://www.unicode.org/reports/tr14/tr14-43.html>
37
 *
38
 * The Unicode Terms of Use are available at
39
 *      <URL:http://www.unicode.org/copyright.html>
40
 */
41

42
/**
43
 * @file    linebreak.c
44
 *
45
 * Implementation of the line breaking algorithm as described in Unicode
46
 * Standard Annex 14.
47
 *
48
 * @author  Wu Yongwei
49
 * @author  Petr Filipsky
50
 */
51

52
#include <assert.h>
53
#include <stddef.h>
54
#include <string.h>
55
#include "linebreak.h"
56
#include "linebreakdef.h"
57

58
/**
59
 * Special value used internally to indicate an undefined break result.
60
 */
61
#define LINEBREAK_UNDEFINED -1
62

63
/**
64
 * Size of the second-level index to the line breaking properties.
65
 */
66
#define LINEBREAK_INDEX_SIZE 40
67

68
/**
69
 * Enumeration of break actions.  They are used in the break action
70
 * pair table #baTable.
71
 */
72
enum BreakAction
73
{
74
    DIR_BRK,        /**< Direct break opportunity */
75
    IND_BRK,        /**< Indirect break opportunity */
76
    CMI_BRK,        /**< Indirect break opportunity for combining marks */
77
    CMP_BRK,        /**< Prohibited break for combining marks */
78
    PRH_BRK         /**< Prohibited break */
79
};
80

81
/**
82
 * Break action pair table.  This is a direct mapping of Table 2 of
83
 * Unicode Standard Annex 14, Revision 37, except for ZWJ (manually
84
 * adjusted after special processing as per LB8a of Revision 41) and CB
85
 * (manually added as per LB20).
86
 */
87
static enum BreakAction baTable[LBP_CB][LBP_CB] = {
88
    {   /* OP */
89
        PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
90
        PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
91
        PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
92
        CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
93
        PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
94
    {   /* CL */
95
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
96
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
97
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
98
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
99
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
100
    {   /* CP */
101
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
102
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
103
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
104
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
105
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
106
    {   /* QU */
107
        PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
108
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
109
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
110
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
111
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
112
    {   /* GL */
113
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
114
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
115
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
116
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
117
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
118
    {   /* NS */
119
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
120
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
121
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
122
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
123
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
124
    {   /* EX */
125
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
126
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
127
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
128
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
129
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
130
    {   /* SY */
131
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
132
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, IND_BRK,
133
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
134
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
135
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
136
    {   /* IS */
137
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
138
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
139
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
140
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
141
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
142
    {   /* PR */
143
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
144
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
145
        IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
146
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
147
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
148
    {   /* PO */
149
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
150
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
151
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
152
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
153
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
154
    {   /* NU */
155
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
156
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
157
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
158
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
159
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
160
    {   /* AL */
161
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
162
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
163
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
164
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
165
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
166
    {   /* HL */
167
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
168
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
169
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
170
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
171
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
172
    {   /* ID */
173
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
174
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
175
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
176
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
177
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
178
    {   /* IN */
179
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
180
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
181
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
182
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
184
    {   /* HY */
185
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
186
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
187
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
188
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
189
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
190
    {   /* BA */
191
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
192
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
193
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
194
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
195
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
196
    {   /* BB */
197
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
198
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
199
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
200
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
201
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
202
    {   /* B2 */
203
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
204
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
205
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
206
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
207
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
208
    {   /* ZW */
209
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
210
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
211
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
212
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
214
    {   /* CM */
215
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
216
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
217
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
218
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
219
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
220
    {   /* WJ */
221
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
222
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
223
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
224
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
225
        IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
226
    {   /* H2 */
227
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
228
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
229
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
230
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
231
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
232
    {   /* H3 */
233
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
234
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
235
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
236
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
237
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
238
    {   /* JL */
239
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
240
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
241
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
242
        CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
243
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
244
    {   /* JV */
245
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
246
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
247
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
248
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
249
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
250
    {   /* JT */
251
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
252
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
253
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
254
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
255
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
256
    {   /* RI */
257
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
258
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
259
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
260
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
261
        IND_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
262
    {   /* EB */
263
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
264
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
265
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
266
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
267
        DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK },
268
    {   /* EM */
269
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
270
        PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
271
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
272
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
273
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
274
    {   /* ZWJ */
275
        IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
276
        PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
277
        DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
278
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
279
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
280
    {   /* CB */
281
        DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK,
282
        PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
283
        DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
284
        CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
285
        DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
286
};
287

288
/**
289
 * Struct for the second-level index to the line breaking properties.
290
 */
291
struct LineBreakPropertiesIndex
292
{
293
    utf32_t end;                           /**< End codepoint */
294
    const struct LineBreakProperties *lbp; /**< Pointer to line breaking
295
                                                properties */
296
};
297

298
/**
299
 * Second-level index to the line breaking properties.
300
 */
301
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
302
{
303
    { 0xFFFFFFFF, lb_prop_default }
304
};
305

306
/**
307
 * Checks whether the \a str ends with \a suffix, which has length
308
 * \a suffix_len.
309
 *
310
 * @param str        string whose ending is to be checked
311
 * @param suffix     string to check
312
 * @param suffixLen  length of \a suffix
313
 * @return           non-zero if true; zero otherwise
314
 */
315
static __inline int ends_with(const char *str, const char *suffix,
316
                              unsigned suffixLen)
317
{
318
    unsigned len;
319
    if (str == NULL)
320
    {
321
        return 0;
322
    }
323
    len = strlen(str);
324
    if (len >= suffixLen &&
325
        memcmp(str + len - suffixLen, suffix, suffixLen) == 0)
326
    {
327
        return 1;
328
    }
329
    else
330
    {
331
        return 0;
332
    }
333
}
334

335
#define ENDS_WITH(str, suffix) ends_with((str), (suffix), sizeof(suffix) - 1)
336

337
/**
338
 * Initializes the second-level index to the line breaking properties.
339
 * If it is not called, the performance of #get_char_lb_class_lang (and
340
 * thus the main functionality) can be pretty bad, especially for big
341
 * codepoints like those of Chinese.
342
 */
343
void init_linebreak(void)
344
{
345
    size_t i;
346
    size_t iPropDefault;
347
    size_t len;
348
    size_t step;
349

350
    len = 0;
351
    while (lb_prop_default[len].prop != LBP_Undefined)
352
        ++len;
353
    step = len / LINEBREAK_INDEX_SIZE;
354
    iPropDefault = 0;
355
    for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
356
    {
357
        lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
358
        iPropDefault += step;
359
        lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
360
    }
361
    lb_prop_index[--i].end = 0xFFFFFFFF;
362
}
363

364
/**
365
 * Gets the language-specific line breaking properties.
366
 *
367
 * @param lang  language of the text
368
 * @return      pointer to the language-specific line breaking
369
 *              properties array if found; \c NULL otherwise
370
 */
371
static const struct LineBreakProperties *get_lb_prop_lang(const char *lang)
372
{
373
    const struct LineBreakPropertiesLang *lbplIter;
374
    if (lang != NULL)
375
    {
376
        for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
377
        {
378
            if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
379
            {
380
                return lbplIter->lbp;
381
            }
382
        }
383
    }
384
    return NULL;
385
}
386

387
/**
388
 * Gets the line breaking class of a character from a line breaking
389
 * properties array.
390
 *
391
 * @param ch   character to check
392
 * @param lbp  pointer to the line breaking properties array
393
 * @return     the line breaking class if found; \c LBP_XX otherwise
394
 */
395
static enum LineBreakClass get_char_lb_class(
396
        utf32_t ch,
397
        const struct LineBreakProperties *lbp)
398
{
399
    while (lbp->prop != LBP_Undefined && ch >= lbp->start)
400
    {
401
        if (ch <= lbp->end)
402
            return lbp->prop;
403
        ++lbp;
404
    }
405
    return LBP_XX;
406
}
407

408
/**
409
 * Gets the line breaking class of a character from the default line
410
 * breaking properties array.
411
 *
412
 * @param ch  character to check
413
 * @return    the line breaking class if found; \c LBP_XX otherwise
414
 */
415
static enum LineBreakClass get_char_lb_class_default(
416
        utf32_t ch)
417
{
418
    size_t i = 0;
419
    while (ch > lb_prop_index[i].end)
420
        ++i;
421
    assert(i < LINEBREAK_INDEX_SIZE);
422
    return get_char_lb_class(ch, lb_prop_index[i].lbp);
423
}
424

425
/**
426
 * Gets the line breaking class of a character for a specific
427
 * language.  This function will check the language-specific data first,
428
 * and then the default data if there is no language-specific property
429
 * available for the character.
430
 *
431
 * @param ch       character to check
432
 * @param lbpLang  pointer to the language-specific line breaking
433
 *                 properties array
434
 * @return         the line breaking class if found; \c LBP_XX
435
 *                 otherwise
436
 */
437
static enum LineBreakClass get_char_lb_class_lang(
438
        utf32_t ch,
439
        const struct LineBreakProperties *lbpLang)
440
{
441
    enum LineBreakClass lbcResult;
442

443
    /* Find the language-specific line breaking class for a character */
444
    if (lbpLang)
445
    {
446
        lbcResult = get_char_lb_class(ch, lbpLang);
447
        if (lbcResult != LBP_XX)
448
            return lbcResult;
449
    }
450

451
    /* Find the generic language-specific line breaking class, if no
452
     * language context is provided, or language-specific data are not
453
     * available for the specific character in the specified language */
454
    return get_char_lb_class_default(ch);
455
}
456

457
/**
458
 * Resolves the line breaking class for certain ambiguous or complicated
459
 * characters.  They are treated in a simplistic way in this
460
 * implementation.
461
 *
462
 * @param lbc   line breaking class to resolve
463
 * @param lang  language of the text
464
 * @return      the resolved line breaking class
465
 */
466
static enum LineBreakClass resolve_lb_class(
467
        enum LineBreakClass lbc,
468
        const char *lang)
469
{
470
    switch (lbc)
471
    {
472
    case LBP_AI:
473
        if (lang != NULL &&
474
                (strncmp(lang, "zh", 2) == 0 || /* Chinese */
475
                 strncmp(lang, "ja", 2) == 0 || /* Japanese */
476
                 strncmp(lang, "ko", 2) == 0))  /* Korean */
477
        {
478
            return LBP_ID;
479
        }
480
        else
481
        {
482
            return LBP_AL;
483
        }
484
    case LBP_CJ:
485
        /* `Strict' and `normal' line breaking.  See
486
         * <url:http://www.unicode.org/reports/tr14/#CJ>
487
         * for details. */
488
        if (ENDS_WITH(lang, "-strict"))
489
        {
490
            return LBP_NS;
491
        }
492
        else
493
        {
494
            return LBP_ID;
495
        }
496
    case LBP_SA:
497
    case LBP_SG:
498
    case LBP_XX:
499
        return LBP_AL;
500
    default:
501
        return lbc;
502
    }
503
}
504

505
/**
506
 * Treats specially for the first character in a line.
507
 *
508
 * @param[in,out] lbpCtx  pointer to the line breaking context
509
 * @pre                   \a lbpCtx->lbcCur has a valid line break class
510
 * @post                  \a lbpCtx->lbcCur has the updated line break class
511
 */
512
static void treat_first_char(
513
        struct LineBreakContext *lbpCtx)
514
{
515
    switch (lbpCtx->lbcCur)
516
    {
517
    case LBP_LF:
518
    case LBP_NL:
519
        lbpCtx->lbcCur = LBP_BK;        /* Rule LB5 */
520
        break;
521
    case LBP_SP:
522
        lbpCtx->lbcCur = LBP_WJ;        /* Leading space treated as WJ */
523
        break;
524
    default:
525
        break;
526
    }
527
}
528

529
/**
530
 * Tries telling the line break opportunity by simple rules.
531
 *
532
 * @param[in,out] lbpCtx  pointer to the line breaking context
533
 * @pre                   \a lbpCtx->lbcCur has the current line break
534
 *                        class; and \a lbpCtx->lbcNew has the line
535
 *                        break class for the next character
536
 * @post                  \a lbpCtx->lbcCur has the updated line break
537
 *                        class
538
 * @return                break result, one of #LINEBREAK_MUSTBREAK,
539
 *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
540
 *                        if identified; or #LINEBREAK_UNDEFINED if
541
 *                        table lookup is needed
542
 */
543
static int get_lb_result_simple(
544
        struct LineBreakContext *lbpCtx)
545
{
546
    if (lbpCtx->lbcCur == LBP_BK
547
        || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
548
    {
549
        return LINEBREAK_MUSTBREAK;     /* Rules LB4 and LB5 */
550
    }
551

552
    switch (lbpCtx->lbcNew)
553
    {
554
    case LBP_SP:
555
        return LINEBREAK_NOBREAK;       /* Rule LB7; no change to lbcCur */
556
    case LBP_BK:
557
    case LBP_LF:
558
    case LBP_NL:
559
        lbpCtx->lbcCur = LBP_BK;        /* Mandatory break after */
560
        return LINEBREAK_NOBREAK;       /* Rule LB6 */
561
    case LBP_CR:
562
        lbpCtx->lbcCur = LBP_CR;
563
        return LINEBREAK_NOBREAK;       /* Rule LB6 */
564
    default:
565
        return LINEBREAK_UNDEFINED;     /* Table lookup is needed */
566
    }
567
}
568

569
/**
570
 * Tells the line break opportunity by table lookup.
571
 *
572
 * @param[in,out] lbpCtx  pointer to the line breaking context
573
 * @pre                   \a lbpCtx->lbcCur has the current line break
574
 *                        class; \a lbpCtx->lbcLast has the line break
575
 *                        class for the last character; and \a
576
 *                        lbcCur->lbcNew has the line break class for
577
 *                        the next character
578
 * @post                  \a lbpCtx->lbcCur has the updated line break
579
 *                        class
580
 * @return                break result, one of #LINEBREAK_MUSTBREAK,
581
 *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
582
 */
583
static int get_lb_result_lookup(
584
        struct LineBreakContext *lbpCtx)
585
{
586
    int brk = LINEBREAK_UNDEFINED;
587

588
    assert(lbpCtx->lbcCur <= LBP_CB);
589
    assert(lbpCtx->lbcNew <= LBP_CB);
590
    switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
591
    {
592
    case DIR_BRK:
593
        brk = LINEBREAK_ALLOWBREAK;
594
        break;
595
    case IND_BRK:
596
        brk = (lbpCtx->lbcLast == LBP_SP)
597
            ? LINEBREAK_ALLOWBREAK
598
            : LINEBREAK_NOBREAK;
599
        break;
600
    case CMI_BRK:
601
        brk = LINEBREAK_ALLOWBREAK;
602
        if (lbpCtx->lbcLast != LBP_SP)
603
        {
604
            brk = LINEBREAK_NOBREAK;
605
            return brk;                 /* Do not update lbcCur */
606
        }
607
        break;
608
    case CMP_BRK:
609
        brk = LINEBREAK_NOBREAK;
610
        if (lbpCtx->lbcLast != LBP_SP)
611
            return brk;                 /* Do not update lbcCur */
612
        break;
613
    case PRH_BRK:
614
        brk = LINEBREAK_NOBREAK;
615
        break;
616
    }
617

618
    /* Special processing due to rule LB8a */
619
    if (lbpCtx->fLb8aZwj)
620
    {
621
        brk = LINEBREAK_NOBREAK;
622
    }
623

624
    /* Special processing due to rule LB21a */
625
    if (lbpCtx->fLb21aHebrew &&
626
        (lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))
627
    {
628
        brk = LINEBREAK_NOBREAK;
629
        lbpCtx->fLb21aHebrew = false;
630
    }
631
    else
632
    {
633
        lbpCtx->fLb21aHebrew = (lbpCtx->lbcCur == LBP_HL);
634
    }
635

636
    /* Special processing due to rule LB30a */
637
    if (lbpCtx->lbcCur == LBP_RI)
638
    {
639
        lbpCtx->cLb30aRI++;
640
        if (lbpCtx->cLb30aRI == 2 && lbpCtx->lbcNew == LBP_RI)
641
        {
642
            brk = LINEBREAK_ALLOWBREAK;
643
            lbpCtx->cLb30aRI = 0;
644
        }
645
    }
646
    else
647
    {
648
        lbpCtx->cLb30aRI = 0;
649
    }
650

651
    lbpCtx->lbcCur = lbpCtx->lbcNew;
652
    return brk;
653
}
654

655
/**
656
 * Initializes line breaking context for a given language.
657
 *
658
 * @param[in,out] lbpCtx  pointer to the line breaking context
659
 * @param[in]     ch      the first character to process
660
 * @param[in]     lang    language of the input
661
 * @post                  the line breaking context is initialized
662
 */
663
void lb_init_break_context(
664
        struct LineBreakContext *lbpCtx,
665
        utf32_t ch,
666
        const char *lang)
667
{
668
    lbpCtx->lang = lang;
669
    lbpCtx->lbpLang = get_lb_prop_lang(lang);
670
    lbpCtx->lbcLast = LBP_Undefined;
671
    lbpCtx->lbcNew = LBP_Undefined;
672
    lbpCtx->lbcCur = resolve_lb_class(
673
                        get_char_lb_class_lang(ch, lbpCtx->lbpLang),
674
                        lbpCtx->lang);
675
    lbpCtx->fLb8aZwj =
676
        (get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_ZWJ);
677
    lbpCtx->fLb10LeadSpace =
678
        (get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_SP);
679
    lbpCtx->fLb21aHebrew = false;
680
    lbpCtx->cLb30aRI = 0;
681
    treat_first_char(lbpCtx);
682
}
683

684
/**
685
 * Updates LineBreakingContext for the next codepoint and returns
686
 * the detected break.
687
 *
688
 * @param[in,out] lbpCtx  pointer to the line breaking context
689
 * @param[in]     ch      Unicode codepoint
690
 * @return                break result, one of #LINEBREAK_MUSTBREAK,
691
 *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
692
 * @post                  the line breaking context is updated
693
 */
694
int lb_process_next_char(
695
        struct LineBreakContext *lbpCtx,
696
        utf32_t ch )
697
{
698
    int brk;
699

700
    lbpCtx->lbcLast = lbpCtx->lbcNew;
701
    lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
702
    brk = get_lb_result_simple(lbpCtx);
703
    switch (brk)
704
    {
705
    case LINEBREAK_MUSTBREAK:
706
        lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
707
        treat_first_char(lbpCtx);
708
        break;
709
    case LINEBREAK_UNDEFINED:
710
        lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
711
        brk = get_lb_result_lookup(lbpCtx);
712
        break;
713
    default:
714
        break;
715
    }
716

717
    /* Special processing due to rule LB8a */
718
    if (lbpCtx->lbcNew == LBP_ZWJ)
719
    {
720
        lbpCtx->fLb8aZwj = true;
721
    }
722
    else
723
    {
724
        lbpCtx->fLb8aZwj = false;
725
    }
726

727
    /* Special processing due to rule LB10 */
728
    if (lbpCtx->fLb10LeadSpace)
729
    {
730
        if (lbpCtx->lbcNew == LBP_CM || lbpCtx->lbcNew == LBP_ZWJ)
731
            brk = LINEBREAK_ALLOWBREAK;
732
        lbpCtx->fLb10LeadSpace = false;
733
    }
734

735
    return brk;
736
}
737

738
/**
739
 * Sets the line breaking information for a generic input string.
740
 *
741
 * Currently, this implementation has customization for the following
742
 * ISO 639-1 language codes (for \a lang):
743
 *
744
 *  - de (German)
745
 *  - en (English)
746
 *  - es (Spanish)
747
 *  - fr (French)
748
 *  - ja (Japanese)
749
 *  - ko (Korean)
750
 *  - ru (Russian)
751
 *  - zh (Chinese)
752
 *
753
 * In addition, a suffix <code>"-strict"</code> may be added to indicate
754
 * strict (as versus normal) line-breaking behaviour.  See the <a
755
 * href="http://www.unicode.org/reports/tr14/#CJ">Conditional Japanese
756
 * Starter section of UAX #14</a> for more details.
757
 *
758
 * @param[in]  s             input string
759
 * @param[in]  len           length of the input
760
 * @param[in]  lang          language of the input
761
 * @param[out] brks          pointer to the output breaking data,
762
 *                           containing #LINEBREAK_MUSTBREAK,
763
 *                           #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
764
 *                           or #LINEBREAK_INSIDEACHAR
765
 * @param[in] get_next_char  function to get the next UTF-32 character
766
 */
767
void set_linebreaks(
768
        const void *s,
769
        size_t len,
770
        const char *lang,
771
        char *brks,
772
        get_next_char_t get_next_char)
773
{
774
    utf32_t ch;
775
    struct LineBreakContext lbCtx;
776
    size_t posCur = 0;
777
    size_t posLast = 0;
778

779
    --posLast;  /* To be ++'d later */
780
    ch = get_next_char(s, len, &posCur);
781
    if (ch == EOS)
782
        return;
783
    lb_init_break_context(&lbCtx, ch, lang);
784

785
    /* Process a line till an explicit break or end of string */
786
    for (;;)
787
    {
788
        for (++posLast; posLast < posCur - 1; ++posLast)
789
        {
790
            brks[posLast] = LINEBREAK_INSIDEACHAR;
791
        }
792
        assert(posLast == posCur - 1);
793
        ch = get_next_char(s, len, &posCur);
794
        if (ch == EOS)
795
            break;
796
        brks[posLast] = lb_process_next_char(&lbCtx, ch);
797
    }
798

799
    assert(posLast == posCur - 1 && posCur <= len);
800
    /* Break after the last character */
801
    brks[posLast] = LINEBREAK_MUSTBREAK;
802
    /* When the input contains incomplete sequences */
803
    while (posCur < len)
804
    {
805
        brks[posCur++] = LINEBREAK_INSIDEACHAR;
806
    }
807
}
808

809
/**
810
 * Sets the line breaking information for a UTF-8 input string.
811
 *
812
 * @param[in]  s     input UTF-8 string
813
 * @param[in]  len   length of the input
814
 * @param[in]  lang  language of the input
815
 * @param[out] brks  pointer to the output breaking data, containing
816
 *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
817
 *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
818
 * @see #set_linebreaks for a note about \a lang.
819
 */
820
void set_linebreaks_utf8(
821
        const utf8_t *s,
822
        size_t len,
823
        const char *lang,
824
        char *brks)
825
{
826
    set_linebreaks(s, len, lang, brks,
827
                   (get_next_char_t)ub_get_next_char_utf8);
828
}
829

830
/**
831
 * Sets the line breaking information for a UTF-16 input string.
832
 *
833
 * @param[in]  s     input UTF-16 string
834
 * @param[in]  len   length of the input
835
 * @param[in]  lang  language of the input
836
 * @param[out] brks  pointer to the output breaking data, containing
837
 *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
838
 *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
839
 * @see #set_linebreaks for a note about \a lang.
840
 */
841
void set_linebreaks_utf16(
842
        const utf16_t *s,
843
        size_t len,
844
        const char *lang,
845
        char *brks)
846
{
847
    set_linebreaks(s, len, lang, brks,
848
                   (get_next_char_t)ub_get_next_char_utf16);
849
}
850

851
/**
852
 * Sets the line breaking information for a UTF-32 input string.
853
 *
854
 * @param[in]  s     input UTF-32 string
855
 * @param[in]  len   length of the input
856
 * @param[in]  lang  language of the input
857
 * @param[out] brks  pointer to the output breaking data, containing
858
 *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
859
 *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
860
 * @see #set_linebreaks for a note about \a lang.
861
 */
862
void set_linebreaks_utf32(
863
        const utf32_t *s,
864
        size_t len,
865
        const char *lang,
866
        char *brks)
867
{
868
    set_linebreaks(s, len, lang, brks,
869
                   (get_next_char_t)ub_get_next_char_utf32);
870
}
871

872
/**
873
 * Tells whether a line break can occur between two Unicode characters.
874
 * This is a wrapper function to expose a simple interface.  Generally
875
 * speaking, it is better to use #set_linebreaks_utf32 instead, since
876
 * complicated cases involving combining marks, spaces, etc. cannot be
877
 * correctly processed.
878
 *
879
 * @param char1  the first Unicode character
880
 * @param char2  the second Unicode character
881
 * @param lang   language of the input
882
 * @return       one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
883
 *               #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
884
 */
885
int is_line_breakable(
886
        utf32_t char1,
887
        utf32_t char2,
888
        const char *lang)
889
{
890
    utf32_t s[2];
891
    char brks[2];
892
    s[0] = char1;
893
    s[1] = char2;
894
    set_linebreaks_utf32(s, 2, lang, brks);
895
    return brks[0];
896
}
897

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.