efl
896 строк · 31.9 Кб
1/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2
3/*
4* Line breaking in a Unicode sequence. Designed to be used in a
5* generic text renderer.
6*
7* Copyright (C) 2008-2019 Wu Yongwei <wuyongwei at gmail dot com>
8* Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
9*
10* This software is provided 'as-is', without any express or implied
11* warranty. In no event will the author be held liable for any damages
12* arising from the use of this software.
13*
14* Permission is granted to anyone to use this software for any purpose,
15* including commercial applications, and to alter it and redistribute
16* it freely, subject to the following restrictions:
17*
18* 1. The origin of this software must not be misrepresented; you must
19* not claim that you wrote the original software. If you use this
20* software in a product, an acknowledgement in the product
21* documentation would be appreciated but is not required.
22* 2. Altered source versions must be plainly marked as such, and must
23* not be misrepresented as being the original software.
24* 3. This notice may not be removed or altered from any source
25* distribution.
26*
27* The main reference is Unicode Standard Annex 14 (UAX #14):
28* <URL:http://www.unicode.org/reports/tr14/>
29*
30* When this library was designed, this annex was at Revision 19, for
31* Unicode 5.0.0:
32* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
33*
34* This library has been updated according to Revision 43, for
35* Unicode 12.0.0:
36* <URL:http://www.unicode.org/reports/tr14/tr14-43.html>
37*
38* The Unicode Terms of Use are available at
39* <URL:http://www.unicode.org/copyright.html>
40*/
41
42/**
43* @file linebreak.c
44*
45* Implementation of the line breaking algorithm as described in Unicode
46* Standard Annex 14.
47*
48* @author Wu Yongwei
49* @author Petr Filipsky
50*/
51
52#include <assert.h>53#include <stddef.h>54#include <string.h>55#include "linebreak.h"56#include "linebreakdef.h"57
58/**
59* Special value used internally to indicate an undefined break result.
60*/
61#define LINEBREAK_UNDEFINED -162
63/**
64* Size of the second-level index to the line breaking properties.
65*/
66#define LINEBREAK_INDEX_SIZE 4067
68/**
69* Enumeration of break actions. They are used in the break action
70* pair table #baTable.
71*/
72enum BreakAction73{
74DIR_BRK, /**< Direct break opportunity */75IND_BRK, /**< Indirect break opportunity */76CMI_BRK, /**< Indirect break opportunity for combining marks */77CMP_BRK, /**< Prohibited break for combining marks */78PRH_BRK /**< Prohibited break */79};80
81/**
82* Break action pair table. This is a direct mapping of Table 2 of
83* Unicode Standard Annex 14, Revision 37, except for ZWJ (manually
84* adjusted after special processing as per LB8a of Revision 41) and CB
85* (manually added as per LB20).
86*/
87static enum BreakAction baTable[LBP_CB][LBP_CB] = {88{ /* OP */89PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,90PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,91PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,92CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,93PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },94{ /* CL */95DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,96PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,97DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,98CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,99DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },100{ /* CP */101DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,102PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,103DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,104CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,105DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },106{ /* QU */107PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,108PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,109IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,110CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,111IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },112{ /* GL */113IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,114PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,115IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,116CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,117IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },118{ /* NS */119DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,120PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,121DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,122CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,123DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },124{ /* EX */125DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,126PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,127DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,128CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,129DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },130{ /* SY */131DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,132PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, IND_BRK,133DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,134CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,135DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },136{ /* IS */137DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,138PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,139DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,140CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,141DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },142{ /* PR */143IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,144PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,145IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,146CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,147DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },148{ /* PO */149IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,150PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,151DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,152CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,153DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },154{ /* NU */155IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,156PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,157DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,158CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,159DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },160{ /* AL */161IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,162PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,163DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,164CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,165DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },166{ /* HL */167IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,168PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,169DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,170CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,171DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },172{ /* ID */173DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,174PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,175DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,176CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,177DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },178{ /* IN */179DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,180PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,181DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,182CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,183DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },184{ /* HY */185DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,186PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,187DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,188CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,189DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },190{ /* BA */191DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,192PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,193DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,194CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,195DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },196{ /* BB */197IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,198PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,199IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,200CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,201IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },202{ /* B2 */203DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,204PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,205DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,206CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,207DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },208{ /* ZW */209DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,210DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,211DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,212DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,213DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },214{ /* CM */215IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,216PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,217DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,218CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,219DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },220{ /* WJ */221IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,222PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,223IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,224CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,225IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },226{ /* H2 */227DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,228PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,229DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,230CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,231DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },232{ /* H3 */233DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,234PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,235DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,236CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,237DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },238{ /* JL */239DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,240PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,241DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,242CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,243DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },244{ /* JV */245DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,246PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,247DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,248CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,249DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },250{ /* JT */251DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,252PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,253DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,254CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,255DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },256{ /* RI */257DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,258PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,259DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,260CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,261IND_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },262{ /* EB */263DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,264PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,265DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,266CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,267DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK },268{ /* EM */269DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,270PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,271DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,272CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,273DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },274{ /* ZWJ */275IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,276PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,277DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,278CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,279DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },280{ /* CB */281DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK,282PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,283DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,284CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,285DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },286};287
288/**
289* Struct for the second-level index to the line breaking properties.
290*/
291struct LineBreakPropertiesIndex292{
293utf32_t end; /**< End codepoint */294const struct LineBreakProperties *lbp; /**< Pointer to line breaking295properties */
296};297
298/**
299* Second-level index to the line breaking properties.
300*/
301static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =302{
303{ 0xFFFFFFFF, lb_prop_default }304};305
306/**
307* Checks whether the \a str ends with \a suffix, which has length
308* \a suffix_len.
309*
310* @param str string whose ending is to be checked
311* @param suffix string to check
312* @param suffixLen length of \a suffix
313* @return non-zero if true; zero otherwise
314*/
315static __inline int ends_with(const char *str, const char *suffix,316unsigned suffixLen)317{
318unsigned len;319if (str == NULL)320{321return 0;322}323len = strlen(str);324if (len >= suffixLen &&325memcmp(str + len - suffixLen, suffix, suffixLen) == 0)326{327return 1;328}329else330{331return 0;332}333}
334
335#define ENDS_WITH(str, suffix) ends_with((str), (suffix), sizeof(suffix) - 1)336
337/**
338* Initializes the second-level index to the line breaking properties.
339* If it is not called, the performance of #get_char_lb_class_lang (and
340* thus the main functionality) can be pretty bad, especially for big
341* codepoints like those of Chinese.
342*/
343void init_linebreak(void)344{
345size_t i;346size_t iPropDefault;347size_t len;348size_t step;349
350len = 0;351while (lb_prop_default[len].prop != LBP_Undefined)352++len;353step = len / LINEBREAK_INDEX_SIZE;354iPropDefault = 0;355for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)356{357lb_prop_index[i].lbp = lb_prop_default + iPropDefault;358iPropDefault += step;359lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;360}361lb_prop_index[--i].end = 0xFFFFFFFF;362}
363
364/**
365* Gets the language-specific line breaking properties.
366*
367* @param lang language of the text
368* @return pointer to the language-specific line breaking
369* properties array if found; \c NULL otherwise
370*/
371static const struct LineBreakProperties *get_lb_prop_lang(const char *lang)372{
373const struct LineBreakPropertiesLang *lbplIter;374if (lang != NULL)375{376for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)377{378if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)379{380return lbplIter->lbp;381}382}383}384return NULL;385}
386
387/**
388* Gets the line breaking class of a character from a line breaking
389* properties array.
390*
391* @param ch character to check
392* @param lbp pointer to the line breaking properties array
393* @return the line breaking class if found; \c LBP_XX otherwise
394*/
395static enum LineBreakClass get_char_lb_class(396utf32_t ch,397const struct LineBreakProperties *lbp)398{
399while (lbp->prop != LBP_Undefined && ch >= lbp->start)400{401if (ch <= lbp->end)402return lbp->prop;403++lbp;404}405return LBP_XX;406}
407
408/**
409* Gets the line breaking class of a character from the default line
410* breaking properties array.
411*
412* @param ch character to check
413* @return the line breaking class if found; \c LBP_XX otherwise
414*/
415static enum LineBreakClass get_char_lb_class_default(416utf32_t ch)417{
418size_t i = 0;419while (ch > lb_prop_index[i].end)420++i;421assert(i < LINEBREAK_INDEX_SIZE);422return get_char_lb_class(ch, lb_prop_index[i].lbp);423}
424
425/**
426* Gets the line breaking class of a character for a specific
427* language. This function will check the language-specific data first,
428* and then the default data if there is no language-specific property
429* available for the character.
430*
431* @param ch character to check
432* @param lbpLang pointer to the language-specific line breaking
433* properties array
434* @return the line breaking class if found; \c LBP_XX
435* otherwise
436*/
437static enum LineBreakClass get_char_lb_class_lang(438utf32_t ch,439const struct LineBreakProperties *lbpLang)440{
441enum LineBreakClass lbcResult;442
443/* Find the language-specific line breaking class for a character */444if (lbpLang)445{446lbcResult = get_char_lb_class(ch, lbpLang);447if (lbcResult != LBP_XX)448return lbcResult;449}450
451/* Find the generic language-specific line breaking class, if no452* language context is provided, or language-specific data are not
453* available for the specific character in the specified language */
454return get_char_lb_class_default(ch);455}
456
457/**
458* Resolves the line breaking class for certain ambiguous or complicated
459* characters. They are treated in a simplistic way in this
460* implementation.
461*
462* @param lbc line breaking class to resolve
463* @param lang language of the text
464* @return the resolved line breaking class
465*/
466static enum LineBreakClass resolve_lb_class(467enum LineBreakClass lbc,468const char *lang)469{
470switch (lbc)471{472case LBP_AI:473if (lang != NULL &&474(strncmp(lang, "zh", 2) == 0 || /* Chinese */475strncmp(lang, "ja", 2) == 0 || /* Japanese */476strncmp(lang, "ko", 2) == 0)) /* Korean */477{478return LBP_ID;479}480else481{482return LBP_AL;483}484case LBP_CJ:485/* `Strict' and `normal' line breaking. See486* <url:http://www.unicode.org/reports/tr14/#CJ>
487* for details. */
488if (ENDS_WITH(lang, "-strict"))489{490return LBP_NS;491}492else493{494return LBP_ID;495}496case LBP_SA:497case LBP_SG:498case LBP_XX:499return LBP_AL;500default:501return lbc;502}503}
504
505/**
506* Treats specially for the first character in a line.
507*
508* @param[in,out] lbpCtx pointer to the line breaking context
509* @pre \a lbpCtx->lbcCur has a valid line break class
510* @post \a lbpCtx->lbcCur has the updated line break class
511*/
512static void treat_first_char(513struct LineBreakContext *lbpCtx)514{
515switch (lbpCtx->lbcCur)516{517case LBP_LF:518case LBP_NL:519lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */520break;521case LBP_SP:522lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */523break;524default:525break;526}527}
528
529/**
530* Tries telling the line break opportunity by simple rules.
531*
532* @param[in,out] lbpCtx pointer to the line breaking context
533* @pre \a lbpCtx->lbcCur has the current line break
534* class; and \a lbpCtx->lbcNew has the line
535* break class for the next character
536* @post \a lbpCtx->lbcCur has the updated line break
537* class
538* @return break result, one of #LINEBREAK_MUSTBREAK,
539* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
540* if identified; or #LINEBREAK_UNDEFINED if
541* table lookup is needed
542*/
543static int get_lb_result_simple(544struct LineBreakContext *lbpCtx)545{
546if (lbpCtx->lbcCur == LBP_BK547|| (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))548{549return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */550}551
552switch (lbpCtx->lbcNew)553{554case LBP_SP:555return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */556case LBP_BK:557case LBP_LF:558case LBP_NL:559lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */560return LINEBREAK_NOBREAK; /* Rule LB6 */561case LBP_CR:562lbpCtx->lbcCur = LBP_CR;563return LINEBREAK_NOBREAK; /* Rule LB6 */564default:565return LINEBREAK_UNDEFINED; /* Table lookup is needed */566}567}
568
569/**
570* Tells the line break opportunity by table lookup.
571*
572* @param[in,out] lbpCtx pointer to the line breaking context
573* @pre \a lbpCtx->lbcCur has the current line break
574* class; \a lbpCtx->lbcLast has the line break
575* class for the last character; and \a
576* lbcCur->lbcNew has the line break class for
577* the next character
578* @post \a lbpCtx->lbcCur has the updated line break
579* class
580* @return break result, one of #LINEBREAK_MUSTBREAK,
581* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
582*/
583static int get_lb_result_lookup(584struct LineBreakContext *lbpCtx)585{
586int brk = LINEBREAK_UNDEFINED;587
588assert(lbpCtx->lbcCur <= LBP_CB);589assert(lbpCtx->lbcNew <= LBP_CB);590switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])591{592case DIR_BRK:593brk = LINEBREAK_ALLOWBREAK;594break;595case IND_BRK:596brk = (lbpCtx->lbcLast == LBP_SP)597? LINEBREAK_ALLOWBREAK598: LINEBREAK_NOBREAK;599break;600case CMI_BRK:601brk = LINEBREAK_ALLOWBREAK;602if (lbpCtx->lbcLast != LBP_SP)603{604brk = LINEBREAK_NOBREAK;605return brk; /* Do not update lbcCur */606}607break;608case CMP_BRK:609brk = LINEBREAK_NOBREAK;610if (lbpCtx->lbcLast != LBP_SP)611return brk; /* Do not update lbcCur */612break;613case PRH_BRK:614brk = LINEBREAK_NOBREAK;615break;616}617
618/* Special processing due to rule LB8a */619if (lbpCtx->fLb8aZwj)620{621brk = LINEBREAK_NOBREAK;622}623
624/* Special processing due to rule LB21a */625if (lbpCtx->fLb21aHebrew &&626(lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))627{628brk = LINEBREAK_NOBREAK;629lbpCtx->fLb21aHebrew = false;630}631else632{633lbpCtx->fLb21aHebrew = (lbpCtx->lbcCur == LBP_HL);634}635
636/* Special processing due to rule LB30a */637if (lbpCtx->lbcCur == LBP_RI)638{639lbpCtx->cLb30aRI++;640if (lbpCtx->cLb30aRI == 2 && lbpCtx->lbcNew == LBP_RI)641{642brk = LINEBREAK_ALLOWBREAK;643lbpCtx->cLb30aRI = 0;644}645}646else647{648lbpCtx->cLb30aRI = 0;649}650
651lbpCtx->lbcCur = lbpCtx->lbcNew;652return brk;653}
654
655/**
656* Initializes line breaking context for a given language.
657*
658* @param[in,out] lbpCtx pointer to the line breaking context
659* @param[in] ch the first character to process
660* @param[in] lang language of the input
661* @post the line breaking context is initialized
662*/
663void lb_init_break_context(664struct LineBreakContext *lbpCtx,665utf32_t ch,666const char *lang)667{
668lbpCtx->lang = lang;669lbpCtx->lbpLang = get_lb_prop_lang(lang);670lbpCtx->lbcLast = LBP_Undefined;671lbpCtx->lbcNew = LBP_Undefined;672lbpCtx->lbcCur = resolve_lb_class(673get_char_lb_class_lang(ch, lbpCtx->lbpLang),674lbpCtx->lang);675lbpCtx->fLb8aZwj =676(get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_ZWJ);677lbpCtx->fLb10LeadSpace =678(get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_SP);679lbpCtx->fLb21aHebrew = false;680lbpCtx->cLb30aRI = 0;681treat_first_char(lbpCtx);682}
683
684/**
685* Updates LineBreakingContext for the next codepoint and returns
686* the detected break.
687*
688* @param[in,out] lbpCtx pointer to the line breaking context
689* @param[in] ch Unicode codepoint
690* @return break result, one of #LINEBREAK_MUSTBREAK,
691* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
692* @post the line breaking context is updated
693*/
694int lb_process_next_char(695struct LineBreakContext *lbpCtx,696utf32_t ch )697{
698int brk;699
700lbpCtx->lbcLast = lbpCtx->lbcNew;701lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);702brk = get_lb_result_simple(lbpCtx);703switch (brk)704{705case LINEBREAK_MUSTBREAK:706lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);707treat_first_char(lbpCtx);708break;709case LINEBREAK_UNDEFINED:710lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);711brk = get_lb_result_lookup(lbpCtx);712break;713default:714break;715}716
717/* Special processing due to rule LB8a */718if (lbpCtx->lbcNew == LBP_ZWJ)719{720lbpCtx->fLb8aZwj = true;721}722else723{724lbpCtx->fLb8aZwj = false;725}726
727/* Special processing due to rule LB10 */728if (lbpCtx->fLb10LeadSpace)729{730if (lbpCtx->lbcNew == LBP_CM || lbpCtx->lbcNew == LBP_ZWJ)731brk = LINEBREAK_ALLOWBREAK;732lbpCtx->fLb10LeadSpace = false;733}734
735return brk;736}
737
738/**
739* Sets the line breaking information for a generic input string.
740*
741* Currently, this implementation has customization for the following
742* ISO 639-1 language codes (for \a lang):
743*
744* - de (German)
745* - en (English)
746* - es (Spanish)
747* - fr (French)
748* - ja (Japanese)
749* - ko (Korean)
750* - ru (Russian)
751* - zh (Chinese)
752*
753* In addition, a suffix <code>"-strict"</code> may be added to indicate
754* strict (as versus normal) line-breaking behaviour. See the <a
755* href="http://www.unicode.org/reports/tr14/#CJ">Conditional Japanese
756* Starter section of UAX #14</a> for more details.
757*
758* @param[in] s input string
759* @param[in] len length of the input
760* @param[in] lang language of the input
761* @param[out] brks pointer to the output breaking data,
762* containing #LINEBREAK_MUSTBREAK,
763* #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
764* or #LINEBREAK_INSIDEACHAR
765* @param[in] get_next_char function to get the next UTF-32 character
766*/
767void set_linebreaks(768const void *s,769size_t len,770const char *lang,771char *brks,772get_next_char_t get_next_char)773{
774utf32_t ch;775struct LineBreakContext lbCtx;776size_t posCur = 0;777size_t posLast = 0;778
779--posLast; /* To be ++'d later */780ch = get_next_char(s, len, &posCur);781if (ch == EOS)782return;783lb_init_break_context(&lbCtx, ch, lang);784
785/* Process a line till an explicit break or end of string */786for (;;)787{788for (++posLast; posLast < posCur - 1; ++posLast)789{790brks[posLast] = LINEBREAK_INSIDEACHAR;791}792assert(posLast == posCur - 1);793ch = get_next_char(s, len, &posCur);794if (ch == EOS)795break;796brks[posLast] = lb_process_next_char(&lbCtx, ch);797}798
799assert(posLast == posCur - 1 && posCur <= len);800/* Break after the last character */801brks[posLast] = LINEBREAK_MUSTBREAK;802/* When the input contains incomplete sequences */803while (posCur < len)804{805brks[posCur++] = LINEBREAK_INSIDEACHAR;806}807}
808
809/**
810* Sets the line breaking information for a UTF-8 input string.
811*
812* @param[in] s input UTF-8 string
813* @param[in] len length of the input
814* @param[in] lang language of the input
815* @param[out] brks pointer to the output breaking data, containing
816* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
817* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
818* @see #set_linebreaks for a note about \a lang.
819*/
820void set_linebreaks_utf8(821const utf8_t *s,822size_t len,823const char *lang,824char *brks)825{
826set_linebreaks(s, len, lang, brks,827(get_next_char_t)ub_get_next_char_utf8);828}
829
830/**
831* Sets the line breaking information for a UTF-16 input string.
832*
833* @param[in] s input UTF-16 string
834* @param[in] len length of the input
835* @param[in] lang language of the input
836* @param[out] brks pointer to the output breaking data, containing
837* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
838* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
839* @see #set_linebreaks for a note about \a lang.
840*/
841void set_linebreaks_utf16(842const utf16_t *s,843size_t len,844const char *lang,845char *brks)846{
847set_linebreaks(s, len, lang, brks,848(get_next_char_t)ub_get_next_char_utf16);849}
850
851/**
852* Sets the line breaking information for a UTF-32 input string.
853*
854* @param[in] s input UTF-32 string
855* @param[in] len length of the input
856* @param[in] lang language of the input
857* @param[out] brks pointer to the output breaking data, containing
858* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
859* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
860* @see #set_linebreaks for a note about \a lang.
861*/
862void set_linebreaks_utf32(863const utf32_t *s,864size_t len,865const char *lang,866char *brks)867{
868set_linebreaks(s, len, lang, brks,869(get_next_char_t)ub_get_next_char_utf32);870}
871
872/**
873* Tells whether a line break can occur between two Unicode characters.
874* This is a wrapper function to expose a simple interface. Generally
875* speaking, it is better to use #set_linebreaks_utf32 instead, since
876* complicated cases involving combining marks, spaces, etc. cannot be
877* correctly processed.
878*
879* @param char1 the first Unicode character
880* @param char2 the second Unicode character
881* @param lang language of the input
882* @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
883* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
884*/
885int is_line_breakable(886utf32_t char1,887utf32_t char2,888const char *lang)889{
890utf32_t s[2];891char brks[2];892s[0] = char1;893s[1] = char2;894set_linebreaks_utf32(s, 2, lang, brks);895return brks[0];896}
897