1 /* mtext-lbrk.c -- line break
2 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
24 /*** @addtogroup m17nInternal
33 #include "m17n-misc.h"
41 LBC_QU, /* quotation */
43 LBC_NS, /* no-start */
44 LBC_EX, /* exclamation/interrogation */
45 LBC_SY, /* Syntax (slash) */
46 LBC_IS, /* infix (numeric) separator */
50 LBC_AL, /* alphabetic */
51 LBC_ID, /* ideograph (atomic) */
52 LBC_IN, /* inseparable */
54 LBC_BA, /* break after */
55 LBC_BB, /* break before */
56 LBC_B2, /* break both */
57 LBC_ZW, /* ZW space */
58 LBC_CM, /* combining mark */
59 LBC_WJ, /* word joiner */
61 /* used for 4.1 pair table */
62 LBC_H2, /* Hamgul 2 Jamo Syllable */
63 LBC_H3, /* Hangul 3 Jamo Syllable */
64 LBC_JL, /* Jamo leading consonant */
65 LBC_JV, /* Jamo vowel */
66 LBC_JT, /* Jamo trailing consonant */
68 /* These are not handled in the pair tables. */
69 LBC_SA, /* south (east) asian */
71 LBC_PS, /* paragraph and line separators */
72 LBC_BK, /* hard break (newline) */
73 LBC_CR, /* carriage return */
74 LBC_LF, /* line feed */
75 LBC_NL, /* next line */
76 LBC_CB, /* contingent break opportunity */
77 LBC_SG, /* surrogate */
78 LBC_AI, /* ambiguous */
87 LBA_COMBINING_INDIRECT = '#',
88 LBA_COMBINING_PROHIBITED = '@',
93 /* The pair table of line break actions. */
94 static char *lba_pair_table[] =
95 /* OP GL SY PO ID BA ZW H2 JV
96 CL NS IS NU IN BB CM H3 JT
97 QU EX PR AL HY B2 WJ JL */
98 { "^^^^^^^^^^^^^^^^^^^@^^^^^^", /* OP */
99 "_^%%^^^^_%____%%__^#^_____", /* CL */
100 "^^%%%^^^%%%%%%%%%%^#^%%%%%", /* QU */
101 "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* GL */
102 "_^%%%^^^______%%__^#^_____", /* NS */
103 "_^%%%^^^______%%__^#^_____", /* EX */
104 "_^%%%^^^__%___%%__^#^_____", /* SY */
105 "_^%%%^^^__%%__%%__^#^_____", /* IS */
106 "%^%%%^^^__%%%_%%__^#^%%%%%", /* PR */
107 "_^%%%^^^______%%__^#^_____", /* PO */
108 "_^%%%^^^_%%%_%%%__^#^_____", /* NU */
109 "_^%%%^^^__%%_%%%__^#^_____", /* AL */
110 "_^%%%^^^_%___%%%__^#^_____", /* ID */
111 "_^%%%^^^_____%%%__^#^_____", /* IN */
112 "_^%%%^^^__%___%%__^#^_____", /* HY */
113 "_^%%%^^^______%%__^#^_____", /* BA */
114 "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* BB */
115 "_^%%%^^^______%%_^^#^_____", /* B2 */
116 "__________________^_______", /* ZW */
117 "_^%%%^^^__%%_%%%__^#^_____", /* CM */
118 "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* WJ */
119 "_^%%%^^^_%___%%%__^#^___%%", /* H2 */
120 "_^%%%^^^_%___%%%__^#^____%", /* H3 */
121 "_^%%%^^^_%___%%%__^#^%%%%_", /* JL */
122 "_^%%%^^^_%___%%%__^#^___%%", /* JV */
123 "_^%%%^^^_%___%%%__^#^____%" /* JT */
126 static MCharTable *lbc_table;
128 /* Set LBC to enum LineBreakClass of the character at POS of MT
129 (length is LEN) while converting LBC_AI and LBC_XX to LBC_AL,
130 LBC_CB to LBC_B2, LBC_CR, LBC_LF, and LBC_NL to LBC_BK. If POS is
131 out of range, set LBC to LBC_BK. */
133 #define GET_LBC(LBC, MT, LEN, POS, OPTION) \
135 if ((POS) < 0 || (POS) >= (LEN)) \
139 int c = mtext_ref_char ((MT), (POS)); \
140 (LBC) = (enum LineBreakClass) mchartable_lookup (lbc_table, c); \
141 if ((LBC) == LBC_NL) \
143 else if ((LBC) == LBC_AI) \
144 (LBC) = ((OPTION) & MTEXT_LBO_AI_AS_ID) ? LBC_ID : LBC_AL; \
145 else if (! ((OPTION) & MTEXT_LBO_KOREAN_SP) \
146 && (LBC) >= LBC_H2 && (LBC) <= LBC_JT) \
148 else if ((LBC) == LBC_CB) \
150 else if ((LBC) == LBC_XX) \
157 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
162 /*** @addtogroup m17nMtext */
167 @brief Find a linebreak postion of an M-text.
169 The mtext_line_break () function checks if position $POS is a
170 proper linebreak position of an M-text $MT according to the
171 algorithm of The Unicode Standard 4.0 UAX#14. It so, it returns
172 $POS. Otherwise, it returns a proper linebreak position before
175 If $OPTION is nonzero, it controls the algorithm by logical-or of
176 the members of #MTextLineBreakOption.
178 If $AFTER is not NULL, a proper linebreak position after $POS is
182 mtext_line_break (MText *mt, int pos, int option, int *after)
184 int break_before, break_after;
185 int len = mtext_len (mt);
186 enum LineBreakClass lbc;
187 enum LineBreakClass Blbc, Albc; /* B(efore) and A(fter) lbcs. */
188 int Bpos, Apos; /* B(efore) and A(fter) positions. */
189 enum LineBreakAction action;
193 /* The end of text is an explicit break position. */
201 MSymbol key = mchar_define_property ("linebreak", Minteger);
203 lbc_table = mchar_get_prop_table (key, NULL);
206 GET_LBC (lbc, mt, len, pos, option);
211 if (option & MTEXT_LBO_SP_CM)
213 GET_LBC (Albc, mt, len, Apos + 1, option);
214 Albc = (Albc == LBC_CM) ? LBC_ID : LBC_SP;
216 while (Albc == LBC_SP)
219 GET_LBC (Albc, mt, len, Apos, option);
222 if ((option & MTEXT_LBO_SP_CM) && (Albc == LBC_CM))
225 GET_LBC (Albc, mt, len, Apos, option);
229 Apos++, Albc = LBC_CM;
234 else if (Albc == LBC_LF)
236 GET_LBC (Albc, mt, len, Apos - 1, option);
241 else if (Albc == LBC_SA)
242 Albc = mtext__word_segment (mt, Apos, &Apos, NULL) > 0 ? LBC_BB : LBC_AL;
244 /* After exiting from the following loop, if Apos is positive, it is
245 the previous (including POS) break position. */
251 /* Now Bpos == Apos. */
254 GET_LBC (Blbc, mt, len, Bpos, option);
255 } while (Blbc == LBC_SP);
257 if (Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
259 /* Explicit break. */
263 indirect = Bpos + 1 < Apos;
269 GET_LBC (Blbc, mt, len, Bpos, option);
270 } while (Blbc == LBC_CM);
271 if ((option & MTEXT_LBO_SP_CM) && (Blbc == LBC_SP))
273 else if (Blbc == LBC_SP || Blbc == LBC_ZW
274 || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
282 mtext__word_segment (mt, Bpos, &next, NULL);
288 action = lba_pair_table[Blbc][Albc];
289 if (action == LBA_DIRECT)
291 else if (action == LBA_INDIRECT)
296 else if (action == LBA_COMBINING_INDIRECT)
303 Apos = next, Albc = LBC_BB;
305 Apos = Bpos, Albc = Blbc;
308 if (break_before > 0)
312 if (break_before == pos)
315 *after = break_before;
320 /* Now find a break position after POS. */
328 GET_LBC (Blbc, mt, len, Bpos, option);
329 } while (Blbc == LBC_CM);
330 if (Blbc == LBC_SP || Blbc == LBC_ZW
331 || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
333 if ((Blbc == LBC_SP) && (option & MTEXT_LBO_SP_CM))
342 mtext__word_segment (mt, Bpos, NULL, &Bpos);
345 else if (Blbc == LBC_SP)
347 if (option & MTEXT_LBO_SP_CM)
349 GET_LBC (Blbc, mt, len, Bpos + 1, option);
351 Blbc = LBC_ID, Bpos++;
355 while (Blbc == LBC_SP)
358 GET_LBC (Blbc, mt, len, Bpos, option);
364 /* After exiting from the following loop, if Apos is positive, it is
365 the next break position. */
371 /* Now Bpos == Apos. */
372 if (Blbc == LBC_LF || Blbc == LBC_BK || Blbc == LBC_CR)
377 GET_LBC (Blbc, mt, len, Bpos + 1, option);
386 GET_LBC (Albc, mt, len, Apos, option);
387 } while (Albc == LBC_SP);
393 /* Explicit break at the end of text. */
396 indirect = Bpos + 1 < Apos;
399 Albc = mtext__word_segment (mt, Apos, NULL, &next) ? LBC_BB : LBC_AL;
401 action = lba_pair_table[Blbc][Albc];
402 if (action == LBA_DIRECT)
403 /* Direct break at Apos. */
405 else if (action == LBA_INDIRECT)
410 else if (action == LBA_COMBINING_INDIRECT)
414 if (option & MTEXT_LBO_SP_CM)
420 Bpos = next, Blbc = LBC_AL;
430 *after = break_after;
432 return (break_before > 0 ? break_before : break_after);