1 # LINEBREAK.awk -- awk script to produce a compact linebreak property map
3 # National Institute of Advanced Industrial Science and Technology (AIST)
4 # Registration Number H15PRO112
6 # This file is part of the m17n database; a sub-part of the m17n
9 # The m17n library is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU Lesser General Public License
11 # as published by the Free Software Foundation; either version 2.1 of
12 # the License, or (at your option) any later version.
14 # The m17n library is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 # Lesser General Public License for more details.
19 # You should have received a copy of the GNU Lesser General Public
20 # License along with the m17n library; if not, write to the Free
21 # Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 function setLBC(name, code) {
26 printf "\n# %2d:%s", code, name;
28 printf " %2d:%s", code, name;
38 printf "# Code:LineBreakingClass";
39 # Assign a uniq integer code to each line breaking class.
40 # The codes must be the same as "enum LineBreakingClass" of
41 # m17n-lib/src/linebreak.c
42 setLBC("OP", i++); # open
43 setLBC("CL", i++); # close
44 setLBC("QU", i++); # quotation
45 setLBC("GL", i++); # glue
46 setLBC("NS", i++); # no-start
47 setLBC("EX", i++); # exclamation/interrogation
48 setLBC("SY", i++); # Syntax (slash)
49 setLBC("IS", i++); # infix (numeric) separator
50 setLBC("PR", i++); # prefix
51 setLBC("PO", i++); # postfix
52 setLBC("NU", i++); # numeric
53 setLBC("AL", i++); # alphabetic
54 setLBC("ID", i++); # ideograph (atomic)
55 setLBC("IN", i++); # inseparable
56 setLBC("HY", i++); # hyphen
57 setLBC("BA", i++); # break after
58 setLBC("BB", i++); # break before
59 setLBC("B2", i++); # break both
60 setLBC("ZW", i++); # ZW space
61 setLBC("CM", i++); # combining mark
62 setLBC("WJ", i++); # word joiner
64 # For UAX#14 7.6 Korean Syllable Block Pair Table.
65 setLBC("H2", i++); # Hamgul 2 Jamo Syllable
66 setLBC("H3", i++); # Hangul 3 Jamo Syllable
67 setLBC("JL", i++); # Jamo leading consonant
68 setLBC("JV", i++); # Jamo vowel
69 setLBC("JT", i++); # Jamo trailing consonant
71 # Not handled in the pair table.
72 setLBC("SA", i++); # south (east) asian
73 setLBC("SP", i++); # space
74 setLBC("PS", i++); # paragraph and line separators
75 setLBC("BK", i++); # hard break (newline)
76 setLBC("CR", i++); # carriage return
77 setLBC("LF", i++); # line feed
78 setLBC("NL", i++); # next line
79 setLBC("CB", i++); # contingent break opportunity
80 setLBC("SG", i++); # surrogate
81 setLBC("AI", i++); # ambiguous
82 setLBC("XX", i); # unknown
84 # The default is "XX".
85 printf "\n0x0000-0x3FFFFF %d\n", i;
95 printf "%s %d\n", from, prev_lbc;
97 printf "%s-%s %d\n", from, to, prev_lbc;
108 /^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
113 printf "%s %d\n", from, prev_lbc;
115 printf "%s-%s %d\n", from, to, prev_lbc;
117 gsub("\\.\\.", "-0x");
118 printf "0x%s %d\n", $1, lbc;
129 printf "0x%s %d\n", from, prev_lbc;
131 printf "0x%s-0x%s %d\n", from, to, prev_lbc;