--- /dev/null
+# LINEBREAK.awk -- awk script to produce a compact linebreak property map
+# Copyright (C) 2005
+# National Institute of Advanced Industrial Science and Technology (AIST)
+# Registration Number H15PRO112
+
+# This file is part of the m17n database; a sub-part of the m17n
+# library.
+
+# The m17n library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+
+# The m17n library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the m17n library; if not, write to the Free
+# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+# 02111-1307, USA.
+
+function setLBC(name, code) {
+ if (code % 10 == 0)
+ printf "\n# %2d:%s", code, name;
+ else
+ printf " %2d:%s", code, name;
+ to_lbc[name] = code;
+}
+
+BEGIN {
+ FS = "[; ]";
+ from = "";
+ to = "";
+ prev_lbc = -1;
+ i = 0;
+ printf "# Code:LineBreakingClass";
+ # Assign a uniq integer code to each line breaking class.
+ # The codes must be the same as "enum LineBreakingClass" of
+ # m17n-lib/src/linebreak.c
+ setLBC("OP", i++); # open
+ setLBC("CL", i++); # close
+ setLBC("QU", i++); # quotation
+ setLBC("GL", i++); # glue
+ setLBC("NS", i++); # no-start
+ setLBC("EX", i++); # exclamation/interrogation
+ setLBC("SY", i++); # Syntax (slash)
+ setLBC("IS", i++); # infix (numeric) separator
+ setLBC("PR", i++); # prefix
+ setLBC("PO", i++); # postfix
+ setLBC("NU", i++); # numeric
+ setLBC("AL", i++); # alphabetic
+ setLBC("ID", i++); # ideograph (atomic)
+ setLBC("IN", i++); # inseparable
+ setLBC("HY", i++); # hyphen
+ setLBC("BA", i++); # break after
+ setLBC("BB", i++); # break before
+ setLBC("B2", i++); # break both
+ setLBC("ZW", i++); # ZW space
+ setLBC("CM", i++); # combining mark
+ setLBC("WJ", i++); # word joiner
+
+ # For UAX#14 7.6 Korean Syllable Block Pair Table.
+ setLBC("H2", i++); # Hamgul 2 Jamo Syllable
+ setLBC("H3", i++); # Hangul 3 Jamo Syllable
+ setLBC("JL", i++); # Jamo leading consonant
+ setLBC("JV", i++); # Jamo vowel
+ setLBC("JT", i++); # Jamo trailing consonant
+
+ # Not handled in the pair table.
+ setLBC("SA", i++); # south (east) asian
+ setLBC("SP", i++); # space
+ setLBC("PS", i++); # paragraph and line separators
+ setLBC("BK", i++); # hard break (newline)
+ setLBC("CR", i++); # carriage return
+ setLBC("LF", i++); # line feed
+ setLBC("NL", i++); # next line
+ setLBC("CB", i++); # contingent break opportunity
+ setLBC("SG", i++); # surrogate
+ setLBC("AI", i++); # ambiguous
+ setLBC("XX", i); # unknown
+
+ # The default is "XX".
+ printf "\n0x0000-0x3FFFFF %d\n", i;
+}
+
+/^[0-9A-Za-z]*;/ {
+ lbc = to_lbc[$2];
+ if (prev_lbc != lbc)
+ {
+ if (prev_lbc != -1)
+ {
+ if (from == to)
+ printf "%s %d\n", from, prev_lbc;
+ else
+ printf "%s-%s %d\n", from, to, prev_lbc;
+ }
+ from = "0x" $1;
+ to = "0x" $1;
+ prev_lbc = lbc;
+ }
+ else
+ to = "0x" $1;
+ next;
+}
+
+/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
+ lbc = to_lbc[$2];
+ if (prev_lbc != -1)
+ {
+ if (from == to)
+ printf "%s %d\n", from, prev_lbc;
+ else
+ printf "%s-%s %d\n", from, to, prev_lbc;
+ }
+ gsub("\\.\\.", "-0x");
+ printf "0x%s %d\n", $1, lbc;
+ from = "";
+ to = "";
+ prev_lbc = -1;
+ next;
+}
+
+END {
+ if (prev_lbc != -1)
+ {
+ if (from == to)
+ printf "0x%s %d\n", from, prev_lbc;
+ else
+ printf "0x%s-0x%s %d\n", from, to, prev_lbc;
+ }
+}