New file.
authorhanda <handa>
Fri, 2 Sep 2005 07:35:29 +0000 (07:35 +0000)
committerhanda <handa>
Fri, 2 Sep 2005 07:35:29 +0000 (07:35 +0000)
LINEBREAK.awk [new file with mode: 0644]

diff --git a/LINEBREAK.awk b/LINEBREAK.awk
new file mode 100644 (file)
index 0000000..639555c
--- /dev/null
@@ -0,0 +1,133 @@
+# LINEBREAK.awk -- awk script to produce a compact linebreak property map
+# Copyright (C) 2005
+#   National Institute of Advanced Industrial Science and Technology (AIST)
+#   Registration Number H15PRO112
+
+# This file is part of the m17n database; a sub-part of the m17n
+# library.
+
+# The m17n library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+
+# The m17n library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the m17n library; if not, write to the Free
+# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+# 02111-1307, USA.
+
+function setLBC(name, code) {
+  if (code % 10 == 0)
+    printf "\n# %2d:%s", code, name;
+  else
+    printf "  %2d:%s", code, name;
+  to_lbc[name] = code;
+}
+
+BEGIN {
+  FS = "[; ]";
+  from = "";
+  to = "";
+  prev_lbc = -1;
+  i = 0;
+  printf "# Code:LineBreakingClass";
+  # Assign a uniq integer code to each line breaking class.
+  # The codes must be the same as "enum LineBreakingClass" of
+  # m17n-lib/src/linebreak.c
+  setLBC("OP", i++);           # open
+  setLBC("CL", i++);           # close
+  setLBC("QU", i++);           # quotation
+  setLBC("GL", i++);           # glue
+  setLBC("NS", i++);           # no-start
+  setLBC("EX", i++);           # exclamation/interrogation
+  setLBC("SY", i++);           # Syntax (slash)
+  setLBC("IS", i++);           # infix (numeric) separator
+  setLBC("PR", i++);           # prefix
+  setLBC("PO", i++);           # postfix
+  setLBC("NU", i++);           # numeric
+  setLBC("AL", i++);           # alphabetic
+  setLBC("ID", i++);           # ideograph (atomic)
+  setLBC("IN", i++);           # inseparable
+  setLBC("HY", i++);           # hyphen
+  setLBC("BA", i++);           # break after
+  setLBC("BB", i++);           # break before
+  setLBC("B2", i++);           # break both
+  setLBC("ZW", i++);           # ZW space
+  setLBC("CM", i++);           # combining mark
+  setLBC("WJ", i++);           # word joiner
+
+  # For UAX#14 7.6 Korean Syllable Block Pair Table.
+  setLBC("H2", i++);           # Hamgul 2 Jamo Syllable
+  setLBC("H3", i++);           # Hangul 3 Jamo Syllable
+  setLBC("JL", i++);           # Jamo leading consonant
+  setLBC("JV", i++);           # Jamo vowel
+  setLBC("JT", i++);           # Jamo trailing consonant
+
+  # Not handled in the pair table.
+  setLBC("SA", i++);           # south (east) asian
+  setLBC("SP", i++);           # space
+  setLBC("PS", i++);           # paragraph and line separators
+  setLBC("BK", i++);           # hard break (newline)
+  setLBC("CR", i++);           # carriage return
+  setLBC("LF", i++);           # line feed
+  setLBC("NL", i++);           # next line
+  setLBC("CB", i++);           # contingent break opportunity
+  setLBC("SG", i++);           # surrogate
+  setLBC("AI", i++);           # ambiguous
+  setLBC("XX", i);             # unknown
+  
+  # The default is "XX".
+  printf "\n0x0000-0x3FFFFF %d\n", i;
+}
+
+/^[0-9A-Za-z]*;/ {
+  lbc = to_lbc[$2];
+  if (prev_lbc != lbc)
+    {
+      if (prev_lbc != -1)
+       {
+         if (from == to)
+           printf "%s %d\n", from, prev_lbc;
+         else
+           printf "%s-%s %d\n", from, to, prev_lbc;
+       }
+      from = "0x" $1;
+      to = "0x" $1;
+      prev_lbc = lbc;
+    }
+  else
+    to = "0x" $1;
+  next;
+}
+
+/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
+  lbc = to_lbc[$2];
+  if (prev_lbc != -1)
+    {
+      if (from == to)
+       printf "%s %d\n", from, prev_lbc;
+      else
+       printf "%s-%s %d\n", from, to, prev_lbc;
+    }
+  gsub("\\.\\.", "-0x");
+  printf "0x%s %d\n", $1, lbc;
+  from = "";
+  to = "";
+  prev_lbc = -1;
+  next;
+}
+
+END {
+  if (prev_lbc != -1)
+    {
+      if (from == to)
+       printf "0x%s %d\n", from, prev_lbc;
+      else
+       printf "0x%s-0x%s %d\n", from, to, prev_lbc;
+    }
+}