From: handa Date: Fri, 2 Sep 2005 07:35:29 +0000 (+0000) Subject: New file. X-Git-Tag: REL-1-3-0~145 X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c2a7b6de7ed53057e940a7097a92c14583e88c44;p=m17n%2Fm17n-db.git New file. --- diff --git a/LINEBREAK.awk b/LINEBREAK.awk new file mode 100644 index 0000000..639555c --- /dev/null +++ b/LINEBREAK.awk @@ -0,0 +1,133 @@ +# LINEBREAK.awk -- awk script to produce a compact linebreak property map +# Copyright (C) 2005 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H15PRO112 + +# This file is part of the m17n database; a sub-part of the m17n +# library. + +# The m17n library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 of +# the License, or (at your option) any later version. + +# The m17n library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the m17n library; if not, write to the Free +# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +# 02111-1307, USA. + +function setLBC(name, code) { + if (code % 10 == 0) + printf "\n# %2d:%s", code, name; + else + printf " %2d:%s", code, name; + to_lbc[name] = code; +} + +BEGIN { + FS = "[; ]"; + from = ""; + to = ""; + prev_lbc = -1; + i = 0; + printf "# Code:LineBreakingClass"; + # Assign a uniq integer code to each line breaking class. + # The codes must be the same as "enum LineBreakingClass" of + # m17n-lib/src/linebreak.c + setLBC("OP", i++); # open + setLBC("CL", i++); # close + setLBC("QU", i++); # quotation + setLBC("GL", i++); # glue + setLBC("NS", i++); # no-start + setLBC("EX", i++); # exclamation/interrogation + setLBC("SY", i++); # Syntax (slash) + setLBC("IS", i++); # infix (numeric) separator + setLBC("PR", i++); # prefix + setLBC("PO", i++); # postfix + setLBC("NU", i++); # numeric + setLBC("AL", i++); # alphabetic + setLBC("ID", i++); # ideograph (atomic) + setLBC("IN", i++); # inseparable + setLBC("HY", i++); # hyphen + setLBC("BA", i++); # break after + setLBC("BB", i++); # break before + setLBC("B2", i++); # break both + setLBC("ZW", i++); # ZW space + setLBC("CM", i++); # combining mark + setLBC("WJ", i++); # word joiner + + # For UAX#14 7.6 Korean Syllable Block Pair Table. + setLBC("H2", i++); # Hamgul 2 Jamo Syllable + setLBC("H3", i++); # Hangul 3 Jamo Syllable + setLBC("JL", i++); # Jamo leading consonant + setLBC("JV", i++); # Jamo vowel + setLBC("JT", i++); # Jamo trailing consonant + + # Not handled in the pair table. + setLBC("SA", i++); # south (east) asian + setLBC("SP", i++); # space + setLBC("PS", i++); # paragraph and line separators + setLBC("BK", i++); # hard break (newline) + setLBC("CR", i++); # carriage return + setLBC("LF", i++); # line feed + setLBC("NL", i++); # next line + setLBC("CB", i++); # contingent break opportunity + setLBC("SG", i++); # surrogate + setLBC("AI", i++); # ambiguous + setLBC("XX", i); # unknown + + # The default is "XX". + printf "\n0x0000-0x3FFFFF %d\n", i; +} + +/^[0-9A-Za-z]*;/ { + lbc = to_lbc[$2]; + if (prev_lbc != lbc) + { + if (prev_lbc != -1) + { + if (from == to) + printf "%s %d\n", from, prev_lbc; + else + printf "%s-%s %d\n", from, to, prev_lbc; + } + from = "0x" $1; + to = "0x" $1; + prev_lbc = lbc; + } + else + to = "0x" $1; + next; +} + +/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ { + lbc = to_lbc[$2]; + if (prev_lbc != -1) + { + if (from == to) + printf "%s %d\n", from, prev_lbc; + else + printf "%s-%s %d\n", from, to, prev_lbc; + } + gsub("\\.\\.", "-0x"); + printf "0x%s %d\n", $1, lbc; + from = ""; + to = ""; + prev_lbc = -1; + next; +} + +END { + if (prev_lbc != -1) + { + if (from == to) + printf "0x%s %d\n", from, prev_lbc; + else + printf "0x%s-0x%s %d\n", from, to, prev_lbc; + } +}