From: handa Date: Fri, 2 Sep 2005 06:18:07 +0000 (+0000) Subject: New file. X-Git-Tag: REL-1-3-0~208 X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=40f613a928df96f91312c618043533747093ad49;p=m17n%2Fm17n-lib.git New file. --- diff --git a/src/mtext-lbrk.c b/src/mtext-lbrk.c new file mode 100644 index 0000000..82b81c0 --- /dev/null +++ b/src/mtext-lbrk.c @@ -0,0 +1,426 @@ +/* mtext-lbrk.c -- line break + Copyright (C) 2005 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H15PRO112 + + This file is part of the m17n library. + + The m17n library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License + as published by the Free Software Foundation; either version 2.1 of + the License, or (at your option) any later version. + + The m17n library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the m17n library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. */ + +#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE) +/*** @addtogroup m17nInternal + @{ */ + +#include +#include +#include + +#include "config.h" +#include "m17n.h" +#include "m17n-misc.h" +#include "internal.h" +#include "mtext.h" + +enum LineBreakClass + { + LBC_OP, /* open */ + LBC_CL, /* close */ + LBC_QU, /* quotation */ + LBC_GL, /* glue */ + LBC_NS, /* no-start */ + LBC_EX, /* exclamation/interrogation */ + LBC_SY, /* Syntax (slash) */ + LBC_IS, /* infix (numeric) separator */ + LBC_PR, /* prefix */ + LBC_PO, /* postfix */ + LBC_NU, /* numeric */ + LBC_AL, /* alphabetic */ + LBC_ID, /* ideograph (atomic) */ + LBC_IN, /* inseparable */ + LBC_HY, /* hyphen */ + LBC_BA, /* break after */ + LBC_BB, /* break before */ + LBC_B2, /* break both */ + LBC_ZW, /* ZW space */ + LBC_CM, /* combining mark */ + LBC_WJ, /* word joiner */ + + /* used for 4.1 pair table */ + LBC_H2, /* Hamgul 2 Jamo Syllable */ + LBC_H3, /* Hangul 3 Jamo Syllable */ + LBC_JL, /* Jamo leading consonant */ + LBC_JV, /* Jamo vowel */ + LBC_JT, /* Jamo trailing consonant */ + + /* These are not handled in the pair tables. */ + LBC_SA, /* south (east) asian */ + LBC_SP, /* space */ + LBC_PS, /* paragraph and line separators */ + LBC_BK, /* hard break (newline) */ + LBC_CR, /* carriage return */ + LBC_LF, /* line feed */ + LBC_NL, /* next line */ + LBC_CB, /* contingent break opportunity */ + LBC_SG, /* surrogate */ + LBC_AI, /* ambiguous */ + LBC_XX, /* unknown */ + LBC_MAX + }; + +enum LineBreakAction + { + LBA_DIRECT = '_', + LBA_INDIRECT = '%', + LBA_COMBINING_INDIRECT = '#', + LBA_COMBINING_PROHIBITED = '@', + LBA_PROHIBITED = '^', + LBA_MAX + }; + +/* The pair table of line break actions. */ +static char *lba_pair_table[] = + /* OP GL SY PO ID BA ZW H2 JV + CL NS IS NU IN BB CM H3 JT + QU EX PR AL HY B2 WJ JL */ + {}; + +static MCharTable *lbc_table; + +/* Set LBC to enum LineBreakClass of the character at POS of MT + (length is LEN) while converting LBC_AI and LBC_XX to LBC_AL, + LBC_CB to LBC_B2, LBC_CR, LBC_LF, and LBC_NL to LBC_BK. If POS is + out of range, set LBC to LBC_BK. */ + +#define GET_LBC(LBC, MT, LEN, POS, OPTION) \ + do { \ + if ((POS) < 0 || (POS) >= (LEN)) \ + (LBC) = LBC_BK; \ + else \ + { \ + int c = mtext_ref_char ((MT), (POS)); \ + (LBC) = (enum LineBreakClass) mchartable_lookup (lbc_table, c); \ + if ((LBC) == LBC_NL) \ + (LBC) = LBC_BK; \ + else if ((LBC) == LBC_AI) \ + (LBC) = ((OPTION) & MTEXT_LBO_AI_AS_ID) ? LBC_ID : LBC_AL; \ + else if (! ((OPTION) & MTEXT_LBO_KOREAN_SP) \ + && (LBC) >= LBC_H2 && (LBC) <= LBC_JT) \ + (LBC) = LBC_AL; \ + else if ((LBC) == LBC_CB) \ + (LBC) = LBC_B2; \ + else if ((LBC) == LBC_XX) \ + (LBC) = LBC_AL; \ + } \ + } while (0) + + +/*** @} */ +#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */ + + +/* External API */ + +/*** @addtogroup m17nMtext */ +/*** @{ */ +/*=*/ + +int +mtext_line_break (MText *mt, int pos, int option, int *after) +{ + int break_before, break_after; + int len = mtext_len (mt); + enum LineBreakClass lbc; + enum LineBreakClass Blbc, Albc; /* B(efore) and A(fter) lbcs. */ + int Bpos, Apos; /* B(efore) and A(fter) positions. */ + enum LineBreakAction action; + + if (pos >= len) + { + /* The end of text is an explicit break position. */ + if (after) + *after = pos; + return pos; + } + + if (! lbc_table) + { + MSymbol key = mchar_define_property ("linebreak", Minteger); + + lbc_table = mchar_get_prop_table (key, NULL); + } + + GET_LBC (lbc, mt, len, pos, option); + Apos = pos; + Albc = lbc; + if (Albc == LBC_SP) + { + if (option & MTEXT_LBO_SP_CM) + { + GET_LBC (Albc, mt, len, Apos + 1, option); + Albc = (Albc == LBC_CM) ? LBC_ID : LBC_SP; + } + while (Albc == LBC_SP) + { + Apos--; + GET_LBC (Albc, mt, len, Apos, option); + } + } + if ((option & MTEXT_LBO_SP_CM) && (Albc == LBC_CM)) + { + Apos--; + GET_LBC (Albc, mt, len, Apos, option); + if (Albc == LBC_SP) + Albc = LBC_ID; + else + Apos++, Albc = LBC_CM; + } + + if (Albc == LBC_CR) + Albc = LBC_BK; + else if (Albc == LBC_LF) + { + GET_LBC (Albc, mt, len, Apos - 1, option); + if (Albc == LBC_CR) + Apos--; + Albc = LBC_BK; + } + else if (Albc == LBC_SA) + Albc = mtext__word_segment (mt, Apos, &Apos, NULL) > 0 ? LBC_BB : LBC_AL; + Bpos = Apos; + /* After exiting from the following loop, if Apos is positive, it is + the previous (including POS) break position. */ + while (Apos > 0) + { + int indirect; + int next = -1; + + /* Now Bpos == Apos. */ + do { + Bpos--; + GET_LBC (Blbc, mt, len, Bpos, option); + } while (Blbc == LBC_SP); + + if (Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) + { + /* Explicit break. */ + break; + } + + indirect = Bpos + 1 < Apos; + + if (Blbc == LBC_CM) + { + do { + Bpos--; + GET_LBC (Blbc, mt, len, Bpos, option); + } while (Blbc == LBC_CM); + if ((option & MTEXT_LBO_SP_CM) && (Blbc == LBC_SP)) + Blbc = LBC_ID; + else if (Blbc == LBC_SP || Blbc == LBC_ZW + || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) + { + Blbc = LBC_AL; + Bpos++; + } + } + if (Blbc == LBC_SA) + { + mtext__word_segment (mt, Bpos, &next, NULL); + Blbc = LBC_AL; + } + + if (Albc != LBC_BK) + { + action = lba_pair_table[Blbc][Albc]; + if (action == LBA_DIRECT) + break; + else if (action == LBA_INDIRECT) + { + if (indirect) + break; + } + else if (action == LBA_COMBINING_INDIRECT) + { + if (indirect) + break; + } + } + if (next >= 0) + Apos = next, Albc = LBC_BB; + else + Apos = Bpos, Albc = Blbc; + } + break_before = Apos; + if (break_before > 0) + { + if (! after) + return break_before; + if (break_before == pos) + { + if (after) + *after = break_before; + return break_before; + } + } + + /* Now find a break position after POS. */ + break_after = 0; + Bpos = pos; + Blbc = lbc; + if (Blbc == LBC_CM) + { + do { + Bpos--; + GET_LBC (Blbc, mt, len, Bpos, option); + } while (Blbc == LBC_CM); + if (Blbc == LBC_SP || Blbc == LBC_ZW + || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) + { + if ((Blbc == LBC_SP) && (option & MTEXT_LBO_SP_CM)) + Blbc = LBC_ID; + else + Blbc = LBC_AL; + } + Bpos = pos; + } + if (Blbc == LBC_SA) + { + mtext__word_segment (mt, Bpos, NULL, &Bpos); + Blbc = LBC_AL; + } + else if (Blbc == LBC_SP) + { + if (option & MTEXT_LBO_SP_CM) + { + GET_LBC (Blbc, mt, len, Bpos + 1, option); + if (Blbc == LBC_CM) + Blbc = LBC_ID, Bpos++; + else + Blbc = LBC_SP; + } + while (Blbc == LBC_SP) + { + Bpos--; + GET_LBC (Blbc, mt, len, Bpos, option); + } + if (Bpos < 0) + Bpos = pos; + } + Apos = Bpos; + /* After exiting from the following loop, if Apos is positive, it is + the next break position. */ + while (1) + { + int indirect; + int next = -1; + + /* Now Bpos == Apos. */ + if (Blbc == LBC_LF || Blbc == LBC_BK || Blbc == LBC_CR) + { + Apos++; + if (Blbc == LBC_CR) + { + GET_LBC (Blbc, mt, len, Bpos + 1, option); + if (Blbc == LBC_LF) + Apos++; + } + break; + } + + do { + Apos++; + GET_LBC (Albc, mt, len, Apos, option); + } while (Albc == LBC_SP); + + if (Blbc == LBC_SP) + break; + + if (Apos == len) + /* Explicit break at the end of text. */ + break; + + indirect = Bpos + 1 < Apos; + + if (Albc == LBC_SA) + Albc = mtext__word_segment (mt, Apos, NULL, &next) ? LBC_BB : LBC_AL; + + action = lba_pair_table[Blbc][Albc]; + if (action == LBA_DIRECT) + /* Direct break at Apos. */ + break; + else if (action == LBA_INDIRECT) + { + if (indirect) + break; + } + else if (action == LBA_COMBINING_INDIRECT) + { + if (indirect) + { + if (option & MTEXT_LBO_SP_CM) + Apos--; + break; + } + } + if (next >= 0) + Bpos = next, Blbc = LBC_AL; + else + { + Bpos = Apos; + if (Albc != LBC_CM) + Blbc = Albc; + } + } + break_after = Apos; + if (after) + *after = break_after; + + return (break_before > 0 ? break_before : break_after); +} + +/*** @} */ + +/* + Local Variables: + coding: euc-japan + End: +*/ diff --git a/src/mtext-wseg.c b/src/mtext-wseg.c new file mode 100644 index 0000000..8e8cdd3 --- /dev/null +++ b/src/mtext-wseg.c @@ -0,0 +1,419 @@ +/* mtext-wseg.c -- word segmentation + Copyright (C) 2005 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H15PRO112 + + This file is part of the m17n library. + + The m17n library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License + as published by the Free Software Foundation; either version 2.1 of + the License, or (at your option) any later version. + + The m17n library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the m17n library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. */ + +#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE) +/*** @addtogroup m17nInternal + @{ */ + +#include +#include + +#include "config.h" +#include "m17n-core.h" +#include "m17n-misc.h" +#include "internal.h" +#include "textprop.h" +#include "character.h" + +typedef struct _MWordseg_Function MWordseg_Function; + +struct _MWordseg_Function +{ + int initialized; + int (*init) (void); + void (*fini) (void); + int (*wseg) (MText *mt, int pos, int *from, int *to, + MWordseg_Function *wordseg); + MWordseg_Function *next; +}; + +static MWordseg_Function *wordseg_function_list; + +static MCharTable *wordseg_function_table; + +static int +generic_wordseg (MText *mt, int pos, int *from, int *to, + MWordseg_Function *wordseg) +{ + int len = mtext_nchars (mt); + int c = mtext_ref_char (mt, pos); + MSymbol category = mchar_get_prop (c, Mcategory); + char cathead = msymbol_name (category)[0]; + int in_word = (cathead == 'L' || cathead == 'M' || cathead == 'N'); + int beg, end; + + for (beg = pos; beg > 0; beg--) + { + c = mtext_ref_char (mt, beg - 1); + category = mchar_get_prop (c, Mcategory); + cathead = msymbol_name (category)[0]; + if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N')) + break; + if (mchartable_lookup (wordseg_function_table, c) != wordseg) + break; + } + for (end = pos; end < len; end++) + { + c = mtext_ref_char (mt, end); + category = mchar_get_prop (c, Mcategory); + cathead = msymbol_name (category)[0]; + if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N')) + break; + if (mchartable_lookup (wordseg_function_table, c) != wordseg) + break; + } + if (from) + *from = beg; + if (to) + *to = end; + return in_word; +} + +#ifdef HAVE_THAI_WORDSEG + +#define THAI_BEG 0x0E01 +#define THAI_END 0x0E6F + +static MSymbol M_thai_wordseg; + +/* We have libthai, wordcut, or wordcut-old. Each of them provides + the following three functions. */ + +static int thai_wordseg_init (void); +static void thai_wordseg_fini (void); +static MTextProperty *thai_wordseg_propertize (MText *mt, int pos, + int from, int to, + unsigned char *tis); + +#ifdef HAVE_LIBTHAI + +#include + +static int +thai_wordseg_init (void) +{ + return 0; +} + +static void +thai_wordseg_fini (void) +{ + return; +} + +static MTextProperty * +thai_wordseg_propertize (MText *mt, int pos, int from, int to, + unsigned char *tis) +{ + int len = to - from; + int *breaks = alloca ((sizeof (int)) * len); + int count = th_brk ((thchar_t *) tis, breaks, len); + MTextProperty *prop = NULL; + + if (count == 0) + { + prop = mtext_property (M_thai_wordseg, Mt, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + mtext_attach_property (mt, from, to, prop); + M17N_OBJECT_UNREF (prop); + } + else + { + int last, i; + MTextProperty *this; + + for (i = 0, last = from; i < count; i++) + { + this = mtext_property (M_thai_wordseg, Mt, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + mtext_attach_property (mt, last, from + breaks[i], this); + if (pos >= last && pos < from + breaks[i]) + prop = this; + M17N_OBJECT_UNREF (this); + last = from + breaks[i]; + } + if (last < to) + { + this = mtext_property (M_thai_wordseg, Mt, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + mtext_attach_property (mt, last, to, this); + if (pos >= last && pos < to) + prop = this; + M17N_OBJECT_UNREF (this); + } + } + + if (! prop) + mdebug_hook (); + return prop; +} + +#elif HAVE_WORDCUT + +#include + +static WcWordcut wordcut; +static WcWordVector *word_vector; + +static int +thai_wordseg_init (void) +{ + wc_wordcut_init (&wordcut); + return 0; +} + +static void +thai_wordseg_fini (void) +{ + if (word_vector) + wc_word_vector_delete (word_vector); + wc_wordcut_destroy (&wordcut); + return; +} + +static MTextProperty * +thai_wordseg_propertize (MText *mt, int pos, int from, int to, + unsigned char *tis) +{ + gulong i, count; + MTextProperty *prop = NULL; + + if (! word_vector) + word_vector = wc_word_vector_new (); + else + { + wc_word_vector_destroy (word_vector); + wc_word_vector_init (word_vector); + } + + wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from), + word_vector); + count = wc_word_vector_get_count (word_vector); + for (i = 0; i < count; i++) + { + WcWord *word = wc_word_vector_get_word (word_vector, i); + + if (word->type != WC_WORDTYPE_DELETED) + { + MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY + || word->type == WC_WORDTYPE_WORDUNIT + || word->type == WC_WORDTYPE_JOINED) + ? Mt : Mnil); + MTextProperty *this + = mtext_property (M_thai_wordseg, val, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + + mtext_attach_property (mt, from, from + word->len, this); + if (pos >= from && pos < from + word->len) + prop = this; + M17N_OBJECT_UNREF (this); + from += word->len; + } + } + return prop; +} + +#else /* HAVE_WORDCUT_OLD */ + +#include + +static Wordcut wordcut; +static WordcutResult wordcut_result; +static int wordcut_result_used; + +static int +thai_wordseg_init (void) +{ + return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1); +} + +static void +thai_wordseg_fini (void) +{ + if (wordcut_result_used) + { + wordcut_result_close (&wordcut_result); + wordcut_result_used = 0; + } + wordcut_close (&wordcut); + return; +} + +static MTextProperty * +thai_wordseg_propertize (MText *mt, int pos, int from, int to, + unsigned char *tis) +{ + int i, last; + MTextProperty *prop = NULL; + + wordcut_cut (&wordcut, (char *) tis, &wordcut_result); + wordcut_result_used = 1; + for (i = 0, last = from; i < wordcut_result.count; i++) + { + MTextProperty *this; + + if (last < from + wordcut_result.start[i]) + { + this = mtext_property (M_thai_wordseg, Mnil, + MTEXTPROP_VOLATILE_WEAK); + mtext_attach_property (mt, last, from + wordcut_result.start[i], + this); + if (pos >= last && pos < from + wordcut_result.start[i]) + prop = this; + M17N_OBJECT_UNREF (this); + } + + this = mtext_property (M_thai_wordseg, Mt, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + last = from + wordcut_result.start[i]; + mtext_attach_property (mt, last, last + wordcut_result.offset[i], this); + if (pos >= last && pos < last + wordcut_result.offset[i]) + prop = this; + m17n_object_unref (this); + last += wordcut_result.offset[i]; + } + return prop; +} + +#endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */ + +int +thai_wordseg (MText *mt, int pos, int *from, int *to, + MWordseg_Function *wordseg) +{ + MTextProperty *prop; + + /* It is assured that there's a Thai character at POS. */ + prop = mtext_get_property (mt, pos, M_thai_wordseg); + if (! prop) + { + /* TIS620 code sequence. */ + unsigned char *tis; + int len = mtext_nchars (mt); + int beg, end; + int c, i; + + for (beg = pos; beg > 0; beg--) + if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END) + break; + for (end = pos + 1; end < len; end++) + if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END) + break; + + /* Extra 1-byte for 0 terminating. */ + tis = alloca ((end - beg) + 1); + + for (i = beg; i < end; i++) + tis[i - beg] = 0xA1 + (mtext_ref_char (mt, i) - THAI_BEG); + tis[i - beg] = 0; + prop = thai_wordseg_propertize (mt, pos, beg, end, tis); + } + + if (from) + *from = MTEXTPROP_START (prop); + if (to) + *to = MTEXTPROP_END (prop); + return (MTEXTPROP_VAL (prop) == Mt); +} + +#endif /* HAVE_THAI_WORDSEG */ + + +/* Internal API */ + +void +mtext__wseg_fini () +{ + if (wordseg_function_list) + { + while (wordseg_function_list) + { + MWordseg_Function *next = wordseg_function_list->next; + + if (wordseg_function_list->initialized > 0 + && wordseg_function_list->fini) + wordseg_function_list->fini (); + free (wordseg_function_list); + wordseg_function_list = next; + } + M17N_OBJECT_UNREF (wordseg_function_table); + } +} + +/* Find word boundaries around POS of MT. Set *FROM to the word + boundary position at or previous to POS, and update *TO to the word + boundary position after POS. + + @return If word boundaries were found successfully, return 1 (if + the character at POS is a part of a word) or 0 (otherwise). If the + operation was not successful, return -1 without setting *FROM and + *TO. */ + +int +mtext__word_segment (MText *mt, int pos, int *from, int *to) +{ + int c = mtext_ref_char (mt, pos); + MWordseg_Function *wordseg; + + if (! wordseg_function_table) + { + wordseg_function_table = mchartable (Mnil, NULL); + + MSTRUCT_CALLOC (wordseg, MERROR_MTEXT); + wordseg->wseg = generic_wordseg; + wordseg->next = wordseg_function_list; + wordseg_function_list = wordseg; + mchartable_set_range (wordseg_function_table, 0, MCHAR_MAX, wordseg); + +#ifdef HAVE_THAI_WORDSEG + MSTRUCT_CALLOC (wordseg, MERROR_MTEXT); + wordseg->init = thai_wordseg_init; + wordseg->fini = thai_wordseg_fini; + wordseg->wseg = thai_wordseg; + wordseg->next = wordseg_function_list; + wordseg_function_list = wordseg; + mchartable_set_range (wordseg_function_table, THAI_BEG, THAI_END, + wordseg); + M_thai_wordseg = msymbol (" thai-wordseg"); +#endif + } + + wordseg = mchartable_lookup (wordseg_function_table, c); + if (wordseg && wordseg->initialized >= 0) + { + if (! wordseg->initialized) + { + if (wordseg->init + && wordseg->init () < 0) + { + wordseg->initialized = -1; + return -1; + } + wordseg->initialized = 1; + } + return wordseg->wseg (mt, pos, from, to, wordseg); + } + return -1; +} + +/*** @} */ +#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */