From: handa Date: Fri, 15 Apr 2005 08:11:03 +0000 (+0000) Subject: New file. X-Git-Tag: REL-1-3-0~337 X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c7827d93f559c92a877d162a5290399a596adb32;p=m17n%2Fm17n-lib.git New file. --- diff --git a/src/word-thai.c b/src/word-thai.c new file mode 100644 index 0000000..bc11b71 --- /dev/null +++ b/src/word-thai.c @@ -0,0 +1,264 @@ +/* word-thai.c -- Find a word segment in Thai text. + Copyright (C) 2005 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H15PRO112 + + This file is part of the m17n library. + + The m17n library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License + as published by the Free Software Foundation; either version 2.1 of + the License, or (at your option) any later version. + + The m17n library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the m17n library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. */ + +#include "config.h" +#include "m17n-core.h" +#include "m17n-misc.h" +#include "internal.h" +#include "textprop.h" +#include "character.h" +#include "mtext.h" + +static int init_wordseg_library (void); +static void fini_wordseg_library (void); +static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to, + unsigned char *tis); + +#define THAI_BEG 0x0E01 +#define THAI_END 0x0E6F + +static int wordseg_library_initialized; +static MSymbol Mthai_wordseg; + +#ifdef HAVE_WORDCUT + +#include + +static WcWordcut wordcut; +static WcWordVector *word_vector; + +static int +init_wordseg_library (void) +{ + wc_wordcut_init (&wordcut); + return 0; +} + +static void +fini_wordseg_library (void) +{ + if (word_vector) + wc_word_vector_delete (word_vector); + wc_wordcut_destroy (&wordcut); + return; +} + +static MTextProperty * +wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis) +{ + gulong i, count; + MTextProperty *prop; + + if (! word_vector) + word_vector = wc_word_vector_new (); + else + { + wc_word_vector_destroy (word_vector); + wc_word_vector_init (word_vector); + } + + wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from), + word_vector); + count = wc_word_vector_get_count (word_vector); + for (i = 0; i < count; i++) + { + WcWord *word = wc_word_vector_get_word (word_vector, i); + + if (word->type != WC_WORDTYPE_DELETED) + { + MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY + || word->type == WC_WORDTYPE_WORDUNIT + || word->type == WC_WORDTYPE_JOINED) + ? Mt : Mnil); + MTextProperty *this + = mtext_property (Mthai_wordseg, val, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + + mtext_attach_property (mt, from, from + word->len, this); + if (pos >= from && pos < from + word->len) + prop = this; + else + M17N_OBJECT_UNREF (this); + from += word->len; + } + } + return prop; +} + +#elif HAVE_WORDCUT_OLD + +#include + +static Wordcut wordcut; +static WordcutResult wordcut_result; +static int wordcut_result_used; + +static int +init_wordseg_library (void) +{ + return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1); +} + +static void +fini_wordseg_library (void) +{ + if (wordcut_result_used) + { + wordcut_result_close (&wordcut_result); + wordcut_result_used = 0; + } + wordcut_close (&wordcut); + return; +} + +static MTextProperty * +wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis) +{ + int i, last; + MTextProperty *prop; + + wc_wordcut_cut (&wordcut, (char *) tis, &wordcut_result); + wordcut_result_used = 1; + for (i = 0, last = from; i < wordcut_result.count; i++) + { + MTextProperty *this; + + if (last < from + wordcut_result.start[i]) + { + this = mtext_property (Mthai_wordseg, Mnil, + MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE); + mtext_attach_property (mt, last, from + wordcut_result.start[i], + prop); + if (pos >= last && pos < from + wordcut_result.start[i]) + prop = this; + else + M17N_OBJECT_UNREF (this); + } + + last = from + wordcut_result.start[i] + mtext_attach_property (mt, last, last + wordcut_result.offset[i], prop); + if (pos >= last && pos < last + wordcut_result.offset[i]) + prop = this; + else + m17n_object_unref (prop); + last += wordcut_result.offset[i]; + } +} + +#else /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */ + +int +init_wordseg_library (void) +{ + return -1; +} + +void +fini_wordseg_library (void) +{ + return; +} + +#endif /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */ + +int +thai_wordseg (MText *mt, int pos, int *from, int *to) +{ + /* TIS620 code sequence. */ + unsigned char *tis; + MTextProperty *prop; + int in_word; + + if (pos >= mtext_nchars (mt)) + { + *from = *to = pos; + return 0; + } + + prop = mtext_get_property (mt, pos, Mthai_wordseg); + + if (! prop) + { + int beg, end; + int c; + + /* Extra 1-byte is for 0 terminating. */ + tis = alloca ((*to - *from) + 1); + + for (beg = pos; beg > *from; beg--) + { + if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END) + break; + tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG); + } + for (end = pos; end < *to; end++) + { + if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END) + break; + tis[end - *from] = 0xA1 + (c - THAI_BEG); + } + + if (pos == end) + { + *from = *to = pos; + return 0; + } + + /* Make it terminate by 0. */ + tis[end - *from] = 0; + prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from)); + } + + *from = MTEXTPROP_START (prop); + *to = MTEXTPROP_END (prop); + in_word = MTEXTPROP_VAL (prop) == Mt; + M17N_OBJECT_UNREF (prop); + return in_word; +} + + +/* Internal API */ + +int +mtext__word_thai_init () +{ + if (! wordseg_library_initialized) + { + if (init_wordseg_library () < 0) + return -1; + wordseg_library_initialized = 1; + Mthai_wordseg = msymbol (" wordcut-wordseg"); + } + mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END, + (void *) thai_wordseg); + return 0; +} + +void +mtext__word_thai_fini () +{ + if (wordseg_library_initialized) + { + fini_wordseg_library (); + wordseg_library_initialized = 0; + } +} diff --git a/src/word-thai.h b/src/word-thai.h new file mode 100644 index 0000000..0d7fc23 --- /dev/null +++ b/src/word-thai.h @@ -0,0 +1,25 @@ +/* word-thai.h -- Header for Thai word segmentation. + Copyright (C) 2005 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H15PRO112 + + This file is part of the m17n library. + + The m17n library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License + as published by the Free Software Foundation; either version 2.1 of + the License, or (at your option) any later version. + + The m17n library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the m17n library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. */ + +extern int mtext__word_thai_init (); + +extern void mtext__word_thai_fini ();