1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 #include "m17n-core.h"
27 #include "m17n-misc.h"
30 #include "character.h"
33 static int init_wordseg_library (void);
34 static void fini_wordseg_library (void);
35 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
38 #define THAI_BEG 0x0E01
39 #define THAI_END 0x0E6F
41 static int wordseg_library_initialized;
42 static MSymbol Mthai_wordseg;
46 #include <wordcut/wcwordcut.h>
48 static WcWordcut wordcut;
49 static WcWordVector *word_vector;
52 init_wordseg_library (void)
54 wc_wordcut_init (&wordcut);
59 fini_wordseg_library (void)
62 wc_word_vector_delete (word_vector);
63 wc_wordcut_destroy (&wordcut);
67 static MTextProperty *
68 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
71 MTextProperty *prop = NULL;
74 word_vector = wc_word_vector_new ();
77 wc_word_vector_destroy (word_vector);
78 wc_word_vector_init (word_vector);
81 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
83 count = wc_word_vector_get_count (word_vector);
84 for (i = 0; i < count; i++)
86 WcWord *word = wc_word_vector_get_word (word_vector, i);
88 if (word->type != WC_WORDTYPE_DELETED)
90 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
91 || word->type == WC_WORDTYPE_WORDUNIT
92 || word->type == WC_WORDTYPE_JOINED)
95 = mtext_property (Mthai_wordseg, val,
96 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
98 mtext_attach_property (mt, from, from + word->len, this);
99 if (pos >= from && pos < from + word->len)
102 M17N_OBJECT_UNREF (this);
109 #elif HAVE_WORDCUT_OLD
111 #include <wordcut/wordcut.h>
113 static Wordcut wordcut;
114 static WordcutResult wordcut_result;
115 static int wordcut_result_used;
118 init_wordseg_library (void)
120 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
124 fini_wordseg_library (void)
126 if (wordcut_result_used)
128 wordcut_result_close (&wordcut_result);
129 wordcut_result_used = 0;
131 wordcut_close (&wordcut);
135 static MTextProperty *
136 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
139 MTextProperty *prop = NULL;
141 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
142 wordcut_result_used = 1;
143 for (i = 0, last = from; i < wordcut_result.count; i++)
147 if (last < from + wordcut_result.start[i])
149 this = mtext_property (Mthai_wordseg, Mnil,
150 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
151 mtext_attach_property (mt, last, from + wordcut_result.start[i],
153 if (pos >= last && pos < from + wordcut_result.start[i])
156 M17N_OBJECT_UNREF (this);
159 last = from + wordcut_result.start[i];
160 mtext_attach_property (mt, last, last + wordcut_result.offset[i], prop);
161 if (pos >= last && pos < last + wordcut_result.offset[i])
164 m17n_object_unref (prop);
165 last += wordcut_result.offset[i];
170 #else /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
173 init_wordseg_library (void)
179 fini_wordseg_library (void)
184 #endif /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
187 thai_wordseg (MText *mt, int pos, int *from, int *to)
189 /* TIS620 code sequence. */
194 if (pos >= mtext_nchars (mt))
200 prop = mtext_get_property (mt, pos, Mthai_wordseg);
207 /* Extra 1-byte is for 0 terminating. */
208 tis = alloca ((*to - *from) + 1);
210 for (beg = pos; beg > *from; beg--)
212 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
214 tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG);
216 for (end = pos; end < *to; end++)
218 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
220 tis[end - *from] = 0xA1 + (c - THAI_BEG);
229 /* Make it terminate by 0. */
230 tis[end - *from] = 0;
231 prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from));
234 *from = MTEXTPROP_START (prop);
235 *to = MTEXTPROP_END (prop);
236 in_word = MTEXTPROP_VAL (prop) == Mt;
237 M17N_OBJECT_UNREF (prop);
245 mtext__word_thai_init ()
247 if (! wordseg_library_initialized)
249 if (init_wordseg_library () < 0)
251 wordseg_library_initialized = 1;
252 Mthai_wordseg = msymbol (" wordcut-wordseg");
254 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
255 (void *) thai_wordseg);
260 mtext__word_thai_fini ()
262 if (wordseg_library_initialized)
264 fini_wordseg_library ();
265 wordseg_library_initialized = 0;