1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 #ifdef HAVE_THAI_WORDSEG
30 #include "m17n-core.h"
31 #include "m17n-misc.h"
34 #include "character.h"
37 static int init_wordseg_library (void);
38 static void fini_wordseg_library (void);
39 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
42 #define THAI_BEG 0x0E01
43 #define THAI_END 0x0E6F
45 static int wordseg_library_initialized;
46 static MSymbol Mthai_wordseg;
50 #include <thai/thbrk.h>
53 init_wordseg_library (void)
59 fini_wordseg_library (void)
64 static MTextProperty *
65 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
68 int *breaks = alloca ((sizeof (int)) * len);
69 int count = th_brk ((thchar_t *) tis, breaks, len);
70 MTextProperty *prop = NULL;
74 prop = mtext_property (Mthai_wordseg, Mt,
75 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
76 mtext_attach_property (mt, from, to, prop);
77 M17N_OBJECT_UNREF (prop);
84 for (i = 0, last = from; i < count; i++)
86 this = mtext_property (Mthai_wordseg, Mt,
87 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
88 mtext_attach_property (mt, last, from + breaks[i], this);
89 if (pos >= last && pos < from + breaks[i])
91 M17N_OBJECT_UNREF (this);
92 last = from + breaks[i];
96 this = mtext_property (Mthai_wordseg, Mt,
97 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
98 mtext_attach_property (mt, last, to, this);
99 if (pos >= last && pos < to)
101 M17N_OBJECT_UNREF (this);
112 #include <wordcut/wcwordcut.h>
114 static WcWordcut wordcut;
115 static WcWordVector *word_vector;
118 init_wordseg_library (void)
120 wc_wordcut_init (&wordcut);
125 fini_wordseg_library (void)
128 wc_word_vector_delete (word_vector);
129 wc_wordcut_destroy (&wordcut);
133 static MTextProperty *
134 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
137 MTextProperty *prop = NULL;
140 word_vector = wc_word_vector_new ();
143 wc_word_vector_destroy (word_vector);
144 wc_word_vector_init (word_vector);
147 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
149 count = wc_word_vector_get_count (word_vector);
150 for (i = 0; i < count; i++)
152 WcWord *word = wc_word_vector_get_word (word_vector, i);
154 if (word->type != WC_WORDTYPE_DELETED)
156 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
157 || word->type == WC_WORDTYPE_WORDUNIT
158 || word->type == WC_WORDTYPE_JOINED)
161 = mtext_property (Mthai_wordseg, val,
162 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
164 mtext_attach_property (mt, from, from + word->len, this);
165 if (pos >= from && pos < from + word->len)
167 M17N_OBJECT_UNREF (this);
174 #else /* HAVE_WORDCUT_OLD */
176 #include <wordcut/wordcut.h>
178 static Wordcut wordcut;
179 static WordcutResult wordcut_result;
180 static int wordcut_result_used;
183 init_wordseg_library (void)
185 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
189 fini_wordseg_library (void)
191 if (wordcut_result_used)
193 wordcut_result_close (&wordcut_result);
194 wordcut_result_used = 0;
196 wordcut_close (&wordcut);
200 static MTextProperty *
201 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
204 MTextProperty *prop = NULL;
206 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
207 wordcut_result_used = 1;
208 for (i = 0, last = from; i < wordcut_result.count; i++)
212 if (last < from + wordcut_result.start[i])
214 this = mtext_property (Mthai_wordseg, Mnil,
215 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
216 mtext_attach_property (mt, last, from + wordcut_result.start[i],
218 if (pos >= last && pos < from + wordcut_result.start[i])
220 M17N_OBJECT_UNREF (this);
223 this = mtext_property (Mthai_wordseg, Mt,
224 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
225 last = from + wordcut_result.start[i];
226 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
227 if (pos >= last && pos < last + wordcut_result.offset[i])
229 m17n_object_unref (this);
230 last += wordcut_result.offset[i];
235 #endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
238 thai_wordseg (MText *mt, int pos, int *from, int *to)
240 /* TIS620 code sequence. */
245 if (pos >= mtext_nchars (mt))
251 prop = mtext_get_property (mt, pos, Mthai_wordseg);
258 /* Extra 1-byte is for 0 terminating. */
259 tis = alloca ((*to - *from) + 1);
261 for (beg = pos; beg > *from; beg--)
263 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
265 tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG);
267 for (end = pos; end < *to; end++)
269 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
271 tis[end - *from] = 0xA1 + (c - THAI_BEG);
280 /* Make it terminate by 0. */
281 tis[end - *from] = 0;
282 prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from));
285 *from = MTEXTPROP_START (prop);
286 *to = MTEXTPROP_END (prop);
287 in_word = MTEXTPROP_VAL (prop) == Mt;
291 #endif /* HAVE_THAI_WORDSEG */
297 mtext__word_thai_init ()
299 #ifdef HAVE_THAI_WORDSEG
300 if (! wordseg_library_initialized)
302 if (init_wordseg_library () < 0)
304 wordseg_library_initialized = 1;
305 Mthai_wordseg = msymbol (" wordcut-wordseg");
307 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
308 (void *) thai_wordseg);
314 mtext__word_thai_fini ()
316 #ifdef HAVE_THAI_WORDSEG
317 if (wordseg_library_initialized)
319 fini_wordseg_library ();
320 wordseg_library_initialized = 0;