1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
24 /*** @addtogroup m17nInternal
32 #ifdef HAVE_THAI_WORDSEG
34 #include "m17n-core.h"
35 #include "m17n-misc.h"
38 #include "character.h"
41 static int init_wordseg_library (void);
42 static void fini_wordseg_library (void);
43 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
46 #define THAI_BEG 0x0E01
47 #define THAI_END 0x0E6F
49 static int wordseg_library_initialized;
50 static MSymbol Mthai_wordseg;
54 #include <thai/thbrk.h>
57 init_wordseg_library (void)
63 fini_wordseg_library (void)
68 static MTextProperty *
69 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
72 int *breaks = alloca ((sizeof (int)) * len);
73 int count = th_brk ((thchar_t *) tis, breaks, len);
74 MTextProperty *prop = NULL;
78 prop = mtext_property (Mthai_wordseg, Mt,
79 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
80 mtext_attach_property (mt, from, to, prop);
81 M17N_OBJECT_UNREF (prop);
88 for (i = 0, last = from; i < count; i++)
90 this = mtext_property (Mthai_wordseg, Mt,
91 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
92 mtext_attach_property (mt, last, from + breaks[i], this);
93 if (pos >= last && pos < from + breaks[i])
95 M17N_OBJECT_UNREF (this);
96 last = from + breaks[i];
100 this = mtext_property (Mthai_wordseg, Mt,
101 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
102 mtext_attach_property (mt, last, to, this);
103 if (pos >= last && pos < to)
105 M17N_OBJECT_UNREF (this);
116 #include <wordcut/wcwordcut.h>
118 static WcWordcut wordcut;
119 static WcWordVector *word_vector;
122 init_wordseg_library (void)
124 wc_wordcut_init (&wordcut);
129 fini_wordseg_library (void)
132 wc_word_vector_delete (word_vector);
133 wc_wordcut_destroy (&wordcut);
137 static MTextProperty *
138 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
141 MTextProperty *prop = NULL;
144 word_vector = wc_word_vector_new ();
147 wc_word_vector_destroy (word_vector);
148 wc_word_vector_init (word_vector);
151 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
153 count = wc_word_vector_get_count (word_vector);
154 for (i = 0; i < count; i++)
156 WcWord *word = wc_word_vector_get_word (word_vector, i);
158 if (word->type != WC_WORDTYPE_DELETED)
160 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
161 || word->type == WC_WORDTYPE_WORDUNIT
162 || word->type == WC_WORDTYPE_JOINED)
165 = mtext_property (Mthai_wordseg, val,
166 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
168 mtext_attach_property (mt, from, from + word->len, this);
169 if (pos >= from && pos < from + word->len)
171 M17N_OBJECT_UNREF (this);
178 #else /* HAVE_WORDCUT_OLD */
180 #include <wordcut/wordcut.h>
182 static Wordcut wordcut;
183 static WordcutResult wordcut_result;
184 static int wordcut_result_used;
187 init_wordseg_library (void)
189 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
193 fini_wordseg_library (void)
195 if (wordcut_result_used)
197 wordcut_result_close (&wordcut_result);
198 wordcut_result_used = 0;
200 wordcut_close (&wordcut);
204 static MTextProperty *
205 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
208 MTextProperty *prop = NULL;
210 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
211 wordcut_result_used = 1;
212 for (i = 0, last = from; i < wordcut_result.count; i++)
216 if (last < from + wordcut_result.start[i])
218 this = mtext_property (Mthai_wordseg, Mnil,
219 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
220 mtext_attach_property (mt, last, from + wordcut_result.start[i],
222 if (pos >= last && pos < from + wordcut_result.start[i])
224 M17N_OBJECT_UNREF (this);
227 this = mtext_property (Mthai_wordseg, Mt,
228 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
229 last = from + wordcut_result.start[i];
230 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
231 if (pos >= last && pos < last + wordcut_result.offset[i])
233 m17n_object_unref (this);
234 last += wordcut_result.offset[i];
239 #endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
242 thai_wordseg (MText *mt, int pos, int *from, int *to)
244 /* TIS620 code sequence. */
249 if (pos >= mtext_nchars (mt))
255 prop = mtext_get_property (mt, pos, Mthai_wordseg);
262 /* Extra 1-byte is for 0 terminating. */
263 tis = alloca ((*to - *from) + 1);
265 for (beg = pos; beg > *from; beg--)
267 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
269 tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG);
271 for (end = pos; end < *to; end++)
273 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
275 tis[end - *from] = 0xA1 + (c - THAI_BEG);
284 /* Make it terminate by 0. */
285 tis[end - *from] = 0;
286 prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from));
289 *from = MTEXTPROP_START (prop);
290 *to = MTEXTPROP_END (prop);
291 in_word = MTEXTPROP_VAL (prop) == Mt;
295 #endif /* HAVE_THAI_WORDSEG */
301 mtext__word_thai_init ()
303 #ifdef HAVE_THAI_WORDSEG
304 if (! wordseg_library_initialized)
306 if (init_wordseg_library () < 0)
308 wordseg_library_initialized = 1;
309 Mthai_wordseg = msymbol (" wordcut-wordseg");
311 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
312 (void *) thai_wordseg);
318 mtext__word_thai_fini ()
320 #ifdef HAVE_THAI_WORDSEG
321 if (wordseg_library_initialized)
323 fini_wordseg_library ();
324 wordseg_library_initialized = 0;
330 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */