1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
24 /*** @addtogroup m17nInternal
32 #ifdef HAVE_THAI_WORDSEG
34 #include "m17n-core.h"
35 #include "m17n-misc.h"
38 #include "character.h"
41 static int init_wordseg_library (void);
42 static void fini_wordseg_library (void);
43 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
46 #define THAI_BEG 0x0E01
47 #define THAI_END 0x0E6F
49 static int wordseg_library_initialized;
50 static MSymbol Mthai_wordseg;
54 #include <thai/thbrk.h>
57 init_wordseg_library (void)
63 fini_wordseg_library (void)
68 static MTextProperty *
69 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
72 int *breaks = alloca ((sizeof (int)) * len);
73 int count = th_brk ((thchar_t *) tis, breaks, len);
74 MTextProperty *prop = NULL;
78 prop = mtext_property (Mthai_wordseg, Mt,
79 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
80 mtext_attach_property (mt, from, to, prop);
81 M17N_OBJECT_UNREF (prop);
88 for (i = 0, last = from; i < count; i++)
90 this = mtext_property (Mthai_wordseg, Mt,
91 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
92 mtext_attach_property (mt, last, from + breaks[i], this);
93 if (pos >= last && pos < from + breaks[i])
95 M17N_OBJECT_UNREF (this);
96 last = from + breaks[i];
100 this = mtext_property (Mthai_wordseg, Mt,
101 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
102 mtext_attach_property (mt, last, to, this);
103 if (pos >= last && pos < to)
105 M17N_OBJECT_UNREF (this);
116 #include <wordcut/wcwordcut.h>
118 static WcWordcut wordcut;
119 static WcWordVector *word_vector;
122 init_wordseg_library (void)
124 wc_wordcut_init (&wordcut);
129 fini_wordseg_library (void)
132 wc_word_vector_delete (word_vector);
133 wc_wordcut_destroy (&wordcut);
137 static MTextProperty *
138 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
141 MTextProperty *prop = NULL;
144 word_vector = wc_word_vector_new ();
147 wc_word_vector_destroy (word_vector);
148 wc_word_vector_init (word_vector);
151 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
153 count = wc_word_vector_get_count (word_vector);
154 for (i = 0; i < count; i++)
156 WcWord *word = wc_word_vector_get_word (word_vector, i);
158 if (word->type != WC_WORDTYPE_DELETED)
160 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
161 || word->type == WC_WORDTYPE_WORDUNIT
162 || word->type == WC_WORDTYPE_JOINED)
165 = mtext_property (Mthai_wordseg, val,
166 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
168 mtext_attach_property (mt, from, from + word->len, this);
169 if (pos >= from && pos < from + word->len)
171 M17N_OBJECT_UNREF (this);
178 #else /* HAVE_WORDCUT_OLD */
180 #include <wordcut/wordcut.h>
182 static Wordcut wordcut;
183 static WordcutResult wordcut_result;
184 static int wordcut_result_used;
187 init_wordseg_library (void)
189 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
193 fini_wordseg_library (void)
195 if (wordcut_result_used)
197 wordcut_result_close (&wordcut_result);
198 wordcut_result_used = 0;
200 wordcut_close (&wordcut);
204 static MTextProperty *
205 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
208 MTextProperty *prop = NULL;
210 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
211 wordcut_result_used = 1;
212 for (i = 0, last = from; i < wordcut_result.count; i++)
216 if (last < from + wordcut_result.start[i])
218 this = mtext_property (Mthai_wordseg, Mnil, MTEXTPROP_VOLATILE_WEAK);
219 mtext_attach_property (mt, last, from + wordcut_result.start[i],
221 if (pos >= last && pos < from + wordcut_result.start[i])
223 M17N_OBJECT_UNREF (this);
226 this = mtext_property (Mthai_wordseg, Mt,
227 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
228 last = from + wordcut_result.start[i];
229 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
230 if (pos >= last && pos < last + wordcut_result.offset[i])
232 m17n_object_unref (this);
233 last += wordcut_result.offset[i];
238 #endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
241 thai_wordseg (MText *mt, int pos, int *from, int *to)
243 int len = mtext_nchars (mt);
244 /* TIS620 code sequence. */
250 /* It is assured that there's a Thai character at POS. */
251 prop = mtext_get_property (mt, pos, Mthai_wordseg);
254 beg = MTEXTPROP_START (prop);
256 && ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END))
258 end = MTEXTPROP_END (prop);
260 && ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END))
267 for (beg = pos; beg > 0; beg--)
268 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
270 for (end = pos + 1; end < len; end++)
271 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
274 /* Extra 1-byte for 0 terminating. */
275 tis = alloca ((end - beg) + 1);
277 for (i = beg; i < end; i++)
278 tis[i - beg] = 0xA1 + (mtext_ref_char (mt, i) - THAI_BEG);
280 prop = wordseg_propertize (mt, pos, beg, end, tis);
281 i = MTEXTPROP_START (prop);
282 beg = (i > beg || i == 0) ? i : -1;
283 i = MTEXTPROP_END (prop);
284 end = (i < end || i == len) ? i : -1;
291 return (MTEXTPROP_VAL (prop) == Mt);
294 #endif /* HAVE_THAI_WORDSEG */
300 mtext__word_thai_init ()
302 #ifdef HAVE_THAI_WORDSEG
303 if (! wordseg_library_initialized)
305 if (init_wordseg_library () < 0)
307 wordseg_library_initialized = 1;
308 Mthai_wordseg = msymbol (" wordcut-wordseg");
310 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
311 (void *) thai_wordseg);
317 mtext__word_thai_fini ()
319 #ifdef HAVE_THAI_WORDSEG
320 if (wordseg_library_initialized)
322 fini_wordseg_library ();
323 wordseg_library_initialized = 0;
329 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */