1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 #include "m17n-core.h"
28 #include "m17n-misc.h"
31 #include "character.h"
34 static int init_wordseg_library (void);
35 static void fini_wordseg_library (void);
36 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
39 #define THAI_BEG 0x0E01
40 #define THAI_END 0x0E6F
42 static int wordseg_library_initialized;
43 static MSymbol Mthai_wordseg;
47 #include <wordcut/wcwordcut.h>
49 static WcWordcut wordcut;
50 static WcWordVector *word_vector;
53 init_wordseg_library (void)
55 wc_wordcut_init (&wordcut);
60 fini_wordseg_library (void)
63 wc_word_vector_delete (word_vector);
64 wc_wordcut_destroy (&wordcut);
68 static MTextProperty *
69 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
72 MTextProperty *prop = NULL;
75 word_vector = wc_word_vector_new ();
78 wc_word_vector_destroy (word_vector);
79 wc_word_vector_init (word_vector);
82 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
84 count = wc_word_vector_get_count (word_vector);
85 for (i = 0; i < count; i++)
87 WcWord *word = wc_word_vector_get_word (word_vector, i);
89 if (word->type != WC_WORDTYPE_DELETED)
91 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
92 || word->type == WC_WORDTYPE_WORDUNIT
93 || word->type == WC_WORDTYPE_JOINED)
96 = mtext_property (Mthai_wordseg, val,
97 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
99 mtext_attach_property (mt, from, from + word->len, this);
100 if (pos >= from && pos < from + word->len)
103 M17N_OBJECT_UNREF (this);
110 #elif HAVE_WORDCUT_OLD
112 #include <wordcut/wordcut.h>
114 static Wordcut wordcut;
115 static WordcutResult wordcut_result;
116 static int wordcut_result_used;
119 init_wordseg_library (void)
121 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
125 fini_wordseg_library (void)
127 if (wordcut_result_used)
129 wordcut_result_close (&wordcut_result);
130 wordcut_result_used = 0;
132 wordcut_close (&wordcut);
136 static MTextProperty *
137 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
140 MTextProperty *prop = NULL;
142 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
143 wordcut_result_used = 1;
144 for (i = 0, last = from; i < wordcut_result.count; i++)
148 if (last < from + wordcut_result.start[i])
150 this = mtext_property (Mthai_wordseg, Mnil,
151 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
152 mtext_attach_property (mt, last, from + wordcut_result.start[i],
154 if (pos >= last && pos < from + wordcut_result.start[i])
157 M17N_OBJECT_UNREF (this);
160 this = mtext_property (Mthai_wordseg, Mt,
161 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
162 last = from + wordcut_result.start[i];
163 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
164 if (pos >= last && pos < last + wordcut_result.offset[i])
167 m17n_object_unref (this);
168 last += wordcut_result.offset[i];
173 #else /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
176 init_wordseg_library (void)
182 fini_wordseg_library (void)
187 static MTextProperty *
188 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
194 #endif /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
197 thai_wordseg (MText *mt, int pos, int *from, int *to)
199 /* TIS620 code sequence. */
204 if (pos >= mtext_nchars (mt))
210 prop = mtext_get_property (mt, pos, Mthai_wordseg);
217 /* Extra 1-byte is for 0 terminating. */
218 tis = alloca ((*to - *from) + 1);
220 for (beg = pos; beg > *from; beg--)
222 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
224 tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG);
226 for (end = pos; end < *to; end++)
228 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
230 tis[end - *from] = 0xA1 + (c - THAI_BEG);
239 /* Make it terminate by 0. */
240 tis[end - *from] = 0;
241 prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from));
244 *from = MTEXTPROP_START (prop);
245 *to = MTEXTPROP_END (prop);
246 in_word = MTEXTPROP_VAL (prop) == Mt;
247 M17N_OBJECT_UNREF (prop);
255 mtext__word_thai_init ()
257 if (! wordseg_library_initialized)
259 if (init_wordseg_library () < 0)
261 wordseg_library_initialized = 1;
262 Mthai_wordseg = msymbol (" wordcut-wordseg");
264 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
265 (void *) thai_wordseg);
270 mtext__word_thai_fini ()
272 if (wordseg_library_initialized)
274 fini_wordseg_library ();
275 wordseg_library_initialized = 0;