1 /* word-thai.c -- Find a word segment in Thai text.
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 #include "m17n-core.h"
28 #include "m17n-misc.h"
31 #include "character.h"
34 static int init_wordseg_library (void);
35 static void fini_wordseg_library (void);
36 static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
39 #define THAI_BEG 0x0E01
40 #define THAI_END 0x0E6F
42 static int wordseg_library_initialized;
43 static MSymbol Mthai_wordseg;
47 #include <wordcut/wcwordcut.h>
49 static WcWordcut wordcut;
50 static WcWordVector *word_vector;
53 init_wordseg_library (void)
55 wc_wordcut_init (&wordcut);
60 fini_wordseg_library (void)
63 wc_word_vector_delete (word_vector);
64 wc_wordcut_destroy (&wordcut);
68 static MTextProperty *
69 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
72 MTextProperty *prop = NULL;
75 word_vector = wc_word_vector_new ();
78 wc_word_vector_destroy (word_vector);
79 wc_word_vector_init (word_vector);
82 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
84 count = wc_word_vector_get_count (word_vector);
85 for (i = 0; i < count; i++)
87 WcWord *word = wc_word_vector_get_word (word_vector, i);
89 if (word->type != WC_WORDTYPE_DELETED)
91 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
92 || word->type == WC_WORDTYPE_WORDUNIT
93 || word->type == WC_WORDTYPE_JOINED)
96 = mtext_property (Mthai_wordseg, val,
97 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
99 mtext_attach_property (mt, from, from + word->len, this);
100 if (pos >= from && pos < from + word->len)
102 M17N_OBJECT_UNREF (this);
109 #elif HAVE_WORDCUT_OLD
111 #include <wordcut/wordcut.h>
113 static Wordcut wordcut;
114 static WordcutResult wordcut_result;
115 static int wordcut_result_used;
118 init_wordseg_library (void)
120 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
124 fini_wordseg_library (void)
126 if (wordcut_result_used)
128 wordcut_result_close (&wordcut_result);
129 wordcut_result_used = 0;
131 wordcut_close (&wordcut);
135 static MTextProperty *
136 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
139 MTextProperty *prop = NULL;
141 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
142 wordcut_result_used = 1;
143 for (i = 0, last = from; i < wordcut_result.count; i++)
147 if (last < from + wordcut_result.start[i])
149 this = mtext_property (Mthai_wordseg, Mnil,
150 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
151 mtext_attach_property (mt, last, from + wordcut_result.start[i],
153 if (pos >= last && pos < from + wordcut_result.start[i])
155 M17N_OBJECT_UNREF (this);
158 this = mtext_property (Mthai_wordseg, Mt,
159 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
160 last = from + wordcut_result.start[i];
161 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
162 if (pos >= last && pos < last + wordcut_result.offset[i])
164 m17n_object_unref (this);
165 last += wordcut_result.offset[i];
170 #else /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
173 init_wordseg_library (void)
179 fini_wordseg_library (void)
184 static MTextProperty *
185 wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
191 #endif /* not HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
194 thai_wordseg (MText *mt, int pos, int *from, int *to)
196 /* TIS620 code sequence. */
201 if (pos >= mtext_nchars (mt))
207 prop = mtext_get_property (mt, pos, Mthai_wordseg);
214 /* Extra 1-byte is for 0 terminating. */
215 tis = alloca ((*to - *from) + 1);
217 for (beg = pos; beg > *from; beg--)
219 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
221 tis[beg - 1 - *from] = 0xA1 + (c - THAI_BEG);
223 for (end = pos; end < *to; end++)
225 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
227 tis[end - *from] = 0xA1 + (c - THAI_BEG);
236 /* Make it terminate by 0. */
237 tis[end - *from] = 0;
238 prop = wordseg_propertize (mt, pos, beg, end, tis + (beg - *from));
241 *from = MTEXTPROP_START (prop);
242 *to = MTEXTPROP_END (prop);
243 in_word = MTEXTPROP_VAL (prop) == Mt;
251 mtext__word_thai_init ()
253 if (! wordseg_library_initialized)
255 if (init_wordseg_library () < 0)
257 wordseg_library_initialized = 1;
258 Mthai_wordseg = msymbol (" wordcut-wordseg");
260 mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
261 (void *) thai_wordseg);
266 mtext__word_thai_fini ()
268 if (wordseg_library_initialized)
270 fini_wordseg_library ();
271 wordseg_library_initialized = 0;