1 /* mtext-wseg.c -- word segmentation
2 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
24 /*** @addtogroup m17nInternal
31 #include "m17n-core.h"
32 #include "m17n-misc.h"
35 #include "character.h"
37 typedef struct _MWordseg_Function MWordseg_Function;
39 struct _MWordseg_Function
44 int (*wseg) (MText *mt, int pos, int *from, int *to,
45 MWordseg_Function *wordseg);
46 MWordseg_Function *next;
49 static MWordseg_Function *wordseg_function_list;
51 static MCharTable *wordseg_function_table;
54 generic_wordseg (MText *mt, int pos, int *from, int *to,
55 MWordseg_Function *wordseg)
57 int len = mtext_nchars (mt);
58 int c = mtext_ref_char (mt, pos);
59 MSymbol category = mchar_get_prop (c, Mcategory);
60 char cathead = msymbol_name (category)[0];
61 int in_word = (cathead == 'L' || cathead == 'M' || cathead == 'N');
64 for (beg = pos; beg > 0; beg--)
66 c = mtext_ref_char (mt, beg - 1);
67 category = mchar_get_prop (c, Mcategory);
68 cathead = msymbol_name (category)[0];
69 if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N'))
71 if (mchartable_lookup (wordseg_function_table, c) != wordseg)
74 for (end = pos; end < len; end++)
76 c = mtext_ref_char (mt, end);
77 category = mchar_get_prop (c, Mcategory);
78 cathead = msymbol_name (category)[0];
79 if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N'))
81 if (mchartable_lookup (wordseg_function_table, c) != wordseg)
91 #ifdef HAVE_THAI_WORDSEG
93 #define THAI_BEG 0x0E01
94 #define THAI_END 0x0E6F
96 static MSymbol M_thai_wordseg;
98 /* We have libthai, wordcut, or wordcut-old. Each of them provides
99 the following three functions. */
101 static int thai_wordseg_init (void);
102 static void thai_wordseg_fini (void);
103 static MTextProperty *thai_wordseg_propertize (MText *mt, int pos,
109 #include <thai/thbrk.h>
112 thai_wordseg_init (void)
118 thai_wordseg_fini (void)
123 static MTextProperty *
124 thai_wordseg_propertize (MText *mt, int pos, int from, int to,
128 int *breaks = alloca ((sizeof (int)) * len);
129 int count = th_brk ((thchar_t *) tis, breaks, len);
130 MTextProperty *prop = NULL;
134 prop = mtext_property (M_thai_wordseg, Mt,
135 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
136 mtext_attach_property (mt, from, to, prop);
137 M17N_OBJECT_UNREF (prop);
144 for (i = 0, last = from; i < count; i++)
146 this = mtext_property (M_thai_wordseg, Mt,
147 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
148 mtext_attach_property (mt, last, from + breaks[i], this);
149 if (pos >= last && pos < from + breaks[i])
151 M17N_OBJECT_UNREF (this);
152 last = from + breaks[i];
156 this = mtext_property (M_thai_wordseg, Mt,
157 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
158 mtext_attach_property (mt, last, to, this);
159 if (pos >= last && pos < to)
161 M17N_OBJECT_UNREF (this);
172 #include <wordcut/wcwordcut.h>
174 static WcWordcut wordcut;
175 static WcWordVector *word_vector;
178 thai_wordseg_init (void)
180 wc_wordcut_init (&wordcut);
185 thai_wordseg_fini (void)
188 wc_word_vector_delete (word_vector);
189 wc_wordcut_destroy (&wordcut);
193 static MTextProperty *
194 thai_wordseg_propertize (MText *mt, int pos, int from, int to,
198 MTextProperty *prop = NULL;
201 word_vector = wc_word_vector_new ();
204 wc_word_vector_destroy (word_vector);
205 wc_word_vector_init (word_vector);
208 wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
210 count = wc_word_vector_get_count (word_vector);
211 for (i = 0; i < count; i++)
213 WcWord *word = wc_word_vector_get_word (word_vector, i);
215 if (word->type != WC_WORDTYPE_DELETED)
217 MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
218 || word->type == WC_WORDTYPE_WORDUNIT
219 || word->type == WC_WORDTYPE_JOINED)
222 = mtext_property (M_thai_wordseg, val,
223 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
225 mtext_attach_property (mt, from, from + word->len, this);
226 if (pos >= from && pos < from + word->len)
228 M17N_OBJECT_UNREF (this);
235 #else /* HAVE_WORDCUT_OLD */
237 #include <wordcut/wordcut.h>
239 static Wordcut wordcut;
240 static WordcutResult wordcut_result;
241 static int wordcut_result_used;
244 thai_wordseg_init (void)
246 return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
250 thai_wordseg_fini (void)
252 if (wordcut_result_used)
254 wordcut_result_close (&wordcut_result);
255 wordcut_result_used = 0;
257 wordcut_close (&wordcut);
261 static MTextProperty *
262 thai_wordseg_propertize (MText *mt, int pos, int from, int to,
266 MTextProperty *prop = NULL;
268 wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
269 wordcut_result_used = 1;
270 for (i = 0, last = from; i < wordcut_result.count; i++)
274 if (last < from + wordcut_result.start[i])
276 this = mtext_property (M_thai_wordseg, Mnil,
277 MTEXTPROP_VOLATILE_WEAK);
278 mtext_attach_property (mt, last, from + wordcut_result.start[i],
280 if (pos >= last && pos < from + wordcut_result.start[i])
282 M17N_OBJECT_UNREF (this);
285 this = mtext_property (M_thai_wordseg, Mt,
286 MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
287 last = from + wordcut_result.start[i];
288 mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
289 if (pos >= last && pos < last + wordcut_result.offset[i])
291 m17n_object_unref (this);
292 last += wordcut_result.offset[i];
297 #endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
300 thai_wordseg (MText *mt, int pos, int *from, int *to,
301 MWordseg_Function *wordseg)
305 /* It is assured that there's a Thai character at POS. */
306 prop = mtext_get_property (mt, pos, M_thai_wordseg);
309 /* TIS620 code sequence. */
311 int len = mtext_nchars (mt);
315 for (beg = pos; beg > 0; beg--)
316 if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
318 for (end = pos + 1; end < len; end++)
319 if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
322 /* Extra 1-byte for 0 terminating. */
323 tis = alloca ((end - beg) + 1);
325 for (i = beg; i < end; i++)
326 tis[i - beg] = 0xA1 + (mtext_ref_char (mt, i) - THAI_BEG);
328 prop = thai_wordseg_propertize (mt, pos, beg, end, tis);
332 *from = MTEXTPROP_START (prop);
334 *to = MTEXTPROP_END (prop);
335 return (MTEXTPROP_VAL (prop) == Mt);
338 #endif /* HAVE_THAI_WORDSEG */
346 if (wordseg_function_list)
348 while (wordseg_function_list)
350 MWordseg_Function *next = wordseg_function_list->next;
352 if (wordseg_function_list->initialized > 0
353 && wordseg_function_list->fini)
354 wordseg_function_list->fini ();
355 free (wordseg_function_list);
356 wordseg_function_list = next;
358 M17N_OBJECT_UNREF (wordseg_function_table);
362 /* Find word boundaries around POS of MT. Set *FROM to the word
363 boundary position at or previous to POS, and update *TO to the word
364 boundary position after POS.
367 If word boundaries were found successfully, return 1 (if
368 the character at POS is a part of a word) or 0 (otherwise). If the
369 operation was not successful, return -1 without setting *FROM and
373 mtext__word_segment (MText *mt, int pos, int *from, int *to)
375 int c = mtext_ref_char (mt, pos);
376 MWordseg_Function *wordseg;
378 if (! wordseg_function_table)
380 wordseg_function_table = mchartable (Mnil, NULL);
382 MSTRUCT_CALLOC (wordseg, MERROR_MTEXT);
383 wordseg->wseg = generic_wordseg;
384 wordseg->next = wordseg_function_list;
385 wordseg_function_list = wordseg;
386 mchartable_set_range (wordseg_function_table, 0, MCHAR_MAX, wordseg);
388 #ifdef HAVE_THAI_WORDSEG
389 MSTRUCT_CALLOC (wordseg, MERROR_MTEXT);
390 wordseg->init = thai_wordseg_init;
391 wordseg->fini = thai_wordseg_fini;
392 wordseg->wseg = thai_wordseg;
393 wordseg->next = wordseg_function_list;
394 wordseg_function_list = wordseg;
395 mchartable_set_range (wordseg_function_table, THAI_BEG, THAI_END,
397 M_thai_wordseg = msymbol (" thai-wordseg");
401 wordseg = mchartable_lookup (wordseg_function_table, c);
402 if (wordseg && wordseg->initialized >= 0)
404 if (! wordseg->initialized)
407 && wordseg->init () < 0)
409 wordseg->initialized = -1;
412 wordseg->initialized = 1;
414 return wordseg->wseg (mt, pos, from, to, wordseg);
420 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */