+++ /dev/null
-! -*- coding: euc-jp; -*-
-*.fontSet: -etl-fixed-medium-r-normal--24-*-*-*-*,-*-*-medium-r-normal--24-*-*-*-*
-*.international: True
-*.File.label: ¥Õ¥¡¥¤¥ë
-*.Cursor.label: ¥«¡¼¥½¥ë
-*.Bidi.label: ½ñ»úÊý¸þ
-*.LineBreak.label: ¹Ôʬ³ä
-*.InputMethod.label: ÆþÎϥ᥽¥Ã¥É
-*.Face.label: ¥Õ¥§¡¼¥¹
-*.Lang.label: ¸À¸ì
-*.Size.label: ¥µ¥¤¥º
-*.Family.label: ¥Õ¥¡¥ß¥ê¡¼
-*.Style.label: ¥¹¥¿¥¤¥ë
-*.Color.label: ¿§
-*.Misc.label: ¤½¤Î¾
-*.Pop Face.label: ¥Ý¥Ã¥×
-*Abkhazian.label:¥¢¥Ö¥Ï¥º¸ì
-*Afar.label:¥¢¥Õ¥¡¥ë¸ì
-*Afrikaans.label:¥¢¥Õ¥ê¥«¡¼¥ó¥¹¸ì
-*Albanian.label:¥¢¥ë¥Ð¥Ë¥¢¸ì
-*Amharic.label:¥¢¥à¥Ï¥é¸ì
-*Arabic.label:¥¢¥é¥Ó¥¢¸ì
-*Armenian.label:¥¢¥ë¥á¥Ë¥¢¸ì
-*Assamese.label:¥¢¥Ã¥µ¥à¸ì
-*Aymara.label:¥¢¥¤¥Þ¥é¸ì
-*Azerbaijani.label:¥¢¥¼¥ë¥Ð¥¤¥¸¥ã¥ó¸ì
-*Bashkir.label:¥Ð¥·¥å¥¡¼¥ë¸ì
-*Basque.label:¥Ð¥¹¥¯¸ì
-*Bengali.label:¥Ù¥ó¥¬¥ë¸ì
-*Bhutani.label:¥Ö¡¼¥¿¥ó¸ì
-*Bihari.label:¥Ó¥Ï¡¼¥ë¸ì
-*Bislama.label:¥Ó¥¹¥é¥Þ¸ì
-*Breton.label:¥Ö¥ë¥È¥ó¸ì
-*Bulgarian.label:¥Ö¥ë¥¬¥ê¥¢¸ì
-*Burmese.label:¥Ó¥ë¥Þ¸ì
-*Byelorussian.label:Çò¥í¥·¥¢¸ì
-*Cambodian.label:¥«¥ó¥Ü¥¸¥¢¸ì
-*Catalan.label:¥«¥¿¥í¥Ë¥¢¸ì
-*Chinese.label:Ãæ¹ñ¸ì
-*Corsican.label:¥³¥ë¥·¥«¸ì
-*Croatian.label:¥¯¥í¥¢¥Á¥¢¸ì
-*Czech.label:¥Á¥§¥³¸ì
-*Dhivehi.label:¥Ç¥£¥Ù¥Ò¸ì
-*Danish.label:¥Ç¥ó¥Þ¡¼¥¯¸ì
-*Dutch.label:¥ª¥é¥ó¥À¸ì
-*English.label:±Ñ¸ì
-*Esperanto.label:¥¨¥¹¥Ú¥é¥ó¥È
-*Estonian.label:¥¨¥¹¥È¥Ë¥¢¸ì
-*Faeroese.label:¥Õ¥§¡¼¥í¡¼¸ì
-*Farsi.label:¥Ú¥ë¥·¥¢¸ì
-*Fiji.label:¥Õ¥£¥¸¡¼¸ì
-*Finnish.label:¥Õ¥£¥ó¥é¥ó¥É¸ì
-*French.label:¥Õ¥é¥ó¥¹¸ì
-*Frisian.label:¥Õ¥ê¥¸¥¢¸ì
-*Galician.label:¥¬¥ê¥·¥¢¸ì
-*Gaelic(Scottish).label:¥²¡¼¥ë¸ì¡Ê¥¹¥³¥Ã¥È¥é¥ó¥É¡Ë
-*Gaelic(Manx).label:¥²¡¼¥ë¸ì¡Ê¥Þ¥óÅç¡Ë
-*Georgian.label:¥°¥ë¥¸¥¢¸ì
-*German.label:¥É¥¤¥Ä¸ì
-*Greek.label:¥®¥ê¥·¥¢¸ì
-*Greenlandic.label:¥°¥ê¡¼¥ó¥é¥ó¥É¸ì
-*Guarani.label:¥ï¥é¥Ë¡¼¸ì
-*Gujarati.label:¥°¥¸¥ã¥é¡¼¥È¸ì
-*Hausa.label:¥Ï¥¦¥µ¸ì
-*Hebrew.label:¥Ø¥Ö¥é¥¤¸ì
-*Hindi.label:¥Ò¥ó¥Ç¥£¡¼¸ì
-*Hungarian.label:¥Ï¥ó¥¬¥ê¡¼¸ì
-*Icelandic.label:¥¢¥¤¥¹¥é¥ó¥É¸ì
-*Indonesian.label:¥¤¥ó¥É¥Í¥·¥¢¸ì
-*Inuktitut.label:¥¤¥Ì¥¯¥Æ¥£¥È¥Ã¥È¸ì
-*Inupiak.label:¥¤¥Ì¥Ô¥¢¥Ã¥¯¸ì
-*Irish.label:¥¢¥¤¥ë¥é¥ó¥É¸ì
-*Italian.label:¥¤¥¿¥ê¥¢¸ì
-*Japanese.label:ÆüËܸì
-*Javanese.label:¥¸¥ã¥ï¸ì
-*Kannada.label:¥«¥ó¥Ê¥À¸ì
-*Kashmiri.label:¥«¥·¥å¥ß¡¼¥ë¸ì
-*Kazakh.label:¥«¥¶¥Õ¸ì
-*Kinyarwanda.label:¥ë¥ï¥ó¥À¸ì
-*Kirghiz.label:¥¥ë¥®¥¹¸ì
-*Kirundi.label:¥ë¥ó¥Ç¥£¸ì
-*Korean.label:Ä«Á¯¸ì
-*Kurdish.label:¥¯¥ë¥É¸ì
-*Laothian.label:¥é¥ª¸ì
-*Latin.label:¥é¥Æ¥ó¸ì
-*Latvian.label:¥é¥È¥ô¥£¥¢¸ì
-*Lingala.label:¥ê¥ó¥¬¥é¸ì
-*Lithuanian.label:¥ê¥È¥¢¥Ë¥¢¸ì
-*Macedonian.label:¥Þ¥±¥É¥Ë¥¢¸ì
-*Malagasy.label:¥Þ¥é¥¬¥·¸ì
-*Malay.label:¥à¥é¥æ¸ì
-*Malayalam.label:¥Þ¥é¥ä¡¼¥é¥à¸ì
-*Maltese.label:¥Þ¥ë¥¿¸ì
-*Maori.label:¥Þ¥ª¥ê¸ì
-*Marathi.label:¥Þ¥é¡¼¥Æ¥£¡¼¸ì
-*Moldavian.label:¥â¥ë¥À¥Ó¥¢¸ì
-*Mongolian.label:¥â¥ó¥´¥ë¸ì
-*Nauru.label:¥Ê¥¦¥ë¸ì
-*Nepali.label:¥Í¥Ñ¡¼¥ë¸ì
-*Norwegian.label:¥Î¥ë¥¦¥§¡¼¸ì
-*Occitan.label:¥×¥í¥ô¥¡¥ó¥¹¸ì
-*Oriya.label:¥ª¥ê¥ä¡¼¸ì
-*Oromo.label:¥¬¥Ã¥é¸ì
-*Pashto.label:¥Ñ¥·¥å¥È¡¼¸ì
-*Polish.label:¥Ý¡¼¥é¥ó¥É¸ì
-*Portuguese.label:¥Ý¥ë¥È¥¬¥ë¸ì
-*Punjabi.label:¥Ñ¥ó¥¸¥ã¡¼¥Ö¸ì
-*Quechua.label:¥±¥Á¥å¥¢¸ì
-*Rhaeto-Romance.label:¥ì¥È¡¦¥í¥Þ¥ó¥¹¸ì
-*Romanian.label:¥ë¡¼¥Þ¥Ë¥¢¸ì
-*Russian.label:¥í¥·¥¢¸ì
-*Samoan.label:¥µ¥â¥¢¸ì
-*Sangro.label:¥µ¥ó¥´¸ì
-*Sanskrit.label:¥µ¥ó¥¹¥¯¥ê¥Ã¥È
-*Serbian.label:¥»¥ë¥Ó¥¢¸ì
-*Serbo-Croatian.label:¥»¥ë¥Ó¥¢¡¦¥¯¥í¥¢¥Á¥¢¸ì
-*Sesotho.label:¥½¥È¸ì
-*Setswana.label:¥Ä¥ï¥Ê¸ì
-*Shona.label:¥·¥ç¥Ê¸ì
-*Sindhi.label:¥·¥ó¥É¸ì
-*Sinhalese.label:¥·¥ó¥Ï¥é¸ì
-*Siswati.label:¥¹¥ï¥Æ¥£¸ì
-*Slovak.label:¥¹¥í¥Ð¥¥¢¸ì
-*Slovenian.label:¥¹¥í¥Ù¥Ë¥¢¸ì
-*Somali.label:¥½¥Þ¥ê¸ì
-*Spanish.label:¥¹¥Ú¥¤¥ó¸ì
-*Sundanese.label:¥¹¥ó¥À¸ì
-*Swahili.label:¥¹¥ï¥Ò¥ê¸ì
-*Swedish.label:¥¹¥¦¥§¡¼¥Ç¥ó¸ì
-*Tagalog.label:¥¿¥¬¥í¥°¸ì
-*Tajik.label:¥¿¥¸¥¯¸ì
-*Tamil.label:¥¿¥ß¡¼¥ë¸ì
-*Tatar.label:¥¿¥¿¡¼¥ë¸ì
-*Telugu.label:¥Æ¥ë¥°¸ì
-*Thai.label:¥¿¥¤¸ì
-*Tibetan.label:¥Á¥Ù¥Ã¥È¸ì
-*Tigrinya.label:¥Æ¥£¥°¥ê¥Ë¥¢¸ì
-*Tonga.label:¥È¥ó¥¬¸ì
-*Tsonga.label:¥Ä¥©¥ó¥¬¸ì
-*Turkish.label:¥È¥ë¥³¸ì
-*Turkmen.label:¥È¥ë¥¯¥á¥ó¸ì
-*Twi.label:¥Á¥å¥¤¸ì
-*Uighur.label:¥¦¥¤¥°¥ë¸ì
-*Ukrainian.label:¥¦¥¯¥é¥¤¥Ê¸ì
-*Urdu.label:¥¦¥ë¥É¥¥¡¼¸ì
-*Uzbek.label:¥¦¥º¥Ù¥¯¸ì
-*Vietnamese.label:¥Ù¥È¥Ê¥à¸ì
-*Volapuk.label:¥ô¥©¥é¥Ô¥å¥¯
-*Welsh.label:¥¦¥§¡¼¥ë¥º¸ì
-*Wolof.label:¥¦¥©¥í¥Õ¸ì
-*Xhosa.label:¥³¥µ¸ì
-*Yiddish.label:¥¤¥Ç¥£¥Ã¥·¥å¸ì
-*Yoruba.label:¥è¥ë¥Ð¸ì
-*Zulu.label:¥º¡¼¥ë¡¼¸ì
+++ /dev/null
-/* word-thai.c -- Find a word segment in Thai text.
- Copyright (C) 2005
- National Institute of Advanced Industrial Science and Technology (AIST)
- Registration Number H15PRO112
-
- This file is part of the m17n library.
-
- The m17n library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public License
- as published by the Free Software Foundation; either version 2.1 of
- the License, or (at your option) any later version.
-
- The m17n library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the m17n library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307, USA. */
-
-#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
-/*** @addtogroup m17nInternal
- @{ */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config.h"
-
-#ifdef HAVE_THAI_WORDSEG
-
-#include "m17n-core.h"
-#include "m17n-misc.h"
-#include "internal.h"
-#include "textprop.h"
-#include "character.h"
-#include "mtext.h"
-
-static int init_wordseg_library (void);
-static void fini_wordseg_library (void);
-static MTextProperty *wordseg_propertize (MText *mt, int pos, int from, int to,
- unsigned char *tis);
-
-#define THAI_BEG 0x0E01
-#define THAI_END 0x0E6F
-
-static int wordseg_library_initialized;
-static MSymbol Mthai_wordseg;
-
-#ifdef HAVE_LIBTHAI
-
-#include <thai/thbrk.h>
-
-static int
-init_wordseg_library (void)
-{
- return 0;
-}
-
-static void
-fini_wordseg_library (void)
-{
- return;
-}
-
-static MTextProperty *
-wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
-{
- int len = to - from;
- int *breaks = alloca ((sizeof (int)) * len);
- int count = th_brk ((thchar_t *) tis, breaks, len);
- MTextProperty *prop = NULL;
-
- if (count == 0)
- {
- prop = mtext_property (Mthai_wordseg, Mt,
- MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
- mtext_attach_property (mt, from, to, prop);
- M17N_OBJECT_UNREF (prop);
- }
- else
- {
- int last, i;
- MTextProperty *this;
-
- for (i = 0, last = from; i < count; i++)
- {
- this = mtext_property (Mthai_wordseg, Mt,
- MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
- mtext_attach_property (mt, last, from + breaks[i], this);
- if (pos >= last && pos < from + breaks[i])
- prop = this;
- M17N_OBJECT_UNREF (this);
- last = from + breaks[i];
- }
- if (last < to)
- {
- this = mtext_property (Mthai_wordseg, Mt,
- MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
- mtext_attach_property (mt, last, to, this);
- if (pos >= last && pos < to)
- prop = this;
- M17N_OBJECT_UNREF (this);
- }
- }
-
- if (! prop)
- mdebug_hook ();
- return prop;
-}
-
-#elif HAVE_WORDCUT
-
-#include <wordcut/wcwordcut.h>
-
-static WcWordcut wordcut;
-static WcWordVector *word_vector;
-
-static int
-init_wordseg_library (void)
-{
- wc_wordcut_init (&wordcut);
- return 0;
-}
-
-static void
-fini_wordseg_library (void)
-{
- if (word_vector)
- wc_word_vector_delete (word_vector);
- wc_wordcut_destroy (&wordcut);
- return;
-}
-
-static MTextProperty *
-wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
-{
- gulong i, count;
- MTextProperty *prop = NULL;
-
- if (! word_vector)
- word_vector = wc_word_vector_new ();
- else
- {
- wc_word_vector_destroy (word_vector);
- wc_word_vector_init (word_vector);
- }
-
- wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
- word_vector);
- count = wc_word_vector_get_count (word_vector);
- for (i = 0; i < count; i++)
- {
- WcWord *word = wc_word_vector_get_word (word_vector, i);
-
- if (word->type != WC_WORDTYPE_DELETED)
- {
- MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
- || word->type == WC_WORDTYPE_WORDUNIT
- || word->type == WC_WORDTYPE_JOINED)
- ? Mt : Mnil);
- MTextProperty *this
- = mtext_property (Mthai_wordseg, val,
- MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
-
- mtext_attach_property (mt, from, from + word->len, this);
- if (pos >= from && pos < from + word->len)
- prop = this;
- M17N_OBJECT_UNREF (this);
- from += word->len;
- }
- }
- return prop;
-}
-
-#else /* HAVE_WORDCUT_OLD */
-
-#include <wordcut/wordcut.h>
-
-static Wordcut wordcut;
-static WordcutResult wordcut_result;
-static int wordcut_result_used;
-
-static int
-init_wordseg_library (void)
-{
- return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
-}
-
-static void
-fini_wordseg_library (void)
-{
- if (wordcut_result_used)
- {
- wordcut_result_close (&wordcut_result);
- wordcut_result_used = 0;
- }
- wordcut_close (&wordcut);
- return;
-}
-
-static MTextProperty *
-wordseg_propertize (MText *mt, int pos, int from, int to, unsigned char *tis)
-{
- int i, last;
- MTextProperty *prop = NULL;
-
- wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
- wordcut_result_used = 1;
- for (i = 0, last = from; i < wordcut_result.count; i++)
- {
- MTextProperty *this;
-
- if (last < from + wordcut_result.start[i])
- {
- this = mtext_property (Mthai_wordseg, Mnil, MTEXTPROP_VOLATILE_WEAK);
- mtext_attach_property (mt, last, from + wordcut_result.start[i],
- this);
- if (pos >= last && pos < from + wordcut_result.start[i])
- prop = this;
- M17N_OBJECT_UNREF (this);
- }
-
- this = mtext_property (Mthai_wordseg, Mt,
- MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
- last = from + wordcut_result.start[i];
- mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
- if (pos >= last && pos < last + wordcut_result.offset[i])
- prop = this;
- m17n_object_unref (this);
- last += wordcut_result.offset[i];
- }
- return prop;
-}
-
-#endif /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
-
-int
-thai_wordseg (MText *mt, int pos, int *from, int *to)
-{
- int len = mtext_nchars (mt);
- /* TIS620 code sequence. */
- unsigned char *tis;
- MTextProperty *prop;
- int beg, end;
- int c;
-
- /* It is assured that there's a Thai character at POS. */
- prop = mtext_get_property (mt, pos, Mthai_wordseg);
- if (prop)
- {
- beg = MTEXTPROP_START (prop);
- if (beg > 0
- && ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END))
- beg = -1;
- end = MTEXTPROP_END (prop);
- if (end < len
- && ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END))
- end = -1;
- }
- else
- {
- int i;
-
- for (beg = pos; beg > 0; beg--)
- if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
- break;
- for (end = pos + 1; end < len; end++)
- if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
- break;
-
- /* Extra 1-byte for 0 terminating. */
- tis = alloca ((end - beg) + 1);
-
- for (i = beg; i < end; i++)
- tis[i - beg] = 0xA1 + (mtext_ref_char (mt, i) - THAI_BEG);
- tis[i - beg] = 0;
- prop = wordseg_propertize (mt, pos, beg, end, tis);
- i = MTEXTPROP_START (prop);
- beg = (i > beg || i == 0) ? i : -1;
- i = MTEXTPROP_END (prop);
- end = (i < end || i == len) ? i : -1;
- }
-
- if (from)
- *from = beg;
- if (to)
- *to = end;
- return (MTEXTPROP_VAL (prop) == Mt);
-}
-
-#endif /* HAVE_THAI_WORDSEG */
-
-\f
-/* Internal API */
-
-int
-mtext__word_thai_init ()
-{
-#ifdef HAVE_THAI_WORDSEG
- if (! wordseg_library_initialized)
- {
- if (init_wordseg_library () < 0)
- return -1;
- wordseg_library_initialized = 1;
- Mthai_wordseg = msymbol (" wordcut-wordseg");
- }
- mchartable_set_range (wordseg_func_table, THAI_BEG, THAI_END,
- (void *) thai_wordseg);
-#endif
- return 0;
-}
-
-void
-mtext__word_thai_fini ()
-{
-#ifdef HAVE_THAI_WORDSEG
- if (wordseg_library_initialized)
- {
- fini_wordseg_library ();
- wordseg_library_initialized = 0;
- }
-#endif
-}
-
-/*** @} */
-#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */