New file.

author handa <handa>

Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)

committer handa <handa>

Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)
author handa <handa>
Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)
committer handa <handa>
Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)
diff --git a/src/mtext-lbrk.c b/src/mtext-lbrk.c

new file mode 100644 (file)

index 0000000..82b81c0
--- /dev/null
+++ b/src/mtext-lbrk.c
@@ -0,0 +1,426 @@
+/* mtext-lbrk.c -- line break
+   Copyright (C) 2005
+     National Institute of Advanced Industrial Science and Technology (AIST)
+     Registration Number H15PRO112
+
+   This file is part of the m17n library.
+
+   The m17n library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License
+   as published by the Free Software Foundation; either version 2.1 of
+   the License, or (at your option) any later version.
+
+   The m17n library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the m17n library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.  */
+
+#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
+/*** @addtogroup m17nInternal
+     @{ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+#include "m17n.h"
+#include "m17n-misc.h"
+#include "internal.h"
+#include "mtext.h"
+
+enum LineBreakClass
+  {
+    LBC_OP, /* open */
+    LBC_CL, /* close */
+    LBC_QU, /* quotation */
+    LBC_GL, /* glue */
+    LBC_NS, /* no-start */
+    LBC_EX, /* exclamation/interrogation */
+    LBC_SY, /* Syntax (slash) */
+    LBC_IS, /* infix (numeric) separator */
+    LBC_PR, /* prefix */
+    LBC_PO, /* postfix */
+    LBC_NU, /* numeric */
+    LBC_AL, /* alphabetic */
+    LBC_ID, /* ideograph (atomic) */
+    LBC_IN, /* inseparable */
+    LBC_HY, /* hyphen */
+    LBC_BA, /* break after */
+    LBC_BB, /* break before */
+    LBC_B2, /* break both */
+    LBC_ZW, /* ZW space */
+    LBC_CM, /* combining mark */
+    LBC_WJ, /* word joiner */
+
+    /* used for 4.1 pair table */
+    LBC_H2, /* Hamgul 2 Jamo Syllable */
+    LBC_H3, /* Hangul 3 Jamo Syllable */
+    LBC_JL, /* Jamo leading consonant */
+    LBC_JV, /* Jamo vowel */
+    LBC_JT, /* Jamo trailing consonant */
+
+    /* These are not handled in the pair tables. */
+    LBC_SA, /* south (east) asian */
+    LBC_SP, /* space */
+    LBC_PS, /* paragraph and line separators */
+    LBC_BK, /* hard break (newline) */
+    LBC_CR, /* carriage return */
+    LBC_LF, /* line feed */
+    LBC_NL, /* next line */
+    LBC_CB, /* contingent break opportunity */
+    LBC_SG, /* surrogate */
+    LBC_AI, /* ambiguous */
+    LBC_XX, /* unknown */
+    LBC_MAX
+  };
+
+enum LineBreakAction
+  {
+    LBA_DIRECT =               '_',
+    LBA_INDIRECT =             '%',
+    LBA_COMBINING_INDIRECT =   '#',
+    LBA_COMBINING_PROHIBITED = '@',
+    LBA_PROHIBITED =           '^',
+    LBA_MAX
+  };
+
+/* The pair table of line break actions.  */
+static char *lba_pair_table[] =
+  /* OP GL SY PO ID BA ZW H2 JV
+      CL NS IS NU IN BB CM H3 JT
+       QU EX PR AL HY B2 WJ JL  */
+  { "^^^^^^^^^^^^^^^^^^^@^^^^^^", /* OP */
+    "_^%%^^^^_%____%%__^#^_____", /* CL */
+    "^^%%%^^^%%%%%%%%%%^#^%%%%%", /* QU */
+    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* GL */
+    "_^%%%^^^______%%__^#^_____", /* NS */
+    "_^%%%^^^______%%__^#^_____", /* EX */
+    "_^%%%^^^__%___%%__^#^_____", /* SY */
+    "_^%%%^^^__%%__%%__^#^_____", /* IS */
+    "%^%%%^^^__%%%_%%__^#^%%%%%", /* PR */
+    "_^%%%^^^______%%__^#^_____", /* PO */
+    "_^%%%^^^_%%%_%%%__^#^_____", /* NU */
+    "_^%%%^^^__%%_%%%__^#^_____", /* AL */
+    "_^%%%^^^_%___%%%__^#^_____", /* ID */
+    "_^%%%^^^_____%%%__^#^_____", /* IN */
+    "_^%%%^^^__%___%%__^#^_____", /* HY */
+    "_^%%%^^^______%%__^#^_____", /* BA */
+    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* BB */
+    "_^%%%^^^______%%_^^#^_____", /* B2 */
+    "__________________^_______", /* ZW */
+    "_^%%%^^^__%%_%%%__^#^_____", /* CM */
+    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* WJ */
+    "_^%%%^^^_%___%%%__^#^___%%", /* H2 */
+    "_^%%%^^^_%___%%%__^#^____%", /* H3 */
+    "_^%%%^^^_%___%%%__^#^%%%%_", /* JL */
+    "_^%%%^^^_%___%%%__^#^___%%", /* JV */
+    "_^%%%^^^_%___%%%__^#^____%"  /* JT */
+  };
+
+static MCharTable *lbc_table;
+
+/* Set LBC to enum LineBreakClass of the character at POS of MT
+   (length is LEN) while converting LBC_AI and LBC_XX to LBC_AL,
+   LBC_CB to LBC_B2, LBC_CR, LBC_LF, and LBC_NL to LBC_BK.  If POS is
+   out of range, set LBC to LBC_BK.  */
+
+#define GET_LBC(LBC, MT, LEN, POS, OPTION)                             \
+  do {                                                                 \
+    if ((POS) < 0 || (POS) >= (LEN))                                   \
+      (LBC) = LBC_BK;                                                  \
+    else                                                               \
+      {                                                                        \
+       int c = mtext_ref_char ((MT), (POS));                           \
+       (LBC) = (enum LineBreakClass) mchartable_lookup (lbc_table, c); \
+       if ((LBC) == LBC_NL)                                            \
+         (LBC) = LBC_BK;                                               \
+       else if ((LBC) == LBC_AI)                                       \
+         (LBC) = ((OPTION) & MTEXT_LBO_AI_AS_ID) ? LBC_ID : LBC_AL;    \
+       else if (! ((OPTION) & MTEXT_LBO_KOREAN_SP)                     \
+                && (LBC) >= LBC_H2 && (LBC) <= LBC_JT)                 \
+         (LBC) = LBC_AL;                                               \
+       else if ((LBC) == LBC_CB)                                       \
+         (LBC) = LBC_B2;                                               \
+       else if ((LBC) == LBC_XX)                                       \
+         (LBC) = LBC_AL;                                               \
+      }                                                                        \
+  } while (0)
+
+
+/*** @} */
+#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
+
+\f
+/* External API */
+
+/*** @addtogroup m17nMtext */
+/*** @{ */
+/*=*/
+
+int
+mtext_line_break (MText *mt, int pos, int option, int *after)
+{
+  int break_before, break_after;
+  int len = mtext_len (mt);
+  enum LineBreakClass lbc;
+  enum LineBreakClass Blbc, Albc; /* B(efore) and A(fter) lbcs.  */
+  int Bpos, Apos;                /* B(efore) and A(fter) positions.  */
+  enum LineBreakAction action;
+  
+  if (pos >= len)
+    {
+      /* The end of text is an explicit break position.  */
+      if (after)
+       *after = pos;
+      return pos;
+    }
+
+  if (! lbc_table)
+    {
+      MSymbol key = mchar_define_property ("linebreak", Minteger);
+
+      lbc_table = mchar_get_prop_table (key, NULL);
+    }
+
+  GET_LBC (lbc, mt, len, pos, option);
+  Apos = pos;
+  Albc = lbc;
+  if (Albc == LBC_SP)
+    {
+      if (option & MTEXT_LBO_SP_CM)
+       {
+         GET_LBC (Albc, mt, len, Apos + 1, option);
+         Albc = (Albc == LBC_CM) ? LBC_ID : LBC_SP;
+       }
+      while (Albc == LBC_SP)
+       {
+         Apos--;
+         GET_LBC (Albc, mt, len, Apos, option);
+       }
+    }
+  if ((option & MTEXT_LBO_SP_CM) && (Albc == LBC_CM))
+    {
+      Apos--;
+      GET_LBC (Albc, mt, len, Apos, option);
+      if (Albc == LBC_SP)
+       Albc = LBC_ID;
+      else
+       Apos++, Albc = LBC_CM;
+    }
+
+  if (Albc == LBC_CR)
+    Albc = LBC_BK;
+  else if (Albc == LBC_LF)
+    {
+      GET_LBC (Albc, mt, len, Apos - 1, option);
+      if (Albc == LBC_CR)
+       Apos--;
+      Albc = LBC_BK;
+    }
+  else if (Albc == LBC_SA)
+    Albc = mtext__word_segment (mt, Apos, &Apos, NULL) > 0 ? LBC_BB : LBC_AL;
+  Bpos = Apos;
+  /* After exiting from the following loop, if Apos is positive, it is
+     the previous (including POS) break position.  */
+  while (Apos > 0)
+    {
+      int indirect;
+      int next = -1;
+
+      /* Now Bpos == Apos.  */
+      do {
+       Bpos--;
+       GET_LBC (Blbc, mt, len, Bpos, option);
+      } while (Blbc == LBC_SP);
+
+      if (Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
+       {
+         /* Explicit break.  */
+         break;
+       }
+
+      indirect = Bpos + 1 < Apos;
+
+      if (Blbc == LBC_CM)
+       {
+         do {
+             Bpos--;
+             GET_LBC (Blbc, mt, len, Bpos, option);
+         } while (Blbc == LBC_CM);
+         if ((option & MTEXT_LBO_SP_CM) && (Blbc == LBC_SP))
+           Blbc = LBC_ID;
+         else if (Blbc == LBC_SP || Blbc == LBC_ZW
+                  || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
+           {
+             Blbc = LBC_AL;
+             Bpos++;
+           }
+       }                  
+      if (Blbc == LBC_SA)
+       {
+         mtext__word_segment (mt, Bpos, &next, NULL);
+         Blbc = LBC_AL;
+       }
+
+      if (Albc != LBC_BK)
+       {
+         action = lba_pair_table[Blbc][Albc];
+         if (action == LBA_DIRECT)
+           break;
+         else if (action == LBA_INDIRECT)
+           {
+             if (indirect)
+               break;
+           }
+         else if (action == LBA_COMBINING_INDIRECT)
+           {
+             if (indirect)
+               break;
+           }
+       }
+      if (next >= 0)
+       Apos = next, Albc = LBC_BB;
+      else
+       Apos = Bpos, Albc = Blbc;
+    }
+  break_before = Apos;
+  if (break_before > 0)
+    {
+      if (! after)
+       return break_before;
+      if (break_before == pos)
+       {
+         if (after)
+           *after = break_before;
+         return break_before;
+       }
+    }
+
+  /* Now find a break position after POS.  */
+  break_after = 0;
+  Bpos = pos;
+  Blbc = lbc;
+  if (Blbc == LBC_CM)
+    {
+      do {
+       Bpos--;
+       GET_LBC (Blbc, mt, len, Bpos, option);
+      } while (Blbc == LBC_CM);
+      if (Blbc == LBC_SP || Blbc == LBC_ZW
+         || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
+       {
+         if ((Blbc == LBC_SP) && (option & MTEXT_LBO_SP_CM))
+           Blbc = LBC_ID;
+         else
+           Blbc = LBC_AL;
+       }
+      Bpos = pos;
+    }
+  if (Blbc == LBC_SA)
+    {
+      mtext__word_segment (mt, Bpos, NULL, &Bpos);
+      Blbc = LBC_AL;
+    }
+  else if (Blbc == LBC_SP)
+    {
+      if (option & MTEXT_LBO_SP_CM)
+       {
+         GET_LBC (Blbc, mt, len, Bpos + 1, option);
+         if (Blbc == LBC_CM)
+           Blbc = LBC_ID, Bpos++;
+         else
+           Blbc = LBC_SP;
+       }
+      while (Blbc == LBC_SP)
+       {
+         Bpos--;
+         GET_LBC (Blbc, mt, len, Bpos, option);
+       }
+      if (Bpos < 0)
+       Bpos = pos;
+    }
+  Apos = Bpos;
+  /* After exiting from the following loop, if Apos is positive, it is
+     the next break position.  */
+  while (1)
+    {
+      int indirect;
+      int next = -1;
+
+      /* Now Bpos == Apos.  */
+      if (Blbc == LBC_LF || Blbc == LBC_BK || Blbc == LBC_CR)
+       {
+         Apos++;
+         if (Blbc == LBC_CR)
+           {
+             GET_LBC (Blbc, mt, len, Bpos + 1, option);
+             if (Blbc == LBC_LF)
+               Apos++;
+           }
+         break;
+       }
+
+      do {
+       Apos++;
+       GET_LBC (Albc, mt, len, Apos, option);
+      } while (Albc == LBC_SP);
+      
+      if (Blbc == LBC_SP)
+       break;
+
+      if (Apos == len)
+       /* Explicit break at the end of text.  */
+       break;
+
+      indirect = Bpos + 1 < Apos;
+
+      if (Albc == LBC_SA)
+       Albc = mtext__word_segment (mt, Apos, NULL, &next) ? LBC_BB : LBC_AL;
+
+      action = lba_pair_table[Blbc][Albc];
+      if (action == LBA_DIRECT)
+       /* Direct break at Apos.  */
+       break;
+      else if (action == LBA_INDIRECT)
+       {
+         if (indirect)
+           break;
+       }
+      else if (action == LBA_COMBINING_INDIRECT)
+       {
+         if (indirect)
+           {
+             if (option & MTEXT_LBO_SP_CM)
+               Apos--;
+             break;
+           }
+       }
+      if (next >= 0)
+       Bpos = next, Blbc = LBC_AL;
+      else
+       {
+         Bpos = Apos;
+         if (Albc != LBC_CM)
+           Blbc = Albc;
+       }
+    }
+  break_after = Apos;
+  if (after)
+    *after = break_after;
+
+  return (break_before > 0 ? break_before : break_after);
+}
+
+/*** @} */ 
+
+/*
+  Local Variables:
+  coding: euc-japan
+  End:
+*/
diff --git a/src/mtext-wseg.c b/src/mtext-wseg.c

new file mode 100644 (file)

index 0000000..8e8cdd3
--- /dev/null
+++ b/src/mtext-wseg.c
@@ -0,0 +1,419 @@
+/* mtext-wseg.c -- word segmentation
+   Copyright (C) 2005
+     National Institute of Advanced Industrial Science and Technology (AIST)
+     Registration Number H15PRO112
+
+   This file is part of the m17n library.
+
+   The m17n library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License
+   as published by the Free Software Foundation; either version 2.1 of
+   the License, or (at your option) any later version.
+
+   The m17n library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the m17n library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.  */
+
+#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
+/*** @addtogroup m17nInternal
+     @{ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+#include "m17n-core.h"
+#include "m17n-misc.h"
+#include "internal.h"
+#include "textprop.h"
+#include "character.h"
+
+typedef struct _MWordseg_Function MWordseg_Function;
+
+struct _MWordseg_Function
+{
+  int initialized;
+  int (*init) (void);
+  void (*fini) (void);
+  int (*wseg) (MText *mt, int pos, int *from, int *to,
+              MWordseg_Function *wordseg);
+  MWordseg_Function *next;
+};
+
+static MWordseg_Function *wordseg_function_list;
+
+static MCharTable *wordseg_function_table;
+
+static int
+generic_wordseg (MText *mt, int pos, int *from, int *to,
+                MWordseg_Function *wordseg)
+{
+  int len = mtext_nchars (mt);
+  int c = mtext_ref_char (mt, pos);
+  MSymbol category = mchar_get_prop (c, Mcategory);
+  char cathead = msymbol_name (category)[0];
+  int in_word = (cathead == 'L' || cathead == 'M' || cathead == 'N');
+  int beg, end;
+
+  for (beg = pos; beg > 0; beg--)
+    {
+      c = mtext_ref_char (mt, beg - 1);
+      category = mchar_get_prop (c, Mcategory);
+      cathead = msymbol_name (category)[0];
+      if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N'))
+       break;
+      if (mchartable_lookup (wordseg_function_table, c) != wordseg)
+       break;
+    }
+  for (end = pos; end < len; end++)
+    {
+      c = mtext_ref_char (mt, end);
+      category = mchar_get_prop (c, Mcategory);
+      cathead = msymbol_name (category)[0];
+      if (in_word != (cathead == 'L' || cathead == 'M' || cathead == 'N'))
+       break;
+      if (mchartable_lookup (wordseg_function_table, c) != wordseg)
+       break;
+    }
+  if (from)
+    *from = beg;
+  if (to)
+    *to = end;
+  return in_word;
+}
+
+#ifdef HAVE_THAI_WORDSEG
+
+#define THAI_BEG 0x0E01
+#define THAI_END 0x0E6F
+
+static MSymbol M_thai_wordseg;
+
+/* We have libthai, wordcut, or wordcut-old.  Each of them provides
+   the following three functions.  */
+
+static int thai_wordseg_init (void);
+static void thai_wordseg_fini (void);
+static MTextProperty *thai_wordseg_propertize (MText *mt, int pos,
+                                              int from, int to,
+                                              unsigned char *tis);
+
+#ifdef HAVE_LIBTHAI
+
+#include <thai/thbrk.h>
+
+static int
+thai_wordseg_init (void)
+{
+  return 0;
+}
+
+static void
+thai_wordseg_fini (void)
+{
+  return;
+}
+
+static MTextProperty *
+thai_wordseg_propertize (MText *mt, int pos, int from, int to,
+                        unsigned char *tis)
+{
+  int len = to - from;
+  int *breaks = alloca ((sizeof (int)) * len);
+  int count = th_brk ((thchar_t *) tis, breaks, len);
+  MTextProperty *prop = NULL;
+
+  if (count == 0)
+    {
+      prop = mtext_property (M_thai_wordseg, Mt,
+                            MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
+      mtext_attach_property (mt, from, to, prop);
+      M17N_OBJECT_UNREF (prop);
+    }
+  else
+    {
+      int last, i;
+      MTextProperty *this;
+
+      for (i = 0, last = from; i < count; i++)
+       {
+         this = mtext_property (M_thai_wordseg, Mt,
+                                MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
+         mtext_attach_property (mt, last, from + breaks[i], this);
+         if (pos >= last && pos < from + breaks[i])
+           prop = this;
+         M17N_OBJECT_UNREF (this);
+         last = from + breaks[i];
+       }
+      if (last < to)
+       {
+         this = mtext_property (M_thai_wordseg, Mt,
+                                MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
+         mtext_attach_property (mt, last, to, this);
+         if (pos >= last && pos < to)
+           prop = this;
+         M17N_OBJECT_UNREF (this);
+       }
+    }
+
+  if (! prop)
+    mdebug_hook ();
+  return prop;
+}
+
+#elif HAVE_WORDCUT
+
+#include <wordcut/wcwordcut.h>
+
+static WcWordcut wordcut;
+static WcWordVector *word_vector;
+
+static int
+thai_wordseg_init (void)
+{  
+  wc_wordcut_init (&wordcut);
+  return 0;
+}
+
+static void
+thai_wordseg_fini (void)
+{
+  if (word_vector)
+    wc_word_vector_delete (word_vector);
+  wc_wordcut_destroy (&wordcut);
+  return;
+}
+
+static MTextProperty *
+thai_wordseg_propertize (MText *mt, int pos, int from, int to,
+                        unsigned char *tis)
+{
+  gulong i, count;
+  MTextProperty *prop = NULL;
+
+  if (! word_vector)
+    word_vector = wc_word_vector_new ();
+  else
+    {
+      wc_word_vector_destroy (word_vector);
+      wc_word_vector_init (word_vector);
+    }
+
+  wc_wordcut_cut (&wordcut, (gchar *) tis, (gint) (to - from),
+                 word_vector);
+  count = wc_word_vector_get_count (word_vector);
+  for (i = 0; i < count; i++)
+    {
+      WcWord *word = wc_word_vector_get_word (word_vector, i);
+
+      if (word->type != WC_WORDTYPE_DELETED)
+       {
+         MSymbol val = ((word->type == WC_WORDTYPE_DICTIONARY
+                         || word->type == WC_WORDTYPE_WORDUNIT
+                         || word->type == WC_WORDTYPE_JOINED)
+                        ? Mt : Mnil);
+         MTextProperty *this
+           = mtext_property (M_thai_wordseg, val,
+                             MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
+
+         mtext_attach_property (mt, from, from + word->len, this);
+         if (pos >= from && pos < from + word->len)
+           prop = this;
+         M17N_OBJECT_UNREF (this);
+         from += word->len;
+       }
+    }
+  return prop;
+}
+
+#else  /* HAVE_WORDCUT_OLD */
+
+#include <wordcut/wordcut.h>
+
+static Wordcut wordcut;
+static WordcutResult wordcut_result;
+static int wordcut_result_used;
+
+static int
+thai_wordseg_init (void)
+{  
+  return (wordcut_init (&wordcut, WORDCUT_TDICT) == 0 ? 0 : -1);
+}
+
+static void
+thai_wordseg_fini (void)
+{
+  if (wordcut_result_used)
+    {
+      wordcut_result_close (&wordcut_result);
+      wordcut_result_used = 0;
+    }
+  wordcut_close (&wordcut);
+  return;
+}
+
+static MTextProperty *
+thai_wordseg_propertize (MText *mt, int pos, int from, int to,
+                        unsigned char *tis)
+{
+  int i, last;
+  MTextProperty *prop = NULL;
+
+  wordcut_cut (&wordcut, (char *) tis, &wordcut_result);
+  wordcut_result_used = 1;
+  for (i = 0, last = from; i < wordcut_result.count; i++)
+    {
+      MTextProperty *this;
+
+      if (last < from + wordcut_result.start[i])
+       {
+         this = mtext_property (M_thai_wordseg, Mnil,
+                                MTEXTPROP_VOLATILE_WEAK);
+         mtext_attach_property (mt, last, from + wordcut_result.start[i],
+                                this);
+         if (pos >= last && pos < from + wordcut_result.start[i])
+           prop = this;
+         M17N_OBJECT_UNREF (this);
+       }
+
+      this = mtext_property (M_thai_wordseg, Mt,
+                            MTEXTPROP_VOLATILE_WEAK | MTEXTPROP_NO_MERGE);
+      last = from + wordcut_result.start[i];
+      mtext_attach_property (mt, last, last + wordcut_result.offset[i], this);
+      if (pos >= last && pos < last + wordcut_result.offset[i])
+       prop = this;
+      m17n_object_unref (this);
+      last += wordcut_result.offset[i];
+    }
+  return prop;
+}
+
+#endif  /* not HAVE_LIBTHA, HAVE_WORDCUT nor HAVE_WORDCUT_OLD */
+
+int
+thai_wordseg (MText *mt, int pos, int *from, int *to,
+             MWordseg_Function *wordseg)
+{
+  MTextProperty *prop;
+
+  /* It is assured that there's a Thai character at POS.  */
+  prop = mtext_get_property (mt, pos, M_thai_wordseg);
+  if (! prop)
+    {
+      /* TIS620 code sequence.  */
+      unsigned char *tis;
+      int len = mtext_nchars (mt);
+      int beg, end;
+      int c, i;
+
+      for (beg = pos; beg > 0; beg--)
+       if ((c = mtext_ref_char (mt, beg - 1)) < THAI_BEG || c > THAI_END)
+         break;
+      for (end = pos + 1; end < len; end++)
+       if ((c = mtext_ref_char (mt, end)) < THAI_BEG || c > THAI_END)
+         break;
+
+      /* Extra 1-byte for 0 terminating.  */
+      tis = alloca ((end - beg) + 1);
+
+      for (i = beg; i < end; i++)
+       tis[i - beg] = 0xA1 + (mtext_ref_char (mt, i) - THAI_BEG);
+      tis[i - beg] = 0;
+      prop = thai_wordseg_propertize (mt, pos, beg, end, tis);
+    }
+
+  if (from)
+    *from = MTEXTPROP_START (prop);
+  if (to)
+    *to = MTEXTPROP_END (prop);
+  return (MTEXTPROP_VAL (prop) == Mt);
+}
+
+#endif /* HAVE_THAI_WORDSEG */
+
+\f
+/* Internal API */
+
+void
+mtext__wseg_fini ()
+{
+  if (wordseg_function_list)
+    {
+      while (wordseg_function_list)
+       {
+         MWordseg_Function *next = wordseg_function_list->next;
+
+         if (wordseg_function_list->initialized > 0
+             && wordseg_function_list->fini)
+           wordseg_function_list->fini ();
+         free (wordseg_function_list);
+         wordseg_function_list = next;
+       }
+      M17N_OBJECT_UNREF (wordseg_function_table);
+    }
+}
+
+/* Find word boundaries around POS of MT.  Set *FROM to the word
+   boundary position at or previous to POS, and update *TO to the word
+   boundary position after POS.
+
+   @return If word boundaries were found successfully, return 1 (if
+   the character at POS is a part of a word) or 0 (otherwise).  If the
+   operation was not successful, return -1 without setting *FROM and
+   *TO.  */
+
+int
+mtext__word_segment (MText *mt, int pos, int *from, int *to)
+{
+  int c = mtext_ref_char (mt, pos);
+  MWordseg_Function *wordseg;
+
+  if (! wordseg_function_table)
+    {
+      wordseg_function_table = mchartable (Mnil, NULL);
+
+      MSTRUCT_CALLOC (wordseg, MERROR_MTEXT);
+      wordseg->wseg = generic_wordseg;
+      wordseg->next = wordseg_function_list;
+      wordseg_function_list = wordseg;
+      mchartable_set_range (wordseg_function_table, 0, MCHAR_MAX, wordseg);
+
+#ifdef HAVE_THAI_WORDSEG
+      MSTRUCT_CALLOC (wordseg, MERROR_MTEXT);
+      wordseg->init = thai_wordseg_init;
+      wordseg->fini = thai_wordseg_fini;
+      wordseg->wseg = thai_wordseg;
+      wordseg->next = wordseg_function_list;
+      wordseg_function_list = wordseg;
+      mchartable_set_range (wordseg_function_table, THAI_BEG, THAI_END,
+                           wordseg);
+      M_thai_wordseg = msymbol ("  thai-wordseg");
+#endif
+    }
+
+  wordseg = mchartable_lookup (wordseg_function_table, c);
+  if (wordseg && wordseg->initialized >= 0)
+    {
+      if (! wordseg->initialized)
+       {
+         if (wordseg->init
+             && wordseg->init () < 0)
+           {
+             wordseg->initialized = -1;
+             return -1;
+           }
+         wordseg->initialized = 1;
+       }
+      return wordseg->wseg (mt, pos, from, to, wordseg);
+    }
+  return -1;
+}
+
+/*** @} */
+#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
author	handa <handa>
	Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)
committer	handa <handa>
	Fri, 2 Sep 2005 06:18:07 +0000 (06:18 +0000)
src/mtext-lbrk.c	[new file with mode: 0644]	patch \| blob
src/mtext-wseg.c	[new file with mode: 0644]	patch \| blob