X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=src%2Fsearch.c;h=b5ac56d6856b4f99234fc009e1c2fe58d4e5e7ee;hb=37bdb8647edead5a0e669c6ec27841d470a9e5e6;hp=7738cde619ad32c396e448921dd834418067787f;hpb=1d9bc86590766427e2431876a50d78206a99edd5;p=chise%2Fxemacs-chise.git

diff --git a/src/search.c b/src/search.c
index 7738cde..b5ac56d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1,6 +1,7 @@
 /* String search routines for XEmacs.
    Copyright (C) 1985, 1986, 1987, 1992-1995 Free Software Foundation, Inc.
    Copyright (C) 1995 Sun Microsystems, Inc.
+   Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
 
 This file is part of XEmacs.
 
@@ -38,13 +39,18 @@ Boston, MA 02111-1307, USA.  */
 
 #include <sys/types.h>
 #include "regex.h"
+#include "casetab.h"
+#include "chartab.h"
 
+#define TRANSLATE(table, pos)	\
+ (!NILP (table) ? TRT_TABLE_OF (table, (Emchar) pos) : pos)
 
 #define REGEXP_CACHE_SIZE 20
 
 /* If the regexp is non-nil, then the buffer contains the compiled form
    of that regexp, suitable for searching.  */
-struct regexp_cache {
+struct regexp_cache
+{
   struct regexp_cache *next;
   Lisp_Object regexp;
   struct re_pattern_buffer buf;
@@ -104,9 +110,16 @@ Lisp_Object Vskip_chars_range_table;
 
 static void set_search_regs (struct buffer *buf, Bufpos beg, Charcount len);
 static void save_search_regs (void);
+static Bufpos simple_search (struct buffer *buf, Bufbyte *base_pat,
+			     Bytecount len, Bytind pos, Bytind lim,
+			     EMACS_INT n, Lisp_Object trt);
+static Bufpos boyer_moore (struct buffer *buf, Bufbyte *base_pat,
+			   Bytecount len, Bytind pos, Bytind lim,
+			   EMACS_INT n, Lisp_Object trt,
+			   Lisp_Object inverse_trt, int charset_base);
 static Bufpos search_buffer (struct buffer *buf, Lisp_Object str,
 			     Bufpos bufpos, Bufpos buflim, EMACS_INT n, int RE,
-			     unsigned char *trt, unsigned char *inverse_trt,
+			     Lisp_Object trt, Lisp_Object inverse_trt,
 			     int posix);
 
 static void
@@ -128,7 +141,7 @@ matcher_overflow (void)
 
 static int
 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern,
-		   char *translate, struct re_registers *regp, int posix,
+		   Lisp_Object translate, struct re_registers *regp, int posix,
 		   Error_behavior errb)
 {
   const char *val;
@@ -167,7 +180,7 @@ compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern,
 
 struct re_pattern_buffer *
 compile_pattern (Lisp_Object pattern, struct re_registers *regp,
-		 char *translate, int posix, Error_behavior errb)
+		 Lisp_Object translate, int posix, Error_behavior errb)
 {
   struct regexp_cache *cp, **cpp;
 
@@ -175,7 +188,7 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp,
     {
       cp = *cpp;
       if (!NILP (Fstring_equal (cp->regexp, pattern))
-	  && cp->buf.translate == translate
+	  && EQ (cp->buf.translate, translate)
 	  && cp->posix == posix)
 	break;
 
@@ -210,8 +223,9 @@ Lisp_Object Qsearch_failed;
 static Lisp_Object
 signal_failure (Lisp_Object arg)
 {
-  Fsignal (Qsearch_failed, list1 (arg));
-  return Qnil;
+  for (;;)
+    Fsignal (Qsearch_failed, list1 (arg));
+  return Qnil; /* Not reached. */
 }
 
 /* Convert the search registers from Bytinds to Bufpos's.  Needs to be
@@ -286,8 +300,7 @@ looking_at_1 (Lisp_Object string, struct buffer *buf, int posix)
   CHECK_STRING (string);
   bufp = compile_pattern (string, &search_regs,
 			  (!NILP (buf->case_fold_search)
-			   ? (char *) MIRROR_DOWNCASE_TABLE_AS_STRING (buf)
-			   : 0),
+			   ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil),
 			  posix, ERROR_ME);
 
   QUIT;
@@ -386,8 +399,8 @@ string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start,
 
   bufp = compile_pattern (regexp, &search_regs,
 			  (!NILP (buf->case_fold_search)
-			   ? (char *) MIRROR_DOWNCASE_TABLE_AS_STRING (buf)
-			   : 0), 0, ERROR_ME);
+			   ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil),
+			  0, ERROR_ME);
   QUIT;
   {
     Bytecount bis = charcount_to_bytecount (XSTRING_DATA (string), s);
@@ -456,10 +469,8 @@ fast_string_match (Lisp_Object regexp,  const Bufbyte *nonreloc,
 
   bufp = compile_pattern (regexp, 0,
 			  (case_fold_search
-			   ? (char *)
-			   /* #### evil current-buffer dependency */
-			   MIRROR_DOWNCASE_TABLE_AS_STRING (current_buffer)
-			   : 0),
+			   ? XCASE_TABLE_DOWNCASE (current_buffer->case_table)
+			   : Qnil),
 			  0, errb);
   if (!bufp)
     return -1; /* will only do this when errb != ERROR_ME */
@@ -774,7 +785,11 @@ skip_chars (struct buffer *buf, int forwardp, int syntaxp,
   unsigned char fastmap[0400];
   int negate = 0;
   REGISTER int i;
+#ifdef UTF2000
+  Lisp_Char_Table *syntax_table = XCHAR_TABLE (buf->syntax_table);
+#else
   Lisp_Char_Table *syntax_table = XCHAR_TABLE (buf->mirror_syntax_table);
+#endif
   Bufpos limit;
 
   if (NILP (lim))
@@ -1025,11 +1040,11 @@ search_command (Lisp_Object string, Lisp_Object limit, Lisp_Object noerror,
 
   np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE,
 		      (!NILP (buf->case_fold_search)
-		       ? MIRROR_CANON_TABLE_AS_STRING (buf)
-                       : 0),
+		       ? XCASE_TABLE_CANON (buf->case_table)
+                       : Qnil),
 		      (!NILP (buf->case_fold_search)
-		       ? MIRROR_EQV_TABLE_AS_STRING (buf)
-                       : 0), posix);
+		       ? XCASE_TABLE_EQV (buf->case_table)
+                       : Qnil), posix);
 
   if (np <= 0)
     {
@@ -1107,25 +1122,14 @@ trivial_regexp_p (Lisp_Object regexp)
 
    POSIX is nonzero if we want full backtracking (POSIX style)
    for this pattern.  0 means backtrack only enough to get a valid match.  */
-
 static Bufpos
 search_buffer (struct buffer *buf, Lisp_Object string, Bufpos bufpos,
-	       Bufpos buflim, EMACS_INT n, int RE, unsigned char *trt,
-	       unsigned char *inverse_trt, int posix)
+	       Bufpos buflim, EMACS_INT n, int RE, Lisp_Object trt,
+	       Lisp_Object inverse_trt, int posix)
 {
   /* This function has been Mule-ized, except for the trt table handling. */
   Bytecount len = XSTRING_LENGTH (string);
   Bufbyte *base_pat = XSTRING_DATA (string);
-  REGISTER EMACS_INT *BM_tab;
-  EMACS_INT *BM_tab_base;
-  REGISTER int direction = ((n > 0) ? 1 : -1);
-  REGISTER Bytecount dirlen;
-  EMACS_INT infinity;
-  Bytind limit;
-  EMACS_INT k;
-  Bytecount stride_for_teases = 0;
-  REGISTER Bufbyte *pat = 0;
-  REGISTER Bufbyte *cursor, *p_limit, *ptr2;
   REGISTER EMACS_INT i, j;
   Bytind p1, p2;
   Bytecount s1, s2;
@@ -1151,7 +1155,7 @@ search_buffer (struct buffer *buf, Lisp_Object string, Bufpos bufpos,
     {
       struct re_pattern_buffer *bufp;
 
-      bufp = compile_pattern (string, &search_regs, (char *) trt, posix,
+      bufp = compile_pattern (string, &search_regs, trt, posix,
 			      ERROR_ME);
 
       /* Get pointers and sizes of the two strings
@@ -1242,318 +1246,665 @@ search_buffer (struct buffer *buf, Lisp_Object string, Bufpos bufpos,
       return bufpos;
     }
   else				/* non-RE case */
-    /* #### Someone really really really needs to comment the workings
-       of this junk somewhat better.
-
-       BTW "BM" stands for Boyer-Moore, which is one of the standard
-       string-searching algorithms.  It's the best string-searching
-       algorithm out there, provided that:
-
-       a) You're not fazed by algorithm complexity. (Rabin-Karp, which
-          uses hashing, is much much easier to code but not as fast.)
-       b) You can freely move backwards in the string that you're
-          searching through.
-
-       As the comment below tries to explain (but garbles in typical
-       programmer-ese), the idea is that you don't have to do a
-       string match at every successive position in the text.  For
-       example, let's say the pattern is "a very long string".  We
-       compare the last character in the string (`g') with the
-       corresponding character in the text.  If it mismatches, and
-       it is, say, `z', then we can skip forward by the entire
-       length of the pattern because `z' does not occur anywhere
-       in the pattern.  If the mismatching character does occur
-       in the pattern, we can usually still skip forward by more
-       than one: e.g. if it is `l', then we can skip forward
-       by the length of the substring "ong string" -- i.e. the
-       largest end section of the pattern that does not contain
-       the mismatched character.  So what we do is compute, for
-       each possible character, the distance we can skip forward
-       (the "stride") and use it in the string matching.  This
-       is what the BM_tab holds. */
     {
-#ifdef C_ALLOCA
-      EMACS_INT BM_tab_space[0400];
-      BM_tab = &BM_tab_space[0];
+      int charset_base = -1;
+      int boyer_moore_ok = 1;
+      Bufbyte *pat = 0;
+      Bufbyte *patbuf = alloca_array (Bufbyte, len * MAX_EMCHAR_LEN);
+      pat = patbuf;
+#ifdef MULE
+      while (len > 0)
+	{
+	  Bufbyte tmp_str[MAX_EMCHAR_LEN];
+	  Emchar c, translated, inverse;
+	  Bytecount orig_bytelen, new_bytelen, inv_bytelen;
+
+	  /* If we got here and the RE flag is set, it's because
+	     we're dealing with a regexp known to be trivial, so the
+	     backslash just quotes the next character.  */
+	  if (RE && *base_pat == '\\')
+	    {
+	      len--;
+	      base_pat++;
+	    }
+	  c = charptr_emchar (base_pat);
+	  translated = TRANSLATE (trt, c);
+	  inverse = TRANSLATE (inverse_trt, c);
+
+	  orig_bytelen = charcount_to_bytecount (base_pat, 1);
+	  inv_bytelen = set_charptr_emchar (tmp_str, inverse);
+	  new_bytelen = set_charptr_emchar (tmp_str, translated);
+
+
+	  if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen)
+	    boyer_moore_ok = 0;
+	  if (translated != c || inverse != c)
+	    {
+	      /* Keep track of which character set row
+		 contains the characters that need translation.  */
+#ifdef UTF2000
+	      int charset_base_code = c >> 6;
 #else
-      BM_tab = alloca_array (EMACS_INT, 256);
+	      int charset_base_code = c & ~CHAR_FIELD3_MASK;
 #endif
+	      if (charset_base == -1)
+		charset_base = charset_base_code;
+	      else if (charset_base != charset_base_code)
+		/* If two different rows appear, needing translation,
+		   then we cannot use boyer_moore search.  */
+		boyer_moore_ok = 0;
+	    }
+	  memcpy (pat, tmp_str, new_bytelen);
+	  pat += new_bytelen;
+	  base_pat += orig_bytelen;
+	  len -= orig_bytelen;
+	}
+#else /* not MULE */
+      while (--len >= 0)
+	{
+	  /* If we got here and the RE flag is set, it's because
+	     we're dealing with a regexp known to be trivial, so the
+	     backslash just quotes the next character.  */
+	  if (RE && *base_pat == '\\')
+	    {
+	      len--;
+	      base_pat++;
+	    }
+	  *pat++ = TRANSLATE (trt, *base_pat++);
+	}
+#endif /* MULE */
+      len = pat - patbuf;
+      pat = base_pat = patbuf;
+      if (boyer_moore_ok)
+	return boyer_moore (buf, base_pat, len, pos, lim, n,
+			    trt, inverse_trt, charset_base);
+      else
+	return simple_search (buf, base_pat, len, pos, lim, n, trt);
+    }
+}
+
+/* Do a simple string search N times for the string PAT,
+   whose length is LEN/LEN_BYTE,
+   from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
+   TRT is the translation table.
+
+   Return the character position where the match is found.
+   Otherwise, if M matches remained to be found, return -M.
+
+   This kind of search works regardless of what is in PAT and
+   regardless of what is in TRT.  It is used in cases where
+   boyer_moore cannot work.  */
+
+static Bufpos
+simple_search (struct buffer *buf, Bufbyte *base_pat, Bytecount len_byte,
+	       Bytind idx, Bytind lim, EMACS_INT n, Lisp_Object trt)
+{
+  int forward = n > 0;
+  Bytecount buf_len = 0; /* Shut up compiler. */
+
+  if (lim > idx)
+    while (n > 0)
+      {
+	while (1)
+	  {
+	    Bytecount this_len = len_byte;
+	    Bytind this_idx = idx;
+	    Bufbyte *p = base_pat;
+	    if (idx >= lim)
+	      goto stop;
+
+	    while (this_len > 0)
+	      {
+		Emchar pat_ch, buf_ch;
+		Bytecount pat_len;
+
+		pat_ch = charptr_emchar (p);
+		buf_ch = BI_BUF_FETCH_CHAR (buf, this_idx);
+
+		buf_ch = TRANSLATE (trt, buf_ch);
+
+		if (buf_ch != pat_ch)
+		  break;
+
+		pat_len = charcount_to_bytecount (p, 1);
+		p += pat_len;
+		this_len -= pat_len;
+		INC_BYTIND (buf, this_idx);
+	      }
+	    if (this_len == 0)
+	      {
+		buf_len = this_idx - idx;
+		idx = this_idx;
+		break;
+	      }
+	    INC_BYTIND (buf, idx);
+	  }
+	n--;
+      }
+  else
+    while (n < 0)
       {
-	Bufbyte *patbuf = alloca_array (Bufbyte, len);
-	pat = patbuf;
-	while (--len >= 0)
+	while (1)
 	  {
-	    /* If we got here and the RE flag is set, it's because we're
-	       dealing with a regexp known to be trivial, so the backslash
-	       just quotes the next character.  */
-	    if (RE && *base_pat == '\\')
+	    Bytecount this_len = len_byte;
+	    Bytind this_idx = idx;
+	    Bufbyte *p;
+	    if (idx <= lim)
+	      goto stop;
+	    p = base_pat + len_byte;
+
+	    while (this_len > 0)
 	      {
-		len--;
-		base_pat++;
+		Emchar pat_ch, buf_ch;
+
+		DEC_CHARPTR (p);
+		DEC_BYTIND (buf, this_idx);
+		pat_ch = charptr_emchar (p);
+		buf_ch = BI_BUF_FETCH_CHAR (buf, this_idx);
+
+		buf_ch = TRANSLATE (trt, buf_ch);
+
+		if (buf_ch != pat_ch)
+		  break;
+
+		this_len -= charcount_to_bytecount (p, 1);
 	      }
-	    *pat++ = (trt ? trt[*base_pat++] : *base_pat++);
+	    if (this_len == 0)
+	      {
+		buf_len = idx - this_idx;
+		idx = this_idx;
+		break;
+	      }
+	    DEC_BYTIND (buf, idx);
 	  }
-	len = pat - patbuf;
-	pat = base_pat = patbuf;
+	n++;
       }
-      /* The general approach is that we are going to maintain that we know */
-      /* the first (closest to the present position, in whatever direction */
-      /* we're searching) character that could possibly be the last */
-      /* (furthest from present position) character of a valid match.  We */
-      /* advance the state of our knowledge by looking at that character */
-      /* and seeing whether it indeed matches the last character of the */
-      /* pattern.  If it does, we take a closer look.  If it does not, we */
-      /* move our pointer (to putative last characters) as far as is */
-      /* logically possible.  This amount of movement, which I call a */
-      /* stride, will be the length of the pattern if the actual character */
-      /* appears nowhere in the pattern, otherwise it will be the distance */
-      /* from the last occurrence of that character to the end of the */
-      /* pattern. */
-      /* As a coding trick, an enormous stride is coded into the table for */
-      /* characters that match the last character.  This allows use of only */
-      /* a single test, a test for having gone past the end of the */
-      /* permissible match region, to test for both possible matches (when */
-      /* the stride goes past the end immediately) and failure to */
-      /* match (where you get nudged past the end one stride at a time). */
-
-      /* Here we make a "mickey mouse" BM table.  The stride of the search */
-      /* is determined only by the last character of the putative match. */
-      /* If that character does not match, we will stride the proper */
-      /* distance to propose a match that superimposes it on the last */
-      /* instance of a character that matches it (per trt), or misses */
-      /* it entirely if there is none. */
-
-      dirlen = len * direction;
-      infinity = dirlen - (lim + pos + len + len) * direction;
-      if (direction < 0)
-	pat = (base_pat += len - 1);
-      BM_tab_base = BM_tab;
-      BM_tab += 0400;
-      j = dirlen;		/* to get it in a register */
-      /* A character that does not appear in the pattern induces a */
-      /* stride equal to the pattern length. */
-      while (BM_tab_base != BM_tab)
+ stop:
+  if (n == 0)
+    {
+      Bufpos beg, end, retval;
+      if (forward)
 	{
-	  *--BM_tab = j;
-	  *--BM_tab = j;
-	  *--BM_tab = j;
-	  *--BM_tab = j;
+	  beg = bytind_to_bufpos (buf, idx - buf_len);
+	  retval = end = bytind_to_bufpos (buf, idx);
 	}
-      i = 0;
-      while (i != infinity)
+      else
 	{
-	  j = pat[i]; i += direction;
-	  if (i == dirlen) i = infinity;
-	  if (trt != 0)
-	    {
-	      k = (j = trt[j]);
-	      if (i == infinity)
-		stride_for_teases = BM_tab[j];
-	      BM_tab[j] = dirlen - i;
-	      /* A translation table is accompanied by its inverse -- see */
-	      /* comment following downcase_table for details */
+	  retval = beg = bytind_to_bufpos (buf, idx);
+	  end = bytind_to_bufpos (buf, idx + buf_len);
+	}
+      set_search_regs (buf, beg, end - beg);
+
+      return retval;
+    }
+  else if (n > 0)
+    return -n;
+  else
+    return n;
+}
 
-	      while ((j = inverse_trt[j]) != k)
-		BM_tab[j] = dirlen - i;
+/* Do Boyer-Moore search N times for the string PAT,
+   whose length is LEN/LEN_BYTE,
+   from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
+   DIRECTION says which direction we search in.
+   TRT and INVERSE_TRT are translation tables.
+
+   This kind of search works if all the characters in PAT that have
+   nontrivial translation are the same aside from the last byte.  This
+   makes it possible to translate just the last byte of a character,
+   and do so after just a simple test of the context.
+
+   If that criterion is not satisfied, do not call this function.  */
+	    
+static Bufpos
+boyer_moore (struct buffer *buf, Bufbyte *base_pat, Bytecount len,
+	     Bytind pos, Bytind lim, EMACS_INT n, Lisp_Object trt,
+	     Lisp_Object inverse_trt, int charset_base)
+{
+  /* #### Someone really really really needs to comment the workings
+     of this junk somewhat better.
+
+     BTW "BM" stands for Boyer-Moore, which is one of the standard
+     string-searching algorithms.  It's the best string-searching
+     algorithm out there, provided that:
+
+     a) You're not fazed by algorithm complexity. (Rabin-Karp, which
+     uses hashing, is much much easier to code but not as fast.)
+     b) You can freely move backwards in the string that you're
+     searching through.
+
+     As the comment below tries to explain (but garbles in typical
+     programmer-ese), the idea is that you don't have to do a
+     string match at every successive position in the text.  For
+     example, let's say the pattern is "a very long string".  We
+     compare the last character in the string (`g') with the
+     corresponding character in the text.  If it mismatches, and
+     it is, say, `z', then we can skip forward by the entire
+     length of the pattern because `z' does not occur anywhere
+     in the pattern.  If the mismatching character does occur
+     in the pattern, we can usually still skip forward by more
+     than one: e.g. if it is `l', then we can skip forward
+     by the length of the substring "ong string" -- i.e. the
+     largest end section of the pattern that does not contain
+     the mismatched character.  So what we do is compute, for
+     each possible character, the distance we can skip forward
+     (the "stride") and use it in the string matching.  This
+     is what the BM_tab holds. */
+  REGISTER EMACS_INT *BM_tab;
+  EMACS_INT *BM_tab_base;
+  REGISTER Bytecount dirlen;
+  EMACS_INT infinity;
+  Bytind limit;
+  Bytecount stride_for_teases = 0;
+  REGISTER EMACS_INT i, j;
+  Bufbyte *pat, *pat_end;
+  REGISTER Bufbyte *cursor, *p_limit, *ptr2;
+  Bufbyte simple_translate[0400];
+  REGISTER int direction = ((n > 0) ? 1 : -1);
+#ifdef MULE
+  Bufbyte translate_prev_byte = 0;
+  Bufbyte translate_anteprev_byte = 0;
+#endif
+#ifdef C_ALLOCA
+  EMACS_INT BM_tab_space[0400];
+  BM_tab = &BM_tab_space[0];
+#else
+  BM_tab = alloca_array (EMACS_INT, 256);
+#endif
+  
+  /* The general approach is that we are going to maintain that we
+     know the first (closest to the present position, in whatever
+     direction we're searching) character that could possibly be
+     the last (furthest from present position) character of a
+     valid match.  We advance the state of our knowledge by
+     looking at that character and seeing whether it indeed
+     matches the last character of the pattern.  If it does, we
+     take a closer look.  If it does not, we move our pointer (to
+     putative last characters) as far as is logically possible.
+     This amount of movement, which I call a stride, will be the
+     length of the pattern if the actual character appears nowhere
+     in the pattern, otherwise it will be the distance from the
+     last occurrence of that character to the end of the pattern.
+     As a coding trick, an enormous stride is coded into the table
+     for characters that match the last character.  This allows
+     use of only a single test, a test for having gone past the
+     end of the permissible match region, to test for both
+     possible matches (when the stride goes past the end
+     immediately) and failure to match (where you get nudged past
+     the end one stride at a time).
+
+     Here we make a "mickey mouse" BM table.  The stride of the
+     search is determined only by the last character of the
+     putative match.  If that character does not match, we will
+     stride the proper distance to propose a match that
+     superimposes it on the last instance of a character that
+     matches it (per trt), or misses it entirely if there is
+     none. */
+
+  dirlen = len * direction;
+  infinity = dirlen - (lim + pos + len + len) * direction;
+  /* Record position after the end of the pattern.  */
+  pat_end = base_pat + len;
+  if (direction < 0)
+    base_pat = pat_end - 1;
+  BM_tab_base = BM_tab;
+  BM_tab += 0400;
+  j = dirlen;		/* to get it in a register */
+  /* A character that does not appear in the pattern induces a
+     stride equal to the pattern length. */
+  while (BM_tab_base != BM_tab)
+    {
+      *--BM_tab = j;
+      *--BM_tab = j;
+      *--BM_tab = j;
+      *--BM_tab = j;
+    }
+  /* We use this for translation, instead of TRT itself.  We
+     fill this in to handle the characters that actually occur
+     in the pattern.  Others don't matter anyway!  */
+  xzero (simple_translate);
+  for (i = 0; i < 0400; i++)
+    simple_translate[i] = i;
+  i = 0;
+  while (i != infinity)
+    {
+      Bufbyte *ptr = base_pat + i;
+      i += direction;
+      if (i == dirlen)
+	i = infinity;
+      if (!NILP (trt))
+	{
+#ifdef MULE
+	  Emchar ch, untranslated;
+	  int this_translated = 1;
+
+	  /* Is *PTR the last byte of a character?  */
+	  if (pat_end - ptr == 1 || BUFBYTE_FIRST_BYTE_P (ptr[1]))
+	    {
+	      Bufbyte *charstart = ptr;
+	      while (!BUFBYTE_FIRST_BYTE_P (*charstart))
+		charstart--;
+	      untranslated = charptr_emchar (charstart);
+#ifdef UTF2000
+	      if (charset_base == (untranslated >> 6))
+#else
+	      if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
+#endif
+		{
+		  ch = TRANSLATE (trt, untranslated);
+		  if (!BUFBYTE_FIRST_BYTE_P (*ptr))
+		    {
+		      translate_prev_byte = ptr[-1];
+		      if (!BUFBYTE_FIRST_BYTE_P (translate_prev_byte))
+			translate_anteprev_byte = ptr[-2];
+		    }
+		}
+	      else
+		{
+		  this_translated = 0;
+		  ch = *ptr;
+		}
 	    }
 	  else
 	    {
-	      if (i == infinity)
-		stride_for_teases = BM_tab[j];
+	      ch = *ptr;
+	      this_translated = 0;
+	    }
+	  if (ch > 0400)
+	    j = ((unsigned char) ch | 0200);
+	  else
+	    j = (unsigned char) ch;
+	      
+	  if (i == infinity)
+	    stride_for_teases = BM_tab[j];
+	  BM_tab[j] = dirlen - i;
+	  /* A translation table is accompanied by its inverse --
+	     see comment following downcase_table for details */
+	  if (this_translated)
+	    {
+	      Emchar starting_ch = ch;
+	      EMACS_INT starting_j = j;
+	      while (1)
+		{
+		  ch = TRANSLATE (inverse_trt, ch);
+		  if (ch > 0400)
+		    j = ((unsigned char) ch | 0200);
+		  else
+		    j = (unsigned char) ch;
+
+		  /* For all the characters that map into CH,
+		     set up simple_translate to map the last byte
+		     into STARTING_J.  */
+		  simple_translate[j] = starting_j;
+		  if (ch == starting_ch)
+		    break;
+		  BM_tab[j] = dirlen - i;
+		}
+	    }
+#else
+	  EMACS_INT k;
+	  j = *ptr;
+	  k = (j = TRANSLATE (trt, j));
+	  if (i == infinity)
+	    stride_for_teases = BM_tab[j];
+	  BM_tab[j] = dirlen - i;
+	  /* A translation table is accompanied by its inverse --
+	     see comment following downcase_table for details */
+
+	  while ((j = TRANSLATE (inverse_trt, j)) != k)
+	    {
+	      simple_translate[j] = k;
 	      BM_tab[j] = dirlen - i;
 	    }
-	  /* stride_for_teases tells how much to stride if we get a */
-	  /* match on the far character but are subsequently */
-	  /* disappointed, by recording what the stride would have been */
-	  /* for that character if the last character had been */
-	  /* different. */
+#endif
 	}
-      infinity = dirlen - infinity;
-      pos += dirlen - ((direction > 0) ? direction : 0);
-      /* loop invariant - pos points at where last char (first char if reverse)
-	 of pattern would align in a possible match.  */
-      while (n != 0)
+      else
 	{
-	  /* It's been reported that some (broken) compiler thinks that
-	     Boolean expressions in an arithmetic context are unsigned.
-	     Using an explicit ?1:0 prevents this.  */
-	  if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0)
-	    return n * (0 - direction);
-	  /* First we do the part we can by pointers (maybe nothing) */
-	  QUIT;
-	  pat = base_pat;
-	  limit = pos - dirlen + direction;
-	  /* XEmacs change: definitions of CEILING_OF and FLOOR_OF
-	     have changed.  See buffer.h. */
-	  limit = ((direction > 0)
-		   ? BI_BUF_CEILING_OF (buf, limit) - 1
-		   : BI_BUF_FLOOR_OF (buf, limit + 1));
-	  /* LIMIT is now the last (not beyond-last!) value
-	     POS can take on without hitting edge of buffer or the gap.  */
-	  limit = ((direction > 0)
-		   ? min (lim - 1, min (limit, pos + 20000))
-		   : max (lim, max (limit, pos - 20000)));
-	  if ((limit - pos) * direction > 20)
+	  j = *ptr;
+
+	  if (i == infinity)
+	    stride_for_teases = BM_tab[j];
+	  BM_tab[j] = dirlen - i;
+	}
+      /* stride_for_teases tells how much to stride if we get a
+	 match on the far character but are subsequently
+	 disappointed, by recording what the stride would have been
+	 for that character if the last character had been
+	 different. */
+    }
+  infinity = dirlen - infinity;
+  pos += dirlen - ((direction > 0) ? direction : 0);
+  /* loop invariant - pos points at where last char (first char if
+     reverse) of pattern would align in a possible match.  */
+  while (n != 0)
+    {
+      Bytind tail_end;
+      Bufbyte *tail_end_ptr;
+      /* It's been reported that some (broken) compiler thinks
+	 that Boolean expressions in an arithmetic context are
+	 unsigned.  Using an explicit ?1:0 prevents this.  */
+      if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0)
+	return n * (0 - direction);
+      /* First we do the part we can by pointers (maybe
+	 nothing) */
+      QUIT;
+      pat = base_pat;
+      limit = pos - dirlen + direction;
+      /* XEmacs change: definitions of CEILING_OF and FLOOR_OF
+	 have changed.  See buffer.h. */
+      limit = ((direction > 0)
+	       ? BI_BUF_CEILING_OF (buf, limit) - 1
+	       : BI_BUF_FLOOR_OF (buf, limit + 1));
+      /* LIMIT is now the last (not beyond-last!) value POS can
+	 take on without hitting edge of buffer or the gap.  */
+      limit = ((direction > 0)
+	       ? min (lim - 1, min (limit, pos + 20000))
+	       : max (lim, max (limit, pos - 20000)));
+      tail_end = BI_BUF_CEILING_OF (buf, pos);
+      tail_end_ptr = BI_BUF_BYTE_ADDRESS (buf, tail_end);
+
+      if ((limit - pos) * direction > 20)
+	{
+	  p_limit = BI_BUF_BYTE_ADDRESS (buf, limit);
+	  ptr2 = (cursor = BI_BUF_BYTE_ADDRESS (buf, pos));
+	  /* In this loop, pos + cursor - ptr2 is the surrogate
+	     for pos */
+	  while (1)	/* use one cursor setting as long as i can */
 	    {
-	      p_limit = BI_BUF_BYTE_ADDRESS (buf, limit);
-	      ptr2 = (cursor = BI_BUF_BYTE_ADDRESS (buf, pos));
-	      /* In this loop, pos + cursor - ptr2 is the surrogate for pos */
-	      while (1)		/* use one cursor setting as long as i can */
+	      if (direction > 0) /* worth duplicating */
 		{
-		  if (direction > 0) /* worth duplicating */
-		    {
-		      /* Use signed comparison if appropriate
-			 to make cursor+infinity sure to be > p_limit.
-			 Assuming that the buffer lies in a range of addresses
-			 that are all "positive" (as ints) or all "negative",
-			 either kind of comparison will work as long
-			 as we don't step by infinity.  So pick the kind
-			 that works when we do step by infinity.  */
-		      if ((EMACS_INT) (p_limit + infinity) >
-			  (EMACS_INT) p_limit)
-			while ((EMACS_INT) cursor <=
-			       (EMACS_INT) p_limit)
-			  cursor += BM_tab[*cursor];
-		      else
-			while ((EMACS_UINT) cursor <=
-			       (EMACS_UINT) p_limit)
-			  cursor += BM_tab[*cursor];
-		    }
+		  /* Use signed comparison if appropriate to make
+		     cursor+infinity sure to be > p_limit.
+		     Assuming that the buffer lies in a range of
+		     addresses that are all "positive" (as ints)
+		     or all "negative", either kind of comparison
+		     will work as long as we don't step by
+		     infinity.  So pick the kind that works when
+		     we do step by infinity.  */
+		  if ((EMACS_INT) (p_limit + infinity) >
+		      (EMACS_INT) p_limit)
+		    while ((EMACS_INT) cursor <=
+			   (EMACS_INT) p_limit)
+		      cursor += BM_tab[*cursor];
 		  else
-		    {
-		      if ((EMACS_INT) (p_limit + infinity) <
-			  (EMACS_INT) p_limit)
-			while ((EMACS_INT) cursor >=
-			       (EMACS_INT) p_limit)
-			  cursor += BM_tab[*cursor];
-		      else
-			while ((EMACS_UINT) cursor >=
-			       (EMACS_UINT) p_limit)
-			  cursor += BM_tab[*cursor];
-		    }
- /* If you are here, cursor is beyond the end of the searched region. */
- /* This can happen if you match on the far character of the pattern, */
- /* because the "stride" of that character is infinity, a number able */
- /* to throw you well beyond the end of the search.  It can also */
- /* happen if you fail to match within the permitted region and would */
- /* otherwise try a character beyond that region */
-		  if ((cursor - p_limit) * direction <= len)
-		    break;	/* a small overrun is genuine */
-		  cursor -= infinity; /* large overrun = hit */
-		  i = dirlen - direction;
-		  if (trt != 0)
-		    {
-		      while ((i -= direction) + direction != 0)
-			if (pat[i] != trt[*(cursor -= direction)])
-			  break;
-		    }
+		    while ((EMACS_UINT) cursor <=
+			   (EMACS_UINT) p_limit)
+		      cursor += BM_tab[*cursor];
+		}
+	      else
+		{
+		  if ((EMACS_INT) (p_limit + infinity) <
+		      (EMACS_INT) p_limit)
+		    while ((EMACS_INT) cursor >=
+			   (EMACS_INT) p_limit)
+		      cursor += BM_tab[*cursor];
 		  else
+		    while ((EMACS_UINT) cursor >=
+			   (EMACS_UINT) p_limit)
+		      cursor += BM_tab[*cursor];
+		}
+	      /* If you are here, cursor is beyond the end of the
+		 searched region.  This can happen if you match on
+		 the far character of the pattern, because the
+		 "stride" of that character is infinity, a number
+		 able to throw you well beyond the end of the
+		 search.  It can also happen if you fail to match
+		 within the permitted region and would otherwise
+		 try a character beyond that region */
+	      if ((cursor - p_limit) * direction <= len)
+		break;	/* a small overrun is genuine */
+	      cursor -= infinity; /* large overrun = hit */
+	      i = dirlen - direction;
+	      if (!NILP (trt))
+		{
+		  while ((i -= direction) + direction != 0)
 		    {
-		      while ((i -= direction) + direction != 0)
-			if (pat[i] != *(cursor -= direction))
-			  break;
-		    }
-		  cursor += dirlen - i - direction;	/* fix cursor */
-		  if (i + direction == 0)
-		    {
+#ifdef MULE
+		      Emchar ch;
 		      cursor -= direction;
+		      /* Translate only the last byte of a character.  */
+		      if ((cursor == tail_end_ptr
+			   || BUFBYTE_FIRST_BYTE_P (cursor[1]))
+			  && (BUFBYTE_FIRST_BYTE_P (cursor[0])
+			      || (translate_prev_byte == cursor[-1]
+				  && (BUFBYTE_FIRST_BYTE_P (translate_prev_byte)
+				      || translate_anteprev_byte == cursor[-2]))))
+			ch = simple_translate[*cursor];
+		      else
+			ch = *cursor;
+		      if (pat[i] != ch)
+			break;
+#else
+		      if (pat[i] != TRANSLATE (trt, *(cursor -= direction)))
+			break;
+#endif
+		    }
+		}
+	      else
+		{
+		  while ((i -= direction) + direction != 0)
+		    if (pat[i] != *(cursor -= direction))
+		      break;
+		}
+	      cursor += dirlen - i - direction;	/* fix cursor */
+	      if (i + direction == 0)
+		{
+		  cursor -= direction;
 
-		      {
-			Bytind bytstart = (pos + cursor - ptr2 +
-					   ((direction > 0)
-					    ? 1 - len : 0));
-			Bufpos bufstart = bytind_to_bufpos (buf, bytstart);
-			Bufpos bufend = bytind_to_bufpos (buf, bytstart + len);
+		  {
+		    Bytind bytstart = (pos + cursor - ptr2 +
+				       ((direction > 0)
+					? 1 - len : 0));
+		    Bufpos bufstart = bytind_to_bufpos (buf, bytstart);
+		    Bufpos bufend = bytind_to_bufpos (buf, bytstart + len);
 
-			set_search_regs (buf, bufstart, bufend - bufstart);
-		      }
+		    set_search_regs (buf, bufstart, bufend - bufstart);
+		  }
 
-		      if ((n -= direction) != 0)
-			cursor += dirlen; /* to resume search */
-		      else
-			return ((direction > 0)
-				? search_regs.end[0] : search_regs.start[0]);
-		    }
+		  if ((n -= direction) != 0)
+		    cursor += dirlen; /* to resume search */
 		  else
-		    cursor += stride_for_teases; /* <sigh> we lose -  */
+		    return ((direction > 0)
+			    ? search_regs.end[0] : search_regs.start[0]);
 		}
-	      pos += cursor - ptr2;
+	      else
+		cursor += stride_for_teases; /* <sigh> we lose -  */
 	    }
-	  else
-	    /* Now we'll pick up a clump that has to be done the hard */
-	    /* way because it covers a discontinuity */
+	  pos += cursor - ptr2;
+	}
+      else
+	/* Now we'll pick up a clump that has to be done the hard
+	   way because it covers a discontinuity */
+	{
+	  /* XEmacs change: definitions of CEILING_OF and FLOOR_OF
+	     have changed.  See buffer.h. */
+	  limit = ((direction > 0)
+		   ? BI_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1
+		   : BI_BUF_FLOOR_OF (buf, pos - dirlen));
+	  limit = ((direction > 0)
+		   ? min (limit + len, lim - 1)
+		   : max (limit - len, lim));
+	  /* LIMIT is now the last value POS can have
+	     and still be valid for a possible match.  */
+	  while (1)
 	    {
-	      /* XEmacs change: definitions of CEILING_OF and FLOOR_OF
-		 have changed.  See buffer.h. */
-	      limit = ((direction > 0)
-		       ? BI_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1
-		       : BI_BUF_FLOOR_OF (buf, pos - dirlen));
-	      limit = ((direction > 0)
-		       ? min (limit + len, lim - 1)
-		       : max (limit - len, lim));
-	      /* LIMIT is now the last value POS can have
-		 and still be valid for a possible match.  */
-	      while (1)
+	      /* This loop can be coded for space rather than
+		 speed because it will usually run only once.
+		 (the reach is at most len + 21, and typically
+		 does not exceed len) */
+	      while ((limit - pos) * direction >= 0)
+		/* *not* BI_BUF_FETCH_CHAR.  We are working here
+		   with bytes, not characters. */
+		pos += BM_tab[*BI_BUF_BYTE_ADDRESS (buf, pos)];
+	      /* now run the same tests to distinguish going off
+		 the end, a match or a phony match. */
+	      if ((pos - limit) * direction <= len)
+		break;	/* ran off the end */
+	      /* Found what might be a match.
+		 Set POS back to last (first if reverse) char pos.  */
+	      pos -= infinity;
+	      i = dirlen - direction;
+	      while ((i -= direction) + direction != 0)
 		{
-		  /* This loop can be coded for space rather than */
-		  /* speed because it will usually run only once. */
-		  /* (the reach is at most len + 21, and typically */
-		  /* does not exceed len) */
-		  while ((limit - pos) * direction >= 0)
-		    /* *not* BI_BUF_FETCH_CHAR.  We are working here
-		       with bytes, not characters. */
-		    pos += BM_tab[*BI_BUF_BYTE_ADDRESS (buf, pos)];
-		  /* now run the same tests to distinguish going off the */
-		  /* end, a match or a phony match. */
-		  if ((pos - limit) * direction <= len)
-		    break;	/* ran off the end */
-		  /* Found what might be a match.
-		     Set POS back to last (first if reverse) char pos.  */
-		  pos -= infinity;
-		  i = dirlen - direction;
-		  while ((i -= direction) + direction != 0)
-		    {
-		      pos -= direction;
-		      if (pat[i] != (((Bufbyte *) trt)
-				     /* #### Does not handle TRT right */
-				     ? trt[*BI_BUF_BYTE_ADDRESS (buf, pos)]
-				     : *BI_BUF_BYTE_ADDRESS (buf, pos)))
-			break;
-		    }
-		  /* Above loop has moved POS part or all the way
-		     back to the first char pos (last char pos if reverse).
-		     Set it once again at the last (first if reverse) char.  */
-		  pos += dirlen - i- direction;
-		  if (i + direction == 0)
-		    {
-		      pos -= direction;
+#ifdef MULE
+		  Emchar ch;
+		  Bufbyte *ptr;
+#endif
+		  pos -= direction;
+#ifdef MULE
+		  ptr = BI_BUF_BYTE_ADDRESS (buf, pos);
+		  if ((ptr == tail_end_ptr
+		       || BUFBYTE_FIRST_BYTE_P (ptr[1]))
+		      && (BUFBYTE_FIRST_BYTE_P (ptr[0])
+			  || (translate_prev_byte == ptr[-1]
+			      && (BUFBYTE_FIRST_BYTE_P (translate_prev_byte)
+				  || translate_anteprev_byte == ptr[-2]))))
+		    ch = simple_translate[*ptr];
+		  else
+		    ch = *ptr;
+		  if (pat[i] != ch)
+		    break;
+		      
+#else
+		  if (pat[i] != TRANSLATE (trt,
+					   *BI_BUF_BYTE_ADDRESS (buf, pos)))
+		    break;
+#endif
+		}
+	      /* Above loop has moved POS part or all the way back
+		 to the first char pos (last char pos if reverse).
+		 Set it once again at the last (first if reverse)
+		 char.  */
+	      pos += dirlen - i- direction;
+	      if (i + direction == 0)
+		{
+		  pos -= direction;
 
-		      {
-			Bytind bytstart = (pos +
-					   ((direction > 0)
-					    ? 1 - len : 0));
-			Bufpos bufstart = bytind_to_bufpos (buf, bytstart);
-			Bufpos bufend = bytind_to_bufpos (buf, bytstart + len);
+		  {
+		    Bytind bytstart = (pos +
+				       ((direction > 0)
+					? 1 - len : 0));
+		    Bufpos bufstart = bytind_to_bufpos (buf, bytstart);
+		    Bufpos bufend = bytind_to_bufpos (buf, bytstart + len);
 
-			set_search_regs (buf, bufstart, bufend - bufstart);
-		      }
+		    set_search_regs (buf, bufstart, bufend - bufstart);
+		  }
 
-		      if ((n -= direction) != 0)
-			pos += dirlen; /* to resume search */
-		      else
-			return ((direction > 0)
-				? search_regs.end[0] : search_regs.start[0]);
-		    }
+		  if ((n -= direction) != 0)
+		    pos += dirlen; /* to resume search */
 		  else
-		    pos += stride_for_teases;
+		    return ((direction > 0)
+			    ? search_regs.end[0] : search_regs.start[0]);
 		}
-	      }
-	  /* We have done one clump.  Can we continue? */
-	  if ((lim - pos) * direction < 0)
-	    return (0 - n) * direction;
+	      else
+		pos += stride_for_teases;
+	    }
 	}
-      return bytind_to_bufpos (buf, pos);
+      /* We have done one clump.  Can we continue? */
+      if ((lim - pos) * direction < 0)
+	return (0 - n) * direction;
     }
+  return bytind_to_bufpos (buf, pos);
 }
 
 /* Record beginning BEG and end BEG + LEN
@@ -1588,7 +1939,11 @@ wordify (Lisp_Object buffer, Lisp_Object string)
   Charcount i, len;
   EMACS_INT punct_count = 0, word_count = 0;
   struct buffer *buf = decode_buffer (buffer, 0);
+#ifdef UTF2000
+  Lisp_Char_Table *syntax_table = XCHAR_TABLE (buf->syntax_table);
+#else
   Lisp_Char_Table *syntax_table = XCHAR_TABLE (buf->mirror_syntax_table);
+#endif
 
   CHECK_STRING (string);
   len = XSTRING_CHAR_LENGTH (string);
@@ -1944,7 +2299,11 @@ and you do not need to specify it.
       buf = XBUFFER (buffer);
     }
 
+#ifdef UTF2000
+  syntax_table = XCHAR_TABLE (buf->syntax_table);
+#else
   syntax_table = XCHAR_TABLE (buf->mirror_syntax_table);
+#endif
 
   case_action = nochange;	/* We tried an initialization */
 				/* but some C compilers blew it */