X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fregex.c;h=eb98c0bcab3053f4de6e34620869c58d117c961c;hb=48edee2a7f07225272c94d9bc300cb16f29c0ef8;hp=b2a7a64b3d50f3cc632e42f026ccc751f105120e;hpb=8ae91923b1c6a495348a86739ef5dafb55993b56;p=chise%2Fxemacs-chise.git.1 diff --git a/src/regex.c b/src/regex.c index b2a7a64..eb98c0b 100644 --- a/src/regex.c +++ b/src/regex.c @@ -6,6 +6,7 @@ Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. Copyright (C) 1995 Sun Microsystems, Inc. Copyright (C) 1995 Ben Wing. + Copyright (C) 1999,2000,2001 MORIOKA Tomohiko This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -131,13 +132,19 @@ char *malloc (); char *realloc (); #endif -/* Other types */ +/* Types normally included via lisp.h */ #include /* for ptrdiff_t */ -#define charptr_emchar(str) ((Emchar) (str)[0]) +#ifdef REGEX_MALLOC +#ifndef DECLARE_NOTHING +#define DECLARE_NOTHING struct nosuchstruct +#endif +#endif typedef int Emchar; +#define charptr_emchar(str) ((Emchar) (str)[0]) + #define INC_CHARPTR(p) ((p)++) #define DEC_CHARPTR(p) ((p)--) @@ -1129,7 +1136,7 @@ static const char *re_error_msgid[] = exactly that if always used MAX_FAILURE_SPACE each time we failed. This is a variable only so users of regex can assign to it; we never change it ourselves. */ -#if defined (MATCH_MAY_ALLOCATE) +#if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC) /* 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. */ int re_max_failures = 20000; @@ -1585,13 +1592,6 @@ static unsigned char reg_unset_dummy; when we use a character as a subscript we must make it unsigned. */ #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d)) -#ifdef MULE - -#define TRANSLATE_EXTENDED_UNSAFE(emch) \ - (TRANSLATE_P (translate) && emch < 0x80 ? RE_TRANSLATE (emch) : (emch)) - -#endif - /* Macros for outputting the compiled pattern into `buffer'. */ /* If the buffer isn't allocated when it comes in, use this. */ @@ -3356,8 +3356,12 @@ compile_extended_range (re_char **p_ptr, re_char *pend, ranges entirely within the first 256 chars. */ if ((range_start >= 0x100 || range_end >= 0x100) - && CHAR_LEADING_BYTE (range_start) != - CHAR_LEADING_BYTE (range_end)) +#ifdef UTF2000 + && CHAR_CHARSET_ID (range_start) != CHAR_CHARSET_ID (range_end) +#else + && CHAR_LEADING_BYTE (range_start) != CHAR_LEADING_BYTE (range_end) +#endif + ) return REG_ERANGESPAN; /* As advertised, translations only work over the 0 - 0x7F range. @@ -3643,25 +3647,36 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) goto done; #ifdef emacs -#if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ +#if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ case syntaxspec: k = *p++; #endif matchsyntax: #ifdef MULE +#ifdef UTF2000 + for (j = 0; j < 0x80; j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->syntax_table), j) == + (enum syntaxcode) k) + fastmap[j] = 1; +#else for (j = 0; j < 0x80; j++) if (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), j) == (enum syntaxcode) k) fastmap[j] = 1; +#endif for (j = 0x80; j < 0xA0; j++) { +#ifndef UTF2000 if (LEADING_BYTE_PREFIX_P(j)) /* too complicated to calculate this right */ fastmap[j] = 1; else { +#endif int multi_p; Lisp_Object cset; @@ -3673,7 +3688,9 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) == Sword || multi_p) fastmap[j] = 1; } +#ifndef UTF2000 } +#endif } #else /* not MULE */ for (j = 0; j < (1 << BYTEWIDTH); j++) @@ -3686,25 +3703,36 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) break; -#if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ +#if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ case notsyntaxspec: k = *p++; #endif matchnotsyntax: #ifdef MULE +#ifdef UTF2000 + for (j = 0; j < 0x80; j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->syntax_table), j) != + (enum syntaxcode) k) + fastmap[j] = 1; +#else for (j = 0; j < 0x80; j++) if (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), j) != (enum syntaxcode) k) fastmap[j] = 1; +#endif for (j = 0x80; j < 0xA0; j++) { +#ifndef UTF2000 if (LEADING_BYTE_PREFIX_P(j)) /* too complicated to calculate this right */ fastmap[j] = 1; else { +#endif int multi_p; Lisp_Object cset; @@ -3716,7 +3744,9 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) != Sword || multi_p) fastmap[j] = 1; } +#ifndef UTF2000 } +#endif } #else /* not MULE */ for (j = 0; j < (1 << BYTEWIDTH); j++) @@ -3746,7 +3776,7 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) case at_dot: case after_dot: continue; -#endif /* not emacs */ +#endif /* emacs */ case no_op: @@ -4108,10 +4138,12 @@ re_search_2 (struct re_pattern_buffer *bufp, const char *str1, { #ifdef MULE Emchar buf_ch; + Bufbyte str[MAX_EMCHAR_LEN]; buf_ch = charptr_emchar (d); buf_ch = RE_TRANSLATE (buf_ch); - if (buf_ch >= 0200 || fastmap[(unsigned char) buf_ch]) + set_charptr_emchar (str, buf_ch); + if (buf_ch >= 0200 || fastmap[(unsigned char) *str]) break; #else if (fastmap[(unsigned char)RE_TRANSLATE (*d)]) @@ -4236,9 +4268,15 @@ re_search_2 (struct re_pattern_buffer *bufp, const char *str1, #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) /* Test if CH is a word-constituent character. (XEmacs change) */ +#ifdef UTF2000 +#define WORDCHAR_P_UNSAFE(ch) \ + (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), \ + ch) == Sword) +#else #define WORDCHAR_P_UNSAFE(ch) \ (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \ ch) == Sword) +#endif /* Free everything we malloc. */ #ifdef MATCH_MAY_ALLOCATE @@ -4710,16 +4748,24 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); } } - - /* If the regs structure we return has more elements than - were in the pattern, set the extra elements to -1. If - we (re)allocated the registers, this is the case, - because we always allocate enough to have at least one - -1 at the end. */ - for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) - regs->start[mcnt] = regs->end[mcnt] = -1; } /* regs && !bufp->no_sub */ + /* If we have regs and the regs structure has more elements than + were in the pattern, set the extra elements to -1. If we + (re)allocated the registers, this is the case, because we + always allocate enough to have at least one -1 at the end. + + We do this even when no_sub is set because some applications + (XEmacs) reuse register structures which may contain stale + information, and permit attempts to access those registers. + + It would be possible to require the caller to do this, but we'd + have to change the API for this function to reflect that, and + audit all callers. */ + if (regs && regs->num_regs > 0) + for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", nfailure_points_pushed, nfailure_points_popped, nfailure_points_pushed - nfailure_points_popped); @@ -4851,7 +4897,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, REGEX_PREFETCH (); c = charptr_emchar ((const Bufbyte *) d); - c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */ + c = TRANSLATE (c); /* The character to match. */ if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) not_p = !not_p; @@ -5532,40 +5578,64 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, matchwordbound: { /* XEmacs change */ - int result; - if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) - result = 1; - else - { - re_char *d_before = POS_BEFORE_GAP_UNSAFE (d); - re_char *d_after = POS_AFTER_GAP_UNSAFE (d); - - /* emch1 is the character before d, syn1 is the syntax of emch1, - emch2 is the character at d, and syn2 is the syntax of emch2. */ - Emchar emch1, emch2; - int syn1, syn2; + /* Straightforward and (I hope) correct implementation. + Probably should be optimized by arranging to compute + pos only once. */ + /* emch1 is the character before d, syn1 is the syntax of + emch1, emch2 is the character at d, and syn2 is the + syntax of emch2. */ + Emchar emch1, emch2; + int syn1, syn2; + re_char *d_before, *d_after; + int result, + at_beg = AT_STRINGS_BEG (d), + at_end = AT_STRINGS_END (d); #ifdef emacs - int pos_before; + int xpos; #endif - DEC_CHARPTR (d_before); - emch1 = charptr_emchar (d_before); - emch2 = charptr_emchar (d_after); - + if (at_beg && at_end) + { + result = 0; + } + else + { + if (!at_beg) + { + d_before = POS_BEFORE_GAP_UNSAFE (d); + DEC_CHARPTR (d_before); + emch1 = charptr_emchar (d_before); #ifdef emacs - pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1; - UPDATE_SYNTAX_CACHE (pos_before); + xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1; + UPDATE_SYNTAX_CACHE (xpos); #endif - syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), - emch1); + syn1 = SYNTAX_FROM_CACHE + (XCHAR_TABLE (regex_emacs_buffer + ->mirror_syntax_table), + emch1); + } + if (!at_end) + { + d_after = POS_AFTER_GAP_UNSAFE (d); + emch2 = charptr_emchar (d_after); #ifdef emacs - UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1); + xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); + UPDATE_SYNTAX_CACHE_FORWARD (xpos + 1); #endif - syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), - emch2); + syn2 = SYNTAX_FROM_CACHE + (XCHAR_TABLE (regex_emacs_buffer + ->mirror_syntax_table), + emch2); + } - result = ((syn1 == Sword) != (syn2 == Sword)); + if (at_beg) + result = (syn2 == Sword); + else if (at_end) + result = (syn1 == Sword); + else + result = ((syn1 == Sword) != (syn2 == Sword)); } + if (result == should_succeed) break; goto fail; @@ -5705,8 +5775,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #endif emch = charptr_emchar ((const Bufbyte *) d); - matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->mirror_syntax_table, +#ifdef UTF2000 + matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), emch) == (enum syntaxcode) mcnt); +#else + matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), + emch) == (enum syntaxcode) mcnt); +#endif INC_CHARPTR (d); if (matches != should_succeed) goto fail;