1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
94 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
100 Lisp_Object Vthe_lisp_rangetab;
103 complex_vars_of_regex (void)
105 Vthe_lisp_rangetab = Fmake_range_table ();
106 staticpro (&Vthe_lisp_rangetab);
112 complex_vars_of_regex (void)
118 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
119 #define TRANSLATE_P(tr) (!NILP (tr))
121 #else /* not emacs */
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
137 /* Types normally included via lisp.h */
138 #include <stddef.h> /* for ptrdiff_t */
141 #ifndef DECLARE_NOTHING
142 #define DECLARE_NOTHING struct nosuchstruct
148 #define charptr_emchar(str) ((Emchar) (str)[0])
150 #define INC_CHARPTR(p) ((p)++)
151 #define DEC_CHARPTR(p) ((p)--)
155 /* Define the syntax stuff for \<, \>, etc. */
157 /* This must be nonzero for the wordchar and notwordchar pattern
158 commands in re_match_2. */
165 extern char *re_syntax_table;
167 #else /* not SYNTAX_TABLE */
169 /* How many characters in the character set. */
170 #define CHAR_SET_SIZE 256
172 static char re_syntax_table[CHAR_SET_SIZE];
175 init_syntax_once (void)
181 const char *word_syntax_chars =
182 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
184 memset (re_syntax_table, 0, sizeof (re_syntax_table));
186 while (*word_syntax_chars)
187 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
204 /* Under XEmacs, this is needed because we don't define it elsewhere. */
205 #ifdef SWITCH_ENUM_BUG
206 #define SWITCH_ENUM_CAST(x) ((int)(x))
208 #define SWITCH_ENUM_CAST(x) (x)
212 /* Get the interface, including the syntax bits. */
215 /* isalpha etc. are used for the character classes. */
218 /* Jim Meyering writes:
220 "... Some ctype macros are valid only for character codes that
221 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
222 using /bin/cc or gcc but without giving an ansi option). So, all
223 ctype uses should be through macros like ISPRINT... If
224 STDC_HEADERS is defined, then autoconf has verified that the ctype
225 macros don't need to be guarded with references to isascii. ...
226 Defining isascii to 1 should let any compiler worth its salt
227 eliminate the && through constant folding." */
229 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
230 #define ISASCII_1(c) 1
232 #define ISASCII_1(c) isascii(c)
236 /* The IS*() macros can be passed any character, including an extended
237 one. We need to make sure there are no crashes, which would occur
238 otherwise due to out-of-bounds array references. */
239 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
241 #define ISASCII(c) ISASCII_1 (c)
245 #define ISBLANK(c) (ISASCII (c) && isblank (c))
247 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
250 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
252 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
255 #define ISPRINT(c) (ISASCII (c) && isprint (c))
256 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
257 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
258 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
259 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
260 #define ISLOWER(c) (ISASCII (c) && islower (c))
261 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
262 #define ISSPACE(c) (ISASCII (c) && isspace (c))
263 #define ISUPPER(c) (ISASCII (c) && isupper (c))
264 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
267 #define NULL (void *)0
270 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
271 since ours (we hope) works properly with all combinations of
272 machines, compilers, `char' and `unsigned char' argument types.
273 (Per Bothner suggested the basic approach.) */
274 #undef SIGN_EXTEND_CHAR
276 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
277 #else /* not __STDC__ */
278 /* As in Harbison and Steele. */
279 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
282 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
283 use `alloca' instead of `malloc'. This is because using malloc in
284 re_search* or re_match* could cause memory leaks when C-g is used in
285 Emacs; also, malloc is slower and causes storage fragmentation. On
286 the other hand, malloc is more portable, and easier to debug.
288 Because we sometimes use alloca, some routines have to be macros,
289 not functions -- `alloca'-allocated space disappears at the end of the
290 function it is called in. */
294 #define REGEX_ALLOCATE malloc
295 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
296 #define REGEX_FREE free
298 #else /* not REGEX_MALLOC */
300 /* Emacs already defines alloca, sometimes. */
303 /* Make alloca work the best possible way. */
305 #define alloca __builtin_alloca
306 #else /* not __GNUC__ */
309 #else /* not __GNUC__ or HAVE_ALLOCA_H */
310 #ifndef _AIX /* Already did AIX, up at the top. */
312 #endif /* not _AIX */
313 #endif /* HAVE_ALLOCA_H */
314 #endif /* __GNUC__ */
316 #endif /* not alloca */
318 #define REGEX_ALLOCATE alloca
320 /* Assumes a `char *destination' variable. */
321 #define REGEX_REALLOCATE(source, osize, nsize) \
322 (destination = (char *) alloca (nsize), \
323 memmove (destination, source, osize), \
326 /* No need to do anything to free, after alloca. */
327 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
329 #endif /* REGEX_MALLOC */
331 /* Define how to allocate the failure stack. */
334 #define REGEX_ALLOCATE_STACK(size) \
335 r_alloc ((char **) &failure_stack_ptr, (size))
336 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
337 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
338 #define REGEX_FREE_STACK(ptr) \
339 r_alloc_free ((void **) &failure_stack_ptr)
341 #else /* not REL_ALLOC */
345 #define REGEX_ALLOCATE_STACK malloc
346 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
347 #define REGEX_FREE_STACK free
349 #else /* not REGEX_MALLOC */
351 #define REGEX_ALLOCATE_STACK alloca
353 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
354 REGEX_REALLOCATE (source, osize, nsize)
355 /* No need to explicitly free anything. */
356 #define REGEX_FREE_STACK(arg)
358 #endif /* REGEX_MALLOC */
359 #endif /* REL_ALLOC */
362 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
363 `string1' or just past its end. This works if PTR is NULL, which is
365 #define FIRST_STRING_P(ptr) \
366 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
368 /* (Re)Allocate N items of type T using malloc, or fail. */
369 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
370 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
371 #define RETALLOC_IF(addr, n, t) \
372 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
373 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
375 #define BYTEWIDTH 8 /* In bits. */
377 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
381 #define MAX(a, b) ((a) > (b) ? (a) : (b))
382 #define MIN(a, b) ((a) < (b) ? (a) : (b))
384 /* Type of source-pattern and string chars. */
385 typedef const unsigned char re_char;
387 typedef char re_bool;
392 /* These are the command codes that appear in compiled regular
393 expressions. Some opcodes are followed by argument bytes. A
394 command code can specify any interpretation whatsoever for its
395 arguments. Zero bytes may appear in the compiled regular expression. */
401 /* Succeed right away--no more backtracking. */
404 /* Followed by one byte giving n, then by n literal bytes. */
407 /* Matches any (more or less) character. */
410 /* Matches any one char belonging to specified set. First
411 following byte is number of bitmap bytes. Then come bytes
412 for a bitmap saying which chars are in. Bits in each byte
413 are ordered low-bit-first. A character is in the set if its
414 bit is 1. A character too large to have a bit in the map is
415 automatically not in the set. */
418 /* Same parameters as charset, but match any character that is
419 not one of those specified. */
422 /* Start remembering the text that is matched, for storing in a
423 register. Followed by one byte with the register number, in
424 the range 1 to the pattern buffer's re_ngroups
425 field. Then followed by one byte with the number of groups
426 inner to this one. (This last has to be part of the
427 start_memory only because we need it in the on_failure_jump
431 /* Stop remembering the text that is matched and store it in a
432 memory register. Followed by one byte with the register
433 number, in the range 1 to `re_ngroups' in the
434 pattern buffer, and one byte with the number of inner groups,
435 just like `start_memory'. (We need the number of inner
436 groups here because we don't have any easy way of finding the
437 corresponding start_memory when we're at a stop_memory.) */
440 /* Match a duplicate of something remembered. Followed by one
441 byte containing the register number. */
444 /* Fail unless at beginning of line. */
447 /* Fail unless at end of line. */
450 /* Succeeds if at beginning of buffer (if emacs) or at beginning
451 of string to be matched (if not). */
454 /* Analogously, for end of buffer/string. */
457 /* Followed by two byte relative address to which to jump. */
460 /* Same as jump, but marks the end of an alternative. */
463 /* Followed by two-byte relative address of place to resume at
464 in case of failure. */
467 /* Like on_failure_jump, but pushes a placeholder instead of the
468 current string position when executed. */
469 on_failure_keep_string_jump,
471 /* Throw away latest failure point and then jump to following
472 two-byte relative address. */
475 /* Change to pop_failure_jump if know won't have to backtrack to
476 match; otherwise change to jump. This is used to jump
477 back to the beginning of a repeat. If what follows this jump
478 clearly won't match what the repeat does, such that we can be
479 sure that there is no use backtracking out of repetitions
480 already matched, then we change it to a pop_failure_jump.
481 Followed by two-byte address. */
484 /* Jump to following two-byte address, and push a dummy failure
485 point. This failure point will be thrown away if an attempt
486 is made to use it for a failure. A `+' construct makes this
487 before the first repeat. Also used as an intermediary kind
488 of jump when compiling an alternative. */
491 /* Push a dummy failure point and continue. Used at the end of
495 /* Followed by two-byte relative address and two-byte number n.
496 After matching N times, jump to the address upon failure. */
499 /* Followed by two-byte relative address, and two-byte number n.
500 Jump to the address N times, then fail. */
503 /* Set the following two-byte relative address to the
504 subsequent two-byte number. The address *includes* the two
508 wordchar, /* Matches any word-constituent character. */
509 notwordchar, /* Matches any char that is not a word-constituent. */
511 wordbeg, /* Succeeds if at word beginning. */
512 wordend, /* Succeeds if at word end. */
514 wordbound, /* Succeeds if at a word boundary. */
515 notwordbound /* Succeeds if not at a word boundary. */
518 ,before_dot, /* Succeeds if before point. */
519 at_dot, /* Succeeds if at point. */
520 after_dot, /* Succeeds if after point. */
522 /* Matches any character whose syntax is specified. Followed by
523 a byte which contains a syntax code, e.g., Sword. */
526 /* Matches any character whose syntax is not that specified. */
532 /* need extra stuff to be able to properly work with XEmacs/Mule
533 characters (which may take up more than one byte) */
535 ,charset_mule, /* Matches any character belonging to specified set.
536 The set is stored in "unified range-table
537 format"; see rangetab.c. Unlike the `charset'
538 opcode, this can handle arbitrary characters. */
540 charset_mule_not /* Same parameters as charset_mule, but match any
541 character that is not one of those specified. */
543 /* 97/2/17 jhod: The following two were merged back in from the Mule
544 2.3 code to enable some language specific processing */
545 ,categoryspec, /* Matches entries in the character category tables */
546 notcategoryspec /* The opposite of the above */
551 /* Common operations on the compiled pattern. */
553 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
555 #define STORE_NUMBER(destination, number) \
557 (destination)[0] = (number) & 0377; \
558 (destination)[1] = (number) >> 8; \
561 /* Same as STORE_NUMBER, except increment DESTINATION to
562 the byte after where the number is stored. Therefore, DESTINATION
563 must be an lvalue. */
565 #define STORE_NUMBER_AND_INCR(destination, number) \
567 STORE_NUMBER (destination, number); \
568 (destination) += 2; \
571 /* Put into DESTINATION a number stored in two contiguous bytes starting
574 #define EXTRACT_NUMBER(destination, source) \
576 (destination) = *(source) & 0377; \
577 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
582 extract_number (int *dest, re_char *source)
584 int temp = SIGN_EXTEND_CHAR (*(source + 1));
585 *dest = *source & 0377;
589 #ifndef EXTRACT_MACROS /* To debug the macros. */
590 #undef EXTRACT_NUMBER
591 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
592 #endif /* not EXTRACT_MACROS */
596 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
597 SOURCE must be an lvalue. */
599 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
601 EXTRACT_NUMBER (destination, source); \
607 extract_number_and_incr (int *destination, unsigned char **source)
609 extract_number (destination, *source);
613 #ifndef EXTRACT_MACROS
614 #undef EXTRACT_NUMBER_AND_INCR
615 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
616 extract_number_and_incr (&dest, &src)
617 #endif /* not EXTRACT_MACROS */
621 /* If DEBUG is defined, Regex prints many voluminous messages about what
622 it is doing (if the variable `debug' is nonzero). If linked with the
623 main program in `iregex.c', you can enter patterns and strings
624 interactively. And if linked with the main program in `main.c' and
625 the other test files, you can run the already-written tests. */
629 /* We use standard I/O for debugging. */
633 /* XEmacs provides its own version of assert() */
634 /* It is useful to test things that ``must'' be true when debugging. */
638 static int debug = 0;
640 #define DEBUG_STATEMENT(e) e
641 #define DEBUG_PRINT1(x) if (debug) printf (x)
642 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
643 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
644 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
645 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
646 if (debug) print_partial_compiled_pattern (s, e)
647 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
648 if (debug) print_double_string (w, s1, sz1, s2, sz2)
651 /* Print the fastmap in human-readable form. */
654 print_fastmap (char *fastmap)
656 unsigned was_a_range = 0;
659 while (i < (1 << BYTEWIDTH))
665 while (i < (1 << BYTEWIDTH) && fastmap[i])
681 /* Print a compiled pattern string in human-readable form, starting at
682 the START pointer into it and ending just before the pointer END. */
685 print_partial_compiled_pattern (re_char *start, re_char *end)
688 unsigned char *p = (unsigned char *) start;
697 /* Loop over pattern commands. */
700 printf ("%ld:\t", (long)(p - start));
702 switch ((re_opcode_t) *p++)
710 printf ("/exactn/%d", mcnt);
721 printf ("/start_memory/%d/%d", mcnt, *p++);
726 printf ("/stop_memory/%d/%d", mcnt, *p++);
730 printf ("/duplicate/%d", *p++);
740 REGISTER int c, last = -100;
741 REGISTER int in_range = 0;
743 printf ("/charset [%s",
744 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
746 assert (p + *p < pend);
748 for (c = 0; c < 256; c++)
749 if (((unsigned char) (c / 8) < *p)
750 && (p[1 + (c/8)] & (1 << (c % 8))))
752 /* Are we starting a range? */
753 if (last + 1 == c && ! in_range)
758 /* Have we broken a range? */
759 else if (last + 1 != c && in_range)
782 case charset_mule_not:
786 printf ("/charset_mule [%s",
787 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
788 nentries = unified_range_table_nentries (p);
789 for (i = 0; i < nentries; i++)
791 EMACS_INT first, last;
792 Lisp_Object dummy_val;
794 unified_range_table_get_range (p, i, &first, &last,
799 printf ("(0x%lx)", (long)first);
806 printf ("(0x%lx)", (long)last);
810 p += unified_range_table_bytes_used (p);
823 case on_failure_jump:
824 extract_number_and_incr (&mcnt, &p);
825 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
828 case on_failure_keep_string_jump:
829 extract_number_and_incr (&mcnt, &p);
830 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
833 case dummy_failure_jump:
834 extract_number_and_incr (&mcnt, &p);
835 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
838 case push_dummy_failure:
839 printf ("/push_dummy_failure");
843 extract_number_and_incr (&mcnt, &p);
844 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
847 case pop_failure_jump:
848 extract_number_and_incr (&mcnt, &p);
849 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
853 extract_number_and_incr (&mcnt, &p);
854 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
858 extract_number_and_incr (&mcnt, &p);
859 printf ("/jump to %ld", (long)(p + mcnt - start));
863 extract_number_and_incr (&mcnt, &p);
864 extract_number_and_incr (&mcnt2, &p);
865 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
869 extract_number_and_incr (&mcnt, &p);
870 extract_number_and_incr (&mcnt2, &p);
871 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
875 extract_number_and_incr (&mcnt, &p);
876 extract_number_and_incr (&mcnt2, &p);
877 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
881 printf ("/wordbound");
885 printf ("/notwordbound");
897 printf ("/before_dot");
905 printf ("/after_dot");
909 printf ("/syntaxspec");
911 printf ("/%d", mcnt);
915 printf ("/notsyntaxspec");
917 printf ("/%d", mcnt);
921 /* 97/2/17 jhod Mule category patch */
923 printf ("/categoryspec");
925 printf ("/%d", mcnt);
928 case notcategoryspec:
929 printf ("/notcategoryspec");
931 printf ("/%d", mcnt);
933 /* end of category patch */
938 printf ("/wordchar");
942 printf ("/notwordchar");
954 printf ("?%d", *(p-1));
960 printf ("%ld:\tend of pattern.\n", (long)(p - start));
965 print_compiled_pattern (struct re_pattern_buffer *bufp)
967 re_char *buffer = bufp->buffer;
969 print_partial_compiled_pattern (buffer, buffer + bufp->used);
970 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
973 if (bufp->fastmap_accurate && bufp->fastmap)
975 printf ("fastmap: ");
976 print_fastmap (bufp->fastmap);
979 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
980 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups);
981 printf ("regs_alloc: %d\t", bufp->regs_allocated);
982 printf ("can_be_null: %d\t", bufp->can_be_null);
983 printf ("newline_anchor: %d\n", bufp->newline_anchor);
984 printf ("no_sub: %d\t", bufp->no_sub);
985 printf ("not_bol: %d\t", bufp->not_bol);
986 printf ("not_eol: %d\t", bufp->not_eol);
987 printf ("syntax: %d\n", bufp->syntax);
988 /* Perhaps we should print the translate table? */
989 /* and maybe the category table? */
991 if (bufp->external_to_internal_register)
995 printf ("external_to_internal_register:\n");
996 for (i = 0; i <= bufp->re_nsub; i++)
1000 printf ("%d -> %d", i, bufp->external_to_internal_register[i]);
1008 print_double_string (re_char *where, re_char *string1, int size1,
1009 re_char *string2, int size2)
1015 Element_count this_char;
1017 if (FIRST_STRING_P (where))
1019 for (this_char = where - string1; this_char < size1; this_char++)
1020 putchar (string1[this_char]);
1025 for (this_char = where - string2; this_char < size2; this_char++)
1026 putchar (string2[this_char]);
1030 #else /* not DEBUG */
1035 #define DEBUG_STATEMENT(e)
1036 #define DEBUG_PRINT1(x)
1037 #define DEBUG_PRINT2(x1, x2)
1038 #define DEBUG_PRINT3(x1, x2, x3)
1039 #define DEBUG_PRINT4(x1, x2, x3, x4)
1040 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1041 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1045 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1046 also be assigned to arbitrarily: each pattern buffer stores its own
1047 syntax, so it can be changed between regex compilations. */
1048 /* This has no initializer because initialized variables in Emacs
1049 become read-only after dumping. */
1050 reg_syntax_t re_syntax_options;
1053 /* Specify the precise syntax of regexps for compilation. This provides
1054 for compatibility for various utilities which historically have
1055 different, incompatible syntaxes.
1057 The argument SYNTAX is a bit mask comprised of the various bits
1058 defined in regex.h. We return the old syntax. */
1061 re_set_syntax (reg_syntax_t syntax)
1063 reg_syntax_t ret = re_syntax_options;
1065 re_syntax_options = syntax;
1069 /* This table gives an error message for each of the error codes listed
1070 in regex.h. Obviously the order here has to be same as there.
1071 POSIX doesn't require that we do anything for REG_NOERROR,
1072 but why not be nice? */
1074 static const char *re_error_msgid[] =
1076 "Success", /* REG_NOERROR */
1077 "No match", /* REG_NOMATCH */
1078 "Invalid regular expression", /* REG_BADPAT */
1079 "Invalid collation character", /* REG_ECOLLATE */
1080 "Invalid character class name", /* REG_ECTYPE */
1081 "Trailing backslash", /* REG_EESCAPE */
1082 "Invalid back reference", /* REG_ESUBREG */
1083 "Unmatched [ or [^", /* REG_EBRACK */
1084 "Unmatched ( or \\(", /* REG_EPAREN */
1085 "Unmatched \\{", /* REG_EBRACE */
1086 "Invalid content of \\{\\}", /* REG_BADBR */
1087 "Invalid range end", /* REG_ERANGE */
1088 "Memory exhausted", /* REG_ESPACE */
1089 "Invalid preceding regular expression", /* REG_BADRPT */
1090 "Premature end of regular expression", /* REG_EEND */
1091 "Regular expression too big", /* REG_ESIZE */
1092 "Unmatched ) or \\)", /* REG_ERPAREN */
1094 "Invalid syntax designator", /* REG_ESYNTAX */
1097 "Ranges may not span charsets", /* REG_ERANGESPAN */
1098 "Invalid category designator", /* REG_ECATEGORY */
1102 /* Avoiding alloca during matching, to placate r_alloc. */
1104 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1105 searching and matching functions should not call alloca. On some
1106 systems, alloca is implemented in terms of malloc, and if we're
1107 using the relocating allocator routines, then malloc could cause a
1108 relocation, which might (if the strings being searched are in the
1109 ralloc heap) shift the data out from underneath the regexp
1112 Here's another reason to avoid allocation: Emacs
1113 processes input from X in a signal handler; processing X input may
1114 call malloc; if input arrives while a matching routine is calling
1115 malloc, then we're scrod. But Emacs can't just block input while
1116 calling matching routines; then we don't notice interrupts when
1117 they come in. So, Emacs blocks input around all regexp calls
1118 except the matching calls, which it leaves unprotected, in the
1119 faith that they will not malloc. */
1121 /* Normally, this is fine. */
1122 #define MATCH_MAY_ALLOCATE
1124 /* When using GNU C, we are not REALLY using the C alloca, no matter
1125 what config.h may say. So don't take precautions for it. */
1130 /* The match routines may not allocate if (1) they would do it with malloc
1131 and (2) it's not safe for them to use malloc.
1132 Note that if REL_ALLOC is defined, matching would not use malloc for the
1133 failure stack, but we would still use it for the register vectors;
1134 so REL_ALLOC should not affect this. */
1135 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1136 #undef MATCH_MAY_ALLOCATE
1140 /* Failure stack declarations and macros; both re_compile_fastmap and
1141 re_match_2 use a failure stack. These have to be macros because of
1142 REGEX_ALLOCATE_STACK. */
1145 /* Number of failure points for which to initially allocate space
1146 when matching. If this number is exceeded, we allocate more
1147 space, so it is not a hard limit. */
1148 #ifndef INIT_FAILURE_ALLOC
1149 #define INIT_FAILURE_ALLOC 20
1152 /* Roughly the maximum number of failure points on the stack. Would be
1153 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1154 This is a variable only so users of regex can assign to it; we never
1155 change it ourselves. */
1156 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1157 /* 4400 was enough to cause a crash on Alpha OSF/1,
1158 whose default stack limit is 2mb. */
1159 int re_max_failures = 40000;
1161 int re_max_failures = 4000;
1164 union fail_stack_elt
1170 typedef union fail_stack_elt fail_stack_elt_t;
1174 fail_stack_elt_t *stack;
1176 Element_count avail; /* Offset of next open position. */
1179 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1180 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1181 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1184 /* Define macros to initialize and free the failure stack.
1185 Do `return -2' if the alloc fails. */
1187 #ifdef MATCH_MAY_ALLOCATE
1188 #define INIT_FAIL_STACK() \
1190 fail_stack.stack = (fail_stack_elt_t *) \
1191 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1193 if (fail_stack.stack == NULL) \
1196 fail_stack.size = INIT_FAILURE_ALLOC; \
1197 fail_stack.avail = 0; \
1200 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1202 #define INIT_FAIL_STACK() \
1204 fail_stack.avail = 0; \
1207 #define RESET_FAIL_STACK()
1211 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1213 Return 1 if succeeds, and 0 if either ran out of memory
1214 allocating space for it or it was already too large.
1216 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1218 #define DOUBLE_FAIL_STACK(fail_stack) \
1219 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1221 : ((fail_stack).stack = (fail_stack_elt_t *) \
1222 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1223 (fail_stack).size * sizeof (fail_stack_elt_t), \
1224 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1226 (fail_stack).stack == NULL \
1228 : ((fail_stack).size <<= 1, \
1232 /* Push pointer POINTER on FAIL_STACK.
1233 Return 1 if was able to do so and 0 if ran out of memory allocating
1235 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1236 ((FAIL_STACK_FULL () \
1237 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1239 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1242 /* Push a pointer value onto the failure stack.
1243 Assumes the variable `fail_stack'. Probably should only
1244 be called from within `PUSH_FAILURE_POINT'. */
1245 #define PUSH_FAILURE_POINTER(item) \
1246 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1248 /* This pushes an integer-valued item onto the failure stack.
1249 Assumes the variable `fail_stack'. Probably should only
1250 be called from within `PUSH_FAILURE_POINT'. */
1251 #define PUSH_FAILURE_INT(item) \
1252 fail_stack.stack[fail_stack.avail++].integer = (item)
1254 /* Push a fail_stack_elt_t value onto the failure stack.
1255 Assumes the variable `fail_stack'. Probably should only
1256 be called from within `PUSH_FAILURE_POINT'. */
1257 #define PUSH_FAILURE_ELT(item) \
1258 fail_stack.stack[fail_stack.avail++] = (item)
1260 /* These three POP... operations complement the three PUSH... operations.
1261 All assume that `fail_stack' is nonempty. */
1262 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1263 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1264 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1266 /* Used to omit pushing failure point id's when we're not debugging. */
1268 #define DEBUG_PUSH PUSH_FAILURE_INT
1269 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1271 #define DEBUG_PUSH(item)
1272 #define DEBUG_POP(item_addr)
1276 /* Push the information about the state we will need
1277 if we ever fail back to it.
1279 Requires variables fail_stack, regstart, regend, reg_info, and
1280 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1283 Does `return FAILURE_CODE' if runs out of memory. */
1285 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1286 #define DECLARE_DESTINATION char *destination
1288 #define DECLARE_DESTINATION DECLARE_NOTHING
1291 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1293 DECLARE_DESTINATION; \
1294 /* Must be int, so when we don't save any registers, the arithmetic \
1295 of 0 + -1 isn't done as unsigned. */ \
1298 DEBUG_STATEMENT (failure_id++); \
1299 DEBUG_STATEMENT (nfailure_points_pushed++); \
1300 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1301 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1302 (unsigned long) (fail_stack).avail); \
1303 DEBUG_PRINT2 (" size: %lu\n", \
1304 (unsigned long) (fail_stack).size); \
1306 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1307 DEBUG_PRINT2 (" available: %ld\n", \
1308 (long) REMAINING_AVAIL_SLOTS); \
1310 /* Ensure we have enough space allocated for what we will push. */ \
1311 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1313 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1314 return failure_code; \
1316 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1317 (unsigned long) (fail_stack).size); \
1318 DEBUG_PRINT2 (" slots available: %ld\n", \
1319 (long) REMAINING_AVAIL_SLOTS); \
1322 /* Push the info, starting with the registers. */ \
1323 DEBUG_PRINT1 ("\n"); \
1325 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1328 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1329 DEBUG_STATEMENT (num_regs_pushed++); \
1331 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1332 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1334 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1335 PUSH_FAILURE_POINTER (regend[this_reg]); \
1337 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1338 * (long *) (®_info[this_reg])); \
1339 DEBUG_PRINT2 (" match_null=%d", \
1340 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1341 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1342 DEBUG_PRINT2 (" matched_something=%d", \
1343 MATCHED_SOMETHING (reg_info[this_reg])); \
1344 DEBUG_PRINT2 (" ever_matched_something=%d", \
1345 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1346 DEBUG_PRINT1 ("\n"); \
1347 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1350 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1351 PUSH_FAILURE_INT (lowest_active_reg); \
1353 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1354 PUSH_FAILURE_INT (highest_active_reg); \
1356 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1357 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1358 PUSH_FAILURE_POINTER (pattern_place); \
1360 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1361 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1363 DEBUG_PRINT1 ("'\n"); \
1364 PUSH_FAILURE_POINTER (string_place); \
1366 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1367 DEBUG_PUSH (failure_id); \
1370 /* This is the number of items that are pushed and popped on the stack
1371 for each register. */
1372 #define NUM_REG_ITEMS 3
1374 /* Individual items aside from the registers. */
1376 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1378 #define NUM_NONREG_ITEMS 4
1381 /* We push at most this many items on the stack. */
1382 /* We used to use (num_regs - 1), which is the number of registers
1383 this regexp will save; but that was changed to 5
1384 to avoid stack overflow for a regexp with lots of parens. */
1385 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1387 /* We actually push this many items. */
1388 #define NUM_FAILURE_ITEMS \
1389 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1392 /* How many items can still be added to the stack without overflowing it. */
1393 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1396 /* Pops what PUSH_FAIL_STACK pushes.
1398 We restore into the parameters, all of which should be lvalues:
1399 STR -- the saved data position.
1400 PAT -- the saved pattern position.
1401 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1402 REGSTART, REGEND -- arrays of string positions.
1403 REG_INFO -- array of information about each subexpression.
1405 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1406 `pend', `string1', `size1', `string2', and `size2'. */
1408 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1409 regstart, regend, reg_info) \
1411 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1413 const unsigned char *string_temp; \
1415 assert (!FAIL_STACK_EMPTY ()); \
1417 /* Remove failure points and point to how many regs pushed. */ \
1418 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1419 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1420 (unsigned long) fail_stack.avail); \
1421 DEBUG_PRINT2 (" size: %lu\n", \
1422 (unsigned long) fail_stack.size); \
1424 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1426 DEBUG_POP (&ffailure_id.integer); \
1427 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1428 * (unsigned int *) &ffailure_id); \
1430 /* If the saved string location is NULL, it came from an \
1431 on_failure_keep_string_jump opcode, and we want to throw away the \
1432 saved NULL, thus retaining our current position in the string. */ \
1433 string_temp = POP_FAILURE_POINTER (); \
1434 if (string_temp != NULL) \
1435 str = string_temp; \
1437 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1438 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1439 DEBUG_PRINT1 ("'\n"); \
1441 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1442 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1443 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1445 /* Restore register info. */ \
1446 high_reg = (unsigned) POP_FAILURE_INT (); \
1447 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1449 low_reg = (unsigned) POP_FAILURE_INT (); \
1450 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1452 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1454 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1456 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1457 DEBUG_PRINT2 (" info: 0x%lx\n", \
1458 * (long *) ®_info[this_reg]); \
1460 regend[this_reg] = POP_FAILURE_POINTER (); \
1461 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1463 regstart[this_reg] = POP_FAILURE_POINTER (); \
1464 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1467 set_regs_matched_done = 0; \
1468 DEBUG_STATEMENT (nfailure_points_popped++); \
1469 } while (0) /* POP_FAILURE_POINT */
1473 /* Structure for per-register (a.k.a. per-group) information.
1474 Other register information, such as the
1475 starting and ending positions (which are addresses), and the list of
1476 inner groups (which is a bits list) are maintained in separate
1479 We are making a (strictly speaking) nonportable assumption here: that
1480 the compiler will pack our bit fields into something that fits into
1481 the type of `word', i.e., is something that fits into one item on the
1486 fail_stack_elt_t word;
1489 /* This field is one if this group can match the empty string,
1490 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1491 #define MATCH_NULL_UNSET_VALUE 3
1492 unsigned match_null_string_p : 2;
1493 unsigned is_active : 1;
1494 unsigned matched_something : 1;
1495 unsigned ever_matched_something : 1;
1497 } register_info_type;
1499 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1500 #define IS_ACTIVE(R) ((R).bits.is_active)
1501 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1502 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1505 /* Call this when have matched a real character; it sets `matched' flags
1506 for the subexpressions which we are currently inside. Also records
1507 that those subexprs have matched. */
1508 #define SET_REGS_MATCHED() \
1511 if (!set_regs_matched_done) \
1514 set_regs_matched_done = 1; \
1515 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1517 MATCHED_SOMETHING (reg_info[r]) \
1518 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1525 /* Registers are set to a sentinel when they haven't yet matched. */
1526 static unsigned char reg_unset_dummy;
1527 #define REG_UNSET_VALUE (®_unset_dummy)
1528 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1530 /* Subroutine declarations and macros for regex_compile. */
1532 /* Fetch the next character in the uncompiled pattern---translating it
1533 if necessary. Also cast from a signed character in the constant
1534 string passed to us by the user to an unsigned char that we can use
1535 as an array index (in, e.g., `translate'). */
1536 #define PATFETCH(c) \
1539 c = TRANSLATE (c); \
1542 /* Fetch the next character in the uncompiled pattern, with no
1544 #define PATFETCH_RAW(c) \
1545 do {if (p == pend) return REG_EEND; \
1546 assert (p < pend); \
1547 c = charptr_emchar (p); \
1551 /* Go backwards one character in the pattern. */
1552 #define PATUNFETCH DEC_CHARPTR (p)
1556 #define PATFETCH_EXTENDED(emch) \
1557 do {if (p == pend) return REG_EEND; \
1558 assert (p < pend); \
1559 emch = charptr_emchar ((const Bufbyte *) p); \
1561 if (TRANSLATE_P (translate) && emch < 0x80) \
1562 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1565 #define PATFETCH_RAW_EXTENDED(emch) \
1566 do {if (p == pend) return REG_EEND; \
1567 assert (p < pend); \
1568 emch = charptr_emchar ((const Bufbyte *) p); \
1572 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1574 #define PATFETCH_EITHER(emch) \
1576 if (has_extended_chars) \
1577 PATFETCH_EXTENDED (emch); \
1582 #define PATFETCH_RAW_EITHER(emch) \
1584 if (has_extended_chars) \
1585 PATFETCH_RAW_EXTENDED (emch); \
1587 PATFETCH_RAW (emch); \
1590 #define PATUNFETCH_EITHER \
1592 if (has_extended_chars) \
1593 PATUNFETCH_EXTENDED (emch); \
1595 PATUNFETCH (emch); \
1598 #else /* not MULE */
1600 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1601 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1602 #define PATUNFETCH_EITHER PATUNFETCH
1606 /* If `translate' is non-null, return translate[D], else just D. We
1607 cast the subscript to translate because some data is declared as
1608 `char *', to avoid warnings when a string constant is passed. But
1609 when we use a character as a subscript we must make it unsigned. */
1610 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1612 /* Macros for outputting the compiled pattern into `buffer'. */
1614 /* If the buffer isn't allocated when it comes in, use this. */
1615 #define INIT_BUF_SIZE 32
1617 /* Make sure we have at least N more bytes of space in buffer. */
1618 #define GET_BUFFER_SPACE(n) \
1619 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1622 /* Make sure we have one more byte of buffer space and then add C to it. */
1623 #define BUF_PUSH(c) \
1625 GET_BUFFER_SPACE (1); \
1626 *buf_end++ = (unsigned char) (c); \
1630 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1631 #define BUF_PUSH_2(c1, c2) \
1633 GET_BUFFER_SPACE (2); \
1634 *buf_end++ = (unsigned char) (c1); \
1635 *buf_end++ = (unsigned char) (c2); \
1639 /* As with BUF_PUSH_2, except for three bytes. */
1640 #define BUF_PUSH_3(c1, c2, c3) \
1642 GET_BUFFER_SPACE (3); \
1643 *buf_end++ = (unsigned char) (c1); \
1644 *buf_end++ = (unsigned char) (c2); \
1645 *buf_end++ = (unsigned char) (c3); \
1649 /* Store a jump with opcode OP at LOC to location TO. We store a
1650 relative address offset by the three bytes the jump itself occupies. */
1651 #define STORE_JUMP(op, loc, to) \
1652 store_op1 (op, loc, (to) - (loc) - 3)
1654 /* Likewise, for a two-argument jump. */
1655 #define STORE_JUMP2(op, loc, to, arg) \
1656 store_op2 (op, loc, (to) - (loc) - 3, arg)
1658 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1660 #define INSERT_JUMP(op, loc, to) \
1661 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1663 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1665 #define INSERT_JUMP2(op, loc, to, arg) \
1666 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1669 /* This is not an arbitrary limit: the arguments which represent offsets
1670 into the pattern are two bytes long. So if 2^16 bytes turns out to
1671 be too small, many things would have to change. */
1672 #define MAX_BUF_SIZE (1L << 16)
1675 /* Extend the buffer by twice its current size via realloc and
1676 reset the pointers that pointed into the old block to point to the
1677 correct places in the new one. If extending the buffer results in it
1678 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1679 #define EXTEND_BUFFER() \
1681 re_char *old_buffer = bufp->buffer; \
1682 if (bufp->allocated == MAX_BUF_SIZE) \
1684 bufp->allocated <<= 1; \
1685 if (bufp->allocated > MAX_BUF_SIZE) \
1686 bufp->allocated = MAX_BUF_SIZE; \
1687 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1688 if (bufp->buffer == NULL) \
1689 return REG_ESPACE; \
1690 /* If the buffer moved, move all the pointers into it. */ \
1691 if (old_buffer != bufp->buffer) \
1693 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1694 begalt = (begalt - old_buffer) + bufp->buffer; \
1695 if (fixup_alt_jump) \
1696 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1698 laststart = (laststart - old_buffer) + bufp->buffer; \
1699 if (pending_exact) \
1700 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1705 /* Since we have one byte reserved for the register number argument to
1706 {start,stop}_memory, the maximum number of groups we can report
1707 things about is what fits in that byte. */
1708 #define MAX_REGNUM 255
1710 /* But patterns can have more than `MAX_REGNUM' registers. We just
1711 ignore the excess. */
1712 typedef unsigned regnum_t;
1714 #define INIT_REG_TRANSLATE_SIZE 5
1716 /* Macros for the compile stack. */
1718 /* Since offsets can go either forwards or backwards, this type needs to
1719 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1720 typedef int pattern_offset_t;
1724 pattern_offset_t begalt_offset;
1725 pattern_offset_t fixup_alt_jump;
1726 pattern_offset_t inner_group_offset;
1727 pattern_offset_t laststart_offset;
1729 } compile_stack_elt_t;
1734 compile_stack_elt_t *stack;
1736 unsigned avail; /* Offset of next open position. */
1737 } compile_stack_type;
1740 #define INIT_COMPILE_STACK_SIZE 32
1742 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1743 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1745 /* The next available element. */
1746 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1749 /* Set the bit for character C in a bit vector. */
1750 #define SET_LIST_BIT(c) \
1751 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1752 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1756 /* Set the "bit" for character C in a range table. */
1757 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1759 /* Set the "bit" for character c in the appropriate table. */
1760 #define SET_EITHER_BIT(c) \
1762 if (has_extended_chars) \
1763 SET_RANGETAB_BIT (c); \
1768 #else /* not MULE */
1770 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1775 /* Get the next unsigned number in the uncompiled pattern. */
1776 #define GET_UNSIGNED_NUMBER(num) \
1780 while (ISDIGIT (c)) \
1784 num = num * 10 + c - '0'; \
1792 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1794 #define IS_CHAR_CLASS(string) \
1795 (STREQ (string, "alpha") || STREQ (string, "upper") \
1796 || STREQ (string, "lower") || STREQ (string, "digit") \
1797 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1798 || STREQ (string, "space") || STREQ (string, "print") \
1799 || STREQ (string, "punct") || STREQ (string, "graph") \
1800 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1802 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1803 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1804 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1805 unsigned char *end);
1806 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1807 unsigned char *end);
1808 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1809 reg_syntax_t syntax);
1810 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1811 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1813 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1814 RE_TRANSLATE_TYPE translate,
1815 reg_syntax_t syntax,
1818 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1820 RE_TRANSLATE_TYPE translate,
1821 reg_syntax_t syntax,
1824 static re_bool group_match_null_string_p (unsigned char **p,
1826 register_info_type *reg_info);
1827 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1828 register_info_type *reg_info);
1829 static re_bool common_op_match_null_string_p (unsigned char **p,
1831 register_info_type *reg_info);
1832 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1833 REGISTER int len, RE_TRANSLATE_TYPE translate);
1834 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1835 re_char *string1, int size1,
1836 re_char *string2, int size2, int pos,
1837 struct re_registers *regs, int stop);
1839 #ifndef MATCH_MAY_ALLOCATE
1841 /* If we cannot allocate large objects within re_match_2_internal,
1842 we make the fail stack and register vectors global.
1843 The fail stack, we grow to the maximum size when a regexp
1845 The register vectors, we adjust in size each time we
1846 compile a regexp, according to the number of registers it needs. */
1848 static fail_stack_type fail_stack;
1850 /* Size with which the following vectors are currently allocated.
1851 That is so we can make them bigger as needed,
1852 but never make them smaller. */
1853 static int regs_allocated_size;
1855 static re_char ** regstart, ** regend;
1856 static re_char ** old_regstart, ** old_regend;
1857 static re_char **best_regstart, **best_regend;
1858 static register_info_type *reg_info;
1859 static re_char **reg_dummy;
1860 static register_info_type *reg_info_dummy;
1862 /* Make the register vectors big enough for NUM_REGS registers,
1863 but don't make them smaller. */
1866 regex_grow_registers (int num_regs)
1868 if (num_regs > regs_allocated_size)
1870 RETALLOC_IF (regstart, num_regs, re_char *);
1871 RETALLOC_IF (regend, num_regs, re_char *);
1872 RETALLOC_IF (old_regstart, num_regs, re_char *);
1873 RETALLOC_IF (old_regend, num_regs, re_char *);
1874 RETALLOC_IF (best_regstart, num_regs, re_char *);
1875 RETALLOC_IF (best_regend, num_regs, re_char *);
1876 RETALLOC_IF (reg_info, num_regs, register_info_type);
1877 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1878 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1880 regs_allocated_size = num_regs;
1884 #endif /* not MATCH_MAY_ALLOCATE */
1886 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1887 Returns one of error codes defined in `regex.h', or zero for success.
1889 Assumes the `allocated' (and perhaps `buffer') and `translate'
1890 fields are set in BUFP on entry.
1892 If it succeeds, results are put in BUFP (if it returns an error, the
1893 contents of BUFP are undefined):
1894 `buffer' is the compiled pattern;
1895 `syntax' is set to SYNTAX;
1896 `used' is set to the length of the compiled pattern;
1897 `fastmap_accurate' is zero;
1898 `re_ngroups' is the number of groups/subexpressions (including shy
1900 `re_nsub' is the number of non-shy groups in PATTERN;
1901 `not_bol' and `not_eol' are zero;
1903 The `fastmap' and `newline_anchor' fields are neither
1904 examined nor set. */
1906 /* Return, freeing storage we allocated. */
1907 #define FREE_STACK_RETURN(value) \
1908 return (free (compile_stack.stack), value)
1910 static reg_errcode_t
1911 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1912 struct re_pattern_buffer *bufp)
1914 /* We fetch characters from PATTERN here. We declare these as int
1915 (or possibly long) so that chars above 127 can be used as
1916 array indices. The macros that fetch a character from the pattern
1917 make sure to coerce to unsigned char before assigning, so we won't
1918 get bitten by negative numbers here. */
1919 /* XEmacs change: used to be unsigned char. */
1920 REGISTER EMACS_INT c, c1;
1922 /* A random temporary spot in PATTERN. */
1925 /* Points to the end of the buffer, where we should append. */
1926 REGISTER unsigned char *buf_end;
1928 /* Keeps track of unclosed groups. */
1929 compile_stack_type compile_stack;
1931 /* Points to the current (ending) position in the pattern. */
1932 re_char *p = pattern;
1933 re_char *pend = pattern + size;
1935 /* How to translate the characters in the pattern. */
1936 RE_TRANSLATE_TYPE translate = bufp->translate;
1938 /* Address of the count-byte of the most recently inserted `exactn'
1939 command. This makes it possible to tell if a new exact-match
1940 character can be added to that command or if the character requires
1941 a new `exactn' command. */
1942 unsigned char *pending_exact = 0;
1944 /* Address of start of the most recently finished expression.
1945 This tells, e.g., postfix * where to find the start of its
1946 operand. Reset at the beginning of groups and alternatives. */
1947 unsigned char *laststart = 0;
1949 /* Address of beginning of regexp, or inside of last group. */
1950 unsigned char *begalt;
1952 /* Place in the uncompiled pattern (i.e., the {) to
1953 which to go back if the interval is invalid. */
1954 re_char *beg_interval;
1956 /* Address of the place where a forward jump should go to the end of
1957 the containing expression. Each alternative of an `or' -- except the
1958 last -- ends with a forward jump of this sort. */
1959 unsigned char *fixup_alt_jump = 0;
1961 /* Counts open-groups as they are encountered. Remembered for the
1962 matching close-group on the compile stack, so the same register
1963 number is put in the stop_memory as the start_memory. */
1964 regnum_t regnum = 0;
1967 DEBUG_PRINT1 ("\nCompiling pattern: ");
1972 for (debug_count = 0; debug_count < size; debug_count++)
1973 putchar (pattern[debug_count]);
1978 /* Initialize the compile stack. */
1979 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1980 if (compile_stack.stack == NULL)
1983 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1984 compile_stack.avail = 0;
1986 /* Initialize the pattern buffer. */
1987 bufp->syntax = syntax;
1988 bufp->fastmap_accurate = 0;
1989 bufp->not_bol = bufp->not_eol = 0;
1991 /* Set `used' to zero, so that if we return an error, the pattern
1992 printer (for debugging) will think there's no pattern. We reset it
1996 /* Always count groups, whether or not bufp->no_sub is set. */
1998 bufp->re_ngroups = 0;
2000 /* Allocate index translation array if needed. */
2001 if (bufp->external_to_internal_register == 0)
2003 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE;
2004 RETALLOC (bufp->external_to_internal_register,
2005 bufp->external_to_internal_register_size,
2009 /* Initialize translations to impossible value to aid debugging. */
2013 bufp->external_to_internal_register[0] = 0;
2014 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2015 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF;
2018 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2019 /* Initialize the syntax table. */
2020 init_syntax_once ();
2023 if (bufp->allocated == 0)
2026 { /* If zero allocated, but buffer is non-null, try to realloc
2027 enough space. This loses if buffer's address is bogus, but
2028 that is the user's responsibility. */
2029 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2032 { /* Caller did not allocate a buffer. Do it for them. */
2033 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2035 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2037 bufp->allocated = INIT_BUF_SIZE;
2040 begalt = buf_end = bufp->buffer;
2042 /* Loop through the uncompiled pattern until we're at the end. */
2051 if ( /* If at start of pattern, it's an operator. */
2053 /* If context independent, it's an operator. */
2054 || syntax & RE_CONTEXT_INDEP_ANCHORS
2055 /* Otherwise, depends on what's come before. */
2056 || at_begline_loc_p (pattern, p, syntax))
2066 if ( /* If at end of pattern, it's an operator. */
2068 /* If context independent, it's an operator. */
2069 || syntax & RE_CONTEXT_INDEP_ANCHORS
2070 /* Otherwise, depends on what's next. */
2071 || at_endline_loc_p (p, pend, syntax))
2081 if ((syntax & RE_BK_PLUS_QM)
2082 || (syntax & RE_LIMITED_OPS))
2086 /* If there is no previous pattern... */
2089 if (syntax & RE_CONTEXT_INVALID_OPS)
2090 FREE_STACK_RETURN (REG_BADRPT);
2091 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2096 /* true means zero/many matches are allowed. */
2097 re_bool zero_times_ok = c != '+';
2098 re_bool many_times_ok = c != '?';
2100 /* true means match shortest string possible. */
2101 re_bool minimal = false;
2103 /* If there is a sequence of repetition chars, collapse it
2104 down to just one (the right one). We can't combine
2105 interval operators with these because of, e.g., `a{2}*',
2106 which should only match an even number of `a's. */
2111 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2112 && (c == '+' || c == '?')))
2115 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2117 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2120 if (!(c1 == '+' || c1 == '?'))
2135 /* If we get here, we found another repeat character. */
2136 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2138 /* "*?" and "+?" and "??" are okay (and mean match
2139 minimally), but other sequences (such as "*??" and
2140 "+++") are rejected (reserved for future use). */
2141 if (minimal || c != '?')
2142 FREE_STACK_RETURN (REG_BADRPT);
2147 zero_times_ok |= c != '+';
2148 many_times_ok |= c != '?';
2152 /* Star, etc. applied to an empty pattern is equivalent
2153 to an empty pattern. */
2157 /* Now we know whether zero matches is allowed
2158 and whether two or more matches is allowed
2159 and whether we want minimal or maximal matching. */
2165 0: /on_failure_jump to 6
2170 GET_BUFFER_SPACE (6);
2171 INSERT_JUMP (jump, laststart, buf_end + 3);
2173 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2176 else if (zero_times_ok)
2181 6: /on_failure_jump to 3
2184 GET_BUFFER_SPACE (6);
2185 INSERT_JUMP (jump, laststart, buf_end + 3);
2187 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2194 3: /on_failure_jump to 0
2197 GET_BUFFER_SPACE (3);
2198 STORE_JUMP (on_failure_jump, buf_end, laststart);
2204 /* Are we optimizing this jump? */
2205 re_bool keep_string_p = false;
2208 { /* More than one repetition is allowed, so put in
2209 at the end a backward relative jump from
2210 `buf_end' to before the next jump we're going
2211 to put in below (which jumps from laststart to
2214 But if we are at the `*' in the exact sequence `.*\n',
2215 insert an unconditional jump backwards to the .,
2216 instead of the beginning of the loop. This way we only
2217 push a failure point once, instead of every time
2218 through the loop. */
2219 assert (p - 1 > pattern);
2221 /* Allocate the space for the jump. */
2222 GET_BUFFER_SPACE (3);
2224 /* We know we are not at the first character of the
2225 pattern, because laststart was nonzero. And we've
2226 already incremented `p', by the way, to be the
2227 character after the `*'. Do we have to do something
2228 analogous here for null bytes, because of
2232 && p < pend && *p == '\n'
2233 && !(syntax & RE_DOT_NEWLINE))
2234 { /* We have .*\n. */
2235 STORE_JUMP (jump, buf_end, laststart);
2236 keep_string_p = true;
2239 /* Anything else. */
2240 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2242 /* We've added more stuff to the buffer. */
2246 /* On failure, jump from laststart to buf_end + 3,
2247 which will be the end of the buffer after this jump
2249 GET_BUFFER_SPACE (3);
2250 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2252 laststart, buf_end + 3);
2257 /* At least one repetition is required, so insert a
2258 `dummy_failure_jump' before the initial
2259 `on_failure_jump' instruction of the loop. This
2260 effects a skip over that instruction the first time
2261 we hit that loop. */
2262 GET_BUFFER_SPACE (3);
2263 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2273 laststart = buf_end;
2280 /* XEmacs change: this whole section */
2281 re_bool had_char_class = false;
2283 re_bool has_extended_chars = false;
2284 REGISTER Lisp_Object rtab = Qnil;
2287 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2289 /* Ensure that we have enough space to push a charset: the
2290 opcode, the length count, and the bitset; 34 bytes in all. */
2291 GET_BUFFER_SPACE (34);
2293 laststart = buf_end;
2295 /* We test `*p == '^' twice, instead of using an if
2296 statement, so we only need one BUF_PUSH. */
2297 BUF_PUSH (*p == '^' ? charset_not : charset);
2301 /* Remember the first position in the bracket expression. */
2304 /* Push the number of bytes in the bitmap. */
2305 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2307 /* Clear the whole map. */
2308 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2310 /* charset_not matches newline according to a syntax bit. */
2311 if ((re_opcode_t) buf_end[-2] == charset_not
2312 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2313 SET_LIST_BIT ('\n');
2316 start_over_with_extended:
2317 if (has_extended_chars)
2319 /* There are extended chars here, which means we need to start
2320 over and shift to unified range-table format. */
2321 if (buf_end[-2] == charset)
2322 buf_end[-2] = charset_mule;
2324 buf_end[-2] = charset_mule_not;
2326 p = p1; /* go back to the beginning of the charset, after
2328 rtab = Vthe_lisp_rangetab;
2329 Fclear_range_table (rtab);
2331 /* charset_not matches newline according to a syntax bit. */
2332 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2333 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2334 SET_EITHER_BIT ('\n');
2338 /* Read in characters and ranges, setting map bits. */
2341 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2346 if (c >= 0x80 && !has_extended_chars)
2348 has_extended_chars = 1;
2349 /* Frumble-bumble, we've found some extended chars.
2350 Need to start over, process everything using
2351 the general extended-char mechanism, and need
2352 to use charset_mule and charset_mule_not instead
2353 of charset and charset_not. */
2354 goto start_over_with_extended;
2357 /* \ might escape characters inside [...] and [^...]. */
2358 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2360 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2364 if (c1 >= 0x80 && !has_extended_chars)
2366 has_extended_chars = 1;
2367 goto start_over_with_extended;
2370 SET_EITHER_BIT (c1);
2374 /* Could be the end of the bracket expression. If it's
2375 not (i.e., when the bracket expression is `[]' so
2376 far), the ']' character bit gets set way below. */
2377 if (c == ']' && p != p1 + 1)
2380 /* Look ahead to see if it's a range when the last thing
2381 was a character class. */
2382 if (had_char_class && c == '-' && *p != ']')
2383 FREE_STACK_RETURN (REG_ERANGE);
2385 /* Look ahead to see if it's a range when the last thing
2386 was a character: if this is a hyphen not at the
2387 beginning or the end of a list, then it's the range
2390 && !(p - 2 >= pattern && p[-2] == '[')
2391 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2397 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2399 has_extended_chars = 1;
2400 goto start_over_with_extended;
2402 if (has_extended_chars)
2403 ret = compile_extended_range (&p, pend, translate,
2407 ret = compile_range (&p, pend, translate, syntax, buf_end);
2408 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2411 else if (p[0] == '-' && p[1] != ']')
2412 { /* This handles ranges made up of characters only. */
2415 /* Move past the `-'. */
2419 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2421 has_extended_chars = 1;
2422 goto start_over_with_extended;
2424 if (has_extended_chars)
2425 ret = compile_extended_range (&p, pend, translate,
2429 ret = compile_range (&p, pend, translate, syntax, buf_end);
2430 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2433 /* See if we're at the beginning of a possible character
2436 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2437 { /* Leave room for the null. */
2438 char str[CHAR_CLASS_MAX_LENGTH + 1];
2443 /* If pattern is `[[:'. */
2444 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2448 /* #### This code is unused.
2449 Correctness is not checked after TRT
2452 if (c == ':' || c == ']' || p == pend
2453 || c1 == CHAR_CLASS_MAX_LENGTH)
2455 str[c1++] = (char) c;
2459 /* If isn't a word bracketed by `[:' and `:]':
2460 undo the ending character, the letters, and leave
2461 the leading `:' and `[' (but set bits for them). */
2462 if (c == ':' && *p == ']')
2465 re_bool is_alnum = STREQ (str, "alnum");
2466 re_bool is_alpha = STREQ (str, "alpha");
2467 re_bool is_blank = STREQ (str, "blank");
2468 re_bool is_cntrl = STREQ (str, "cntrl");
2469 re_bool is_digit = STREQ (str, "digit");
2470 re_bool is_graph = STREQ (str, "graph");
2471 re_bool is_lower = STREQ (str, "lower");
2472 re_bool is_print = STREQ (str, "print");
2473 re_bool is_punct = STREQ (str, "punct");
2474 re_bool is_space = STREQ (str, "space");
2475 re_bool is_upper = STREQ (str, "upper");
2476 re_bool is_xdigit = STREQ (str, "xdigit");
2478 if (!IS_CHAR_CLASS (str))
2479 FREE_STACK_RETURN (REG_ECTYPE);
2481 /* Throw away the ] at the end of the character
2485 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2487 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2489 /* This was split into 3 if's to
2490 avoid an arbitrary limit in some compiler. */
2491 if ( (is_alnum && ISALNUM (ch))
2492 || (is_alpha && ISALPHA (ch))
2493 || (is_blank && ISBLANK (ch))
2494 || (is_cntrl && ISCNTRL (ch)))
2495 SET_EITHER_BIT (ch);
2496 if ( (is_digit && ISDIGIT (ch))
2497 || (is_graph && ISGRAPH (ch))
2498 || (is_lower && ISLOWER (ch))
2499 || (is_print && ISPRINT (ch)))
2500 SET_EITHER_BIT (ch);
2501 if ( (is_punct && ISPUNCT (ch))
2502 || (is_space && ISSPACE (ch))
2503 || (is_upper && ISUPPER (ch))
2504 || (is_xdigit && ISXDIGIT (ch)))
2505 SET_EITHER_BIT (ch);
2507 had_char_class = true;
2514 SET_EITHER_BIT ('[');
2515 SET_EITHER_BIT (':');
2516 had_char_class = false;
2521 had_char_class = false;
2527 if (has_extended_chars)
2529 /* We have a range table, not a bit vector. */
2531 unified_range_table_bytes_needed (rtab);
2532 GET_BUFFER_SPACE (bytes_needed);
2533 unified_range_table_copy_data (rtab, buf_end);
2534 buf_end += unified_range_table_bytes_used (buf_end);
2538 /* Discard any (non)matching list bytes that are all 0 at the
2539 end of the map. Decrease the map-length byte too. */
2540 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2542 buf_end += buf_end[-1];
2548 if (syntax & RE_NO_BK_PARENS)
2555 if (syntax & RE_NO_BK_PARENS)
2562 if (syntax & RE_NEWLINE_ALT)
2569 if (syntax & RE_NO_BK_VBAR)
2576 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2577 goto handle_interval;
2583 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2585 /* Do not translate the character after the \, so that we can
2586 distinguish, e.g., \B from \b, even if we normally would
2587 translate, e.g., B to b. */
2593 if (syntax & RE_NO_BK_PARENS)
2594 goto normal_backslash;
2601 if (!(syntax & RE_NO_SHY_GROUPS)
2609 case ':': /* shy groups */
2613 /* All others are reserved for future constructs. */
2615 FREE_STACK_RETURN (REG_BADPAT);
2622 /* Record the translation from capturing group index to
2623 register number, reallocating table as needed. */
2626 while (bufp->external_to_internal_register_size <=
2631 bufp->external_to_internal_register_size;
2632 bufp->external_to_internal_register_size += 5;
2633 RETALLOC (bufp->external_to_internal_register,
2634 bufp->external_to_internal_register_size,
2638 i < bufp->external_to_internal_register_size; i++)
2639 bufp->external_to_internal_register[i] =
2643 bufp->external_to_internal_register[bufp->re_nsub] =
2647 if (COMPILE_STACK_FULL)
2649 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2650 compile_stack_elt_t);
2651 if (compile_stack.stack == NULL) return REG_ESPACE;
2653 compile_stack.size <<= 1;
2656 /* These are the values to restore when we hit end of this
2657 group. They are all relative offsets, so that if the
2658 whole pattern moves because of realloc, they will still
2660 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2661 COMPILE_STACK_TOP.fixup_alt_jump
2662 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2663 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2664 COMPILE_STACK_TOP.regnum = r;
2666 /* We will eventually replace the 0 with the number of
2667 groups inner to this one. But do not push a
2668 start_memory for groups beyond the last one we can
2669 represent in the compiled pattern.
2670 #### bad bad bad. this will fail in lots of ways, if we
2671 ever have to backtrack for these groups.
2673 if (r <= MAX_REGNUM)
2675 COMPILE_STACK_TOP.inner_group_offset
2676 = buf_end - bufp->buffer + 2;
2677 BUF_PUSH_3 (start_memory, r, 0);
2680 compile_stack.avail++;
2685 /* If we've reached MAX_REGNUM groups, then this open
2686 won't actually generate any code, so we'll have to
2687 clear pending_exact explicitly. */
2694 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2696 if (COMPILE_STACK_EMPTY) {
2697 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2698 goto normal_backslash;
2700 FREE_STACK_RETURN (REG_ERPAREN);
2705 { /* Push a dummy failure point at the end of the
2706 alternative for a possible future
2707 `pop_failure_jump' to pop. See comments at
2708 `push_dummy_failure' in `re_match_2'. */
2709 BUF_PUSH (push_dummy_failure);
2711 /* We allocated space for this jump when we assigned
2712 to `fixup_alt_jump', in the `handle_alt' case below. */
2713 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2716 /* See similar code for backslashed left paren above. */
2717 if (COMPILE_STACK_EMPTY) {
2718 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2721 FREE_STACK_RETURN (REG_ERPAREN);
2724 /* Since we just checked for an empty stack above, this
2725 ``can't happen''. */
2726 assert (compile_stack.avail != 0);
2728 /* We don't just want to restore into `regnum', because
2729 later groups should continue to be numbered higher,
2730 as in `(ab)c(de)' -- the second group is #2. */
2731 regnum_t this_group_regnum;
2733 compile_stack.avail--;
2734 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2736 = COMPILE_STACK_TOP.fixup_alt_jump
2737 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2739 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2740 this_group_regnum = COMPILE_STACK_TOP.regnum;
2741 /* If we've reached MAX_REGNUM groups, then this open
2742 won't actually generate any code, so we'll have to
2743 clear pending_exact explicitly. */
2746 /* We're at the end of the group, so now we know how many
2747 groups were inside this one. */
2748 if (this_group_regnum <= MAX_REGNUM)
2750 unsigned char *inner_group_loc
2751 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2753 *inner_group_loc = regnum - this_group_regnum;
2754 BUF_PUSH_3 (stop_memory, this_group_regnum,
2755 regnum - this_group_regnum);
2761 case '|': /* `\|'. */
2762 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2763 goto normal_backslash;
2765 if (syntax & RE_LIMITED_OPS)
2768 /* Insert before the previous alternative a jump which
2769 jumps to this alternative if the former fails. */
2770 GET_BUFFER_SPACE (3);
2771 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2775 /* The alternative before this one has a jump after it
2776 which gets executed if it gets matched. Adjust that
2777 jump so it will jump to this alternative's analogous
2778 jump (put in below, which in turn will jump to the next
2779 (if any) alternative's such jump, etc.). The last such
2780 jump jumps to the correct final destination. A picture:
2786 If we are at `b', then fixup_alt_jump right now points to a
2787 three-byte space after `a'. We'll put in the jump, set
2788 fixup_alt_jump to right after `b', and leave behind three
2789 bytes which we'll fill in when we get to after `c'. */
2792 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2794 /* Mark and leave space for a jump after this alternative,
2795 to be filled in later either by next alternative or
2796 when know we're at the end of a series of alternatives. */
2797 fixup_alt_jump = buf_end;
2798 GET_BUFFER_SPACE (3);
2807 /* If \{ is a literal. */
2808 if (!(syntax & RE_INTERVALS)
2809 /* If we're at `\{' and it's not the open-interval
2811 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2812 || (p - 2 == pattern && p == pend))
2813 goto normal_backslash;
2817 /* If got here, then the syntax allows intervals. */
2819 /* At least (most) this many matches must be made. */
2820 int lower_bound = -1, upper_bound = -1;
2822 beg_interval = p - 1;
2826 if (syntax & RE_NO_BK_BRACES)
2827 goto unfetch_interval;
2829 FREE_STACK_RETURN (REG_EBRACE);
2832 GET_UNSIGNED_NUMBER (lower_bound);
2836 GET_UNSIGNED_NUMBER (upper_bound);
2837 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2840 /* Interval such as `{1}' => match exactly once. */
2841 upper_bound = lower_bound;
2843 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2844 || lower_bound > upper_bound)
2846 if (syntax & RE_NO_BK_BRACES)
2847 goto unfetch_interval;
2849 FREE_STACK_RETURN (REG_BADBR);
2852 if (!(syntax & RE_NO_BK_BRACES))
2854 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2861 if (syntax & RE_NO_BK_BRACES)
2862 goto unfetch_interval;
2864 FREE_STACK_RETURN (REG_BADBR);
2867 /* We just parsed a valid interval. */
2869 /* If it's invalid to have no preceding re. */
2872 if (syntax & RE_CONTEXT_INVALID_OPS)
2873 FREE_STACK_RETURN (REG_BADRPT);
2874 else if (syntax & RE_CONTEXT_INDEP_OPS)
2875 laststart = buf_end;
2877 goto unfetch_interval;
2880 /* If the upper bound is zero, don't want to succeed at
2881 all; jump from `laststart' to `b + 3', which will be
2882 the end of the buffer after we insert the jump. */
2883 if (upper_bound == 0)
2885 GET_BUFFER_SPACE (3);
2886 INSERT_JUMP (jump, laststart, buf_end + 3);
2890 /* Otherwise, we have a nontrivial interval. When
2891 we're all done, the pattern will look like:
2892 set_number_at <jump count> <upper bound>
2893 set_number_at <succeed_n count> <lower bound>
2894 succeed_n <after jump addr> <succeed_n count>
2896 jump_n <succeed_n addr> <jump count>
2897 (The upper bound and `jump_n' are omitted if
2898 `upper_bound' is 1, though.) */
2900 { /* If the upper bound is > 1, we need to insert
2901 more at the end of the loop. */
2902 Memory_count nbytes = 10 + (upper_bound > 1) * 10;
2904 GET_BUFFER_SPACE (nbytes);
2906 /* Initialize lower bound of the `succeed_n', even
2907 though it will be set during matching by its
2908 attendant `set_number_at' (inserted next),
2909 because `re_compile_fastmap' needs to know.
2910 Jump to the `jump_n' we might insert below. */
2911 INSERT_JUMP2 (succeed_n, laststart,
2912 buf_end + 5 + (upper_bound > 1) * 5,
2916 /* Code to initialize the lower bound. Insert
2917 before the `succeed_n'. The `5' is the last two
2918 bytes of this `set_number_at', plus 3 bytes of
2919 the following `succeed_n'. */
2920 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2923 if (upper_bound > 1)
2924 { /* More than one repetition is allowed, so
2925 append a backward jump to the `succeed_n'
2926 that starts this interval.
2928 When we've reached this during matching,
2929 we'll have matched the interval once, so
2930 jump back only `upper_bound - 1' times. */
2931 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2935 /* The location we want to set is the second
2936 parameter of the `jump_n'; that is `b-2' as
2937 an absolute address. `laststart' will be
2938 the `set_number_at' we're about to insert;
2939 `laststart+3' the number to set, the source
2940 for the relative address. But we are
2941 inserting into the middle of the pattern --
2942 so everything is getting moved up by 5.
2943 Conclusion: (b - 2) - (laststart + 3) + 5,
2944 i.e., b - laststart.
2946 We insert this at the beginning of the loop
2947 so that if we fail during matching, we'll
2948 reinitialize the bounds. */
2949 insert_op2 (set_number_at, laststart,
2950 buf_end - laststart,
2951 upper_bound - 1, buf_end);
2956 beg_interval = NULL;
2961 /* If an invalid interval, match the characters as literals. */
2962 assert (beg_interval);
2964 beg_interval = NULL;
2966 /* normal_char and normal_backslash need `c'. */
2969 if (!(syntax & RE_NO_BK_BRACES))
2971 if (p > pattern && p[-1] == '\\')
2972 goto normal_backslash;
2977 /* There is no way to specify the before_dot and after_dot
2978 operators. rms says this is ok. --karl */
2984 laststart = buf_end;
2986 /* XEmacs addition */
2987 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2988 FREE_STACK_RETURN (REG_ESYNTAX);
2989 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2993 laststart = buf_end;
2995 /* XEmacs addition */
2996 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2997 FREE_STACK_RETURN (REG_ESYNTAX);
2998 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3002 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3004 laststart = buf_end;
3006 if (c < 32 || c > 127)
3007 FREE_STACK_RETURN (REG_ECATEGORY);
3008 BUF_PUSH_2 (categoryspec, c);
3012 laststart = buf_end;
3014 if (c < 32 || c > 127)
3015 FREE_STACK_RETURN (REG_ECATEGORY);
3016 BUF_PUSH_2 (notcategoryspec, c);
3018 /* end of category patch */
3024 laststart = buf_end;
3025 BUF_PUSH (wordchar);
3030 laststart = buf_end;
3031 BUF_PUSH (notwordchar);
3044 BUF_PUSH (wordbound);
3048 BUF_PUSH (notwordbound);
3059 case '1': case '2': case '3': case '4': case '5':
3060 case '6': case '7': case '8': case '9':
3064 if (syntax & RE_NO_BK_REFS)
3067 /* External register indexing. */
3070 if (reg > bufp->re_nsub)
3071 FREE_STACK_RETURN (REG_ESUBREG);
3073 /* Convert external to internal as soon as possible. */
3074 reg = bufp->external_to_internal_register[reg];
3076 /* Can't back reference to a subexpression if inside it. */
3077 if (group_in_compile_stack (compile_stack, reg))
3080 laststart = buf_end;
3081 BUF_PUSH_2 (duplicate, reg);
3088 if (syntax & RE_BK_PLUS_QM)
3091 goto normal_backslash;
3095 /* You might think it would be useful for \ to mean
3096 not to translate; but if we don't translate it,
3097 it will never match anything. */
3105 /* Expects the character in `c'. */
3106 /* `p' points to the location after where `c' came from. */
3109 /* XEmacs: modifications here for Mule. */
3110 /* `q' points to the beginning of the next char. */
3113 /* If no exactn currently being built. */
3116 /* If last exactn not at current position. */
3117 || pending_exact + *pending_exact + 1 != buf_end
3119 /* We have only one byte following the exactn for the count. */
3120 || ((unsigned int) (*pending_exact + (q - p)) >=
3121 ((unsigned int) (1 << BYTEWIDTH) - 1))
3123 /* If followed by a repetition operator. */
3124 || *q == '*' || *q == '^'
3125 || ((syntax & RE_BK_PLUS_QM)
3126 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3127 : (*q == '+' || *q == '?'))
3128 || ((syntax & RE_INTERVALS)
3129 && ((syntax & RE_NO_BK_BRACES)
3131 : (q[0] == '\\' && q[1] == '{'))))
3133 /* Start building a new exactn. */
3135 laststart = buf_end;
3137 BUF_PUSH_2 (exactn, 0);
3138 pending_exact = buf_end - 1;
3147 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3150 bt_count = set_charptr_emchar (tmp_buf, c);
3152 for (i = 0; i < bt_count; i++)
3154 BUF_PUSH (tmp_buf[i]);
3162 } /* while p != pend */
3165 /* Through the pattern now. */
3168 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3170 if (!COMPILE_STACK_EMPTY)
3171 FREE_STACK_RETURN (REG_EPAREN);
3173 /* If we don't want backtracking, force success
3174 the first time we reach the end of the compiled pattern. */
3175 if (syntax & RE_NO_POSIX_BACKTRACKING)
3178 free (compile_stack.stack);
3180 /* We have succeeded; set the length of the buffer. */
3181 bufp->used = buf_end - bufp->buffer;
3186 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3187 print_compiled_pattern (bufp);
3191 #ifndef MATCH_MAY_ALLOCATE
3192 /* Initialize the failure stack to the largest possible stack. This
3193 isn't necessary unless we're trying to avoid calling alloca in
3194 the search and match routines. */
3196 int num_regs = bufp->re_ngroups + 1;
3198 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3199 is strictly greater than re_max_failures, the largest possible stack
3200 is 2 * re_max_failures failure points. */
3201 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3203 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3206 if (! fail_stack.stack)
3208 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3209 * sizeof (fail_stack_elt_t));
3212 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3214 * sizeof (fail_stack_elt_t)));
3215 #else /* not emacs */
3216 if (! fail_stack.stack)
3218 = (fail_stack_elt_t *) malloc (fail_stack.size
3219 * sizeof (fail_stack_elt_t));
3222 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3224 * sizeof (fail_stack_elt_t)));
3228 regex_grow_registers (num_regs);
3230 #endif /* not MATCH_MAY_ALLOCATE */
3233 } /* regex_compile */
3235 /* Subroutines for `regex_compile'. */
3237 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3240 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3242 *loc = (unsigned char) op;
3243 STORE_NUMBER (loc + 1, arg);
3247 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3250 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3252 *loc = (unsigned char) op;
3253 STORE_NUMBER (loc + 1, arg1);
3254 STORE_NUMBER (loc + 3, arg2);
3258 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3259 for OP followed by two-byte integer parameter ARG. */
3262 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3264 REGISTER unsigned char *pfrom = end;
3265 REGISTER unsigned char *pto = end + 3;
3267 while (pfrom != loc)
3270 store_op1 (op, loc, arg);
3274 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3277 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3280 REGISTER unsigned char *pfrom = end;
3281 REGISTER unsigned char *pto = end + 5;
3283 while (pfrom != loc)
3286 store_op2 (op, loc, arg1, arg2);
3290 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3291 after an alternative or a begin-subexpression. We assume there is at
3292 least one character before the ^. */
3295 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3297 re_char *prev = p - 2;
3298 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3301 /* After a subexpression? */
3302 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3303 /* After an alternative? */
3304 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3308 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3309 at least one character after the $, i.e., `P < PEND'. */
3312 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3315 re_bool next_backslash = *next == '\\';
3316 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3319 /* Before a subexpression? */
3320 (syntax & RE_NO_BK_PARENS ? *next == ')'
3321 : next_backslash && next_next && *next_next == ')')
3322 /* Before an alternative? */
3323 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3324 : next_backslash && next_next && *next_next == '|');
3328 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3329 false if it's not. */
3332 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3336 for (this_element = compile_stack.avail - 1;
3339 if (compile_stack.stack[this_element].regnum == regnum)
3346 /* Read the ending character of a range (in a bracket expression) from the
3347 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3348 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3349 Then we set the translation of all bits between the starting and
3350 ending characters (inclusive) in the compiled pattern B.
3352 Return an error code.
3354 We use these short variable names so we can use the same macros as
3355 `regex_compile' itself. */
3357 static reg_errcode_t
3358 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3359 reg_syntax_t syntax, unsigned char *buf_end)
3361 Element_count this_char;
3363 re_char *p = *p_ptr;
3364 int range_start, range_end;
3369 /* Even though the pattern is a signed `char *', we need to fetch
3370 with unsigned char *'s; if the high bit of the pattern character
3371 is set, the range endpoints will be negative if we fetch using a
3374 We also want to fetch the endpoints without translating them; the
3375 appropriate translation is done in the bit-setting loop below. */
3376 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3377 range_start = ((const unsigned char *) p)[-2];
3378 range_end = ((const unsigned char *) p)[0];
3380 /* Have to increment the pointer into the pattern string, so the
3381 caller isn't still at the ending character. */
3384 /* If the start is after the end, the range is empty. */
3385 if (range_start > range_end)
3386 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3388 /* Here we see why `this_char' has to be larger than an `unsigned
3389 char' -- the range is inclusive, so if `range_end' == 0xff
3390 (assuming 8-bit characters), we would otherwise go into an infinite
3391 loop, since all characters <= 0xff. */
3392 for (this_char = range_start; this_char <= range_end; this_char++)
3394 SET_LIST_BIT (TRANSLATE (this_char));
3402 static reg_errcode_t
3403 compile_extended_range (re_char **p_ptr, re_char *pend,
3404 RE_TRANSLATE_TYPE translate,
3405 reg_syntax_t syntax, Lisp_Object rtab)
3407 Emchar this_char, range_start, range_end;
3413 p = (const Bufbyte *) *p_ptr;
3414 range_end = charptr_emchar (p);
3415 p--; /* back to '-' */
3416 DEC_CHARPTR (p); /* back to start of range */
3417 /* We also want to fetch the endpoints without translating them; the
3418 appropriate translation is done in the bit-setting loop below. */
3419 range_start = charptr_emchar (p);
3420 INC_CHARPTR (*p_ptr);
3422 /* If the start is after the end, the range is empty. */
3423 if (range_start > range_end)
3424 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3426 /* Can't have ranges spanning different charsets, except maybe for
3427 ranges entirely within the first 256 chars. */
3429 if ((range_start >= 0x100 || range_end >= 0x100)
3431 && CHAR_CHARSET_ID (range_start) != CHAR_CHARSET_ID (range_end)
3433 && CHAR_LEADING_BYTE (range_start) != CHAR_LEADING_BYTE (range_end)
3436 return REG_ERANGESPAN;
3438 /* As advertised, translations only work over the 0 - 0x7F range.
3439 Making this kind of stuff work generally is much harder.
3440 Iterating over the whole range like this would be way efficient
3441 if the range encompasses 10,000 chars or something. You'd have
3442 to do something like this:
3446 map over translation table in [range_start, range_end] of
3447 (put the mapped range in a;
3448 put the translation in b)
3449 invert the range in a and truncate to [range_start, range_end]
3450 compute the union of a, b
3451 union the result into rtab
3453 for (this_char = range_start;
3454 this_char <= range_end && this_char < 0x80; this_char++)
3456 SET_RANGETAB_BIT (TRANSLATE (this_char));
3459 if (this_char <= range_end)
3460 put_range_table (rtab, this_char, range_end, Qt);
3467 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3468 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3469 characters can start a string that matches the pattern. This fastmap
3470 is used by re_search to skip quickly over impossible starting points.
3472 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3473 area as BUFP->fastmap.
3475 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3478 Returns 0 if we succeed, -2 if an internal error. */
3481 re_compile_fastmap (struct re_pattern_buffer *bufp)
3484 #ifdef MATCH_MAY_ALLOCATE
3485 fail_stack_type fail_stack;
3487 DECLARE_DESTINATION;
3488 /* We don't push any register information onto the failure stack. */
3490 REGISTER char *fastmap = bufp->fastmap;
3491 unsigned char *pattern = bufp->buffer;
3492 unsigned long size = bufp->used;
3493 unsigned char *p = pattern;
3494 REGISTER unsigned char *pend = pattern + size;
3497 /* This holds the pointer to the failure stack, when
3498 it is allocated relocatably. */
3499 fail_stack_elt_t *failure_stack_ptr;
3502 /* Assume that each path through the pattern can be null until
3503 proven otherwise. We set this false at the bottom of switch
3504 statement, to which we get only if a particular path doesn't
3505 match the empty string. */
3506 re_bool path_can_be_null = true;
3508 /* We aren't doing a `succeed_n' to begin with. */
3509 re_bool succeed_n_p = false;
3511 assert (fastmap != NULL && p != NULL);
3514 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3515 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3516 bufp->can_be_null = 0;
3520 if (p == pend || *p == succeed)
3522 /* We have reached the (effective) end of pattern. */
3523 if (!FAIL_STACK_EMPTY ())
3525 bufp->can_be_null |= path_can_be_null;
3527 /* Reset for next path. */
3528 path_can_be_null = true;
3530 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3538 /* We should never be about to go beyond the end of the pattern. */
3541 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3544 /* I guess the idea here is to simply not bother with a fastmap
3545 if a backreference is used, since it's too hard to figure out
3546 the fastmap for the corresponding group. Setting
3547 `can_be_null' stops `re_search_2' from using the fastmap, so
3548 that is all we do. */
3550 bufp->can_be_null = 1;
3554 /* Following are the cases which match a character. These end
3563 /* XEmacs: Under Mule, these bit vectors will
3564 only contain values for characters below 0x80. */
3565 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3566 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3572 /* Chars beyond end of map must be allowed. */
3574 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3576 /* And all extended characters must be allowed, too. */
3577 for (j = 0x80; j < 0xA0; j++)
3579 #else /* not MULE */
3580 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3584 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3585 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3595 nentries = unified_range_table_nentries (p);
3596 for (i = 0; i < nentries; i++)
3598 EMACS_INT first, last;
3599 Lisp_Object dummy_val;
3601 Bufbyte strr[MAX_EMCHAR_LEN];
3603 unified_range_table_get_range (p, i, &first, &last,
3605 for (jj = first; jj <= last && jj < 0x80; jj++)
3607 /* Ranges below 0x100 can span charsets, but there
3608 are only two (Control-1 and Latin-1), and
3609 either first or last has to be in them. */
3610 set_charptr_emchar (strr, first);
3614 set_charptr_emchar (strr, last);
3621 case charset_mule_not:
3626 nentries = unified_range_table_nentries (p);
3627 for (i = 0; i < nentries; i++)
3629 EMACS_INT first, last;
3630 Lisp_Object dummy_val;
3632 int smallest_prev = 0;
3634 unified_range_table_get_range (p, i, &first, &last,
3636 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3638 smallest_prev = last + 1;
3639 if (smallest_prev >= 0x80)
3642 /* Calculating which leading bytes are actually allowed
3643 here is rather difficult, so we just punt and allow
3645 for (i = 0x80; i < 0xA0; i++)
3657 for (j = 0; j < (1 << BYTEWIDTH); j++)
3660 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3669 goto matchnotsyntax;
3671 for (j = 0; j < (1 << BYTEWIDTH); j++)
3674 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3682 int fastmap_newline = fastmap['\n'];
3684 /* `.' matches anything ... */
3686 /* "anything" only includes bytes that can be the
3687 first byte of a character. */
3688 for (j = 0; j < 0xA0; j++)
3691 for (j = 0; j < (1 << BYTEWIDTH); j++)
3695 /* ... except perhaps newline. */
3696 if (!(bufp->syntax & RE_DOT_NEWLINE))
3697 fastmap['\n'] = fastmap_newline;
3699 /* Return if we have already set `can_be_null'; if we have,
3700 then the fastmap is irrelevant. Something's wrong here. */
3701 else if (bufp->can_be_null)
3704 /* Otherwise, have to check alternative paths. */
3715 /* This match depends on text properties. These end with
3716 aborting optimizations. */
3717 bufp->can_be_null = 1;
3721 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3728 for (j = 0; j < 0x80; j++)
3731 (regex_emacs_buffer->syntax_table), j) ==
3732 (enum syntaxcode) k)
3735 for (j = 0; j < 0x80; j++)
3738 (regex_emacs_buffer->mirror_syntax_table), j) ==
3739 (enum syntaxcode) k)
3742 for (j = 0x80; j < 0xA0; j++)
3745 if (LEADING_BYTE_PREFIX_P(j))
3746 /* too complicated to calculate this right */
3754 cset = CHARSET_BY_LEADING_BYTE (j);
3755 if (CHARSETP (cset))
3757 if (charset_syntax (regex_emacs_buffer, cset,
3759 == Sword || multi_p)
3766 #else /* not MULE */
3767 for (j = 0; j < (1 << BYTEWIDTH); j++)
3770 (regex_emacs_buffer->mirror_syntax_table), j) ==
3771 (enum syntaxcode) k)
3777 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3784 for (j = 0; j < 0x80; j++)
3787 (regex_emacs_buffer->syntax_table), j) !=
3788 (enum syntaxcode) k)
3791 for (j = 0; j < 0x80; j++)
3794 (regex_emacs_buffer->mirror_syntax_table), j) !=
3795 (enum syntaxcode) k)
3798 for (j = 0x80; j < 0xA0; j++)
3801 if (LEADING_BYTE_PREFIX_P(j))
3802 /* too complicated to calculate this right */
3810 cset = CHARSET_BY_LEADING_BYTE (j);
3811 if (CHARSETP (cset))
3813 if (charset_syntax (regex_emacs_buffer, cset,
3815 != Sword || multi_p)
3822 #else /* not MULE */
3823 for (j = 0; j < (1 << BYTEWIDTH); j++)
3826 (regex_emacs_buffer->mirror_syntax_table), j) !=
3827 (enum syntaxcode) k)
3834 /* 97/2/17 jhod category patch */
3836 case notcategoryspec:
3837 bufp->can_be_null = 1;
3839 /* end if category patch */
3842 /* All cases after this match the empty string. These end with
3864 case push_dummy_failure:
3869 case pop_failure_jump:
3870 case maybe_pop_jump:
3873 case dummy_failure_jump:
3874 EXTRACT_NUMBER_AND_INCR (j, p);
3879 /* Jump backward implies we just went through the body of a
3880 loop and matched nothing. Opcode jumped to should be
3881 `on_failure_jump' or `succeed_n'. Just treat it like an
3882 ordinary jump. For a * loop, it has pushed its failure
3883 point already; if so, discard that as redundant. */
3884 if ((re_opcode_t) *p != on_failure_jump
3885 && (re_opcode_t) *p != succeed_n)
3889 EXTRACT_NUMBER_AND_INCR (j, p);
3892 /* If what's on the stack is where we are now, pop it. */
3893 if (!FAIL_STACK_EMPTY ()
3894 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3900 case on_failure_jump:
3901 case on_failure_keep_string_jump:
3902 handle_on_failure_jump:
3903 EXTRACT_NUMBER_AND_INCR (j, p);
3905 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3906 end of the pattern. We don't want to push such a point,
3907 since when we restore it above, entering the switch will
3908 increment `p' past the end of the pattern. We don't need
3909 to push such a point since we obviously won't find any more
3910 fastmap entries beyond `pend'. Such a pattern can match
3911 the null string, though. */
3914 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3916 RESET_FAIL_STACK ();
3921 bufp->can_be_null = 1;
3925 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3926 succeed_n_p = false;
3933 /* Get to the number of times to succeed. */
3936 /* Increment p past the n for when k != 0. */
3937 EXTRACT_NUMBER_AND_INCR (k, p);
3941 succeed_n_p = true; /* Spaghetti code alert. */
3942 goto handle_on_failure_jump;
3959 ABORT (); /* We have listed all the cases. */
3962 /* Getting here means we have found the possible starting
3963 characters for one path of the pattern -- and that the empty
3964 string does not match. We need not follow this path further.
3965 Instead, look at the next alternative (remembered on the
3966 stack), or quit if no more. The test at the top of the loop
3967 does these things. */
3968 path_can_be_null = false;
3972 /* Set `can_be_null' for the last path (also the first path, if the
3973 pattern is empty). */
3974 bufp->can_be_null |= path_can_be_null;
3977 RESET_FAIL_STACK ();
3979 } /* re_compile_fastmap */
3981 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3982 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3983 this memory for recording register information. STARTS and ENDS
3984 must be allocated using the malloc library routine, and must each
3985 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3987 If NUM_REGS == 0, then subsequent matches should allocate their own
3990 Unless this function is called, the first search or match using
3991 PATTERN_BUFFER will allocate its own register data, without
3992 freeing the old data. */
3995 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3996 unsigned num_regs, regoff_t *starts, regoff_t *ends)
4000 bufp->regs_allocated = REGS_REALLOCATE;
4001 regs->num_regs = num_regs;
4002 regs->start = starts;
4007 bufp->regs_allocated = REGS_UNALLOCATED;
4009 regs->start = regs->end = (regoff_t *) 0;
4013 /* Searching routines. */
4015 /* Like re_search_2, below, but only one string is specified, and
4016 doesn't let you say where to stop matching. */
4019 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
4020 int startpos, int range, struct re_registers *regs)
4022 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4027 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4028 # define bytecount_to_charcount(ptr, len) (len)
4029 # define charcount_to_bytecount(ptr, len) (len)
4030 typedef int Charcount;
4033 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4034 virtual concatenation of STRING1 and STRING2, starting first at index
4035 STARTPOS, then at STARTPOS + 1, and so on.
4037 With MULE, STARTPOS is a byte position, not a char position. And the
4038 search will increment STARTPOS by the width of the current leading
4041 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4043 RANGE is how far to scan while trying to match. RANGE = 0 means try
4044 only at STARTPOS; in general, the last start tried is STARTPOS +
4047 With MULE, RANGE is a byte position, not a char position. The last
4048 start tried is the character starting <= STARTPOS + RANGE.
4050 In REGS, return the indices of the virtual concatenation of STRING1
4051 and STRING2 that matched the entire BUFP->buffer and its contained
4054 Do not consider matching one past the index STOP in the virtual
4055 concatenation of STRING1 and STRING2.
4057 We return either the position in the strings at which the match was
4058 found, -1 if no match, or -2 if error (such as failure
4062 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
4063 int size1, const char *str2, int size2, int startpos,
4064 int range, struct re_registers *regs, int stop)
4067 re_char *string1 = (re_char *) str1;
4068 re_char *string2 = (re_char *) str2;
4069 REGISTER char *fastmap = bufp->fastmap;
4070 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4071 int total_size = size1 + size2;
4072 int endpos = startpos + range;
4073 #ifdef REGEX_BEGLINE_CHECK
4074 int anchored_at_begline = 0;
4079 /* Check for out-of-range STARTPOS. */
4080 if (startpos < 0 || startpos > total_size)
4083 /* Fix up RANGE if it might eventually take us outside
4084 the virtual concatenation of STRING1 and STRING2. */
4086 range = 0 - startpos;
4087 else if (endpos > total_size)
4088 range = total_size - startpos;
4090 /* If the search isn't to be a backwards one, don't waste time in a
4091 search for a pattern that must be anchored. */
4092 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4098 d = ((const unsigned char *)
4099 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4100 range = charcount_to_bytecount (d, 1);
4105 /* In a forward search for something that starts with \=.
4106 don't keep searching past point. */
4107 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4109 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4116 /* Update the fastmap now if not correct already. */
4117 if (fastmap && !bufp->fastmap_accurate)
4118 if (re_compile_fastmap (bufp) == -2)
4121 #ifdef REGEX_BEGLINE_CHECK
4123 unsigned long i = 0;
4125 while (i < bufp->used)
4127 if (bufp->buffer[i] == start_memory ||
4128 bufp->buffer[i] == stop_memory)
4133 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4138 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4140 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4146 /* Loop through the string, looking for a place to start matching. */
4149 #ifdef REGEX_BEGLINE_CHECK
4150 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4151 then we can speed things up by skipping to the next beginning-of-
4153 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4156 /* whose stupid idea was it anyway to make this
4157 function take two strings to match?? */
4161 if (startpos < size1 && startpos + range >= size1)
4162 lim = range - (size1 - startpos);
4164 d = ((const unsigned char *)
4165 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4166 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4167 d_size = charcount_to_bytecount (d, 1);
4169 if (TRANSLATE_P (translate))
4170 while (range > lim && *d != '\n')
4172 d += d_size; /* Speedier INC_CHARPTR(d) */
4173 d_size = charcount_to_bytecount (d, 1);
4177 while (range > lim && *d != '\n')
4179 d += d_size; /* Speedier INC_CHARPTR(d) */
4180 d_size = charcount_to_bytecount (d, 1);
4184 startpos += irange - range;
4186 #endif /* REGEX_BEGLINE_CHECK */
4188 /* If a fastmap is supplied, skip quickly over characters that
4189 cannot be the start of a match. If the pattern can match the
4190 null string, however, we don't need to skip characters; we want
4191 the first null string. */
4192 if (fastmap && startpos < total_size && !bufp->can_be_null)
4194 if (range > 0) /* Searching forwards. */
4199 if (startpos < size1 && startpos + range >= size1)
4200 lim = range - (size1 - startpos);
4202 d = ((const unsigned char *)
4203 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4205 /* Written out as an if-else to avoid testing `translate'
4207 if (TRANSLATE_P (translate))
4212 Bufbyte str[MAX_EMCHAR_LEN];
4214 buf_ch = charptr_emchar (d);
4215 buf_ch = RE_TRANSLATE (buf_ch);
4216 set_charptr_emchar (str, buf_ch);
4217 if (buf_ch >= 0200 || fastmap[(unsigned char) *str])
4220 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4223 d_size = charcount_to_bytecount (d, 1);
4225 d += d_size; /* Speedier INC_CHARPTR(d) */
4228 while (range > lim && !fastmap[*d])
4230 d_size = charcount_to_bytecount (d, 1);
4232 d += d_size; /* Speedier INC_CHARPTR(d) */
4235 startpos += irange - range;
4237 else /* Searching backwards. */
4239 Emchar c = (size1 == 0 || startpos >= size1
4240 ? charptr_emchar (string2 + startpos - size1)
4241 : charptr_emchar (string1 + startpos));
4244 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4247 if (!fastmap[(unsigned char) c])
4253 /* If can't match the null string, and that's all we have left, fail. */
4254 if (range >= 0 && startpos == total_size && fastmap
4255 && !bufp->can_be_null)
4258 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4259 if (!no_quit_in_re_search)
4262 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4263 startpos, regs, stop);
4264 #ifndef REGEX_MALLOC
4281 d = ((const unsigned char *)
4282 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4283 d_size = charcount_to_bytecount (d, 1);
4289 /* Note startpos > size1 not >=. If we are on the
4290 string1/string2 boundary, we want to backup into string1. */
4291 d = ((const unsigned char *)
4292 (startpos > size1 ? string2 - size1 : string1) + startpos);
4294 d_size = charcount_to_bytecount (d, 1);
4302 /* Declarations and macros for re_match_2. */
4304 /* This converts PTR, a pointer into one of the search strings `string1'
4305 and `string2' into an offset from the beginning of that string. */
4306 #define POINTER_TO_OFFSET(ptr) \
4307 (FIRST_STRING_P (ptr) \
4308 ? ((regoff_t) ((ptr) - string1)) \
4309 : ((regoff_t) ((ptr) - string2 + size1)))
4311 /* Macros for dealing with the split strings in re_match_2. */
4313 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4315 /* Call before fetching a character with *d. This switches over to
4316 string2 if necessary. */
4317 #define REGEX_PREFETCH() \
4320 /* End of string2 => fail. */ \
4321 if (dend == end_match_2) \
4323 /* End of string1 => advance to string2. */ \
4325 dend = end_match_2; \
4329 /* Test if at very beginning or at very end of the virtual concatenation
4330 of `string1' and `string2'. If only one string, it's `string2'. */
4331 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4332 #define AT_STRINGS_END(d) ((d) == end2)
4335 If the given position straddles the string gap, return the equivalent
4336 position that is before or after the gap, respectively; otherwise,
4337 return the same position. */
4338 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4339 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4341 /* Test if CH is a word-constituent character. (XEmacs change) */
4343 #define WORDCHAR_P_UNSAFE(ch) \
4344 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), \
4347 #define WORDCHAR_P_UNSAFE(ch) \
4348 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4352 /* Free everything we malloc. */
4353 #ifdef MATCH_MAY_ALLOCATE
4354 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4355 #define FREE_VARIABLES() \
4357 REGEX_FREE_STACK (fail_stack.stack); \
4358 FREE_VAR (regstart); \
4359 FREE_VAR (regend); \
4360 FREE_VAR (old_regstart); \
4361 FREE_VAR (old_regend); \
4362 FREE_VAR (best_regstart); \
4363 FREE_VAR (best_regend); \
4364 FREE_VAR (reg_info); \
4365 FREE_VAR (reg_dummy); \
4366 FREE_VAR (reg_info_dummy); \
4368 #else /* not MATCH_MAY_ALLOCATE */
4369 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4370 #endif /* MATCH_MAY_ALLOCATE */
4372 /* These values must meet several constraints. They must not be valid
4373 register values; since we have a limit of 255 registers (because
4374 we use only one byte in the pattern for the register number), we can
4375 use numbers larger than 255. They must differ by 1, because of
4376 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4377 be larger than the value for the highest register, so we do not try
4378 to actually save any registers when none are active. */
4379 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4380 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4382 /* Matching routines. */
4384 #ifndef emacs /* Emacs never uses this. */
4385 /* re_match is like re_match_2 except it takes only a single string. */
4388 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4389 int pos, struct re_registers *regs)
4391 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4396 #endif /* not emacs */
4399 /* re_match_2 matches the compiled pattern in BUFP against the
4400 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4401 SIZE2, respectively). We start matching at POS, and stop matching
4404 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4405 store offsets for the substring each group matched in REGS. See the
4406 documentation for exactly how many groups we fill.
4408 We return -1 if no match, -2 if an internal error (such as the
4409 failure stack overflowing). Otherwise, we return the length of the
4410 matched substring. */
4413 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4414 int size1, const char *string2, int size2, int pos,
4415 struct re_registers *regs, int stop)
4420 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4422 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4428 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4429 (re_char *) string2, size2,
4436 /* This is a separate function so that we can force an alloca cleanup
4439 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4440 int size1, re_char *string2, int size2, int pos,
4441 struct re_registers *regs, int stop)
4443 /* General temporaries. */
4446 int should_succeed; /* XEmacs change */
4448 /* Just past the end of the corresponding string. */
4449 re_char *end1, *end2;
4451 /* Pointers into string1 and string2, just past the last characters in
4452 each to consider matching. */
4453 re_char *end_match_1, *end_match_2;
4455 /* Where we are in the data, and the end of the current string. */
4458 /* Where we are in the pattern, and the end of the pattern. */
4459 unsigned char *p = bufp->buffer;
4460 REGISTER unsigned char *pend = p + bufp->used;
4462 /* Mark the opcode just after a start_memory, so we can test for an
4463 empty subpattern when we get to the stop_memory. */
4464 re_char *just_past_start_mem = 0;
4466 /* We use this to map every character in the string. */
4467 RE_TRANSLATE_TYPE translate = bufp->translate;
4469 /* Failure point stack. Each place that can handle a failure further
4470 down the line pushes a failure point on this stack. It consists of
4471 restart, regend, and reg_info for all registers corresponding to
4472 the subexpressions we're currently inside, plus the number of such
4473 registers, and, finally, two char *'s. The first char * is where
4474 to resume scanning the pattern; the second one is where to resume
4475 scanning the strings. If the latter is zero, the failure point is
4476 a ``dummy''; if a failure happens and the failure point is a dummy,
4477 it gets discarded and the next one is tried. */
4478 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4479 fail_stack_type fail_stack;
4482 static unsigned failure_id;
4483 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4487 /* This holds the pointer to the failure stack, when
4488 it is allocated relocatably. */
4489 fail_stack_elt_t *failure_stack_ptr;
4492 /* We fill all the registers internally, independent of what we
4493 return, for use in backreferences. The number here includes
4494 an element for register zero. */
4495 int num_regs = bufp->re_ngroups + 1;
4497 /* The currently active registers. */
4498 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4499 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4501 /* Information on the contents of registers. These are pointers into
4502 the input strings; they record just what was matched (on this
4503 attempt) by a subexpression part of the pattern, that is, the
4504 regnum-th regstart pointer points to where in the pattern we began
4505 matching and the regnum-th regend points to right after where we
4506 stopped matching the regnum-th subexpression. (The zeroth register
4507 keeps track of what the whole pattern matches.) */
4508 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4509 re_char **regstart, **regend;
4512 /* If a group that's operated upon by a repetition operator fails to
4513 match anything, then the register for its start will need to be
4514 restored because it will have been set to wherever in the string we
4515 are when we last see its open-group operator. Similarly for a
4517 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4518 re_char **old_regstart, **old_regend;
4521 /* The is_active field of reg_info helps us keep track of which (possibly
4522 nested) subexpressions we are currently in. The matched_something
4523 field of reg_info[reg_num] helps us tell whether or not we have
4524 matched any of the pattern so far this time through the reg_num-th
4525 subexpression. These two fields get reset each time through any
4526 loop their register is in. */
4527 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4528 register_info_type *reg_info;
4531 /* The following record the register info as found in the above
4532 variables when we find a match better than any we've seen before.
4533 This happens as we backtrack through the failure points, which in
4534 turn happens only if we have not yet matched the entire string. */
4535 unsigned best_regs_set = false;
4536 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4537 re_char **best_regstart, **best_regend;
4540 /* Logically, this is `best_regend[0]'. But we don't want to have to
4541 allocate space for that if we're not allocating space for anything
4542 else (see below). Also, we never need info about register 0 for
4543 any of the other register vectors, and it seems rather a kludge to
4544 treat `best_regend' differently than the rest. So we keep track of
4545 the end of the best match so far in a separate variable. We
4546 initialize this to NULL so that when we backtrack the first time
4547 and need to test it, it's not garbage. */
4548 re_char *match_end = NULL;
4550 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4551 int set_regs_matched_done = 0;
4553 /* Used when we pop values we don't care about. */
4554 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4555 re_char **reg_dummy;
4556 register_info_type *reg_info_dummy;
4560 /* Counts the total number of registers pushed. */
4561 unsigned num_regs_pushed = 0;
4564 /* 1 if this match ends in the same string (string1 or string2)
4565 as the best previous match. */
4568 /* 1 if this match is the best seen so far. */
4569 re_bool best_match_p;
4571 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4575 #ifdef MATCH_MAY_ALLOCATE
4576 /* Do not bother to initialize all the register variables if there are
4577 no groups in the pattern, as it takes a fair amount of time. If
4578 there are groups, we include space for register 0 (the whole
4579 pattern), even though we never use it, since it simplifies the
4580 array indexing. We should fix this. */
4581 if (bufp->re_ngroups)
4583 regstart = REGEX_TALLOC (num_regs, re_char *);
4584 regend = REGEX_TALLOC (num_regs, re_char *);
4585 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4586 old_regend = REGEX_TALLOC (num_regs, re_char *);
4587 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4588 best_regend = REGEX_TALLOC (num_regs, re_char *);
4589 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4590 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4591 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4593 if (!(regstart && regend && old_regstart && old_regend && reg_info
4594 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4602 /* We must initialize all our variables to NULL, so that
4603 `FREE_VARIABLES' doesn't try to free them. */
4604 regstart = regend = old_regstart = old_regend = best_regstart
4605 = best_regend = reg_dummy = NULL;
4606 reg_info = reg_info_dummy = (register_info_type *) NULL;
4608 #endif /* MATCH_MAY_ALLOCATE */
4610 /* The starting position is bogus. */
4611 if (pos < 0 || pos > size1 + size2)
4617 /* Initialize subexpression text positions to -1 to mark ones that no
4618 start_memory/stop_memory has been seen for. Also initialize the
4619 register information struct. */
4620 for (mcnt = 1; mcnt < num_regs; mcnt++)
4622 regstart[mcnt] = regend[mcnt]
4623 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4625 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4626 IS_ACTIVE (reg_info[mcnt]) = 0;
4627 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4628 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4630 /* We move `string1' into `string2' if the latter's empty -- but not if
4631 `string1' is null. */
4632 if (size2 == 0 && string1 != NULL)
4639 end1 = string1 + size1;
4640 end2 = string2 + size2;
4642 /* Compute where to stop matching, within the two strings. */
4645 end_match_1 = string1 + stop;
4646 end_match_2 = string2;
4651 end_match_2 = string2 + stop - size1;
4654 /* `p' scans through the pattern as `d' scans through the data.
4655 `dend' is the end of the input string that `d' points within. `d'
4656 is advanced into the following input string whenever necessary, but
4657 this happens before fetching; therefore, at the beginning of the
4658 loop, `d' can be pointing at the end of a string, but it cannot
4660 if (size1 > 0 && pos <= size1)
4667 d = string2 + pos - size1;
4671 DEBUG_PRINT1 ("The compiled pattern is: \n");
4672 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4673 DEBUG_PRINT1 ("The string to match is: `");
4674 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4675 DEBUG_PRINT1 ("'\n");
4677 /* This loops over pattern commands. It exits by returning from the
4678 function if the match is complete, or it drops through if the match
4679 fails at this starting point in the input data. */
4682 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4683 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4684 if (!no_quit_in_re_search)
4689 { /* End of pattern means we might have succeeded. */
4690 DEBUG_PRINT1 ("end of pattern ... ");
4692 /* If we haven't matched the entire string, and we want the
4693 longest match, try backtracking. */
4694 if (d != end_match_2)
4696 same_str_p = (FIRST_STRING_P (match_end)
4697 == MATCHING_IN_FIRST_STRING);
4699 /* AIX compiler got confused when this was combined
4700 with the previous declaration. */
4702 best_match_p = d > match_end;
4704 best_match_p = !MATCHING_IN_FIRST_STRING;
4706 DEBUG_PRINT1 ("backtracking.\n");
4708 if (!FAIL_STACK_EMPTY ())
4709 { /* More failure points to try. */
4711 /* If exceeds best match so far, save it. */
4712 if (!best_regs_set || best_match_p)
4714 best_regs_set = true;
4717 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4719 for (mcnt = 1; mcnt < num_regs; mcnt++)
4721 best_regstart[mcnt] = regstart[mcnt];
4722 best_regend[mcnt] = regend[mcnt];
4728 /* If no failure points, don't restore garbage. And if
4729 last match is real best match, don't restore second
4731 else if (best_regs_set && !best_match_p)
4734 /* Restore best match. It may happen that `dend ==
4735 end_match_1' while the restored d is in string2.
4736 For example, the pattern `x.*y.*z' against the
4737 strings `x-' and `y-z-', if the two strings are
4738 not consecutive in memory. */
4739 DEBUG_PRINT1 ("Restoring best registers.\n");
4742 dend = ((d >= string1 && d <= end1)
4743 ? end_match_1 : end_match_2);
4745 for (mcnt = 1; mcnt < num_regs; mcnt++)
4747 regstart[mcnt] = best_regstart[mcnt];
4748 regend[mcnt] = best_regend[mcnt];
4751 } /* d != end_match_2 */
4754 DEBUG_PRINT1 ("Accepting match.\n");
4757 /* If caller wants register contents data back, fill REGS. */
4758 int num_nonshy_regs = bufp->re_nsub + 1;
4759 if (regs && !bufp->no_sub)
4761 /* Have the register data arrays been allocated? */
4762 if (bufp->regs_allocated == REGS_UNALLOCATED)
4763 { /* No. So allocate them with malloc. We need one
4764 extra element beyond `num_regs' for the `-1' marker
4766 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1);
4767 regs->start = TALLOC (regs->num_regs, regoff_t);
4768 regs->end = TALLOC (regs->num_regs, regoff_t);
4769 if (regs->start == NULL || regs->end == NULL)
4774 bufp->regs_allocated = REGS_REALLOCATE;
4776 else if (bufp->regs_allocated == REGS_REALLOCATE)
4777 { /* Yes. If we need more elements than were already
4778 allocated, reallocate them. If we need fewer, just
4780 if (regs->num_regs < num_nonshy_regs + 1)
4782 regs->num_regs = num_nonshy_regs + 1;
4783 RETALLOC (regs->start, regs->num_regs, regoff_t);
4784 RETALLOC (regs->end, regs->num_regs, regoff_t);
4785 if (regs->start == NULL || regs->end == NULL)
4794 /* The braces fend off a "empty body in an else-statement"
4795 warning under GCC when assert expands to nothing. */
4796 assert (bufp->regs_allocated == REGS_FIXED);
4799 /* Convert the pointer data in `regstart' and `regend' to
4800 indices. Register zero has to be set differently,
4801 since we haven't kept track of any info for it. */
4802 if (regs->num_regs > 0)
4804 regs->start[0] = pos;
4805 regs->end[0] = (MATCHING_IN_FIRST_STRING
4806 ? ((regoff_t) (d - string1))
4807 : ((regoff_t) (d - string2 + size1)));
4810 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
4811 Copy each into the corresponding external register.
4812 N.B. MCNT indexes external registers. */
4814 mcnt < MIN (num_nonshy_regs, regs->num_regs);
4817 int ireg = bufp->external_to_internal_register[mcnt];
4819 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
4820 regs->start[mcnt] = regs->end[mcnt] = -1;
4824 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
4826 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
4829 } /* regs && !bufp->no_sub */
4831 /* If we have regs and the regs structure has more elements than
4832 were in the pattern, set the extra elements to -1. If we
4833 (re)allocated the registers, this is the case, because we
4834 always allocate enough to have at least one -1 at the end.
4836 We do this even when no_sub is set because some applications
4837 (XEmacs) reuse register structures which may contain stale
4838 information, and permit attempts to access those registers.
4840 It would be possible to require the caller to do this, but we'd
4841 have to change the API for this function to reflect that, and
4842 audit all callers. */
4843 if (regs && regs->num_regs > 0)
4844 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
4845 regs->start[mcnt] = regs->end[mcnt] = -1;
4848 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4849 nfailure_points_pushed, nfailure_points_popped,
4850 nfailure_points_pushed - nfailure_points_popped);
4851 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4853 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4857 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4863 /* Otherwise match next pattern command. */
4864 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4866 /* Ignore these. Used to ignore the n of succeed_n's which
4867 currently have n == 0. */
4869 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4873 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4876 /* Match the next n pattern characters exactly. The following
4877 byte in the pattern defines n, and the n bytes after that
4878 are the characters to match. */
4881 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4883 /* This is written out as an if-else so we don't waste time
4884 testing `translate' inside the loop. */
4885 if (TRANSLATE_P (translate))
4890 Emchar pat_ch, buf_ch;
4894 pat_ch = charptr_emchar (p);
4895 buf_ch = charptr_emchar (d);
4896 if (RE_TRANSLATE (buf_ch) != pat_ch)
4899 pat_len = charcount_to_bytecount (p, 1);
4904 #else /* not MULE */
4906 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4918 if (*d++ != *p++) goto fail;
4922 SET_REGS_MATCHED ();
4926 /* Match any character except possibly a newline or a null. */
4928 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4932 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4933 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4936 SET_REGS_MATCHED ();
4937 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4938 INC_CHARPTR (d); /* XEmacs change */
4945 REGISTER unsigned char c;
4946 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4948 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4951 c = TRANSLATE (*d); /* The character to match. */
4953 /* Cast to `unsigned' instead of `unsigned char' in case the
4954 bit list is a full 32 bytes long. */
4955 if (c < (unsigned) (*p * BYTEWIDTH)
4956 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4961 if (!not_p) goto fail;
4963 SET_REGS_MATCHED ();
4964 INC_CHARPTR (d); /* XEmacs change */
4970 case charset_mule_not:
4973 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4975 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4978 c = charptr_emchar ((const Bufbyte *) d);
4979 c = TRANSLATE (c); /* The character to match. */
4981 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4984 p += unified_range_table_bytes_used (p);
4986 if (!not_p) goto fail;
4988 SET_REGS_MATCHED ();
4995 /* The beginning of a group is represented by start_memory.
4996 The arguments are the register number in the next byte, and the
4997 number of groups inner to this one in the next. The text
4998 matched within the group is recorded (in the internal
4999 registers data structure) under the register number. */
5001 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
5003 /* Find out if this group can match the empty string. */
5004 p1 = p; /* To send to group_match_null_string_p. */
5006 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
5007 REG_MATCH_NULL_STRING_P (reg_info[*p])
5008 = group_match_null_string_p (&p1, pend, reg_info);
5010 /* Save the position in the string where we were the last time
5011 we were at this open-group operator in case the group is
5012 operated upon by a repetition operator, e.g., with `(a*)*b'
5013 against `ab'; then we want to ignore where we are now in
5014 the string in case this attempt to match fails. */
5015 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
5016 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
5018 DEBUG_PRINT2 (" old_regstart: %d\n",
5019 POINTER_TO_OFFSET (old_regstart[*p]));
5022 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5024 IS_ACTIVE (reg_info[*p]) = 1;
5025 MATCHED_SOMETHING (reg_info[*p]) = 0;
5027 /* Clear this whenever we change the register activity status. */
5028 set_regs_matched_done = 0;
5030 /* This is the new highest active register. */
5031 highest_active_reg = *p;
5033 /* If nothing was active before, this is the new lowest active
5035 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5036 lowest_active_reg = *p;
5038 /* Move past the register number and inner group count. */
5040 just_past_start_mem = p;
5045 /* The stop_memory opcode represents the end of a group. Its
5046 arguments are the same as start_memory's: the register
5047 number, and the number of inner groups. */
5049 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
5051 /* We need to save the string position the last time we were at
5052 this close-group operator in case the group is operated
5053 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
5054 against `aba'; then we want to ignore where we are now in
5055 the string in case this attempt to match fails. */
5056 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
5057 ? REG_UNSET (regend[*p]) ? d : regend[*p]
5059 DEBUG_PRINT2 (" old_regend: %d\n",
5060 POINTER_TO_OFFSET (old_regend[*p]));
5063 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5065 /* This register isn't active anymore. */
5066 IS_ACTIVE (reg_info[*p]) = 0;
5068 /* Clear this whenever we change the register activity status. */
5069 set_regs_matched_done = 0;
5071 /* If this was the only register active, nothing is active
5073 if (lowest_active_reg == highest_active_reg)
5075 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5076 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5079 { /* We must scan for the new highest active register, since
5080 it isn't necessarily one less than now: consider
5081 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
5082 new highest active register is 1. */
5083 unsigned char r = *p - 1;
5084 while (r > 0 && !IS_ACTIVE (reg_info[r]))
5087 /* If we end up at register zero, that means that we saved
5088 the registers as the result of an `on_failure_jump', not
5089 a `start_memory', and we jumped to past the innermost
5090 `stop_memory'. For example, in ((.)*) we save
5091 registers 1 and 2 as a result of the *, but when we pop
5092 back to the second ), we are at the stop_memory 1.
5093 Thus, nothing is active. */
5096 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5097 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5101 highest_active_reg = r;
5103 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
5105 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
5107 lowest_active_reg = r;
5111 /* If just failed to match something this time around with a
5112 group that's operated on by a repetition operator, try to
5113 force exit from the ``loop'', and restore the register
5114 information for this group that we had before trying this
5116 if ((!MATCHED_SOMETHING (reg_info[*p])
5117 || just_past_start_mem == p - 1)
5120 re_bool is_a_jump_n = false;
5124 switch ((re_opcode_t) *p1++)
5128 case pop_failure_jump:
5129 case maybe_pop_jump:
5131 case dummy_failure_jump:
5132 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5142 /* If the next operation is a jump backwards in the pattern
5143 to an on_failure_jump right before the start_memory
5144 corresponding to this stop_memory, exit from the loop
5145 by forcing a failure after pushing on the stack the
5146 on_failure_jump's jump in the pattern, and d. */
5147 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5148 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5150 /* If this group ever matched anything, then restore
5151 what its registers were before trying this last
5152 failed match, e.g., with `(a*)*b' against `ab' for
5153 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5154 against `aba' for regend[3].
5156 Also restore the registers for inner groups for,
5157 e.g., `((a*)(b*))*' against `aba' (register 3 would
5158 otherwise get trashed). */
5160 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5164 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5166 /* Restore this and inner groups' (if any) registers. */
5167 for (r = *p; r < *p + *(p + 1); r++)
5169 regstart[r] = old_regstart[r];
5171 /* xx why this test? */
5172 if (old_regend[r] >= regstart[r])
5173 regend[r] = old_regend[r];
5177 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5178 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5184 /* Move past the register number and the inner group count. */
5189 /* \<digit> has been turned into a `duplicate' command which is
5190 followed by the numeric value of <digit> as the register number.
5191 (Already passed through external-to-internal-register mapping,
5192 so it refers to the actual group number, not the non-shy-only
5193 numbering used in the external world.) */
5196 REGISTER re_char *d2, *dend2;
5197 /* Get which register to match against. */
5199 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5201 /* Can't back reference a group which we've never matched. */
5202 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5205 /* Where in input to try to start matching. */
5206 d2 = regstart[regno];
5208 /* Where to stop matching; if both the place to start and
5209 the place to stop matching are in the same string, then
5210 set to the place to stop, otherwise, for now have to use
5211 the end of the first string. */
5213 dend2 = ((FIRST_STRING_P (regstart[regno])
5214 == FIRST_STRING_P (regend[regno]))
5215 ? regend[regno] : end_match_1);
5218 /* If necessary, advance to next segment in register
5222 if (dend2 == end_match_2) break;
5223 if (dend2 == regend[regno]) break;
5225 /* End of string1 => advance to string2. */
5227 dend2 = regend[regno];
5229 /* At end of register contents => success */
5230 if (d2 == dend2) break;
5232 /* If necessary, advance to next segment in data. */
5235 /* How many characters left in this segment to match. */
5238 /* Want how many consecutive characters we can match in
5239 one shot, so, if necessary, adjust the count. */
5240 if (mcnt > dend2 - d2)
5243 /* Compare that many; failure if mismatch, else move
5245 if (TRANSLATE_P (translate)
5246 ? bcmp_translate ((unsigned char *) d,
5247 (unsigned char *) d2, mcnt, translate)
5248 : memcmp (d, d2, mcnt))
5250 d += mcnt, d2 += mcnt;
5252 /* Do this because we've match some characters. */
5253 SET_REGS_MATCHED ();
5259 /* begline matches the empty string at the beginning of the string
5260 (unless `not_bol' is set in `bufp'), and, if
5261 `newline_anchor' is set, after newlines. */
5263 DEBUG_PRINT1 ("EXECUTING begline.\n");
5265 if (AT_STRINGS_BEG (d))
5267 if (!bufp->not_bol) break;
5269 else if (d[-1] == '\n' && bufp->newline_anchor)
5273 /* In all other cases, we fail. */
5277 /* endline is the dual of begline. */
5279 DEBUG_PRINT1 ("EXECUTING endline.\n");
5281 if (AT_STRINGS_END (d))
5283 if (!bufp->not_eol) break;
5286 /* We have to ``prefetch'' the next character. */
5287 else if ((d == end1 ? *string2 : *d) == '\n'
5288 && bufp->newline_anchor)
5295 /* Match at the very beginning of the data. */
5297 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5298 if (AT_STRINGS_BEG (d))
5303 /* Match at the very end of the data. */
5305 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5306 if (AT_STRINGS_END (d))
5311 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5312 pushes NULL as the value for the string on the stack. Then
5313 `pop_failure_point' will keep the current value for the
5314 string, instead of restoring it. To see why, consider
5315 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5316 then the . fails against the \n. But the next thing we want
5317 to do is match the \n against the \n; if we restored the
5318 string value, we would be back at the foo.
5320 Because this is used only in specific cases, we don't need to
5321 check all the things that `on_failure_jump' does, to make
5322 sure the right things get saved on the stack. Hence we don't
5323 share its code. The only reason to push anything on the
5324 stack at all is that otherwise we would have to change
5325 `anychar's code to do something besides goto fail in this
5326 case; that seems worse than this. */
5327 case on_failure_keep_string_jump:
5328 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5330 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5331 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5333 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5337 /* Uses of on_failure_jump:
5339 Each alternative starts with an on_failure_jump that points
5340 to the beginning of the next alternative. Each alternative
5341 except the last ends with a jump that in effect jumps past
5342 the rest of the alternatives. (They really jump to the
5343 ending jump of the following alternative, because tensioning
5344 these jumps is a hassle.)
5346 Repeats start with an on_failure_jump that points past both
5347 the repetition text and either the following jump or
5348 pop_failure_jump back to this on_failure_jump. */
5349 case on_failure_jump:
5351 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5353 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5354 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5356 /* If this on_failure_jump comes right before a group (i.e.,
5357 the original * applied to a group), save the information
5358 for that group and all inner ones, so that if we fail back
5359 to this point, the group's information will be correct.
5360 For example, in \(a*\)*\1, we need the preceding group,
5361 and in \(\(a*\)b*\)\2, we need the inner group. */
5363 /* We can't use `p' to check ahead because we push
5364 a failure point to `p + mcnt' after we do this. */
5367 /* We need to skip no_op's before we look for the
5368 start_memory in case this on_failure_jump is happening as
5369 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5371 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5374 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5376 /* We have a new highest active register now. This will
5377 get reset at the start_memory we are about to get to,
5378 but we will have saved all the registers relevant to
5379 this repetition op, as described above. */
5380 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5381 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5382 lowest_active_reg = *(p1 + 1);
5385 DEBUG_PRINT1 (":\n");
5386 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5390 /* A smart repeat ends with `maybe_pop_jump'.
5391 We change it to either `pop_failure_jump' or `jump'. */
5392 case maybe_pop_jump:
5393 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5394 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5396 REGISTER unsigned char *p2 = p;
5398 /* Compare the beginning of the repeat with what in the
5399 pattern follows its end. If we can establish that there
5400 is nothing that they would both match, i.e., that we
5401 would have to backtrack because of (as in, e.g., `a*a')
5402 then we can change to pop_failure_jump, because we'll
5403 never have to backtrack.
5405 This is not true in the case of alternatives: in
5406 `(a|ab)*' we do need to backtrack to the `ab' alternative
5407 (e.g., if the string was `ab'). But instead of trying to
5408 detect that here, the alternative has put on a dummy
5409 failure point which is what we will end up popping. */
5411 /* Skip over open/close-group commands.
5412 If what follows this loop is a ...+ construct,
5413 look at what begins its body, since we will have to
5414 match at least one of that. */
5418 && ((re_opcode_t) *p2 == stop_memory
5419 || (re_opcode_t) *p2 == start_memory))
5421 else if (p2 + 6 < pend
5422 && (re_opcode_t) *p2 == dummy_failure_jump)
5429 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5430 to the `maybe_finalize_jump' of this case. Examine what
5433 /* If we're at the end of the pattern, we can change. */
5436 /* Consider what happens when matching ":\(.*\)"
5437 against ":/". I don't really understand this code
5439 p[-3] = (unsigned char) pop_failure_jump;
5441 (" End of pattern: change to `pop_failure_jump'.\n");
5444 else if ((re_opcode_t) *p2 == exactn
5445 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5447 REGISTER unsigned char c
5448 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5450 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5452 p[-3] = (unsigned char) pop_failure_jump;
5453 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5457 else if ((re_opcode_t) p1[3] == charset
5458 || (re_opcode_t) p1[3] == charset_not)
5460 int not_p = (re_opcode_t) p1[3] == charset_not;
5462 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5463 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5466 /* `not_p' is equal to 1 if c would match, which means
5467 that we can't change to pop_failure_jump. */
5470 p[-3] = (unsigned char) pop_failure_jump;
5471 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5475 else if ((re_opcode_t) *p2 == charset)
5478 REGISTER unsigned char c
5479 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5482 if ((re_opcode_t) p1[3] == exactn
5483 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5484 && (p2[2 + p1[5] / BYTEWIDTH]
5485 & (1 << (p1[5] % BYTEWIDTH)))))
5487 p[-3] = (unsigned char) pop_failure_jump;
5488 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5492 else if ((re_opcode_t) p1[3] == charset_not)
5495 /* We win if the charset_not inside the loop
5496 lists every character listed in the charset after. */
5497 for (idx = 0; idx < (int) p2[1]; idx++)
5498 if (! (p2[2 + idx] == 0
5499 || (idx < (int) p1[4]
5500 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5505 p[-3] = (unsigned char) pop_failure_jump;
5506 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5509 else if ((re_opcode_t) p1[3] == charset)
5512 /* We win if the charset inside the loop
5513 has no overlap with the one after the loop. */
5515 idx < (int) p2[1] && idx < (int) p1[4];
5517 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5520 if (idx == p2[1] || idx == p1[4])
5522 p[-3] = (unsigned char) pop_failure_jump;
5523 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5528 p -= 2; /* Point at relative address again. */
5529 if ((re_opcode_t) p[-1] != pop_failure_jump)
5531 p[-1] = (unsigned char) jump;
5532 DEBUG_PRINT1 (" Match => jump.\n");
5533 goto unconditional_jump;
5535 /* Note fall through. */
5538 /* The end of a simple repeat has a pop_failure_jump back to
5539 its matching on_failure_jump, where the latter will push a
5540 failure point. The pop_failure_jump takes off failure
5541 points put on by this pop_failure_jump's matching
5542 on_failure_jump; we got through the pattern to here from the
5543 matching on_failure_jump, so didn't fail. */
5544 case pop_failure_jump:
5546 /* We need to pass separate storage for the lowest and
5547 highest registers, even though we don't care about the
5548 actual values. Otherwise, we will restore only one
5549 register from the stack, since lowest will == highest in
5550 `pop_failure_point'. */
5551 int dummy_low_reg, dummy_high_reg;
5552 unsigned char *pdummy;
5553 re_char *sdummy = NULL;
5555 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5556 POP_FAILURE_POINT (sdummy, pdummy,
5557 dummy_low_reg, dummy_high_reg,
5558 reg_dummy, reg_dummy, reg_info_dummy);
5560 /* Note fall through. */
5563 /* Unconditionally jump (without popping any failure points). */
5566 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5567 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5568 p += mcnt; /* Do the jump. */
5569 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5573 /* We need this opcode so we can detect where alternatives end
5574 in `group_match_null_string_p' et al. */
5576 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5577 goto unconditional_jump;
5580 /* Normally, the on_failure_jump pushes a failure point, which
5581 then gets popped at pop_failure_jump. We will end up at
5582 pop_failure_jump, also, and with a pattern of, say, `a+', we
5583 are skipping over the on_failure_jump, so we have to push
5584 something meaningless for pop_failure_jump to pop. */
5585 case dummy_failure_jump:
5586 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5587 /* It doesn't matter what we push for the string here. What
5588 the code at `fail' tests is the value for the pattern. */
5589 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5590 goto unconditional_jump;
5593 /* At the end of an alternative, we need to push a dummy failure
5594 point in case we are followed by a `pop_failure_jump', because
5595 we don't want the failure point for the alternative to be
5596 popped. For example, matching `(a|ab)*' against `aab'
5597 requires that we match the `ab' alternative. */
5598 case push_dummy_failure:
5599 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5600 /* See comments just above at `dummy_failure_jump' about the
5602 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5605 /* Have to succeed matching what follows at least n times.
5606 After that, handle like `on_failure_jump'. */
5608 EXTRACT_NUMBER (mcnt, p + 2);
5609 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5612 /* Originally, this is how many times we HAVE to succeed. */
5617 STORE_NUMBER_AND_INCR (p, mcnt);
5618 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5622 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5624 p[2] = (unsigned char) no_op;
5625 p[3] = (unsigned char) no_op;
5631 EXTRACT_NUMBER (mcnt, p + 2);
5632 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5634 /* Originally, this is how many times we CAN jump. */
5638 STORE_NUMBER (p + 2, mcnt);
5639 goto unconditional_jump;
5641 /* If don't have to jump any more, skip over the rest of command. */
5648 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5650 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5652 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5653 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5654 STORE_NUMBER (p1, mcnt);
5659 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5664 /* Straightforward and (I hope) correct implementation.
5665 Probably should be optimized by arranging to compute
5667 /* emch1 is the character before d, syn1 is the syntax of
5668 emch1, emch2 is the character at d, and syn2 is the
5670 Emchar emch1, emch2;
5671 /* GCC isn't smart enough to see these are initialized if used. */
5672 int syn1 = 0, syn2 = 0;
5673 re_char *d_before, *d_after;
5675 at_beg = AT_STRINGS_BEG (d),
5676 at_end = AT_STRINGS_END (d);
5681 if (at_beg && at_end)
5689 d_before = POS_BEFORE_GAP_UNSAFE (d);
5690 DEC_CHARPTR (d_before);
5691 emch1 = charptr_emchar (d_before);
5693 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5694 UPDATE_SYNTAX_CACHE (xpos);
5696 syn1 = SYNTAX_FROM_CACHE
5697 (XCHAR_TABLE (regex_emacs_buffer
5698 ->mirror_syntax_table),
5703 d_after = POS_AFTER_GAP_UNSAFE (d);
5704 emch2 = charptr_emchar (d_after);
5706 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5707 UPDATE_SYNTAX_CACHE_FORWARD (xpos + 1);
5709 syn2 = SYNTAX_FROM_CACHE
5710 (XCHAR_TABLE (regex_emacs_buffer
5711 ->mirror_syntax_table),
5716 result = (syn2 == Sword);
5718 result = (syn1 == Sword);
5720 result = ((syn1 == Sword) != (syn2 == Sword));
5723 if (result == should_succeed)
5729 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5731 goto matchwordbound;
5734 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5735 if (AT_STRINGS_END (d))
5738 /* XEmacs: this originally read:
5740 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5744 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5745 Emchar emch = charptr_emchar (dtmp);
5747 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5748 UPDATE_SYNTAX_CACHE (charpos);
5750 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5753 if (AT_STRINGS_BEG (d))
5755 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5757 emch = charptr_emchar (dtmp);
5759 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5761 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5768 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5769 if (AT_STRINGS_BEG (d))
5772 /* XEmacs: this originally read:
5774 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5775 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5778 The or condition is incorrect (reversed).
5783 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5784 UPDATE_SYNTAX_CACHE (charpos);
5786 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5788 emch = charptr_emchar (dtmp);
5789 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5792 if (AT_STRINGS_END (d))
5794 dtmp = POS_AFTER_GAP_UNSAFE (d);
5795 emch = charptr_emchar (dtmp);
5797 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5799 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5807 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5808 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5809 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5810 >= BUF_PT (regex_emacs_buffer)))
5815 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5816 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5817 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5818 != BUF_PT (regex_emacs_buffer)))
5823 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5824 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5825 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5826 <= BUF_PT (regex_emacs_buffer)))
5829 #if 0 /* not emacs19 */
5831 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5832 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5833 != BUF_PT (regex_emacs_buffer))
5836 #endif /* not emacs19 */
5839 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5844 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5856 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5857 UPDATE_SYNTAX_CACHE (charpos);
5861 emch = charptr_emchar ((const Bufbyte *) d);
5863 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->syntax_table),
5864 emch) == (enum syntaxcode) mcnt);
5866 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5867 emch) == (enum syntaxcode) mcnt);
5870 if (matches != should_succeed)
5872 SET_REGS_MATCHED ();
5877 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5879 goto matchnotsyntax;
5882 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5886 goto matchornotsyntax;
5889 /* 97/2/17 jhod Mule category code patch */
5898 emch = charptr_emchar ((const Bufbyte *) d);
5900 if (check_category_char(emch, regex_emacs_buffer->category_table,
5901 mcnt, should_succeed))
5903 SET_REGS_MATCHED ();
5907 case notcategoryspec:
5909 goto matchornotcategory;
5910 /* end of category patch */
5912 #else /* not emacs */
5914 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5916 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5918 SET_REGS_MATCHED ();
5923 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5925 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5927 SET_REGS_MATCHED ();
5935 continue; /* Successfully executed one pattern command; keep going. */
5938 /* We goto here if a matching operation fails. */
5940 if (!FAIL_STACK_EMPTY ())
5941 { /* A restart point is known. Restore to that state. */
5942 DEBUG_PRINT1 ("\nFAIL:\n");
5943 POP_FAILURE_POINT (d, p,
5944 lowest_active_reg, highest_active_reg,
5945 regstart, regend, reg_info);
5947 /* If this failure point is a dummy, try the next one. */
5951 /* If we failed to the end of the pattern, don't examine *p. */
5955 re_bool is_a_jump_n = false;
5957 /* If failed to a backwards jump that's part of a repetition
5958 loop, need to pop this failure point and use the next one. */
5959 switch ((re_opcode_t) *p)
5963 case maybe_pop_jump:
5964 case pop_failure_jump:
5967 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5970 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5972 && (re_opcode_t) *p1 == on_failure_jump))
5980 if (d >= string1 && d <= end1)
5984 break; /* Matching at this starting point really fails. */
5988 goto restore_best_regs;
5992 return -1; /* Failure to match. */
5995 /* Subroutine definitions for re_match_2. */
5998 /* We are passed P pointing to a register number after a start_memory.
6000 Return true if the pattern up to the corresponding stop_memory can
6001 match the empty string, and false otherwise.
6003 If we find the matching stop_memory, sets P to point to one past its number.
6004 Otherwise, sets P to an undefined byte less than or equal to END.
6006 We don't handle duplicates properly (yet). */
6009 group_match_null_string_p (unsigned char **p, unsigned char *end,
6010 register_info_type *register_info)
6013 /* Point to after the args to the start_memory. */
6014 unsigned char *p1 = *p + 2;
6018 /* Skip over opcodes that can match nothing, and return true or
6019 false, as appropriate, when we get to one that can't, or to the
6020 matching stop_memory. */
6022 switch ((re_opcode_t) *p1)
6024 /* Could be either a loop or a series of alternatives. */
6025 case on_failure_jump:
6027 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6029 /* If the next operation is not a jump backwards in the
6034 /* Go through the on_failure_jumps of the alternatives,
6035 seeing if any of the alternatives cannot match nothing.
6036 The last alternative starts with only a jump,
6037 whereas the rest start with on_failure_jump and end
6038 with a jump, e.g., here is the pattern for `a|b|c':
6040 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6041 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6044 So, we have to first go through the first (n-1)
6045 alternatives and then deal with the last one separately. */
6048 /* Deal with the first (n-1) alternatives, which start
6049 with an on_failure_jump (see above) that jumps to right
6050 past a jump_past_alt. */
6052 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
6054 /* `mcnt' holds how many bytes long the alternative
6055 is, including the ending `jump_past_alt' and
6058 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
6062 /* Move to right after this alternative, including the
6066 /* Break if it's the beginning of an n-th alternative
6067 that doesn't begin with an on_failure_jump. */
6068 if ((re_opcode_t) *p1 != on_failure_jump)
6071 /* Still have to check that it's not an n-th
6072 alternative that starts with an on_failure_jump. */
6074 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6075 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
6077 /* Get to the beginning of the n-th alternative. */
6083 /* Deal with the last alternative: go back and get number
6084 of the `jump_past_alt' just before it. `mcnt' contains
6085 the length of the alternative. */
6086 EXTRACT_NUMBER (mcnt, p1 - 2);
6088 if (!alt_match_null_string_p (p1, p1 + mcnt, register_info))
6091 p1 += mcnt; /* Get past the n-th alternative. */
6097 assert (p1[1] == **p);
6103 if (!common_op_match_null_string_p (&p1, end, register_info))
6106 } /* while p1 < end */
6109 } /* group_match_null_string_p */
6112 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6113 It expects P to be the first byte of a single alternative and END one
6114 byte past the last. The alternative can contain groups. */
6117 alt_match_null_string_p (unsigned char *p, unsigned char *end,
6118 register_info_type *register_info)
6121 unsigned char *p1 = p;
6125 /* Skip over opcodes that can match nothing, and break when we get
6126 to one that can't. */
6128 switch ((re_opcode_t) *p1)
6131 case on_failure_jump:
6133 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6138 if (!common_op_match_null_string_p (&p1, end, register_info))
6141 } /* while p1 < end */
6144 } /* alt_match_null_string_p */
6147 /* Deals with the ops common to group_match_null_string_p and
6148 alt_match_null_string_p.
6150 Sets P to one after the op and its arguments, if any. */
6153 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
6154 register_info_type *register_info)
6159 unsigned char *p1 = *p;
6161 switch ((re_opcode_t) *p1++)
6181 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6182 ret = group_match_null_string_p (&p1, end, register_info);
6184 /* Have to set this here in case we're checking a group which
6185 contains a group and a back reference to it. */
6187 if (REG_MATCH_NULL_STRING_P (register_info[reg_no]) ==
6188 MATCH_NULL_UNSET_VALUE)
6189 REG_MATCH_NULL_STRING_P (register_info[reg_no]) = ret;
6195 /* If this is an optimized succeed_n for zero times, make the jump. */
6197 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6205 /* Get to the number of times to succeed. */
6207 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6212 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6220 if (!REG_MATCH_NULL_STRING_P (register_info[*p1]))
6228 /* All other opcodes mean we cannot match the empty string. */
6234 } /* common_op_match_null_string_p */
6237 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6238 bytes; nonzero otherwise. */
6241 bcmp_translate (re_char *s1, re_char *s2,
6242 REGISTER int len, RE_TRANSLATE_TYPE translate)
6244 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6246 const unsigned char *p1_end = s1 + len;
6247 const unsigned char *p2_end = s2 + len;
6249 while (p1 != p1_end && p2 != p2_end)
6251 Emchar p1_ch, p2_ch;
6253 p1_ch = charptr_emchar (p1);
6254 p2_ch = charptr_emchar (p2);
6256 if (RE_TRANSLATE (p1_ch)
6257 != RE_TRANSLATE (p2_ch))
6262 #else /* not MULE */
6265 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6272 /* Entry points for GNU code. */
6274 /* re_compile_pattern is the GNU regular expression compiler: it
6275 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6276 Returns 0 if the pattern was valid, otherwise an error string.
6278 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6279 are set in BUFP on entry.
6281 We call regex_compile to do the actual compilation. */
6284 re_compile_pattern (const char *pattern, int length,
6285 struct re_pattern_buffer *bufp)
6289 /* GNU code is written to assume at least RE_NREGS registers will be set
6290 (and at least one extra will be -1). */
6291 bufp->regs_allocated = REGS_UNALLOCATED;
6293 /* And GNU code determines whether or not to get register information
6294 by passing null for the REGS argument to re_match, etc., not by
6298 /* Match anchors at newline. */
6299 bufp->newline_anchor = 1;
6301 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6305 return gettext (re_error_msgid[(int) ret]);
6308 /* Entry points compatible with 4.2 BSD regex library. We don't define
6309 them unless specifically requested. */
6311 #ifdef _REGEX_RE_COMP
6313 /* BSD has one and only one pattern buffer. */
6314 static struct re_pattern_buffer re_comp_buf;
6317 re_comp (const char *s)
6323 if (!re_comp_buf.buffer)
6324 return gettext ("No previous regular expression");
6328 if (!re_comp_buf.buffer)
6330 re_comp_buf.buffer = (unsigned char *) malloc (200);
6331 if (re_comp_buf.buffer == NULL)
6332 return gettext (re_error_msgid[(int) REG_ESPACE]);
6333 re_comp_buf.allocated = 200;
6335 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6336 if (re_comp_buf.fastmap == NULL)
6337 return gettext (re_error_msgid[(int) REG_ESPACE]);
6340 /* Since `re_exec' always passes NULL for the `regs' argument, we
6341 don't need to initialize the pattern buffer fields which affect it. */
6343 /* Match anchors at newlines. */
6344 re_comp_buf.newline_anchor = 1;
6346 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6351 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6352 return (char *) gettext (re_error_msgid[(int) ret]);
6357 re_exec (const char *s)
6359 const int len = strlen (s);
6361 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6363 #endif /* _REGEX_RE_COMP */
6365 /* POSIX.2 functions. Don't define these for Emacs. */
6369 /* regcomp takes a regular expression as a string and compiles it.
6371 PREG is a regex_t *. We do not expect any fields to be initialized,
6372 since POSIX says we shouldn't. Thus, we set
6374 `buffer' to the compiled pattern;
6375 `used' to the length of the compiled pattern;
6376 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6377 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6378 RE_SYNTAX_POSIX_BASIC;
6379 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6380 `fastmap' and `fastmap_accurate' to zero;
6381 `re_nsub' to the number of subexpressions in PATTERN.
6382 (non-shy of course. POSIX probably doesn't know about
6383 shy ones, and in any case they should be invisible.)
6385 PATTERN is the address of the pattern string.
6387 CFLAGS is a series of bits which affect compilation.
6389 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6390 use POSIX basic syntax.
6392 If REG_NEWLINE is set, then . and [^...] don't match newline.
6393 Also, regexec will try a match beginning after every newline.
6395 If REG_ICASE is set, then we considers upper- and lowercase
6396 versions of letters to be equivalent when matching.
6398 If REG_NOSUB is set, then when PREG is passed to regexec, that
6399 routine will report only success or failure, and nothing about the
6402 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6403 the return codes and their meanings.) */
6406 regcomp (regex_t *preg, const char *pattern, int cflags)
6410 = (cflags & REG_EXTENDED) ?
6411 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6413 /* regex_compile will allocate the space for the compiled pattern. */
6415 preg->allocated = 0;
6418 /* Don't bother to use a fastmap when searching. This simplifies the
6419 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6420 characters after newlines into the fastmap. This way, we just try
6424 if (cflags & REG_ICASE)
6428 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6429 if (preg->translate == NULL)
6430 return (int) REG_ESPACE;
6432 /* Map uppercase characters to corresponding lowercase ones. */
6433 for (i = 0; i < CHAR_SET_SIZE; i++)
6434 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6437 preg->translate = NULL;
6439 /* If REG_NEWLINE is set, newlines are treated differently. */
6440 if (cflags & REG_NEWLINE)
6441 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6442 syntax &= ~RE_DOT_NEWLINE;
6443 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6444 /* It also changes the matching behavior. */
6445 preg->newline_anchor = 1;
6448 preg->newline_anchor = 0;
6450 preg->no_sub = !!(cflags & REG_NOSUB);
6452 /* POSIX says a null character in the pattern terminates it, so we
6453 can use strlen here in compiling the pattern. */
6454 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6456 /* POSIX doesn't distinguish between an unmatched open-group and an
6457 unmatched close-group: both are REG_EPAREN. */
6458 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6464 /* regexec searches for a given pattern, specified by PREG, in the
6467 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6468 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6469 least NMATCH elements, and we set them to the offsets of the
6470 corresponding matched substrings.
6472 EFLAGS specifies `execution flags' which affect matching: if
6473 REG_NOTBOL is set, then ^ does not match at the beginning of the
6474 string; if REG_NOTEOL is set, then $ does not match at the end.
6476 We return 0 if we find a match and REG_NOMATCH if not. */
6479 regexec (const regex_t *preg, const char *string, Element_count nmatch,
6480 regmatch_t pmatch[], int eflags)
6483 struct re_registers regs;
6484 regex_t private_preg;
6485 int len = strlen (string);
6486 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6488 private_preg = *preg;
6490 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6491 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6493 /* The user has told us exactly how many registers to return
6494 information about, via `nmatch'. We have to pass that on to the
6495 matching routines. */
6496 private_preg.regs_allocated = REGS_FIXED;
6500 regs.num_regs = nmatch;
6501 regs.start = TALLOC (nmatch, regoff_t);
6502 regs.end = TALLOC (nmatch, regoff_t);
6503 if (regs.start == NULL || regs.end == NULL)
6504 return (int) REG_NOMATCH;
6507 /* Perform the searching operation. */
6508 ret = re_search (&private_preg, string, len,
6509 /* start: */ 0, /* range: */ len,
6510 want_reg_info ? ®s : (struct re_registers *) 0);
6512 /* Copy the register information to the POSIX structure. */
6519 for (r = 0; r < nmatch; r++)
6521 pmatch[r].rm_so = regs.start[r];
6522 pmatch[r].rm_eo = regs.end[r];
6526 /* If we needed the temporary register info, free the space now. */
6531 /* We want zero return to mean success, unlike `re_search'. */
6532 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6536 /* Returns a message corresponding to an error code, ERRCODE, returned
6537 from either regcomp or regexec. We don't use PREG here. */
6540 regerror (int errcode, const regex_t *preg, char *errbuf,
6541 Memory_count errbuf_size)
6544 Memory_count msg_size;
6547 || (size_t) errcode >= (sizeof (re_error_msgid)
6548 / sizeof (re_error_msgid[0])))
6549 /* Only error codes returned by the rest of the code should be passed
6550 to this routine. If we are given anything else, or if other regex
6551 code generates an invalid error code, then the program has a bug.
6552 Dump core so we can fix it. */
6555 msg = gettext (re_error_msgid[errcode]);
6557 msg_size = strlen (msg) + 1; /* Includes the null. */
6559 if (errbuf_size != 0)
6561 if (msg_size > errbuf_size)
6563 strncpy (errbuf, msg, errbuf_size - 1);
6564 errbuf[errbuf_size - 1] = 0;
6567 strcpy (errbuf, msg);
6574 /* Free dynamically allocated space used by PREG. */
6577 regfree (regex_t *preg)
6579 if (preg->buffer != NULL)
6580 free (preg->buffer);
6581 preg->buffer = NULL;
6583 preg->allocated = 0;
6586 if (preg->fastmap != NULL)
6587 free (preg->fastmap);
6588 preg->fastmap = NULL;
6589 preg->fastmap_accurate = 0;
6591 if (preg->translate != NULL)
6592 free (preg->translate);
6593 preg->translate = NULL;
6596 #endif /* not emacs */
6600 make-backup-files: t
6602 trim-versions-without-asking: nil