1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
94 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
100 Lisp_Object Vthe_lisp_rangetab;
103 complex_vars_of_regex (void)
105 Vthe_lisp_rangetab = Fmake_range_table ();
106 staticpro (&Vthe_lisp_rangetab);
112 complex_vars_of_regex (void)
118 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
119 #define TRANSLATE_P(tr) (!NILP (tr))
121 #else /* not emacs */
123 /* If we are not linking with Emacs proper,
124 we can't use the relocating allocator
125 even if config.h says that we can. */
128 #if defined (STDC_HEADERS) || defined (_LIBC)
135 /* Types normally included via lisp.h */
136 #include <stddef.h> /* for ptrdiff_t */
139 #ifndef DECLARE_NOTHING
140 #define DECLARE_NOTHING struct nosuchstruct
146 #define charptr_emchar(str) ((Emchar) (str)[0])
148 #define INC_CHARPTR(p) ((p)++)
149 #define DEC_CHARPTR(p) ((p)--)
153 /* Define the syntax stuff for \<, \>, etc. */
155 /* This must be nonzero for the wordchar and notwordchar pattern
156 commands in re_match_2. */
163 extern char *re_syntax_table;
165 #else /* not SYNTAX_TABLE */
167 /* How many characters in the character set. */
168 #define CHAR_SET_SIZE 256
170 static char re_syntax_table[CHAR_SET_SIZE];
173 init_syntax_once (void)
179 const char *word_syntax_chars =
180 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
182 memset (re_syntax_table, 0, sizeof (re_syntax_table));
184 while (*word_syntax_chars)
185 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
191 #endif /* SYNTAX_TABLE */
193 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
194 #undef SYNTAX_FROM_CACHE
195 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
197 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
198 #define TRANSLATE_P(tr) tr
202 /* Under XEmacs, this is needed because we don't define it elsewhere. */
203 #ifdef SWITCH_ENUM_BUG
204 #define SWITCH_ENUM_CAST(x) ((int)(x))
206 #define SWITCH_ENUM_CAST(x) (x)
210 /* Get the interface, including the syntax bits. */
213 /* isalpha etc. are used for the character classes. */
216 /* Jim Meyering writes:
218 "... Some ctype macros are valid only for character codes that
219 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
220 using /bin/cc or gcc but without giving an ansi option). So, all
221 ctype uses should be through macros like ISPRINT... If
222 STDC_HEADERS is defined, then autoconf has verified that the ctype
223 macros don't need to be guarded with references to isascii. ...
224 Defining isascii to 1 should let any compiler worth its salt
225 eliminate the && through constant folding." */
227 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
228 #define ISASCII_1(c) 1
230 #define ISASCII_1(c) isascii(c)
234 /* The IS*() macros can be passed any character, including an extended
235 one. We need to make sure there are no crashes, which would occur
236 otherwise due to out-of-bounds array references. */
237 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
239 #define ISASCII(c) ISASCII_1 (c)
243 #define ISBLANK(c) (ISASCII (c) && isblank (c))
245 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
248 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
250 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
253 #define ISPRINT(c) (ISASCII (c) && isprint (c))
254 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
255 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
256 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
257 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
258 #define ISLOWER(c) (ISASCII (c) && islower (c))
259 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
260 #define ISSPACE(c) (ISASCII (c) && isspace (c))
261 #define ISUPPER(c) (ISASCII (c) && isupper (c))
262 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
265 #define NULL (void *)0
268 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
269 since ours (we hope) works properly with all combinations of
270 machines, compilers, `char' and `unsigned char' argument types.
271 (Per Bothner suggested the basic approach.) */
272 #undef SIGN_EXTEND_CHAR
274 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
275 #else /* not __STDC__ */
276 /* As in Harbison and Steele. */
277 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
280 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
281 use `alloca' instead of `malloc'. This is because using malloc in
282 re_search* or re_match* could cause memory leaks when C-g is used in
283 Emacs; also, malloc is slower and causes storage fragmentation. On
284 the other hand, malloc is more portable, and easier to debug.
286 Because we sometimes use alloca, some routines have to be macros,
287 not functions -- `alloca'-allocated space disappears at the end of the
288 function it is called in. */
292 #define REGEX_ALLOCATE malloc
293 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
294 #define REGEX_FREE free
296 #else /* not REGEX_MALLOC */
298 /* Emacs already defines alloca, sometimes. */
301 /* Make alloca work the best possible way. */
303 #define alloca __builtin_alloca
304 #else /* not __GNUC__ */
307 #else /* not __GNUC__ or HAVE_ALLOCA_H */
308 #ifndef _AIX /* Already did AIX, up at the top. */
310 #endif /* not _AIX */
311 #endif /* HAVE_ALLOCA_H */
312 #endif /* __GNUC__ */
314 #endif /* not alloca */
316 #define REGEX_ALLOCATE alloca
318 /* Assumes a `char *destination' variable. */
319 #define REGEX_REALLOCATE(source, osize, nsize) \
320 (destination = (char *) alloca (nsize), \
321 memmove (destination, source, osize), \
324 /* No need to do anything to free, after alloca. */
325 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
327 #endif /* REGEX_MALLOC */
329 /* Define how to allocate the failure stack. */
332 #define REGEX_ALLOCATE_STACK(size) \
333 r_alloc ((char **) &failure_stack_ptr, (size))
334 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
335 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
336 #define REGEX_FREE_STACK(ptr) \
337 r_alloc_free ((void **) &failure_stack_ptr)
339 #else /* not REL_ALLOC */
343 #define REGEX_ALLOCATE_STACK malloc
344 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
345 #define REGEX_FREE_STACK free
347 #else /* not REGEX_MALLOC */
349 #define REGEX_ALLOCATE_STACK alloca
351 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
352 REGEX_REALLOCATE (source, osize, nsize)
353 /* No need to explicitly free anything. */
354 #define REGEX_FREE_STACK(arg)
356 #endif /* REGEX_MALLOC */
357 #endif /* REL_ALLOC */
360 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
361 `string1' or just past its end. This works if PTR is NULL, which is
363 #define FIRST_STRING_P(ptr) \
364 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
366 /* (Re)Allocate N items of type T using malloc, or fail. */
367 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
368 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
369 #define RETALLOC_IF(addr, n, t) \
370 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
371 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
373 #define BYTEWIDTH 8 /* In bits. */
375 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
379 #define MAX(a, b) ((a) > (b) ? (a) : (b))
380 #define MIN(a, b) ((a) < (b) ? (a) : (b))
382 /* Type of source-pattern and string chars. */
383 typedef const unsigned char re_char;
385 typedef char re_bool;
390 /* These are the command codes that appear in compiled regular
391 expressions. Some opcodes are followed by argument bytes. A
392 command code can specify any interpretation whatsoever for its
393 arguments. Zero bytes may appear in the compiled regular expression. */
399 /* Succeed right away--no more backtracking. */
402 /* Followed by one byte giving n, then by n literal bytes. */
405 /* Matches any (more or less) character. */
408 /* Matches any one char belonging to specified set. First
409 following byte is number of bitmap bytes. Then come bytes
410 for a bitmap saying which chars are in. Bits in each byte
411 are ordered low-bit-first. A character is in the set if its
412 bit is 1. A character too large to have a bit in the map is
413 automatically not in the set. */
416 /* Same parameters as charset, but match any character that is
417 not one of those specified. */
420 /* Start remembering the text that is matched, for storing in a
421 register. Followed by one byte with the register number, in
422 the range 0 to one less than the pattern buffer's re_nsub
423 field. Then followed by one byte with the number of groups
424 inner to this one. (This last has to be part of the
425 start_memory only because we need it in the on_failure_jump
429 /* Stop remembering the text that is matched and store it in a
430 memory register. Followed by one byte with the register
431 number, in the range 0 to one less than `re_nsub' in the
432 pattern buffer, and one byte with the number of inner groups,
433 just like `start_memory'. (We need the number of inner
434 groups here because we don't have any easy way of finding the
435 corresponding start_memory when we're at a stop_memory.) */
438 /* Match a duplicate of something remembered. Followed by one
439 byte containing the register number. */
442 /* Fail unless at beginning of line. */
445 /* Fail unless at end of line. */
448 /* Succeeds if at beginning of buffer (if emacs) or at beginning
449 of string to be matched (if not). */
452 /* Analogously, for end of buffer/string. */
455 /* Followed by two byte relative address to which to jump. */
458 /* Same as jump, but marks the end of an alternative. */
461 /* Followed by two-byte relative address of place to resume at
462 in case of failure. */
465 /* Like on_failure_jump, but pushes a placeholder instead of the
466 current string position when executed. */
467 on_failure_keep_string_jump,
469 /* Throw away latest failure point and then jump to following
470 two-byte relative address. */
473 /* Change to pop_failure_jump if know won't have to backtrack to
474 match; otherwise change to jump. This is used to jump
475 back to the beginning of a repeat. If what follows this jump
476 clearly won't match what the repeat does, such that we can be
477 sure that there is no use backtracking out of repetitions
478 already matched, then we change it to a pop_failure_jump.
479 Followed by two-byte address. */
482 /* Jump to following two-byte address, and push a dummy failure
483 point. This failure point will be thrown away if an attempt
484 is made to use it for a failure. A `+' construct makes this
485 before the first repeat. Also used as an intermediary kind
486 of jump when compiling an alternative. */
489 /* Push a dummy failure point and continue. Used at the end of
493 /* Followed by two-byte relative address and two-byte number n.
494 After matching N times, jump to the address upon failure. */
497 /* Followed by two-byte relative address, and two-byte number n.
498 Jump to the address N times, then fail. */
501 /* Set the following two-byte relative address to the
502 subsequent two-byte number. The address *includes* the two
506 wordchar, /* Matches any word-constituent character. */
507 notwordchar, /* Matches any char that is not a word-constituent. */
509 wordbeg, /* Succeeds if at word beginning. */
510 wordend, /* Succeeds if at word end. */
512 wordbound, /* Succeeds if at a word boundary. */
513 notwordbound /* Succeeds if not at a word boundary. */
516 ,before_dot, /* Succeeds if before point. */
517 at_dot, /* Succeeds if at point. */
518 after_dot, /* Succeeds if after point. */
520 /* Matches any character whose syntax is specified. Followed by
521 a byte which contains a syntax code, e.g., Sword. */
524 /* Matches any character whose syntax is not that specified. */
530 /* need extra stuff to be able to properly work with XEmacs/Mule
531 characters (which may take up more than one byte) */
533 ,charset_mule, /* Matches any character belonging to specified set.
534 The set is stored in "unified range-table
535 format"; see rangetab.c. Unlike the `charset'
536 opcode, this can handle arbitrary characters. */
538 charset_mule_not /* Same parameters as charset_mule, but match any
539 character that is not one of those specified. */
541 /* 97/2/17 jhod: The following two were merged back in from the Mule
542 2.3 code to enable some language specific processing */
543 ,categoryspec, /* Matches entries in the character category tables */
544 notcategoryspec /* The opposite of the above */
549 /* Common operations on the compiled pattern. */
551 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
553 #define STORE_NUMBER(destination, number) \
555 (destination)[0] = (number) & 0377; \
556 (destination)[1] = (number) >> 8; \
559 /* Same as STORE_NUMBER, except increment DESTINATION to
560 the byte after where the number is stored. Therefore, DESTINATION
561 must be an lvalue. */
563 #define STORE_NUMBER_AND_INCR(destination, number) \
565 STORE_NUMBER (destination, number); \
566 (destination) += 2; \
569 /* Put into DESTINATION a number stored in two contiguous bytes starting
572 #define EXTRACT_NUMBER(destination, source) \
574 (destination) = *(source) & 0377; \
575 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
580 extract_number (int *dest, re_char *source)
582 int temp = SIGN_EXTEND_CHAR (*(source + 1));
583 *dest = *source & 0377;
587 #ifndef EXTRACT_MACROS /* To debug the macros. */
588 #undef EXTRACT_NUMBER
589 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
590 #endif /* not EXTRACT_MACROS */
594 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
595 SOURCE must be an lvalue. */
597 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
599 EXTRACT_NUMBER (destination, source); \
605 extract_number_and_incr (int *destination, unsigned char **source)
607 extract_number (destination, *source);
611 #ifndef EXTRACT_MACROS
612 #undef EXTRACT_NUMBER_AND_INCR
613 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
614 extract_number_and_incr (&dest, &src)
615 #endif /* not EXTRACT_MACROS */
619 /* If DEBUG is defined, Regex prints many voluminous messages about what
620 it is doing (if the variable `debug' is nonzero). If linked with the
621 main program in `iregex.c', you can enter patterns and strings
622 interactively. And if linked with the main program in `main.c' and
623 the other test files, you can run the already-written tests. */
627 /* We use standard I/O for debugging. */
631 /* XEmacs provides its own version of assert() */
632 /* It is useful to test things that ``must'' be true when debugging. */
636 static int debug = 0;
638 #define DEBUG_STATEMENT(e) e
639 #define DEBUG_PRINT1(x) if (debug) printf (x)
640 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
641 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
642 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
643 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
644 if (debug) print_partial_compiled_pattern (s, e)
645 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
646 if (debug) print_double_string (w, s1, sz1, s2, sz2)
649 /* Print the fastmap in human-readable form. */
652 print_fastmap (char *fastmap)
654 unsigned was_a_range = 0;
657 while (i < (1 << BYTEWIDTH))
663 while (i < (1 << BYTEWIDTH) && fastmap[i])
679 /* Print a compiled pattern string in human-readable form, starting at
680 the START pointer into it and ending just before the pointer END. */
683 print_partial_compiled_pattern (re_char *start, re_char *end)
686 unsigned char *p = (unsigned char *) start;
695 /* Loop over pattern commands. */
698 printf ("%ld:\t", (long)(p - start));
700 switch ((re_opcode_t) *p++)
708 printf ("/exactn/%d", mcnt);
719 printf ("/start_memory/%d/%d", mcnt, *p++);
724 printf ("/stop_memory/%d/%d", mcnt, *p++);
728 printf ("/duplicate/%d", *p++);
738 REGISTER int c, last = -100;
739 REGISTER int in_range = 0;
741 printf ("/charset [%s",
742 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
744 assert (p + *p < pend);
746 for (c = 0; c < 256; c++)
747 if (((unsigned char) (c / 8) < *p)
748 && (p[1 + (c/8)] & (1 << (c % 8))))
750 /* Are we starting a range? */
751 if (last + 1 == c && ! in_range)
756 /* Have we broken a range? */
757 else if (last + 1 != c && in_range)
780 case charset_mule_not:
784 printf ("/charset_mule [%s",
785 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
786 nentries = unified_range_table_nentries (p);
787 for (i = 0; i < nentries; i++)
789 EMACS_INT first, last;
790 Lisp_Object dummy_val;
792 unified_range_table_get_range (p, i, &first, &last,
797 printf ("(0x%lx)", (long)first);
804 printf ("(0x%lx)", (long)last);
808 p += unified_range_table_bytes_used (p);
821 case on_failure_jump:
822 extract_number_and_incr (&mcnt, &p);
823 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
826 case on_failure_keep_string_jump:
827 extract_number_and_incr (&mcnt, &p);
828 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
831 case dummy_failure_jump:
832 extract_number_and_incr (&mcnt, &p);
833 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
836 case push_dummy_failure:
837 printf ("/push_dummy_failure");
841 extract_number_and_incr (&mcnt, &p);
842 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
845 case pop_failure_jump:
846 extract_number_and_incr (&mcnt, &p);
847 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
851 extract_number_and_incr (&mcnt, &p);
852 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
856 extract_number_and_incr (&mcnt, &p);
857 printf ("/jump to %ld", (long)(p + mcnt - start));
861 extract_number_and_incr (&mcnt, &p);
862 extract_number_and_incr (&mcnt2, &p);
863 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
867 extract_number_and_incr (&mcnt, &p);
868 extract_number_and_incr (&mcnt2, &p);
869 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
873 extract_number_and_incr (&mcnt, &p);
874 extract_number_and_incr (&mcnt2, &p);
875 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
879 printf ("/wordbound");
883 printf ("/notwordbound");
895 printf ("/before_dot");
903 printf ("/after_dot");
907 printf ("/syntaxspec");
909 printf ("/%d", mcnt);
913 printf ("/notsyntaxspec");
915 printf ("/%d", mcnt);
919 /* 97/2/17 jhod Mule category patch */
921 printf ("/categoryspec");
923 printf ("/%d", mcnt);
926 case notcategoryspec:
927 printf ("/notcategoryspec");
929 printf ("/%d", mcnt);
931 /* end of category patch */
936 printf ("/wordchar");
940 printf ("/notwordchar");
952 printf ("?%d", *(p-1));
958 printf ("%ld:\tend of pattern.\n", (long)(p - start));
963 print_compiled_pattern (struct re_pattern_buffer *bufp)
965 re_char *buffer = bufp->buffer;
967 print_partial_compiled_pattern (buffer, buffer + bufp->used);
968 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
971 if (bufp->fastmap_accurate && bufp->fastmap)
973 printf ("fastmap: ");
974 print_fastmap (bufp->fastmap);
977 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
978 printf ("regs_alloc: %d\t", bufp->regs_allocated);
979 printf ("can_be_null: %d\t", bufp->can_be_null);
980 printf ("newline_anchor: %d\n", bufp->newline_anchor);
981 printf ("no_sub: %d\t", bufp->no_sub);
982 printf ("not_bol: %d\t", bufp->not_bol);
983 printf ("not_eol: %d\t", bufp->not_eol);
984 printf ("syntax: %d\n", bufp->syntax);
985 /* Perhaps we should print the translate table? */
986 /* and maybe the category table? */
991 print_double_string (re_char *where, re_char *string1, int size1,
992 re_char *string2, int size2)
998 Element_count this_char;
1000 if (FIRST_STRING_P (where))
1002 for (this_char = where - string1; this_char < size1; this_char++)
1003 putchar (string1[this_char]);
1008 for (this_char = where - string2; this_char < size2; this_char++)
1009 putchar (string2[this_char]);
1013 #else /* not DEBUG */
1018 #define DEBUG_STATEMENT(e)
1019 #define DEBUG_PRINT1(x)
1020 #define DEBUG_PRINT2(x1, x2)
1021 #define DEBUG_PRINT3(x1, x2, x3)
1022 #define DEBUG_PRINT4(x1, x2, x3, x4)
1023 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1024 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1028 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1029 also be assigned to arbitrarily: each pattern buffer stores its own
1030 syntax, so it can be changed between regex compilations. */
1031 /* This has no initializer because initialized variables in Emacs
1032 become read-only after dumping. */
1033 reg_syntax_t re_syntax_options;
1036 /* Specify the precise syntax of regexps for compilation. This provides
1037 for compatibility for various utilities which historically have
1038 different, incompatible syntaxes.
1040 The argument SYNTAX is a bit mask comprised of the various bits
1041 defined in regex.h. We return the old syntax. */
1044 re_set_syntax (reg_syntax_t syntax)
1046 reg_syntax_t ret = re_syntax_options;
1048 re_syntax_options = syntax;
1052 /* This table gives an error message for each of the error codes listed
1053 in regex.h. Obviously the order here has to be same as there.
1054 POSIX doesn't require that we do anything for REG_NOERROR,
1055 but why not be nice? */
1057 static const char *re_error_msgid[] =
1059 "Success", /* REG_NOERROR */
1060 "No match", /* REG_NOMATCH */
1061 "Invalid regular expression", /* REG_BADPAT */
1062 "Invalid collation character", /* REG_ECOLLATE */
1063 "Invalid character class name", /* REG_ECTYPE */
1064 "Trailing backslash", /* REG_EESCAPE */
1065 "Invalid back reference", /* REG_ESUBREG */
1066 "Unmatched [ or [^", /* REG_EBRACK */
1067 "Unmatched ( or \\(", /* REG_EPAREN */
1068 "Unmatched \\{", /* REG_EBRACE */
1069 "Invalid content of \\{\\}", /* REG_BADBR */
1070 "Invalid range end", /* REG_ERANGE */
1071 "Memory exhausted", /* REG_ESPACE */
1072 "Invalid preceding regular expression", /* REG_BADRPT */
1073 "Premature end of regular expression", /* REG_EEND */
1074 "Regular expression too big", /* REG_ESIZE */
1075 "Unmatched ) or \\)", /* REG_ERPAREN */
1077 "Invalid syntax designator", /* REG_ESYNTAX */
1080 "Ranges may not span charsets", /* REG_ERANGESPAN */
1081 "Invalid category designator", /* REG_ECATEGORY */
1085 /* Avoiding alloca during matching, to placate r_alloc. */
1087 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1088 searching and matching functions should not call alloca. On some
1089 systems, alloca is implemented in terms of malloc, and if we're
1090 using the relocating allocator routines, then malloc could cause a
1091 relocation, which might (if the strings being searched are in the
1092 ralloc heap) shift the data out from underneath the regexp
1095 Here's another reason to avoid allocation: Emacs
1096 processes input from X in a signal handler; processing X input may
1097 call malloc; if input arrives while a matching routine is calling
1098 malloc, then we're scrod. But Emacs can't just block input while
1099 calling matching routines; then we don't notice interrupts when
1100 they come in. So, Emacs blocks input around all regexp calls
1101 except the matching calls, which it leaves unprotected, in the
1102 faith that they will not malloc. */
1104 /* Normally, this is fine. */
1105 #define MATCH_MAY_ALLOCATE
1107 /* When using GNU C, we are not REALLY using the C alloca, no matter
1108 what config.h may say. So don't take precautions for it. */
1113 /* The match routines may not allocate if (1) they would do it with malloc
1114 and (2) it's not safe for them to use malloc.
1115 Note that if REL_ALLOC is defined, matching would not use malloc for the
1116 failure stack, but we would still use it for the register vectors;
1117 so REL_ALLOC should not affect this. */
1118 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1119 #undef MATCH_MAY_ALLOCATE
1123 /* Failure stack declarations and macros; both re_compile_fastmap and
1124 re_match_2 use a failure stack. These have to be macros because of
1125 REGEX_ALLOCATE_STACK. */
1128 /* Number of failure points for which to initially allocate space
1129 when matching. If this number is exceeded, we allocate more
1130 space, so it is not a hard limit. */
1131 #ifndef INIT_FAILURE_ALLOC
1132 #define INIT_FAILURE_ALLOC 5
1135 /* Roughly the maximum number of failure points on the stack. Would be
1136 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1137 This is a variable only so users of regex can assign to it; we never
1138 change it ourselves. */
1139 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1140 /* 4400 was enough to cause a crash on Alpha OSF/1,
1141 whose default stack limit is 2mb. */
1142 int re_max_failures = 20000;
1144 int re_max_failures = 2000;
1147 union fail_stack_elt
1153 typedef union fail_stack_elt fail_stack_elt_t;
1157 fail_stack_elt_t *stack;
1159 Element_count avail; /* Offset of next open position. */
1162 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1163 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1164 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1167 /* Define macros to initialize and free the failure stack.
1168 Do `return -2' if the alloc fails. */
1170 #ifdef MATCH_MAY_ALLOCATE
1171 #define INIT_FAIL_STACK() \
1173 fail_stack.stack = (fail_stack_elt_t *) \
1174 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1176 if (fail_stack.stack == NULL) \
1179 fail_stack.size = INIT_FAILURE_ALLOC; \
1180 fail_stack.avail = 0; \
1183 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1185 #define INIT_FAIL_STACK() \
1187 fail_stack.avail = 0; \
1190 #define RESET_FAIL_STACK()
1194 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1196 Return 1 if succeeds, and 0 if either ran out of memory
1197 allocating space for it or it was already too large.
1199 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1201 #define DOUBLE_FAIL_STACK(fail_stack) \
1202 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1204 : ((fail_stack).stack = (fail_stack_elt_t *) \
1205 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1206 (fail_stack).size * sizeof (fail_stack_elt_t), \
1207 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1209 (fail_stack).stack == NULL \
1211 : ((fail_stack).size <<= 1, \
1215 /* Push pointer POINTER on FAIL_STACK.
1216 Return 1 if was able to do so and 0 if ran out of memory allocating
1218 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1219 ((FAIL_STACK_FULL () \
1220 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1222 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1225 /* Push a pointer value onto the failure stack.
1226 Assumes the variable `fail_stack'. Probably should only
1227 be called from within `PUSH_FAILURE_POINT'. */
1228 #define PUSH_FAILURE_POINTER(item) \
1229 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1231 /* This pushes an integer-valued item onto the failure stack.
1232 Assumes the variable `fail_stack'. Probably should only
1233 be called from within `PUSH_FAILURE_POINT'. */
1234 #define PUSH_FAILURE_INT(item) \
1235 fail_stack.stack[fail_stack.avail++].integer = (item)
1237 /* Push a fail_stack_elt_t value onto the failure stack.
1238 Assumes the variable `fail_stack'. Probably should only
1239 be called from within `PUSH_FAILURE_POINT'. */
1240 #define PUSH_FAILURE_ELT(item) \
1241 fail_stack.stack[fail_stack.avail++] = (item)
1243 /* These three POP... operations complement the three PUSH... operations.
1244 All assume that `fail_stack' is nonempty. */
1245 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1246 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1247 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1249 /* Used to omit pushing failure point id's when we're not debugging. */
1251 #define DEBUG_PUSH PUSH_FAILURE_INT
1252 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1254 #define DEBUG_PUSH(item)
1255 #define DEBUG_POP(item_addr)
1259 /* Push the information about the state we will need
1260 if we ever fail back to it.
1262 Requires variables fail_stack, regstart, regend, reg_info, and
1263 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1266 Does `return FAILURE_CODE' if runs out of memory. */
1268 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1269 #define DECLARE_DESTINATION char *destination
1271 #define DECLARE_DESTINATION DECLARE_NOTHING
1274 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1276 DECLARE_DESTINATION; \
1277 /* Must be int, so when we don't save any registers, the arithmetic \
1278 of 0 + -1 isn't done as unsigned. */ \
1281 DEBUG_STATEMENT (failure_id++); \
1282 DEBUG_STATEMENT (nfailure_points_pushed++); \
1283 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1284 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1285 (unsigned long) (fail_stack).avail); \
1286 DEBUG_PRINT2 (" size: %lu\n", \
1287 (unsigned long) (fail_stack).size); \
1289 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1290 DEBUG_PRINT2 (" available: %ld\n", \
1291 (long) REMAINING_AVAIL_SLOTS); \
1293 /* Ensure we have enough space allocated for what we will push. */ \
1294 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1296 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1297 return failure_code; \
1299 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1300 (unsigned long) (fail_stack).size); \
1301 DEBUG_PRINT2 (" slots available: %ld\n", \
1302 (long) REMAINING_AVAIL_SLOTS); \
1305 /* Push the info, starting with the registers. */ \
1306 DEBUG_PRINT1 ("\n"); \
1308 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1311 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1312 DEBUG_STATEMENT (num_regs_pushed++); \
1314 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1315 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1317 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1318 PUSH_FAILURE_POINTER (regend[this_reg]); \
1320 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1321 * (long *) (®_info[this_reg])); \
1322 DEBUG_PRINT2 (" match_null=%d", \
1323 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1324 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1325 DEBUG_PRINT2 (" matched_something=%d", \
1326 MATCHED_SOMETHING (reg_info[this_reg])); \
1327 DEBUG_PRINT2 (" ever_matched_something=%d", \
1328 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1329 DEBUG_PRINT1 ("\n"); \
1330 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1333 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1334 PUSH_FAILURE_INT (lowest_active_reg); \
1336 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1337 PUSH_FAILURE_INT (highest_active_reg); \
1339 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1340 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1341 PUSH_FAILURE_POINTER (pattern_place); \
1343 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1344 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1346 DEBUG_PRINT1 ("'\n"); \
1347 PUSH_FAILURE_POINTER (string_place); \
1349 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1350 DEBUG_PUSH (failure_id); \
1353 /* This is the number of items that are pushed and popped on the stack
1354 for each register. */
1355 #define NUM_REG_ITEMS 3
1357 /* Individual items aside from the registers. */
1359 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1361 #define NUM_NONREG_ITEMS 4
1364 /* We push at most this many items on the stack. */
1365 /* We used to use (num_regs - 1), which is the number of registers
1366 this regexp will save; but that was changed to 5
1367 to avoid stack overflow for a regexp with lots of parens. */
1368 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1370 /* We actually push this many items. */
1371 #define NUM_FAILURE_ITEMS \
1372 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1375 /* How many items can still be added to the stack without overflowing it. */
1376 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1379 /* Pops what PUSH_FAIL_STACK pushes.
1381 We restore into the parameters, all of which should be lvalues:
1382 STR -- the saved data position.
1383 PAT -- the saved pattern position.
1384 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1385 REGSTART, REGEND -- arrays of string positions.
1386 REG_INFO -- array of information about each subexpression.
1388 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1389 `pend', `string1', `size1', `string2', and `size2'. */
1391 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1392 regstart, regend, reg_info) \
1394 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1396 const unsigned char *string_temp; \
1398 assert (!FAIL_STACK_EMPTY ()); \
1400 /* Remove failure points and point to how many regs pushed. */ \
1401 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1402 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1403 (unsigned long) fail_stack.avail); \
1404 DEBUG_PRINT2 (" size: %lu\n", \
1405 (unsigned long) fail_stack.size); \
1407 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1409 DEBUG_POP (&ffailure_id.integer); \
1410 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1411 * (unsigned int *) &ffailure_id); \
1413 /* If the saved string location is NULL, it came from an \
1414 on_failure_keep_string_jump opcode, and we want to throw away the \
1415 saved NULL, thus retaining our current position in the string. */ \
1416 string_temp = POP_FAILURE_POINTER (); \
1417 if (string_temp != NULL) \
1418 str = string_temp; \
1420 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1421 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1422 DEBUG_PRINT1 ("'\n"); \
1424 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1425 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1426 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1428 /* Restore register info. */ \
1429 high_reg = (unsigned) POP_FAILURE_INT (); \
1430 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1432 low_reg = (unsigned) POP_FAILURE_INT (); \
1433 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1435 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1437 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1439 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1440 DEBUG_PRINT2 (" info: 0x%lx\n", \
1441 * (long *) ®_info[this_reg]); \
1443 regend[this_reg] = POP_FAILURE_POINTER (); \
1444 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1446 regstart[this_reg] = POP_FAILURE_POINTER (); \
1447 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1450 set_regs_matched_done = 0; \
1451 DEBUG_STATEMENT (nfailure_points_popped++); \
1452 } while (0) /* POP_FAILURE_POINT */
1456 /* Structure for per-register (a.k.a. per-group) information.
1457 Other register information, such as the
1458 starting and ending positions (which are addresses), and the list of
1459 inner groups (which is a bits list) are maintained in separate
1462 We are making a (strictly speaking) nonportable assumption here: that
1463 the compiler will pack our bit fields into something that fits into
1464 the type of `word', i.e., is something that fits into one item on the
1469 fail_stack_elt_t word;
1472 /* This field is one if this group can match the empty string,
1473 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1474 #define MATCH_NULL_UNSET_VALUE 3
1475 unsigned match_null_string_p : 2;
1476 unsigned is_active : 1;
1477 unsigned matched_something : 1;
1478 unsigned ever_matched_something : 1;
1480 } register_info_type;
1482 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1483 #define IS_ACTIVE(R) ((R).bits.is_active)
1484 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1485 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1488 /* Call this when have matched a real character; it sets `matched' flags
1489 for the subexpressions which we are currently inside. Also records
1490 that those subexprs have matched. */
1491 #define SET_REGS_MATCHED() \
1494 if (!set_regs_matched_done) \
1497 set_regs_matched_done = 1; \
1498 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1500 MATCHED_SOMETHING (reg_info[r]) \
1501 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1508 /* Registers are set to a sentinel when they haven't yet matched. */
1509 static unsigned char reg_unset_dummy;
1510 #define REG_UNSET_VALUE (®_unset_dummy)
1511 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1513 /* Subroutine declarations and macros for regex_compile. */
1515 /* Fetch the next character in the uncompiled pattern---translating it
1516 if necessary. Also cast from a signed character in the constant
1517 string passed to us by the user to an unsigned char that we can use
1518 as an array index (in, e.g., `translate'). */
1519 #define PATFETCH(c) \
1522 c = TRANSLATE (c); \
1525 /* Fetch the next character in the uncompiled pattern, with no
1527 #define PATFETCH_RAW(c) \
1528 do {if (p == pend) return REG_EEND; \
1529 assert (p < pend); \
1530 c = charptr_emchar (p); \
1534 /* Go backwards one character in the pattern. */
1535 #define PATUNFETCH DEC_CHARPTR (p)
1539 #define PATFETCH_EXTENDED(emch) \
1540 do {if (p == pend) return REG_EEND; \
1541 assert (p < pend); \
1542 emch = charptr_emchar ((const Bufbyte *) p); \
1544 if (TRANSLATE_P (translate) && emch < 0x80) \
1545 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1548 #define PATFETCH_RAW_EXTENDED(emch) \
1549 do {if (p == pend) return REG_EEND; \
1550 assert (p < pend); \
1551 emch = charptr_emchar ((const Bufbyte *) p); \
1555 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1557 #define PATFETCH_EITHER(emch) \
1559 if (has_extended_chars) \
1560 PATFETCH_EXTENDED (emch); \
1565 #define PATFETCH_RAW_EITHER(emch) \
1567 if (has_extended_chars) \
1568 PATFETCH_RAW_EXTENDED (emch); \
1570 PATFETCH_RAW (emch); \
1573 #define PATUNFETCH_EITHER \
1575 if (has_extended_chars) \
1576 PATUNFETCH_EXTENDED (emch); \
1578 PATUNFETCH (emch); \
1581 #else /* not MULE */
1583 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1584 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1585 #define PATUNFETCH_EITHER PATUNFETCH
1589 /* If `translate' is non-null, return translate[D], else just D. We
1590 cast the subscript to translate because some data is declared as
1591 `char *', to avoid warnings when a string constant is passed. But
1592 when we use a character as a subscript we must make it unsigned. */
1593 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1595 /* Macros for outputting the compiled pattern into `buffer'. */
1597 /* If the buffer isn't allocated when it comes in, use this. */
1598 #define INIT_BUF_SIZE 32
1600 /* Make sure we have at least N more bytes of space in buffer. */
1601 #define GET_BUFFER_SPACE(n) \
1602 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1605 /* Make sure we have one more byte of buffer space and then add C to it. */
1606 #define BUF_PUSH(c) \
1608 GET_BUFFER_SPACE (1); \
1609 *buf_end++ = (unsigned char) (c); \
1613 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1614 #define BUF_PUSH_2(c1, c2) \
1616 GET_BUFFER_SPACE (2); \
1617 *buf_end++ = (unsigned char) (c1); \
1618 *buf_end++ = (unsigned char) (c2); \
1622 /* As with BUF_PUSH_2, except for three bytes. */
1623 #define BUF_PUSH_3(c1, c2, c3) \
1625 GET_BUFFER_SPACE (3); \
1626 *buf_end++ = (unsigned char) (c1); \
1627 *buf_end++ = (unsigned char) (c2); \
1628 *buf_end++ = (unsigned char) (c3); \
1632 /* Store a jump with opcode OP at LOC to location TO. We store a
1633 relative address offset by the three bytes the jump itself occupies. */
1634 #define STORE_JUMP(op, loc, to) \
1635 store_op1 (op, loc, (to) - (loc) - 3)
1637 /* Likewise, for a two-argument jump. */
1638 #define STORE_JUMP2(op, loc, to, arg) \
1639 store_op2 (op, loc, (to) - (loc) - 3, arg)
1641 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1643 #define INSERT_JUMP(op, loc, to) \
1644 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1646 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1648 #define INSERT_JUMP2(op, loc, to, arg) \
1649 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1652 /* This is not an arbitrary limit: the arguments which represent offsets
1653 into the pattern are two bytes long. So if 2^16 bytes turns out to
1654 be too small, many things would have to change. */
1655 #define MAX_BUF_SIZE (1L << 16)
1658 /* Extend the buffer by twice its current size via realloc and
1659 reset the pointers that pointed into the old block to point to the
1660 correct places in the new one. If extending the buffer results in it
1661 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1662 #define EXTEND_BUFFER() \
1664 re_char *old_buffer = bufp->buffer; \
1665 if (bufp->allocated == MAX_BUF_SIZE) \
1667 bufp->allocated <<= 1; \
1668 if (bufp->allocated > MAX_BUF_SIZE) \
1669 bufp->allocated = MAX_BUF_SIZE; \
1670 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1671 if (bufp->buffer == NULL) \
1672 return REG_ESPACE; \
1673 /* If the buffer moved, move all the pointers into it. */ \
1674 if (old_buffer != bufp->buffer) \
1676 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1677 begalt = (begalt - old_buffer) + bufp->buffer; \
1678 if (fixup_alt_jump) \
1679 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1681 laststart = (laststart - old_buffer) + bufp->buffer; \
1682 if (pending_exact) \
1683 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1688 /* Since we have one byte reserved for the register number argument to
1689 {start,stop}_memory, the maximum number of groups we can report
1690 things about is what fits in that byte. */
1691 #define MAX_REGNUM 255
1693 /* But patterns can have more than `MAX_REGNUM' registers. We just
1694 ignore the excess. */
1695 typedef unsigned regnum_t;
1698 /* Macros for the compile stack. */
1700 /* Since offsets can go either forwards or backwards, this type needs to
1701 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1702 typedef int pattern_offset_t;
1706 pattern_offset_t begalt_offset;
1707 pattern_offset_t fixup_alt_jump;
1708 pattern_offset_t inner_group_offset;
1709 pattern_offset_t laststart_offset;
1711 } compile_stack_elt_t;
1716 compile_stack_elt_t *stack;
1718 unsigned avail; /* Offset of next open position. */
1719 } compile_stack_type;
1722 #define INIT_COMPILE_STACK_SIZE 32
1724 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1725 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1727 /* The next available element. */
1728 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1731 /* Set the bit for character C in a bit vector. */
1732 #define SET_LIST_BIT(c) \
1733 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1734 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1738 /* Set the "bit" for character C in a range table. */
1739 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1741 /* Set the "bit" for character c in the appropriate table. */
1742 #define SET_EITHER_BIT(c) \
1744 if (has_extended_chars) \
1745 SET_RANGETAB_BIT (c); \
1750 #else /* not MULE */
1752 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1757 /* Get the next unsigned number in the uncompiled pattern. */
1758 #define GET_UNSIGNED_NUMBER(num) \
1762 while (ISDIGIT (c)) \
1766 num = num * 10 + c - '0'; \
1774 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1776 #define IS_CHAR_CLASS(string) \
1777 (STREQ (string, "alpha") || STREQ (string, "upper") \
1778 || STREQ (string, "lower") || STREQ (string, "digit") \
1779 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1780 || STREQ (string, "space") || STREQ (string, "print") \
1781 || STREQ (string, "punct") || STREQ (string, "graph") \
1782 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1784 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1785 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1786 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1787 unsigned char *end);
1788 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1789 unsigned char *end);
1790 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1791 reg_syntax_t syntax);
1792 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1793 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1795 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1796 RE_TRANSLATE_TYPE translate,
1797 reg_syntax_t syntax,
1800 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1802 RE_TRANSLATE_TYPE translate,
1803 reg_syntax_t syntax,
1806 static re_bool group_match_null_string_p (unsigned char **p,
1808 register_info_type *reg_info);
1809 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1810 register_info_type *reg_info);
1811 static re_bool common_op_match_null_string_p (unsigned char **p,
1813 register_info_type *reg_info);
1814 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1815 REGISTER int len, RE_TRANSLATE_TYPE translate);
1816 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1817 re_char *string1, int size1,
1818 re_char *string2, int size2, int pos,
1819 struct re_registers *regs, int stop);
1821 #ifndef MATCH_MAY_ALLOCATE
1823 /* If we cannot allocate large objects within re_match_2_internal,
1824 we make the fail stack and register vectors global.
1825 The fail stack, we grow to the maximum size when a regexp
1827 The register vectors, we adjust in size each time we
1828 compile a regexp, according to the number of registers it needs. */
1830 static fail_stack_type fail_stack;
1832 /* Size with which the following vectors are currently allocated.
1833 That is so we can make them bigger as needed,
1834 but never make them smaller. */
1835 static int regs_allocated_size;
1837 static re_char ** regstart, ** regend;
1838 static re_char ** old_regstart, ** old_regend;
1839 static re_char **best_regstart, **best_regend;
1840 static register_info_type *reg_info;
1841 static re_char **reg_dummy;
1842 static register_info_type *reg_info_dummy;
1844 /* Make the register vectors big enough for NUM_REGS registers,
1845 but don't make them smaller. */
1848 regex_grow_registers (int num_regs)
1850 if (num_regs > regs_allocated_size)
1852 RETALLOC_IF (regstart, num_regs, re_char *);
1853 RETALLOC_IF (regend, num_regs, re_char *);
1854 RETALLOC_IF (old_regstart, num_regs, re_char *);
1855 RETALLOC_IF (old_regend, num_regs, re_char *);
1856 RETALLOC_IF (best_regstart, num_regs, re_char *);
1857 RETALLOC_IF (best_regend, num_regs, re_char *);
1858 RETALLOC_IF (reg_info, num_regs, register_info_type);
1859 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1860 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1862 regs_allocated_size = num_regs;
1866 #endif /* not MATCH_MAY_ALLOCATE */
1868 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1869 Returns one of error codes defined in `regex.h', or zero for success.
1871 Assumes the `allocated' (and perhaps `buffer') and `translate'
1872 fields are set in BUFP on entry.
1874 If it succeeds, results are put in BUFP (if it returns an error, the
1875 contents of BUFP are undefined):
1876 `buffer' is the compiled pattern;
1877 `syntax' is set to SYNTAX;
1878 `used' is set to the length of the compiled pattern;
1879 `fastmap_accurate' is zero;
1880 `re_nsub' is the number of subexpressions in PATTERN;
1881 `not_bol' and `not_eol' are zero;
1883 The `fastmap' and `newline_anchor' fields are neither
1884 examined nor set. */
1886 /* Return, freeing storage we allocated. */
1887 #define FREE_STACK_RETURN(value) \
1888 return (free (compile_stack.stack), value)
1890 static reg_errcode_t
1891 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1892 struct re_pattern_buffer *bufp)
1894 /* We fetch characters from PATTERN here. We declare these as int
1895 (or possibly long) so that chars above 127 can be used as
1896 array indices. The macros that fetch a character from the pattern
1897 make sure to coerce to unsigned char before assigning, so we won't
1898 get bitten by negative numbers here. */
1899 /* XEmacs change: used to be unsigned char. */
1900 REGISTER EMACS_INT c, c1;
1902 /* A random temporary spot in PATTERN. */
1905 /* Points to the end of the buffer, where we should append. */
1906 REGISTER unsigned char *buf_end;
1908 /* Keeps track of unclosed groups. */
1909 compile_stack_type compile_stack;
1911 /* Points to the current (ending) position in the pattern. */
1912 re_char *p = pattern;
1913 re_char *pend = pattern + size;
1915 /* How to translate the characters in the pattern. */
1916 RE_TRANSLATE_TYPE translate = bufp->translate;
1918 /* Address of the count-byte of the most recently inserted `exactn'
1919 command. This makes it possible to tell if a new exact-match
1920 character can be added to that command or if the character requires
1921 a new `exactn' command. */
1922 unsigned char *pending_exact = 0;
1924 /* Address of start of the most recently finished expression.
1925 This tells, e.g., postfix * where to find the start of its
1926 operand. Reset at the beginning of groups and alternatives. */
1927 unsigned char *laststart = 0;
1929 /* Address of beginning of regexp, or inside of last group. */
1930 unsigned char *begalt;
1932 /* Place in the uncompiled pattern (i.e., the {) to
1933 which to go back if the interval is invalid. */
1934 re_char *beg_interval;
1936 /* Address of the place where a forward jump should go to the end of
1937 the containing expression. Each alternative of an `or' -- except the
1938 last -- ends with a forward jump of this sort. */
1939 unsigned char *fixup_alt_jump = 0;
1941 /* Counts open-groups as they are encountered. Remembered for the
1942 matching close-group on the compile stack, so the same register
1943 number is put in the stop_memory as the start_memory. */
1944 regnum_t regnum = 0;
1947 DEBUG_PRINT1 ("\nCompiling pattern: ");
1952 for (debug_count = 0; debug_count < size; debug_count++)
1953 putchar (pattern[debug_count]);
1958 /* Initialize the compile stack. */
1959 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1960 if (compile_stack.stack == NULL)
1963 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1964 compile_stack.avail = 0;
1966 /* Initialize the pattern buffer. */
1967 bufp->syntax = syntax;
1968 bufp->fastmap_accurate = 0;
1969 bufp->not_bol = bufp->not_eol = 0;
1971 /* Set `used' to zero, so that if we return an error, the pattern
1972 printer (for debugging) will think there's no pattern. We reset it
1976 /* Always count groups, whether or not bufp->no_sub is set. */
1979 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1980 /* Initialize the syntax table. */
1981 init_syntax_once ();
1984 if (bufp->allocated == 0)
1987 { /* If zero allocated, but buffer is non-null, try to realloc
1988 enough space. This loses if buffer's address is bogus, but
1989 that is the user's responsibility. */
1990 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1993 { /* Caller did not allocate a buffer. Do it for them. */
1994 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1996 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1998 bufp->allocated = INIT_BUF_SIZE;
2001 begalt = buf_end = bufp->buffer;
2003 /* Loop through the uncompiled pattern until we're at the end. */
2012 if ( /* If at start of pattern, it's an operator. */
2014 /* If context independent, it's an operator. */
2015 || syntax & RE_CONTEXT_INDEP_ANCHORS
2016 /* Otherwise, depends on what's come before. */
2017 || at_begline_loc_p (pattern, p, syntax))
2027 if ( /* If at end of pattern, it's an operator. */
2029 /* If context independent, it's an operator. */
2030 || syntax & RE_CONTEXT_INDEP_ANCHORS
2031 /* Otherwise, depends on what's next. */
2032 || at_endline_loc_p (p, pend, syntax))
2042 if ((syntax & RE_BK_PLUS_QM)
2043 || (syntax & RE_LIMITED_OPS))
2047 /* If there is no previous pattern... */
2050 if (syntax & RE_CONTEXT_INVALID_OPS)
2051 FREE_STACK_RETURN (REG_BADRPT);
2052 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2057 /* true means zero/many matches are allowed. */
2058 re_bool zero_times_ok = c != '+';
2059 re_bool many_times_ok = c != '?';
2061 /* true means match shortest string possible. */
2062 re_bool minimal = false;
2064 /* If there is a sequence of repetition chars, collapse it
2065 down to just one (the right one). We can't combine
2066 interval operators with these because of, e.g., `a{2}*',
2067 which should only match an even number of `a's. */
2072 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2073 && (c == '+' || c == '?')))
2076 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2078 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2081 if (!(c1 == '+' || c1 == '?'))
2096 /* If we get here, we found another repeat character. */
2097 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2099 /* "*?" and "+?" and "??" are okay (and mean match
2100 minimally), but other sequences (such as "*??" and
2101 "+++") are rejected (reserved for future use). */
2102 if (minimal || c != '?')
2103 FREE_STACK_RETURN (REG_BADRPT);
2108 zero_times_ok |= c != '+';
2109 many_times_ok |= c != '?';
2113 /* Star, etc. applied to an empty pattern is equivalent
2114 to an empty pattern. */
2118 /* Now we know whether zero matches is allowed
2119 and whether two or more matches is allowed
2120 and whether we want minimal or maximal matching. */
2126 0: /on_failure_jump to 6
2131 GET_BUFFER_SPACE (6);
2132 INSERT_JUMP (jump, laststart, buf_end + 3);
2134 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2137 else if (zero_times_ok)
2142 6: /on_failure_jump to 3
2145 GET_BUFFER_SPACE (6);
2146 INSERT_JUMP (jump, laststart, buf_end + 3);
2148 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2155 3: /on_failure_jump to 0
2158 GET_BUFFER_SPACE (3);
2159 STORE_JUMP (on_failure_jump, buf_end, laststart);
2165 /* Are we optimizing this jump? */
2166 re_bool keep_string_p = false;
2169 { /* More than one repetition is allowed, so put in
2170 at the end a backward relative jump from
2171 `buf_end' to before the next jump we're going
2172 to put in below (which jumps from laststart to
2175 But if we are at the `*' in the exact sequence `.*\n',
2176 insert an unconditional jump backwards to the .,
2177 instead of the beginning of the loop. This way we only
2178 push a failure point once, instead of every time
2179 through the loop. */
2180 assert (p - 1 > pattern);
2182 /* Allocate the space for the jump. */
2183 GET_BUFFER_SPACE (3);
2185 /* We know we are not at the first character of the
2186 pattern, because laststart was nonzero. And we've
2187 already incremented `p', by the way, to be the
2188 character after the `*'. Do we have to do something
2189 analogous here for null bytes, because of
2193 && p < pend && *p == '\n'
2194 && !(syntax & RE_DOT_NEWLINE))
2195 { /* We have .*\n. */
2196 STORE_JUMP (jump, buf_end, laststart);
2197 keep_string_p = true;
2200 /* Anything else. */
2201 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2203 /* We've added more stuff to the buffer. */
2207 /* On failure, jump from laststart to buf_end + 3,
2208 which will be the end of the buffer after this jump
2210 GET_BUFFER_SPACE (3);
2211 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2213 laststart, buf_end + 3);
2218 /* At least one repetition is required, so insert a
2219 `dummy_failure_jump' before the initial
2220 `on_failure_jump' instruction of the loop. This
2221 effects a skip over that instruction the first time
2222 we hit that loop. */
2223 GET_BUFFER_SPACE (3);
2224 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2234 laststart = buf_end;
2241 /* XEmacs change: this whole section */
2242 re_bool had_char_class = false;
2244 re_bool has_extended_chars = false;
2245 REGISTER Lisp_Object rtab = Qnil;
2248 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2250 /* Ensure that we have enough space to push a charset: the
2251 opcode, the length count, and the bitset; 34 bytes in all. */
2252 GET_BUFFER_SPACE (34);
2254 laststart = buf_end;
2256 /* We test `*p == '^' twice, instead of using an if
2257 statement, so we only need one BUF_PUSH. */
2258 BUF_PUSH (*p == '^' ? charset_not : charset);
2262 /* Remember the first position in the bracket expression. */
2265 /* Push the number of bytes in the bitmap. */
2266 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2268 /* Clear the whole map. */
2269 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2271 /* charset_not matches newline according to a syntax bit. */
2272 if ((re_opcode_t) buf_end[-2] == charset_not
2273 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2274 SET_LIST_BIT ('\n');
2277 start_over_with_extended:
2278 if (has_extended_chars)
2280 /* There are extended chars here, which means we need to start
2281 over and shift to unified range-table format. */
2282 if (buf_end[-2] == charset)
2283 buf_end[-2] = charset_mule;
2285 buf_end[-2] = charset_mule_not;
2287 p = p1; /* go back to the beginning of the charset, after
2289 rtab = Vthe_lisp_rangetab;
2290 Fclear_range_table (rtab);
2292 /* charset_not matches newline according to a syntax bit. */
2293 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2294 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2295 SET_EITHER_BIT ('\n');
2299 /* Read in characters and ranges, setting map bits. */
2302 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2307 if (c >= 0x80 && !has_extended_chars)
2309 has_extended_chars = 1;
2310 /* Frumble-bumble, we've found some extended chars.
2311 Need to start over, process everything using
2312 the general extended-char mechanism, and need
2313 to use charset_mule and charset_mule_not instead
2314 of charset and charset_not. */
2315 goto start_over_with_extended;
2318 /* \ might escape characters inside [...] and [^...]. */
2319 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2321 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2325 if (c1 >= 0x80 && !has_extended_chars)
2327 has_extended_chars = 1;
2328 goto start_over_with_extended;
2331 SET_EITHER_BIT (c1);
2335 /* Could be the end of the bracket expression. If it's
2336 not (i.e., when the bracket expression is `[]' so
2337 far), the ']' character bit gets set way below. */
2338 if (c == ']' && p != p1 + 1)
2341 /* Look ahead to see if it's a range when the last thing
2342 was a character class. */
2343 if (had_char_class && c == '-' && *p != ']')
2344 FREE_STACK_RETURN (REG_ERANGE);
2346 /* Look ahead to see if it's a range when the last thing
2347 was a character: if this is a hyphen not at the
2348 beginning or the end of a list, then it's the range
2351 && !(p - 2 >= pattern && p[-2] == '[')
2352 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2358 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2360 has_extended_chars = 1;
2361 goto start_over_with_extended;
2363 if (has_extended_chars)
2364 ret = compile_extended_range (&p, pend, translate,
2368 ret = compile_range (&p, pend, translate, syntax, buf_end);
2369 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2372 else if (p[0] == '-' && p[1] != ']')
2373 { /* This handles ranges made up of characters only. */
2376 /* Move past the `-'. */
2380 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2382 has_extended_chars = 1;
2383 goto start_over_with_extended;
2385 if (has_extended_chars)
2386 ret = compile_extended_range (&p, pend, translate,
2390 ret = compile_range (&p, pend, translate, syntax, buf_end);
2391 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2394 /* See if we're at the beginning of a possible character
2397 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2398 { /* Leave room for the null. */
2399 char str[CHAR_CLASS_MAX_LENGTH + 1];
2404 /* If pattern is `[[:'. */
2405 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2409 /* #### This code is unused.
2410 Correctness is not checked after TRT
2413 if (c == ':' || c == ']' || p == pend
2414 || c1 == CHAR_CLASS_MAX_LENGTH)
2416 str[c1++] = (char) c;
2420 /* If isn't a word bracketed by `[:' and `:]':
2421 undo the ending character, the letters, and leave
2422 the leading `:' and `[' (but set bits for them). */
2423 if (c == ':' && *p == ']')
2426 re_bool is_alnum = STREQ (str, "alnum");
2427 re_bool is_alpha = STREQ (str, "alpha");
2428 re_bool is_blank = STREQ (str, "blank");
2429 re_bool is_cntrl = STREQ (str, "cntrl");
2430 re_bool is_digit = STREQ (str, "digit");
2431 re_bool is_graph = STREQ (str, "graph");
2432 re_bool is_lower = STREQ (str, "lower");
2433 re_bool is_print = STREQ (str, "print");
2434 re_bool is_punct = STREQ (str, "punct");
2435 re_bool is_space = STREQ (str, "space");
2436 re_bool is_upper = STREQ (str, "upper");
2437 re_bool is_xdigit = STREQ (str, "xdigit");
2439 if (!IS_CHAR_CLASS (str))
2440 FREE_STACK_RETURN (REG_ECTYPE);
2442 /* Throw away the ] at the end of the character
2446 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2448 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2450 /* This was split into 3 if's to
2451 avoid an arbitrary limit in some compiler. */
2452 if ( (is_alnum && ISALNUM (ch))
2453 || (is_alpha && ISALPHA (ch))
2454 || (is_blank && ISBLANK (ch))
2455 || (is_cntrl && ISCNTRL (ch)))
2456 SET_EITHER_BIT (ch);
2457 if ( (is_digit && ISDIGIT (ch))
2458 || (is_graph && ISGRAPH (ch))
2459 || (is_lower && ISLOWER (ch))
2460 || (is_print && ISPRINT (ch)))
2461 SET_EITHER_BIT (ch);
2462 if ( (is_punct && ISPUNCT (ch))
2463 || (is_space && ISSPACE (ch))
2464 || (is_upper && ISUPPER (ch))
2465 || (is_xdigit && ISXDIGIT (ch)))
2466 SET_EITHER_BIT (ch);
2468 had_char_class = true;
2475 SET_EITHER_BIT ('[');
2476 SET_EITHER_BIT (':');
2477 had_char_class = false;
2482 had_char_class = false;
2488 if (has_extended_chars)
2490 /* We have a range table, not a bit vector. */
2492 unified_range_table_bytes_needed (rtab);
2493 GET_BUFFER_SPACE (bytes_needed);
2494 unified_range_table_copy_data (rtab, buf_end);
2495 buf_end += unified_range_table_bytes_used (buf_end);
2499 /* Discard any (non)matching list bytes that are all 0 at the
2500 end of the map. Decrease the map-length byte too. */
2501 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2503 buf_end += buf_end[-1];
2509 if (syntax & RE_NO_BK_PARENS)
2516 if (syntax & RE_NO_BK_PARENS)
2523 if (syntax & RE_NEWLINE_ALT)
2530 if (syntax & RE_NO_BK_VBAR)
2537 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2538 goto handle_interval;
2544 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2546 /* Do not translate the character after the \, so that we can
2547 distinguish, e.g., \B from \b, even if we normally would
2548 translate, e.g., B to b. */
2554 if (syntax & RE_NO_BK_PARENS)
2555 goto normal_backslash;
2561 if (!(syntax & RE_NO_SHY_GROUPS)
2569 case ':': /* shy groups */
2573 /* All others are reserved for future constructs. */
2575 FREE_STACK_RETURN (REG_BADPAT);
2584 if (COMPILE_STACK_FULL)
2586 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2587 compile_stack_elt_t);
2588 if (compile_stack.stack == NULL) return REG_ESPACE;
2590 compile_stack.size <<= 1;
2593 /* These are the values to restore when we hit end of this
2594 group. They are all relative offsets, so that if the
2595 whole pattern moves because of realloc, they will still
2597 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2598 COMPILE_STACK_TOP.fixup_alt_jump
2599 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2600 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2601 COMPILE_STACK_TOP.regnum = r;
2603 /* We will eventually replace the 0 with the number of
2604 groups inner to this one. But do not push a
2605 start_memory for groups beyond the last one we can
2606 represent in the compiled pattern. */
2607 if (r <= MAX_REGNUM)
2609 COMPILE_STACK_TOP.inner_group_offset
2610 = buf_end - bufp->buffer + 2;
2611 BUF_PUSH_3 (start_memory, r, 0);
2614 compile_stack.avail++;
2619 /* If we've reached MAX_REGNUM groups, then this open
2620 won't actually generate any code, so we'll have to
2621 clear pending_exact explicitly. */
2628 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2630 if (COMPILE_STACK_EMPTY) {
2631 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2632 goto normal_backslash;
2634 FREE_STACK_RETURN (REG_ERPAREN);
2639 { /* Push a dummy failure point at the end of the
2640 alternative for a possible future
2641 `pop_failure_jump' to pop. See comments at
2642 `push_dummy_failure' in `re_match_2'. */
2643 BUF_PUSH (push_dummy_failure);
2645 /* We allocated space for this jump when we assigned
2646 to `fixup_alt_jump', in the `handle_alt' case below. */
2647 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2650 /* See similar code for backslashed left paren above. */
2651 if (COMPILE_STACK_EMPTY) {
2652 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2655 FREE_STACK_RETURN (REG_ERPAREN);
2658 /* Since we just checked for an empty stack above, this
2659 ``can't happen''. */
2660 assert (compile_stack.avail != 0);
2662 /* We don't just want to restore into `regnum', because
2663 later groups should continue to be numbered higher,
2664 as in `(ab)c(de)' -- the second group is #2. */
2665 regnum_t this_group_regnum;
2667 compile_stack.avail--;
2668 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2670 = COMPILE_STACK_TOP.fixup_alt_jump
2671 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2673 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2674 this_group_regnum = COMPILE_STACK_TOP.regnum;
2675 /* If we've reached MAX_REGNUM groups, then this open
2676 won't actually generate any code, so we'll have to
2677 clear pending_exact explicitly. */
2680 /* We're at the end of the group, so now we know how many
2681 groups were inside this one. */
2682 if (this_group_regnum <= MAX_REGNUM)
2684 unsigned char *inner_group_loc
2685 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2687 *inner_group_loc = regnum - this_group_regnum;
2688 BUF_PUSH_3 (stop_memory, this_group_regnum,
2689 regnum - this_group_regnum);
2695 case '|': /* `\|'. */
2696 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2697 goto normal_backslash;
2699 if (syntax & RE_LIMITED_OPS)
2702 /* Insert before the previous alternative a jump which
2703 jumps to this alternative if the former fails. */
2704 GET_BUFFER_SPACE (3);
2705 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2709 /* The alternative before this one has a jump after it
2710 which gets executed if it gets matched. Adjust that
2711 jump so it will jump to this alternative's analogous
2712 jump (put in below, which in turn will jump to the next
2713 (if any) alternative's such jump, etc.). The last such
2714 jump jumps to the correct final destination. A picture:
2720 If we are at `b', then fixup_alt_jump right now points to a
2721 three-byte space after `a'. We'll put in the jump, set
2722 fixup_alt_jump to right after `b', and leave behind three
2723 bytes which we'll fill in when we get to after `c'. */
2726 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2728 /* Mark and leave space for a jump after this alternative,
2729 to be filled in later either by next alternative or
2730 when know we're at the end of a series of alternatives. */
2731 fixup_alt_jump = buf_end;
2732 GET_BUFFER_SPACE (3);
2741 /* If \{ is a literal. */
2742 if (!(syntax & RE_INTERVALS)
2743 /* If we're at `\{' and it's not the open-interval
2745 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2746 || (p - 2 == pattern && p == pend))
2747 goto normal_backslash;
2751 /* If got here, then the syntax allows intervals. */
2753 /* At least (most) this many matches must be made. */
2754 int lower_bound = -1, upper_bound = -1;
2756 beg_interval = p - 1;
2760 if (syntax & RE_NO_BK_BRACES)
2761 goto unfetch_interval;
2763 FREE_STACK_RETURN (REG_EBRACE);
2766 GET_UNSIGNED_NUMBER (lower_bound);
2770 GET_UNSIGNED_NUMBER (upper_bound);
2771 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2774 /* Interval such as `{1}' => match exactly once. */
2775 upper_bound = lower_bound;
2777 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2778 || lower_bound > upper_bound)
2780 if (syntax & RE_NO_BK_BRACES)
2781 goto unfetch_interval;
2783 FREE_STACK_RETURN (REG_BADBR);
2786 if (!(syntax & RE_NO_BK_BRACES))
2788 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2795 if (syntax & RE_NO_BK_BRACES)
2796 goto unfetch_interval;
2798 FREE_STACK_RETURN (REG_BADBR);
2801 /* We just parsed a valid interval. */
2803 /* If it's invalid to have no preceding re. */
2806 if (syntax & RE_CONTEXT_INVALID_OPS)
2807 FREE_STACK_RETURN (REG_BADRPT);
2808 else if (syntax & RE_CONTEXT_INDEP_OPS)
2809 laststart = buf_end;
2811 goto unfetch_interval;
2814 /* If the upper bound is zero, don't want to succeed at
2815 all; jump from `laststart' to `b + 3', which will be
2816 the end of the buffer after we insert the jump. */
2817 if (upper_bound == 0)
2819 GET_BUFFER_SPACE (3);
2820 INSERT_JUMP (jump, laststart, buf_end + 3);
2824 /* Otherwise, we have a nontrivial interval. When
2825 we're all done, the pattern will look like:
2826 set_number_at <jump count> <upper bound>
2827 set_number_at <succeed_n count> <lower bound>
2828 succeed_n <after jump addr> <succeed_n count>
2830 jump_n <succeed_n addr> <jump count>
2831 (The upper bound and `jump_n' are omitted if
2832 `upper_bound' is 1, though.) */
2834 { /* If the upper bound is > 1, we need to insert
2835 more at the end of the loop. */
2836 Memory_count nbytes = 10 + (upper_bound > 1) * 10;
2838 GET_BUFFER_SPACE (nbytes);
2840 /* Initialize lower bound of the `succeed_n', even
2841 though it will be set during matching by its
2842 attendant `set_number_at' (inserted next),
2843 because `re_compile_fastmap' needs to know.
2844 Jump to the `jump_n' we might insert below. */
2845 INSERT_JUMP2 (succeed_n, laststart,
2846 buf_end + 5 + (upper_bound > 1) * 5,
2850 /* Code to initialize the lower bound. Insert
2851 before the `succeed_n'. The `5' is the last two
2852 bytes of this `set_number_at', plus 3 bytes of
2853 the following `succeed_n'. */
2854 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2857 if (upper_bound > 1)
2858 { /* More than one repetition is allowed, so
2859 append a backward jump to the `succeed_n'
2860 that starts this interval.
2862 When we've reached this during matching,
2863 we'll have matched the interval once, so
2864 jump back only `upper_bound - 1' times. */
2865 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2869 /* The location we want to set is the second
2870 parameter of the `jump_n'; that is `b-2' as
2871 an absolute address. `laststart' will be
2872 the `set_number_at' we're about to insert;
2873 `laststart+3' the number to set, the source
2874 for the relative address. But we are
2875 inserting into the middle of the pattern --
2876 so everything is getting moved up by 5.
2877 Conclusion: (b - 2) - (laststart + 3) + 5,
2878 i.e., b - laststart.
2880 We insert this at the beginning of the loop
2881 so that if we fail during matching, we'll
2882 reinitialize the bounds. */
2883 insert_op2 (set_number_at, laststart,
2884 buf_end - laststart,
2885 upper_bound - 1, buf_end);
2890 beg_interval = NULL;
2895 /* If an invalid interval, match the characters as literals. */
2896 assert (beg_interval);
2898 beg_interval = NULL;
2900 /* normal_char and normal_backslash need `c'. */
2903 if (!(syntax & RE_NO_BK_BRACES))
2905 if (p > pattern && p[-1] == '\\')
2906 goto normal_backslash;
2911 /* There is no way to specify the before_dot and after_dot
2912 operators. rms says this is ok. --karl */
2918 laststart = buf_end;
2920 /* XEmacs addition */
2921 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2922 FREE_STACK_RETURN (REG_ESYNTAX);
2923 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2927 laststart = buf_end;
2929 /* XEmacs addition */
2930 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2931 FREE_STACK_RETURN (REG_ESYNTAX);
2932 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2936 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2938 laststart = buf_end;
2940 if (c < 32 || c > 127)
2941 FREE_STACK_RETURN (REG_ECATEGORY);
2942 BUF_PUSH_2 (categoryspec, c);
2946 laststart = buf_end;
2948 if (c < 32 || c > 127)
2949 FREE_STACK_RETURN (REG_ECATEGORY);
2950 BUF_PUSH_2 (notcategoryspec, c);
2952 /* end of category patch */
2958 laststart = buf_end;
2959 BUF_PUSH (wordchar);
2964 laststart = buf_end;
2965 BUF_PUSH (notwordchar);
2978 BUF_PUSH (wordbound);
2982 BUF_PUSH (notwordbound);
2993 case '1': case '2': case '3': case '4': case '5':
2994 case '6': case '7': case '8': case '9':
2997 if (syntax & RE_NO_BK_REFS)
3003 FREE_STACK_RETURN (REG_ESUBREG);
3005 /* Can't back reference to a subexpression if inside of it. */
3006 if (group_in_compile_stack (compile_stack, reg))
3009 laststart = buf_end;
3010 BUF_PUSH_2 (duplicate, reg);
3017 if (syntax & RE_BK_PLUS_QM)
3020 goto normal_backslash;
3024 /* You might think it would be useful for \ to mean
3025 not to translate; but if we don't translate it,
3026 it will never match anything. */
3034 /* Expects the character in `c'. */
3035 /* `p' points to the location after where `c' came from. */
3038 /* XEmacs: modifications here for Mule. */
3039 /* `q' points to the beginning of the next char. */
3042 /* If no exactn currently being built. */
3045 /* If last exactn not at current position. */
3046 || pending_exact + *pending_exact + 1 != buf_end
3048 /* We have only one byte following the exactn for the count. */
3049 || ((unsigned int) (*pending_exact + (q - p)) >=
3050 ((unsigned int) (1 << BYTEWIDTH) - 1))
3052 /* If followed by a repetition operator. */
3053 || *q == '*' || *q == '^'
3054 || ((syntax & RE_BK_PLUS_QM)
3055 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3056 : (*q == '+' || *q == '?'))
3057 || ((syntax & RE_INTERVALS)
3058 && ((syntax & RE_NO_BK_BRACES)
3060 : (q[0] == '\\' && q[1] == '{'))))
3062 /* Start building a new exactn. */
3064 laststart = buf_end;
3066 BUF_PUSH_2 (exactn, 0);
3067 pending_exact = buf_end - 1;
3076 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3079 bt_count = set_charptr_emchar (tmp_buf, c);
3081 for (i = 0; i < bt_count; i++)
3083 BUF_PUSH (tmp_buf[i]);
3091 } /* while p != pend */
3094 /* Through the pattern now. */
3097 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3099 if (!COMPILE_STACK_EMPTY)
3100 FREE_STACK_RETURN (REG_EPAREN);
3102 /* If we don't want backtracking, force success
3103 the first time we reach the end of the compiled pattern. */
3104 if (syntax & RE_NO_POSIX_BACKTRACKING)
3107 free (compile_stack.stack);
3109 /* We have succeeded; set the length of the buffer. */
3110 bufp->used = buf_end - bufp->buffer;
3115 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3116 print_compiled_pattern (bufp);
3120 #ifndef MATCH_MAY_ALLOCATE
3121 /* Initialize the failure stack to the largest possible stack. This
3122 isn't necessary unless we're trying to avoid calling alloca in
3123 the search and match routines. */
3125 int num_regs = bufp->re_nsub + 1;
3127 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3128 is strictly greater than re_max_failures, the largest possible stack
3129 is 2 * re_max_failures failure points. */
3130 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3132 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3135 if (! fail_stack.stack)
3137 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3138 * sizeof (fail_stack_elt_t));
3141 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3143 * sizeof (fail_stack_elt_t)));
3144 #else /* not emacs */
3145 if (! fail_stack.stack)
3147 = (fail_stack_elt_t *) malloc (fail_stack.size
3148 * sizeof (fail_stack_elt_t));
3151 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3153 * sizeof (fail_stack_elt_t)));
3157 regex_grow_registers (num_regs);
3159 #endif /* not MATCH_MAY_ALLOCATE */
3162 } /* regex_compile */
3164 /* Subroutines for `regex_compile'. */
3166 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3169 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3171 *loc = (unsigned char) op;
3172 STORE_NUMBER (loc + 1, arg);
3176 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3179 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3181 *loc = (unsigned char) op;
3182 STORE_NUMBER (loc + 1, arg1);
3183 STORE_NUMBER (loc + 3, arg2);
3187 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3188 for OP followed by two-byte integer parameter ARG. */
3191 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3193 REGISTER unsigned char *pfrom = end;
3194 REGISTER unsigned char *pto = end + 3;
3196 while (pfrom != loc)
3199 store_op1 (op, loc, arg);
3203 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3206 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3209 REGISTER unsigned char *pfrom = end;
3210 REGISTER unsigned char *pto = end + 5;
3212 while (pfrom != loc)
3215 store_op2 (op, loc, arg1, arg2);
3219 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3220 after an alternative or a begin-subexpression. We assume there is at
3221 least one character before the ^. */
3224 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3226 re_char *prev = p - 2;
3227 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3230 /* After a subexpression? */
3231 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3232 /* After an alternative? */
3233 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3237 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3238 at least one character after the $, i.e., `P < PEND'. */
3241 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3244 re_bool next_backslash = *next == '\\';
3245 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3248 /* Before a subexpression? */
3249 (syntax & RE_NO_BK_PARENS ? *next == ')'
3250 : next_backslash && next_next && *next_next == ')')
3251 /* Before an alternative? */
3252 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3253 : next_backslash && next_next && *next_next == '|');
3257 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3258 false if it's not. */
3261 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3265 for (this_element = compile_stack.avail - 1;
3268 if (compile_stack.stack[this_element].regnum == regnum)
3275 /* Read the ending character of a range (in a bracket expression) from the
3276 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3277 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3278 Then we set the translation of all bits between the starting and
3279 ending characters (inclusive) in the compiled pattern B.
3281 Return an error code.
3283 We use these short variable names so we can use the same macros as
3284 `regex_compile' itself. */
3286 static reg_errcode_t
3287 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3288 reg_syntax_t syntax, unsigned char *buf_end)
3290 Element_count this_char;
3292 re_char *p = *p_ptr;
3293 int range_start, range_end;
3298 /* Even though the pattern is a signed `char *', we need to fetch
3299 with unsigned char *'s; if the high bit of the pattern character
3300 is set, the range endpoints will be negative if we fetch using a
3303 We also want to fetch the endpoints without translating them; the
3304 appropriate translation is done in the bit-setting loop below. */
3305 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3306 range_start = ((const unsigned char *) p)[-2];
3307 range_end = ((const unsigned char *) p)[0];
3309 /* Have to increment the pointer into the pattern string, so the
3310 caller isn't still at the ending character. */
3313 /* If the start is after the end, the range is empty. */
3314 if (range_start > range_end)
3315 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3317 /* Here we see why `this_char' has to be larger than an `unsigned
3318 char' -- the range is inclusive, so if `range_end' == 0xff
3319 (assuming 8-bit characters), we would otherwise go into an infinite
3320 loop, since all characters <= 0xff. */
3321 for (this_char = range_start; this_char <= range_end; this_char++)
3323 SET_LIST_BIT (TRANSLATE (this_char));
3331 static reg_errcode_t
3332 compile_extended_range (re_char **p_ptr, re_char *pend,
3333 RE_TRANSLATE_TYPE translate,
3334 reg_syntax_t syntax, Lisp_Object rtab)
3336 Emchar this_char, range_start, range_end;
3342 p = (const Bufbyte *) *p_ptr;
3343 range_end = charptr_emchar (p);
3344 p--; /* back to '-' */
3345 DEC_CHARPTR (p); /* back to start of range */
3346 /* We also want to fetch the endpoints without translating them; the
3347 appropriate translation is done in the bit-setting loop below. */
3348 range_start = charptr_emchar (p);
3349 INC_CHARPTR (*p_ptr);
3351 /* If the start is after the end, the range is empty. */
3352 if (range_start > range_end)
3353 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3355 /* Can't have ranges spanning different charsets, except maybe for
3356 ranges entirely within the first 256 chars. */
3358 if ((range_start >= 0x100 || range_end >= 0x100)
3360 && CHAR_CHARSET_ID (range_start) != CHAR_CHARSET_ID (range_end)
3362 && CHAR_LEADING_BYTE (range_start) != CHAR_LEADING_BYTE (range_end)
3365 return REG_ERANGESPAN;
3367 /* As advertised, translations only work over the 0 - 0x7F range.
3368 Making this kind of stuff work generally is much harder.
3369 Iterating over the whole range like this would be way efficient
3370 if the range encompasses 10,000 chars or something. You'd have
3371 to do something like this:
3375 map over translation table in [range_start, range_end] of
3376 (put the mapped range in a;
3377 put the translation in b)
3378 invert the range in a and truncate to [range_start, range_end]
3379 compute the union of a, b
3380 union the result into rtab
3382 for (this_char = range_start;
3383 this_char <= range_end && this_char < 0x80; this_char++)
3385 SET_RANGETAB_BIT (TRANSLATE (this_char));
3388 if (this_char <= range_end)
3389 put_range_table (rtab, this_char, range_end, Qt);
3396 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3397 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3398 characters can start a string that matches the pattern. This fastmap
3399 is used by re_search to skip quickly over impossible starting points.
3401 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3402 area as BUFP->fastmap.
3404 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3407 Returns 0 if we succeed, -2 if an internal error. */
3410 re_compile_fastmap (struct re_pattern_buffer *bufp)
3413 #ifdef MATCH_MAY_ALLOCATE
3414 fail_stack_type fail_stack;
3416 DECLARE_DESTINATION;
3417 /* We don't push any register information onto the failure stack. */
3419 REGISTER char *fastmap = bufp->fastmap;
3420 unsigned char *pattern = bufp->buffer;
3421 unsigned long size = bufp->used;
3422 unsigned char *p = pattern;
3423 REGISTER unsigned char *pend = pattern + size;
3426 /* This holds the pointer to the failure stack, when
3427 it is allocated relocatably. */
3428 fail_stack_elt_t *failure_stack_ptr;
3431 /* Assume that each path through the pattern can be null until
3432 proven otherwise. We set this false at the bottom of switch
3433 statement, to which we get only if a particular path doesn't
3434 match the empty string. */
3435 re_bool path_can_be_null = true;
3437 /* We aren't doing a `succeed_n' to begin with. */
3438 re_bool succeed_n_p = false;
3440 assert (fastmap != NULL && p != NULL);
3443 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3444 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3445 bufp->can_be_null = 0;
3449 if (p == pend || *p == succeed)
3451 /* We have reached the (effective) end of pattern. */
3452 if (!FAIL_STACK_EMPTY ())
3454 bufp->can_be_null |= path_can_be_null;
3456 /* Reset for next path. */
3457 path_can_be_null = true;
3459 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3467 /* We should never be about to go beyond the end of the pattern. */
3470 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3473 /* I guess the idea here is to simply not bother with a fastmap
3474 if a backreference is used, since it's too hard to figure out
3475 the fastmap for the corresponding group. Setting
3476 `can_be_null' stops `re_search_2' from using the fastmap, so
3477 that is all we do. */
3479 bufp->can_be_null = 1;
3483 /* Following are the cases which match a character. These end
3492 /* XEmacs: Under Mule, these bit vectors will
3493 only contain values for characters below 0x80. */
3494 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3495 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3501 /* Chars beyond end of map must be allowed. */
3503 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3505 /* And all extended characters must be allowed, too. */
3506 for (j = 0x80; j < 0xA0; j++)
3508 #else /* not MULE */
3509 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3513 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3514 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3524 nentries = unified_range_table_nentries (p);
3525 for (i = 0; i < nentries; i++)
3527 EMACS_INT first, last;
3528 Lisp_Object dummy_val;
3530 Bufbyte strr[MAX_EMCHAR_LEN];
3532 unified_range_table_get_range (p, i, &first, &last,
3534 for (jj = first; jj <= last && jj < 0x80; jj++)
3536 /* Ranges below 0x100 can span charsets, but there
3537 are only two (Control-1 and Latin-1), and
3538 either first or last has to be in them. */
3539 set_charptr_emchar (strr, first);
3543 set_charptr_emchar (strr, last);
3550 case charset_mule_not:
3555 nentries = unified_range_table_nentries (p);
3556 for (i = 0; i < nentries; i++)
3558 EMACS_INT first, last;
3559 Lisp_Object dummy_val;
3561 int smallest_prev = 0;
3563 unified_range_table_get_range (p, i, &first, &last,
3565 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3567 smallest_prev = last + 1;
3568 if (smallest_prev >= 0x80)
3571 /* Calculating which leading bytes are actually allowed
3572 here is rather difficult, so we just punt and allow
3574 for (i = 0x80; i < 0xA0; i++)
3586 for (j = 0; j < (1 << BYTEWIDTH); j++)
3589 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3598 goto matchnotsyntax;
3600 for (j = 0; j < (1 << BYTEWIDTH); j++)
3603 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3611 int fastmap_newline = fastmap['\n'];
3613 /* `.' matches anything ... */
3615 /* "anything" only includes bytes that can be the
3616 first byte of a character. */
3617 for (j = 0; j < 0xA0; j++)
3620 for (j = 0; j < (1 << BYTEWIDTH); j++)
3624 /* ... except perhaps newline. */
3625 if (!(bufp->syntax & RE_DOT_NEWLINE))
3626 fastmap['\n'] = fastmap_newline;
3628 /* Return if we have already set `can_be_null'; if we have,
3629 then the fastmap is irrelevant. Something's wrong here. */
3630 else if (bufp->can_be_null)
3633 /* Otherwise, have to check alternative paths. */
3644 /* This match depends on text properties. These end with
3645 aborting optimizations. */
3646 bufp->can_be_null = 1;
3650 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3657 for (j = 0; j < 0x80; j++)
3660 (regex_emacs_buffer->syntax_table), j) ==
3661 (enum syntaxcode) k)
3664 for (j = 0; j < 0x80; j++)
3667 (regex_emacs_buffer->mirror_syntax_table), j) ==
3668 (enum syntaxcode) k)
3671 for (j = 0x80; j < 0xA0; j++)
3674 if (LEADING_BYTE_PREFIX_P(j))
3675 /* too complicated to calculate this right */
3683 cset = CHARSET_BY_LEADING_BYTE (j);
3684 if (CHARSETP (cset))
3686 if (charset_syntax (regex_emacs_buffer, cset,
3688 == Sword || multi_p)
3695 #else /* not MULE */
3696 for (j = 0; j < (1 << BYTEWIDTH); j++)
3699 (regex_emacs_buffer->mirror_syntax_table), j) ==
3700 (enum syntaxcode) k)
3706 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3713 for (j = 0; j < 0x80; j++)
3716 (regex_emacs_buffer->syntax_table), j) !=
3717 (enum syntaxcode) k)
3720 for (j = 0; j < 0x80; j++)
3723 (regex_emacs_buffer->mirror_syntax_table), j) !=
3724 (enum syntaxcode) k)
3727 for (j = 0x80; j < 0xA0; j++)
3730 if (LEADING_BYTE_PREFIX_P(j))
3731 /* too complicated to calculate this right */
3739 cset = CHARSET_BY_LEADING_BYTE (j);
3740 if (CHARSETP (cset))
3742 if (charset_syntax (regex_emacs_buffer, cset,
3744 != Sword || multi_p)
3751 #else /* not MULE */
3752 for (j = 0; j < (1 << BYTEWIDTH); j++)
3755 (regex_emacs_buffer->mirror_syntax_table), j) !=
3756 (enum syntaxcode) k)
3763 /* 97/2/17 jhod category patch */
3765 case notcategoryspec:
3766 bufp->can_be_null = 1;
3768 /* end if category patch */
3771 /* All cases after this match the empty string. These end with
3793 case push_dummy_failure:
3798 case pop_failure_jump:
3799 case maybe_pop_jump:
3802 case dummy_failure_jump:
3803 EXTRACT_NUMBER_AND_INCR (j, p);
3808 /* Jump backward implies we just went through the body of a
3809 loop and matched nothing. Opcode jumped to should be
3810 `on_failure_jump' or `succeed_n'. Just treat it like an
3811 ordinary jump. For a * loop, it has pushed its failure
3812 point already; if so, discard that as redundant. */
3813 if ((re_opcode_t) *p != on_failure_jump
3814 && (re_opcode_t) *p != succeed_n)
3818 EXTRACT_NUMBER_AND_INCR (j, p);
3821 /* If what's on the stack is where we are now, pop it. */
3822 if (!FAIL_STACK_EMPTY ()
3823 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3829 case on_failure_jump:
3830 case on_failure_keep_string_jump:
3831 handle_on_failure_jump:
3832 EXTRACT_NUMBER_AND_INCR (j, p);
3834 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3835 end of the pattern. We don't want to push such a point,
3836 since when we restore it above, entering the switch will
3837 increment `p' past the end of the pattern. We don't need
3838 to push such a point since we obviously won't find any more
3839 fastmap entries beyond `pend'. Such a pattern can match
3840 the null string, though. */
3843 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3845 RESET_FAIL_STACK ();
3850 bufp->can_be_null = 1;
3854 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3855 succeed_n_p = false;
3862 /* Get to the number of times to succeed. */
3865 /* Increment p past the n for when k != 0. */
3866 EXTRACT_NUMBER_AND_INCR (k, p);
3870 succeed_n_p = true; /* Spaghetti code alert. */
3871 goto handle_on_failure_jump;
3888 abort (); /* We have listed all the cases. */
3891 /* Getting here means we have found the possible starting
3892 characters for one path of the pattern -- and that the empty
3893 string does not match. We need not follow this path further.
3894 Instead, look at the next alternative (remembered on the
3895 stack), or quit if no more. The test at the top of the loop
3896 does these things. */
3897 path_can_be_null = false;
3901 /* Set `can_be_null' for the last path (also the first path, if the
3902 pattern is empty). */
3903 bufp->can_be_null |= path_can_be_null;
3906 RESET_FAIL_STACK ();
3908 } /* re_compile_fastmap */
3910 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3911 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3912 this memory for recording register information. STARTS and ENDS
3913 must be allocated using the malloc library routine, and must each
3914 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3916 If NUM_REGS == 0, then subsequent matches should allocate their own
3919 Unless this function is called, the first search or match using
3920 PATTERN_BUFFER will allocate its own register data, without
3921 freeing the old data. */
3924 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3925 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3929 bufp->regs_allocated = REGS_REALLOCATE;
3930 regs->num_regs = num_regs;
3931 regs->start = starts;
3936 bufp->regs_allocated = REGS_UNALLOCATED;
3938 regs->start = regs->end = (regoff_t *) 0;
3942 /* Searching routines. */
3944 /* Like re_search_2, below, but only one string is specified, and
3945 doesn't let you say where to stop matching. */
3948 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3949 int startpos, int range, struct re_registers *regs)
3951 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3956 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3957 # define bytecount_to_charcount(ptr, len) (len)
3958 # define charcount_to_bytecount(ptr, len) (len)
3959 typedef int Charcount;
3962 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3963 virtual concatenation of STRING1 and STRING2, starting first at index
3964 STARTPOS, then at STARTPOS + 1, and so on.
3966 With MULE, STARTPOS is a byte position, not a char position. And the
3967 search will increment STARTPOS by the width of the current leading
3970 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3972 RANGE is how far to scan while trying to match. RANGE = 0 means try
3973 only at STARTPOS; in general, the last start tried is STARTPOS +
3976 With MULE, RANGE is a byte position, not a char position. The last
3977 start tried is the character starting <= STARTPOS + RANGE.
3979 In REGS, return the indices of the virtual concatenation of STRING1
3980 and STRING2 that matched the entire BUFP->buffer and its contained
3983 Do not consider matching one past the index STOP in the virtual
3984 concatenation of STRING1 and STRING2.
3986 We return either the position in the strings at which the match was
3987 found, -1 if no match, or -2 if error (such as failure
3991 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
3992 int size1, const char *str2, int size2, int startpos,
3993 int range, struct re_registers *regs, int stop)
3996 re_char *string1 = (re_char *) str1;
3997 re_char *string2 = (re_char *) str2;
3998 REGISTER char *fastmap = bufp->fastmap;
3999 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4000 int total_size = size1 + size2;
4001 int endpos = startpos + range;
4002 #ifdef REGEX_BEGLINE_CHECK
4003 int anchored_at_begline = 0;
4008 /* Check for out-of-range STARTPOS. */
4009 if (startpos < 0 || startpos > total_size)
4012 /* Fix up RANGE if it might eventually take us outside
4013 the virtual concatenation of STRING1 and STRING2. */
4015 range = 0 - startpos;
4016 else if (endpos > total_size)
4017 range = total_size - startpos;
4019 /* If the search isn't to be a backwards one, don't waste time in a
4020 search for a pattern that must be anchored. */
4021 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4027 d = ((const unsigned char *)
4028 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4029 range = charcount_to_bytecount (d, 1);
4034 /* In a forward search for something that starts with \=.
4035 don't keep searching past point. */
4036 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4038 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4045 /* Update the fastmap now if not correct already. */
4046 if (fastmap && !bufp->fastmap_accurate)
4047 if (re_compile_fastmap (bufp) == -2)
4050 #ifdef REGEX_BEGLINE_CHECK
4052 unsigned long i = 0;
4054 while (i < bufp->used)
4056 if (bufp->buffer[i] == start_memory ||
4057 bufp->buffer[i] == stop_memory)
4062 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4067 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4069 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4075 /* Loop through the string, looking for a place to start matching. */
4078 #ifdef REGEX_BEGLINE_CHECK
4079 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4080 then we can speed things up by skipping to the next beginning-of-
4082 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4085 /* whose stupid idea was it anyway to make this
4086 function take two strings to match?? */
4090 if (startpos < size1 && startpos + range >= size1)
4091 lim = range - (size1 - startpos);
4093 d = ((const unsigned char *)
4094 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4095 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4096 d_size = charcount_to_bytecount (d, 1);
4098 if (TRANSLATE_P (translate))
4099 while (range > lim && *d != '\n')
4101 d += d_size; /* Speedier INC_CHARPTR(d) */
4102 d_size = charcount_to_bytecount (d, 1);
4106 while (range > lim && *d != '\n')
4108 d += d_size; /* Speedier INC_CHARPTR(d) */
4109 d_size = charcount_to_bytecount (d, 1);
4113 startpos += irange - range;
4115 #endif /* REGEX_BEGLINE_CHECK */
4117 /* If a fastmap is supplied, skip quickly over characters that
4118 cannot be the start of a match. If the pattern can match the
4119 null string, however, we don't need to skip characters; we want
4120 the first null string. */
4121 if (fastmap && startpos < total_size && !bufp->can_be_null)
4123 if (range > 0) /* Searching forwards. */
4128 if (startpos < size1 && startpos + range >= size1)
4129 lim = range - (size1 - startpos);
4131 d = ((const unsigned char *)
4132 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4134 /* Written out as an if-else to avoid testing `translate'
4136 if (TRANSLATE_P (translate))
4141 Bufbyte str[MAX_EMCHAR_LEN];
4143 buf_ch = charptr_emchar (d);
4144 buf_ch = RE_TRANSLATE (buf_ch);
4145 set_charptr_emchar (str, buf_ch);
4146 if (buf_ch >= 0200 || fastmap[(unsigned char) *str])
4149 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4152 d_size = charcount_to_bytecount (d, 1);
4154 d += d_size; /* Speedier INC_CHARPTR(d) */
4157 while (range > lim && !fastmap[*d])
4159 d_size = charcount_to_bytecount (d, 1);
4161 d += d_size; /* Speedier INC_CHARPTR(d) */
4164 startpos += irange - range;
4166 else /* Searching backwards. */
4168 Emchar c = (size1 == 0 || startpos >= size1
4169 ? charptr_emchar (string2 + startpos - size1)
4170 : charptr_emchar (string1 + startpos));
4173 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4176 if (!fastmap[(unsigned char) c])
4182 /* If can't match the null string, and that's all we have left, fail. */
4183 if (range >= 0 && startpos == total_size && fastmap
4184 && !bufp->can_be_null)
4187 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4188 if (!no_quit_in_re_search)
4191 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4192 startpos, regs, stop);
4193 #ifndef REGEX_MALLOC
4210 d = ((const unsigned char *)
4211 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4212 d_size = charcount_to_bytecount (d, 1);
4218 /* Note startpos > size1 not >=. If we are on the
4219 string1/string2 boundary, we want to backup into string1. */
4220 d = ((const unsigned char *)
4221 (startpos > size1 ? string2 - size1 : string1) + startpos);
4223 d_size = charcount_to_bytecount (d, 1);
4231 /* Declarations and macros for re_match_2. */
4233 /* This converts PTR, a pointer into one of the search strings `string1'
4234 and `string2' into an offset from the beginning of that string. */
4235 #define POINTER_TO_OFFSET(ptr) \
4236 (FIRST_STRING_P (ptr) \
4237 ? ((regoff_t) ((ptr) - string1)) \
4238 : ((regoff_t) ((ptr) - string2 + size1)))
4240 /* Macros for dealing with the split strings in re_match_2. */
4242 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4244 /* Call before fetching a character with *d. This switches over to
4245 string2 if necessary. */
4246 #define REGEX_PREFETCH() \
4249 /* End of string2 => fail. */ \
4250 if (dend == end_match_2) \
4252 /* End of string1 => advance to string2. */ \
4254 dend = end_match_2; \
4258 /* Test if at very beginning or at very end of the virtual concatenation
4259 of `string1' and `string2'. If only one string, it's `string2'. */
4260 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4261 #define AT_STRINGS_END(d) ((d) == end2)
4264 If the given position straddles the string gap, return the equivalent
4265 position that is before or after the gap, respectively; otherwise,
4266 return the same position. */
4267 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4268 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4270 /* Test if CH is a word-constituent character. (XEmacs change) */
4272 #define WORDCHAR_P_UNSAFE(ch) \
4273 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), \
4276 #define WORDCHAR_P_UNSAFE(ch) \
4277 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4281 /* Free everything we malloc. */
4282 #ifdef MATCH_MAY_ALLOCATE
4283 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4284 #define FREE_VARIABLES() \
4286 REGEX_FREE_STACK (fail_stack.stack); \
4287 FREE_VAR (regstart); \
4288 FREE_VAR (regend); \
4289 FREE_VAR (old_regstart); \
4290 FREE_VAR (old_regend); \
4291 FREE_VAR (best_regstart); \
4292 FREE_VAR (best_regend); \
4293 FREE_VAR (reg_info); \
4294 FREE_VAR (reg_dummy); \
4295 FREE_VAR (reg_info_dummy); \
4297 #else /* not MATCH_MAY_ALLOCATE */
4298 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4299 #endif /* MATCH_MAY_ALLOCATE */
4301 /* These values must meet several constraints. They must not be valid
4302 register values; since we have a limit of 255 registers (because
4303 we use only one byte in the pattern for the register number), we can
4304 use numbers larger than 255. They must differ by 1, because of
4305 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4306 be larger than the value for the highest register, so we do not try
4307 to actually save any registers when none are active. */
4308 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4309 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4311 /* Matching routines. */
4313 #ifndef emacs /* Emacs never uses this. */
4314 /* re_match is like re_match_2 except it takes only a single string. */
4317 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4318 int pos, struct re_registers *regs)
4320 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4325 #endif /* not emacs */
4328 /* re_match_2 matches the compiled pattern in BUFP against the
4329 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4330 SIZE2, respectively). We start matching at POS, and stop matching
4333 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4334 store offsets for the substring each group matched in REGS. See the
4335 documentation for exactly how many groups we fill.
4337 We return -1 if no match, -2 if an internal error (such as the
4338 failure stack overflowing). Otherwise, we return the length of the
4339 matched substring. */
4342 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4343 int size1, const char *string2, int size2, int pos,
4344 struct re_registers *regs, int stop)
4349 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4351 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4357 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4358 (re_char *) string2, size2,
4365 /* This is a separate function so that we can force an alloca cleanup
4368 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4369 int size1, re_char *string2, int size2, int pos,
4370 struct re_registers *regs, int stop)
4372 /* General temporaries. */
4375 int should_succeed; /* XEmacs change */
4377 /* Just past the end of the corresponding string. */
4378 re_char *end1, *end2;
4380 /* Pointers into string1 and string2, just past the last characters in
4381 each to consider matching. */
4382 re_char *end_match_1, *end_match_2;
4384 /* Where we are in the data, and the end of the current string. */
4387 /* Where we are in the pattern, and the end of the pattern. */
4388 unsigned char *p = bufp->buffer;
4389 REGISTER unsigned char *pend = p + bufp->used;
4391 /* Mark the opcode just after a start_memory, so we can test for an
4392 empty subpattern when we get to the stop_memory. */
4393 re_char *just_past_start_mem = 0;
4395 /* We use this to map every character in the string. */
4396 RE_TRANSLATE_TYPE translate = bufp->translate;
4398 /* Failure point stack. Each place that can handle a failure further
4399 down the line pushes a failure point on this stack. It consists of
4400 restart, regend, and reg_info for all registers corresponding to
4401 the subexpressions we're currently inside, plus the number of such
4402 registers, and, finally, two char *'s. The first char * is where
4403 to resume scanning the pattern; the second one is where to resume
4404 scanning the strings. If the latter is zero, the failure point is
4405 a ``dummy''; if a failure happens and the failure point is a dummy,
4406 it gets discarded and the next one is tried. */
4407 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4408 fail_stack_type fail_stack;
4411 static unsigned failure_id;
4412 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4416 /* This holds the pointer to the failure stack, when
4417 it is allocated relocatably. */
4418 fail_stack_elt_t *failure_stack_ptr;
4421 /* We fill all the registers internally, independent of what we
4422 return, for use in backreferences. The number here includes
4423 an element for register zero. */
4424 int num_regs = bufp->re_nsub + 1;
4426 /* The currently active registers. */
4427 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4428 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4430 /* Information on the contents of registers. These are pointers into
4431 the input strings; they record just what was matched (on this
4432 attempt) by a subexpression part of the pattern, that is, the
4433 regnum-th regstart pointer points to where in the pattern we began
4434 matching and the regnum-th regend points to right after where we
4435 stopped matching the regnum-th subexpression. (The zeroth register
4436 keeps track of what the whole pattern matches.) */
4437 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4438 re_char **regstart, **regend;
4441 /* If a group that's operated upon by a repetition operator fails to
4442 match anything, then the register for its start will need to be
4443 restored because it will have been set to wherever in the string we
4444 are when we last see its open-group operator. Similarly for a
4446 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4447 re_char **old_regstart, **old_regend;
4450 /* The is_active field of reg_info helps us keep track of which (possibly
4451 nested) subexpressions we are currently in. The matched_something
4452 field of reg_info[reg_num] helps us tell whether or not we have
4453 matched any of the pattern so far this time through the reg_num-th
4454 subexpression. These two fields get reset each time through any
4455 loop their register is in. */
4456 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4457 register_info_type *reg_info;
4460 /* The following record the register info as found in the above
4461 variables when we find a match better than any we've seen before.
4462 This happens as we backtrack through the failure points, which in
4463 turn happens only if we have not yet matched the entire string. */
4464 unsigned best_regs_set = false;
4465 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4466 re_char **best_regstart, **best_regend;
4469 /* Logically, this is `best_regend[0]'. But we don't want to have to
4470 allocate space for that if we're not allocating space for anything
4471 else (see below). Also, we never need info about register 0 for
4472 any of the other register vectors, and it seems rather a kludge to
4473 treat `best_regend' differently than the rest. So we keep track of
4474 the end of the best match so far in a separate variable. We
4475 initialize this to NULL so that when we backtrack the first time
4476 and need to test it, it's not garbage. */
4477 re_char *match_end = NULL;
4479 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4480 int set_regs_matched_done = 0;
4482 /* Used when we pop values we don't care about. */
4483 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4484 re_char **reg_dummy;
4485 register_info_type *reg_info_dummy;
4489 /* Counts the total number of registers pushed. */
4490 unsigned num_regs_pushed = 0;
4493 /* 1 if this match ends in the same string (string1 or string2)
4494 as the best previous match. */
4497 /* 1 if this match is the best seen so far. */
4498 re_bool best_match_p;
4500 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4504 #ifdef MATCH_MAY_ALLOCATE
4505 /* Do not bother to initialize all the register variables if there are
4506 no groups in the pattern, as it takes a fair amount of time. If
4507 there are groups, we include space for register 0 (the whole
4508 pattern), even though we never use it, since it simplifies the
4509 array indexing. We should fix this. */
4512 regstart = REGEX_TALLOC (num_regs, re_char *);
4513 regend = REGEX_TALLOC (num_regs, re_char *);
4514 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4515 old_regend = REGEX_TALLOC (num_regs, re_char *);
4516 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4517 best_regend = REGEX_TALLOC (num_regs, re_char *);
4518 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4519 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4520 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4522 if (!(regstart && regend && old_regstart && old_regend && reg_info
4523 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4531 /* We must initialize all our variables to NULL, so that
4532 `FREE_VARIABLES' doesn't try to free them. */
4533 regstart = regend = old_regstart = old_regend = best_regstart
4534 = best_regend = reg_dummy = NULL;
4535 reg_info = reg_info_dummy = (register_info_type *) NULL;
4537 #endif /* MATCH_MAY_ALLOCATE */
4539 /* The starting position is bogus. */
4540 if (pos < 0 || pos > size1 + size2)
4546 /* Initialize subexpression text positions to -1 to mark ones that no
4547 start_memory/stop_memory has been seen for. Also initialize the
4548 register information struct. */
4549 for (mcnt = 1; mcnt < num_regs; mcnt++)
4551 regstart[mcnt] = regend[mcnt]
4552 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4554 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4555 IS_ACTIVE (reg_info[mcnt]) = 0;
4556 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4557 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4559 /* We move `string1' into `string2' if the latter's empty -- but not if
4560 `string1' is null. */
4561 if (size2 == 0 && string1 != NULL)
4568 end1 = string1 + size1;
4569 end2 = string2 + size2;
4571 /* Compute where to stop matching, within the two strings. */
4574 end_match_1 = string1 + stop;
4575 end_match_2 = string2;
4580 end_match_2 = string2 + stop - size1;
4583 /* `p' scans through the pattern as `d' scans through the data.
4584 `dend' is the end of the input string that `d' points within. `d'
4585 is advanced into the following input string whenever necessary, but
4586 this happens before fetching; therefore, at the beginning of the
4587 loop, `d' can be pointing at the end of a string, but it cannot
4589 if (size1 > 0 && pos <= size1)
4596 d = string2 + pos - size1;
4600 DEBUG_PRINT1 ("The compiled pattern is: \n");
4601 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4602 DEBUG_PRINT1 ("The string to match is: `");
4603 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4604 DEBUG_PRINT1 ("'\n");
4606 /* This loops over pattern commands. It exits by returning from the
4607 function if the match is complete, or it drops through if the match
4608 fails at this starting point in the input data. */
4611 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4612 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4613 if (!no_quit_in_re_search)
4618 { /* End of pattern means we might have succeeded. */
4619 DEBUG_PRINT1 ("end of pattern ... ");
4621 /* If we haven't matched the entire string, and we want the
4622 longest match, try backtracking. */
4623 if (d != end_match_2)
4625 same_str_p = (FIRST_STRING_P (match_end)
4626 == MATCHING_IN_FIRST_STRING);
4628 /* AIX compiler got confused when this was combined
4629 with the previous declaration. */
4631 best_match_p = d > match_end;
4633 best_match_p = !MATCHING_IN_FIRST_STRING;
4635 DEBUG_PRINT1 ("backtracking.\n");
4637 if (!FAIL_STACK_EMPTY ())
4638 { /* More failure points to try. */
4640 /* If exceeds best match so far, save it. */
4641 if (!best_regs_set || best_match_p)
4643 best_regs_set = true;
4646 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4648 for (mcnt = 1; mcnt < num_regs; mcnt++)
4650 best_regstart[mcnt] = regstart[mcnt];
4651 best_regend[mcnt] = regend[mcnt];
4657 /* If no failure points, don't restore garbage. And if
4658 last match is real best match, don't restore second
4660 else if (best_regs_set && !best_match_p)
4663 /* Restore best match. It may happen that `dend ==
4664 end_match_1' while the restored d is in string2.
4665 For example, the pattern `x.*y.*z' against the
4666 strings `x-' and `y-z-', if the two strings are
4667 not consecutive in memory. */
4668 DEBUG_PRINT1 ("Restoring best registers.\n");
4671 dend = ((d >= string1 && d <= end1)
4672 ? end_match_1 : end_match_2);
4674 for (mcnt = 1; mcnt < num_regs; mcnt++)
4676 regstart[mcnt] = best_regstart[mcnt];
4677 regend[mcnt] = best_regend[mcnt];
4680 } /* d != end_match_2 */
4683 DEBUG_PRINT1 ("Accepting match.\n");
4685 /* If caller wants register contents data back, do it. */
4686 if (regs && !bufp->no_sub)
4688 /* Have the register data arrays been allocated? */
4689 if (bufp->regs_allocated == REGS_UNALLOCATED)
4690 { /* No. So allocate them with malloc. We need one
4691 extra element beyond `num_regs' for the `-1' marker
4693 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4694 regs->start = TALLOC (regs->num_regs, regoff_t);
4695 regs->end = TALLOC (regs->num_regs, regoff_t);
4696 if (regs->start == NULL || regs->end == NULL)
4701 bufp->regs_allocated = REGS_REALLOCATE;
4703 else if (bufp->regs_allocated == REGS_REALLOCATE)
4704 { /* Yes. If we need more elements than were already
4705 allocated, reallocate them. If we need fewer, just
4707 if (regs->num_regs < num_regs + 1)
4709 regs->num_regs = num_regs + 1;
4710 RETALLOC (regs->start, regs->num_regs, regoff_t);
4711 RETALLOC (regs->end, regs->num_regs, regoff_t);
4712 if (regs->start == NULL || regs->end == NULL)
4721 /* These braces fend off a "empty body in an else-statement"
4722 warning under GCC when assert expands to nothing. */
4723 assert (bufp->regs_allocated == REGS_FIXED);
4726 /* Convert the pointer data in `regstart' and `regend' to
4727 indices. Register zero has to be set differently,
4728 since we haven't kept track of any info for it. */
4729 if (regs->num_regs > 0)
4731 regs->start[0] = pos;
4732 regs->end[0] = (MATCHING_IN_FIRST_STRING
4733 ? ((regoff_t) (d - string1))
4734 : ((regoff_t) (d - string2 + size1)));
4737 /* Go through the first `min (num_regs, regs->num_regs)'
4738 registers, since that is all we initialized. */
4739 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4741 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4742 regs->start[mcnt] = regs->end[mcnt] = -1;
4746 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4748 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4751 } /* regs && !bufp->no_sub */
4753 /* If we have regs and the regs structure has more elements than
4754 were in the pattern, set the extra elements to -1. If we
4755 (re)allocated the registers, this is the case, because we
4756 always allocate enough to have at least one -1 at the end.
4758 We do this even when no_sub is set because some applications
4759 (XEmacs) reuse register structures which may contain stale
4760 information, and permit attempts to access those registers.
4762 It would be possible to require the caller to do this, but we'd
4763 have to change the API for this function to reflect that, and
4764 audit all callers. */
4765 if (regs && regs->num_regs > 0)
4766 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4767 regs->start[mcnt] = regs->end[mcnt] = -1;
4769 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4770 nfailure_points_pushed, nfailure_points_popped,
4771 nfailure_points_pushed - nfailure_points_popped);
4772 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4774 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4778 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4784 /* Otherwise match next pattern command. */
4785 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4787 /* Ignore these. Used to ignore the n of succeed_n's which
4788 currently have n == 0. */
4790 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4794 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4797 /* Match the next n pattern characters exactly. The following
4798 byte in the pattern defines n, and the n bytes after that
4799 are the characters to match. */
4802 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4804 /* This is written out as an if-else so we don't waste time
4805 testing `translate' inside the loop. */
4806 if (TRANSLATE_P (translate))
4811 Emchar pat_ch, buf_ch;
4815 pat_ch = charptr_emchar (p);
4816 buf_ch = charptr_emchar (d);
4817 if (RE_TRANSLATE (buf_ch) != pat_ch)
4820 pat_len = charcount_to_bytecount (p, 1);
4825 #else /* not MULE */
4827 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4839 if (*d++ != *p++) goto fail;
4843 SET_REGS_MATCHED ();
4847 /* Match any character except possibly a newline or a null. */
4849 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4853 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4854 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4857 SET_REGS_MATCHED ();
4858 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4859 INC_CHARPTR (d); /* XEmacs change */
4866 REGISTER unsigned char c;
4867 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4869 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4872 c = TRANSLATE (*d); /* The character to match. */
4874 /* Cast to `unsigned' instead of `unsigned char' in case the
4875 bit list is a full 32 bytes long. */
4876 if (c < (unsigned) (*p * BYTEWIDTH)
4877 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4882 if (!not_p) goto fail;
4884 SET_REGS_MATCHED ();
4885 INC_CHARPTR (d); /* XEmacs change */
4891 case charset_mule_not:
4894 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4896 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4899 c = charptr_emchar ((const Bufbyte *) d);
4900 c = TRANSLATE (c); /* The character to match. */
4902 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4905 p += unified_range_table_bytes_used (p);
4907 if (!not_p) goto fail;
4909 SET_REGS_MATCHED ();
4916 /* The beginning of a group is represented by start_memory.
4917 The arguments are the register number in the next byte, and the
4918 number of groups inner to this one in the next. The text
4919 matched within the group is recorded (in the internal
4920 registers data structure) under the register number. */
4922 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4924 /* Find out if this group can match the empty string. */
4925 p1 = p; /* To send to group_match_null_string_p. */
4927 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4928 REG_MATCH_NULL_STRING_P (reg_info[*p])
4929 = group_match_null_string_p (&p1, pend, reg_info);
4931 /* Save the position in the string where we were the last time
4932 we were at this open-group operator in case the group is
4933 operated upon by a repetition operator, e.g., with `(a*)*b'
4934 against `ab'; then we want to ignore where we are now in
4935 the string in case this attempt to match fails. */
4936 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4937 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4939 DEBUG_PRINT2 (" old_regstart: %d\n",
4940 POINTER_TO_OFFSET (old_regstart[*p]));
4943 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4945 IS_ACTIVE (reg_info[*p]) = 1;
4946 MATCHED_SOMETHING (reg_info[*p]) = 0;
4948 /* Clear this whenever we change the register activity status. */
4949 set_regs_matched_done = 0;
4951 /* This is the new highest active register. */
4952 highest_active_reg = *p;
4954 /* If nothing was active before, this is the new lowest active
4956 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4957 lowest_active_reg = *p;
4959 /* Move past the register number and inner group count. */
4961 just_past_start_mem = p;
4966 /* The stop_memory opcode represents the end of a group. Its
4967 arguments are the same as start_memory's: the register
4968 number, and the number of inner groups. */
4970 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4972 /* We need to save the string position the last time we were at
4973 this close-group operator in case the group is operated
4974 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4975 against `aba'; then we want to ignore where we are now in
4976 the string in case this attempt to match fails. */
4977 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4978 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4980 DEBUG_PRINT2 (" old_regend: %d\n",
4981 POINTER_TO_OFFSET (old_regend[*p]));
4984 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4986 /* This register isn't active anymore. */
4987 IS_ACTIVE (reg_info[*p]) = 0;
4989 /* Clear this whenever we change the register activity status. */
4990 set_regs_matched_done = 0;
4992 /* If this was the only register active, nothing is active
4994 if (lowest_active_reg == highest_active_reg)
4996 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4997 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5000 { /* We must scan for the new highest active register, since
5001 it isn't necessarily one less than now: consider
5002 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
5003 new highest active register is 1. */
5004 unsigned char r = *p - 1;
5005 while (r > 0 && !IS_ACTIVE (reg_info[r]))
5008 /* If we end up at register zero, that means that we saved
5009 the registers as the result of an `on_failure_jump', not
5010 a `start_memory', and we jumped to past the innermost
5011 `stop_memory'. For example, in ((.)*) we save
5012 registers 1 and 2 as a result of the *, but when we pop
5013 back to the second ), we are at the stop_memory 1.
5014 Thus, nothing is active. */
5017 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5018 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5022 highest_active_reg = r;
5024 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
5026 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
5028 lowest_active_reg = r;
5032 /* If just failed to match something this time around with a
5033 group that's operated on by a repetition operator, try to
5034 force exit from the ``loop'', and restore the register
5035 information for this group that we had before trying this
5037 if ((!MATCHED_SOMETHING (reg_info[*p])
5038 || just_past_start_mem == p - 1)
5041 re_bool is_a_jump_n = false;
5045 switch ((re_opcode_t) *p1++)
5049 case pop_failure_jump:
5050 case maybe_pop_jump:
5052 case dummy_failure_jump:
5053 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5063 /* If the next operation is a jump backwards in the pattern
5064 to an on_failure_jump right before the start_memory
5065 corresponding to this stop_memory, exit from the loop
5066 by forcing a failure after pushing on the stack the
5067 on_failure_jump's jump in the pattern, and d. */
5068 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5069 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5071 /* If this group ever matched anything, then restore
5072 what its registers were before trying this last
5073 failed match, e.g., with `(a*)*b' against `ab' for
5074 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5075 against `aba' for regend[3].
5077 Also restore the registers for inner groups for,
5078 e.g., `((a*)(b*))*' against `aba' (register 3 would
5079 otherwise get trashed). */
5081 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5085 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5087 /* Restore this and inner groups' (if any) registers. */
5088 for (r = *p; r < *p + *(p + 1); r++)
5090 regstart[r] = old_regstart[r];
5092 /* xx why this test? */
5093 if (old_regend[r] >= regstart[r])
5094 regend[r] = old_regend[r];
5098 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5099 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5105 /* Move past the register number and the inner group count. */
5110 /* \<digit> has been turned into a `duplicate' command which is
5111 followed by the numeric value of <digit> as the register number. */
5114 REGISTER re_char *d2, *dend2;
5115 int regno = *p++; /* Get which register to match against. */
5116 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5118 /* Can't back reference a group which we've never matched. */
5119 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5122 /* Where in input to try to start matching. */
5123 d2 = regstart[regno];
5125 /* Where to stop matching; if both the place to start and
5126 the place to stop matching are in the same string, then
5127 set to the place to stop, otherwise, for now have to use
5128 the end of the first string. */
5130 dend2 = ((FIRST_STRING_P (regstart[regno])
5131 == FIRST_STRING_P (regend[regno]))
5132 ? regend[regno] : end_match_1);
5135 /* If necessary, advance to next segment in register
5139 if (dend2 == end_match_2) break;
5140 if (dend2 == regend[regno]) break;
5142 /* End of string1 => advance to string2. */
5144 dend2 = regend[regno];
5146 /* At end of register contents => success */
5147 if (d2 == dend2) break;
5149 /* If necessary, advance to next segment in data. */
5152 /* How many characters left in this segment to match. */
5155 /* Want how many consecutive characters we can match in
5156 one shot, so, if necessary, adjust the count. */
5157 if (mcnt > dend2 - d2)
5160 /* Compare that many; failure if mismatch, else move
5162 if (TRANSLATE_P (translate)
5163 ? bcmp_translate ((unsigned char *) d,
5164 (unsigned char *) d2, mcnt, translate)
5165 : memcmp (d, d2, mcnt))
5167 d += mcnt, d2 += mcnt;
5169 /* Do this because we've match some characters. */
5170 SET_REGS_MATCHED ();
5176 /* begline matches the empty string at the beginning of the string
5177 (unless `not_bol' is set in `bufp'), and, if
5178 `newline_anchor' is set, after newlines. */
5180 DEBUG_PRINT1 ("EXECUTING begline.\n");
5182 if (AT_STRINGS_BEG (d))
5184 if (!bufp->not_bol) break;
5186 else if (d[-1] == '\n' && bufp->newline_anchor)
5190 /* In all other cases, we fail. */
5194 /* endline is the dual of begline. */
5196 DEBUG_PRINT1 ("EXECUTING endline.\n");
5198 if (AT_STRINGS_END (d))
5200 if (!bufp->not_eol) break;
5203 /* We have to ``prefetch'' the next character. */
5204 else if ((d == end1 ? *string2 : *d) == '\n'
5205 && bufp->newline_anchor)
5212 /* Match at the very beginning of the data. */
5214 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5215 if (AT_STRINGS_BEG (d))
5220 /* Match at the very end of the data. */
5222 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5223 if (AT_STRINGS_END (d))
5228 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5229 pushes NULL as the value for the string on the stack. Then
5230 `pop_failure_point' will keep the current value for the
5231 string, instead of restoring it. To see why, consider
5232 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5233 then the . fails against the \n. But the next thing we want
5234 to do is match the \n against the \n; if we restored the
5235 string value, we would be back at the foo.
5237 Because this is used only in specific cases, we don't need to
5238 check all the things that `on_failure_jump' does, to make
5239 sure the right things get saved on the stack. Hence we don't
5240 share its code. The only reason to push anything on the
5241 stack at all is that otherwise we would have to change
5242 `anychar's code to do something besides goto fail in this
5243 case; that seems worse than this. */
5244 case on_failure_keep_string_jump:
5245 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5247 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5248 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5250 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5254 /* Uses of on_failure_jump:
5256 Each alternative starts with an on_failure_jump that points
5257 to the beginning of the next alternative. Each alternative
5258 except the last ends with a jump that in effect jumps past
5259 the rest of the alternatives. (They really jump to the
5260 ending jump of the following alternative, because tensioning
5261 these jumps is a hassle.)
5263 Repeats start with an on_failure_jump that points past both
5264 the repetition text and either the following jump or
5265 pop_failure_jump back to this on_failure_jump. */
5266 case on_failure_jump:
5268 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5270 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5271 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5273 /* If this on_failure_jump comes right before a group (i.e.,
5274 the original * applied to a group), save the information
5275 for that group and all inner ones, so that if we fail back
5276 to this point, the group's information will be correct.
5277 For example, in \(a*\)*\1, we need the preceding group,
5278 and in \(\(a*\)b*\)\2, we need the inner group. */
5280 /* We can't use `p' to check ahead because we push
5281 a failure point to `p + mcnt' after we do this. */
5284 /* We need to skip no_op's before we look for the
5285 start_memory in case this on_failure_jump is happening as
5286 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5288 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5291 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5293 /* We have a new highest active register now. This will
5294 get reset at the start_memory we are about to get to,
5295 but we will have saved all the registers relevant to
5296 this repetition op, as described above. */
5297 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5298 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5299 lowest_active_reg = *(p1 + 1);
5302 DEBUG_PRINT1 (":\n");
5303 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5307 /* A smart repeat ends with `maybe_pop_jump'.
5308 We change it to either `pop_failure_jump' or `jump'. */
5309 case maybe_pop_jump:
5310 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5311 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5313 REGISTER unsigned char *p2 = p;
5315 /* Compare the beginning of the repeat with what in the
5316 pattern follows its end. If we can establish that there
5317 is nothing that they would both match, i.e., that we
5318 would have to backtrack because of (as in, e.g., `a*a')
5319 then we can change to pop_failure_jump, because we'll
5320 never have to backtrack.
5322 This is not true in the case of alternatives: in
5323 `(a|ab)*' we do need to backtrack to the `ab' alternative
5324 (e.g., if the string was `ab'). But instead of trying to
5325 detect that here, the alternative has put on a dummy
5326 failure point which is what we will end up popping. */
5328 /* Skip over open/close-group commands.
5329 If what follows this loop is a ...+ construct,
5330 look at what begins its body, since we will have to
5331 match at least one of that. */
5335 && ((re_opcode_t) *p2 == stop_memory
5336 || (re_opcode_t) *p2 == start_memory))
5338 else if (p2 + 6 < pend
5339 && (re_opcode_t) *p2 == dummy_failure_jump)
5346 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5347 to the `maybe_finalize_jump' of this case. Examine what
5350 /* If we're at the end of the pattern, we can change. */
5353 /* Consider what happens when matching ":\(.*\)"
5354 against ":/". I don't really understand this code
5356 p[-3] = (unsigned char) pop_failure_jump;
5358 (" End of pattern: change to `pop_failure_jump'.\n");
5361 else if ((re_opcode_t) *p2 == exactn
5362 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5364 REGISTER unsigned char c
5365 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5367 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5369 p[-3] = (unsigned char) pop_failure_jump;
5370 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5374 else if ((re_opcode_t) p1[3] == charset
5375 || (re_opcode_t) p1[3] == charset_not)
5377 int not_p = (re_opcode_t) p1[3] == charset_not;
5379 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5380 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5383 /* `not_p' is equal to 1 if c would match, which means
5384 that we can't change to pop_failure_jump. */
5387 p[-3] = (unsigned char) pop_failure_jump;
5388 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5392 else if ((re_opcode_t) *p2 == charset)
5395 REGISTER unsigned char c
5396 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5399 if ((re_opcode_t) p1[3] == exactn
5400 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5401 && (p2[2 + p1[5] / BYTEWIDTH]
5402 & (1 << (p1[5] % BYTEWIDTH)))))
5404 p[-3] = (unsigned char) pop_failure_jump;
5405 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5409 else if ((re_opcode_t) p1[3] == charset_not)
5412 /* We win if the charset_not inside the loop
5413 lists every character listed in the charset after. */
5414 for (idx = 0; idx < (int) p2[1]; idx++)
5415 if (! (p2[2 + idx] == 0
5416 || (idx < (int) p1[4]
5417 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5422 p[-3] = (unsigned char) pop_failure_jump;
5423 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5426 else if ((re_opcode_t) p1[3] == charset)
5429 /* We win if the charset inside the loop
5430 has no overlap with the one after the loop. */
5432 idx < (int) p2[1] && idx < (int) p1[4];
5434 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5437 if (idx == p2[1] || idx == p1[4])
5439 p[-3] = (unsigned char) pop_failure_jump;
5440 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5445 p -= 2; /* Point at relative address again. */
5446 if ((re_opcode_t) p[-1] != pop_failure_jump)
5448 p[-1] = (unsigned char) jump;
5449 DEBUG_PRINT1 (" Match => jump.\n");
5450 goto unconditional_jump;
5452 /* Note fall through. */
5455 /* The end of a simple repeat has a pop_failure_jump back to
5456 its matching on_failure_jump, where the latter will push a
5457 failure point. The pop_failure_jump takes off failure
5458 points put on by this pop_failure_jump's matching
5459 on_failure_jump; we got through the pattern to here from the
5460 matching on_failure_jump, so didn't fail. */
5461 case pop_failure_jump:
5463 /* We need to pass separate storage for the lowest and
5464 highest registers, even though we don't care about the
5465 actual values. Otherwise, we will restore only one
5466 register from the stack, since lowest will == highest in
5467 `pop_failure_point'. */
5468 int dummy_low_reg, dummy_high_reg;
5469 unsigned char *pdummy;
5470 re_char *sdummy = NULL;
5472 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5473 POP_FAILURE_POINT (sdummy, pdummy,
5474 dummy_low_reg, dummy_high_reg,
5475 reg_dummy, reg_dummy, reg_info_dummy);
5477 /* Note fall through. */
5480 /* Unconditionally jump (without popping any failure points). */
5483 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5484 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5485 p += mcnt; /* Do the jump. */
5486 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5490 /* We need this opcode so we can detect where alternatives end
5491 in `group_match_null_string_p' et al. */
5493 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5494 goto unconditional_jump;
5497 /* Normally, the on_failure_jump pushes a failure point, which
5498 then gets popped at pop_failure_jump. We will end up at
5499 pop_failure_jump, also, and with a pattern of, say, `a+', we
5500 are skipping over the on_failure_jump, so we have to push
5501 something meaningless for pop_failure_jump to pop. */
5502 case dummy_failure_jump:
5503 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5504 /* It doesn't matter what we push for the string here. What
5505 the code at `fail' tests is the value for the pattern. */
5506 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5507 goto unconditional_jump;
5510 /* At the end of an alternative, we need to push a dummy failure
5511 point in case we are followed by a `pop_failure_jump', because
5512 we don't want the failure point for the alternative to be
5513 popped. For example, matching `(a|ab)*' against `aab'
5514 requires that we match the `ab' alternative. */
5515 case push_dummy_failure:
5516 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5517 /* See comments just above at `dummy_failure_jump' about the
5519 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5522 /* Have to succeed matching what follows at least n times.
5523 After that, handle like `on_failure_jump'. */
5525 EXTRACT_NUMBER (mcnt, p + 2);
5526 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5529 /* Originally, this is how many times we HAVE to succeed. */
5534 STORE_NUMBER_AND_INCR (p, mcnt);
5535 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5539 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5541 p[2] = (unsigned char) no_op;
5542 p[3] = (unsigned char) no_op;
5548 EXTRACT_NUMBER (mcnt, p + 2);
5549 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5551 /* Originally, this is how many times we CAN jump. */
5555 STORE_NUMBER (p + 2, mcnt);
5556 goto unconditional_jump;
5558 /* If don't have to jump any more, skip over the rest of command. */
5565 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5567 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5569 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5570 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5571 STORE_NUMBER (p1, mcnt);
5576 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5581 /* Straightforward and (I hope) correct implementation.
5582 Probably should be optimized by arranging to compute
5584 /* emch1 is the character before d, syn1 is the syntax of
5585 emch1, emch2 is the character at d, and syn2 is the
5587 Emchar emch1, emch2;
5589 re_char *d_before, *d_after;
5591 at_beg = AT_STRINGS_BEG (d),
5592 at_end = AT_STRINGS_END (d);
5597 if (at_beg && at_end)
5605 d_before = POS_BEFORE_GAP_UNSAFE (d);
5606 DEC_CHARPTR (d_before);
5607 emch1 = charptr_emchar (d_before);
5609 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5610 UPDATE_SYNTAX_CACHE (xpos);
5612 syn1 = SYNTAX_FROM_CACHE
5613 (XCHAR_TABLE (regex_emacs_buffer
5614 ->mirror_syntax_table),
5619 d_after = POS_AFTER_GAP_UNSAFE (d);
5620 emch2 = charptr_emchar (d_after);
5622 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5623 UPDATE_SYNTAX_CACHE_FORWARD (xpos + 1);
5625 syn2 = SYNTAX_FROM_CACHE
5626 (XCHAR_TABLE (regex_emacs_buffer
5627 ->mirror_syntax_table),
5632 result = (syn2 == Sword);
5634 result = (syn1 == Sword);
5636 result = ((syn1 == Sword) != (syn2 == Sword));
5639 if (result == should_succeed)
5645 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5647 goto matchwordbound;
5650 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5651 if (AT_STRINGS_END (d))
5654 /* XEmacs: this originally read:
5656 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5660 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5661 Emchar emch = charptr_emchar (dtmp);
5663 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5664 UPDATE_SYNTAX_CACHE (charpos);
5666 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5669 if (AT_STRINGS_BEG (d))
5671 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5673 emch = charptr_emchar (dtmp);
5675 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5677 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5684 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5685 if (AT_STRINGS_BEG (d))
5688 /* XEmacs: this originally read:
5690 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5691 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5694 The or condition is incorrect (reversed).
5699 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5700 UPDATE_SYNTAX_CACHE (charpos);
5702 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5704 emch = charptr_emchar (dtmp);
5705 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5708 if (AT_STRINGS_END (d))
5710 dtmp = POS_AFTER_GAP_UNSAFE (d);
5711 emch = charptr_emchar (dtmp);
5713 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5715 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5723 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5724 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5725 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5726 >= BUF_PT (regex_emacs_buffer)))
5731 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5732 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5733 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5734 != BUF_PT (regex_emacs_buffer)))
5739 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5740 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5741 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5742 <= BUF_PT (regex_emacs_buffer)))
5745 #if 0 /* not emacs19 */
5747 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5748 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5749 != BUF_PT (regex_emacs_buffer))
5752 #endif /* not emacs19 */
5755 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5760 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5772 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5773 UPDATE_SYNTAX_CACHE (charpos);
5777 emch = charptr_emchar ((const Bufbyte *) d);
5779 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->syntax_table),
5780 emch) == (enum syntaxcode) mcnt);
5782 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5783 emch) == (enum syntaxcode) mcnt);
5786 if (matches != should_succeed)
5788 SET_REGS_MATCHED ();
5793 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5795 goto matchnotsyntax;
5798 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5802 goto matchornotsyntax;
5805 /* 97/2/17 jhod Mule category code patch */
5814 emch = charptr_emchar ((const Bufbyte *) d);
5816 if (check_category_char(emch, regex_emacs_buffer->category_table,
5817 mcnt, should_succeed))
5819 SET_REGS_MATCHED ();
5823 case notcategoryspec:
5825 goto matchornotcategory;
5826 /* end of category patch */
5828 #else /* not emacs */
5830 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5832 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5834 SET_REGS_MATCHED ();
5839 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5841 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5843 SET_REGS_MATCHED ();
5851 continue; /* Successfully executed one pattern command; keep going. */
5854 /* We goto here if a matching operation fails. */
5856 if (!FAIL_STACK_EMPTY ())
5857 { /* A restart point is known. Restore to that state. */
5858 DEBUG_PRINT1 ("\nFAIL:\n");
5859 POP_FAILURE_POINT (d, p,
5860 lowest_active_reg, highest_active_reg,
5861 regstart, regend, reg_info);
5863 /* If this failure point is a dummy, try the next one. */
5867 /* If we failed to the end of the pattern, don't examine *p. */
5871 re_bool is_a_jump_n = false;
5873 /* If failed to a backwards jump that's part of a repetition
5874 loop, need to pop this failure point and use the next one. */
5875 switch ((re_opcode_t) *p)
5879 case maybe_pop_jump:
5880 case pop_failure_jump:
5883 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5886 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5888 && (re_opcode_t) *p1 == on_failure_jump))
5896 if (d >= string1 && d <= end1)
5900 break; /* Matching at this starting point really fails. */
5904 goto restore_best_regs;
5908 return -1; /* Failure to match. */
5911 /* Subroutine definitions for re_match_2. */
5914 /* We are passed P pointing to a register number after a start_memory.
5916 Return true if the pattern up to the corresponding stop_memory can
5917 match the empty string, and false otherwise.
5919 If we find the matching stop_memory, sets P to point to one past its number.
5920 Otherwise, sets P to an undefined byte less than or equal to END.
5922 We don't handle duplicates properly (yet). */
5925 group_match_null_string_p (unsigned char **p, unsigned char *end,
5926 register_info_type *reg_info)
5929 /* Point to after the args to the start_memory. */
5930 unsigned char *p1 = *p + 2;
5934 /* Skip over opcodes that can match nothing, and return true or
5935 false, as appropriate, when we get to one that can't, or to the
5936 matching stop_memory. */
5938 switch ((re_opcode_t) *p1)
5940 /* Could be either a loop or a series of alternatives. */
5941 case on_failure_jump:
5943 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5945 /* If the next operation is not a jump backwards in the
5950 /* Go through the on_failure_jumps of the alternatives,
5951 seeing if any of the alternatives cannot match nothing.
5952 The last alternative starts with only a jump,
5953 whereas the rest start with on_failure_jump and end
5954 with a jump, e.g., here is the pattern for `a|b|c':
5956 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5957 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5960 So, we have to first go through the first (n-1)
5961 alternatives and then deal with the last one separately. */
5964 /* Deal with the first (n-1) alternatives, which start
5965 with an on_failure_jump (see above) that jumps to right
5966 past a jump_past_alt. */
5968 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5970 /* `mcnt' holds how many bytes long the alternative
5971 is, including the ending `jump_past_alt' and
5974 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5978 /* Move to right after this alternative, including the
5982 /* Break if it's the beginning of an n-th alternative
5983 that doesn't begin with an on_failure_jump. */
5984 if ((re_opcode_t) *p1 != on_failure_jump)
5987 /* Still have to check that it's not an n-th
5988 alternative that starts with an on_failure_jump. */
5990 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5991 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5993 /* Get to the beginning of the n-th alternative. */
5999 /* Deal with the last alternative: go back and get number
6000 of the `jump_past_alt' just before it. `mcnt' contains
6001 the length of the alternative. */
6002 EXTRACT_NUMBER (mcnt, p1 - 2);
6004 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
6007 p1 += mcnt; /* Get past the n-th alternative. */
6013 assert (p1[1] == **p);
6019 if (!common_op_match_null_string_p (&p1, end, reg_info))
6022 } /* while p1 < end */
6025 } /* group_match_null_string_p */
6028 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6029 It expects P to be the first byte of a single alternative and END one
6030 byte past the last. The alternative can contain groups. */
6033 alt_match_null_string_p (unsigned char *p, unsigned char *end,
6034 register_info_type *reg_info)
6037 unsigned char *p1 = p;
6041 /* Skip over opcodes that can match nothing, and break when we get
6042 to one that can't. */
6044 switch ((re_opcode_t) *p1)
6047 case on_failure_jump:
6049 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6054 if (!common_op_match_null_string_p (&p1, end, reg_info))
6057 } /* while p1 < end */
6060 } /* alt_match_null_string_p */
6063 /* Deals with the ops common to group_match_null_string_p and
6064 alt_match_null_string_p.
6066 Sets P to one after the op and its arguments, if any. */
6069 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
6070 register_info_type *reg_info)
6075 unsigned char *p1 = *p;
6077 switch ((re_opcode_t) *p1++)
6097 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6098 ret = group_match_null_string_p (&p1, end, reg_info);
6100 /* Have to set this here in case we're checking a group which
6101 contains a group and a back reference to it. */
6103 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6104 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6110 /* If this is an optimized succeed_n for zero times, make the jump. */
6112 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6120 /* Get to the number of times to succeed. */
6122 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6127 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6135 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6143 /* All other opcodes mean we cannot match the empty string. */
6149 } /* common_op_match_null_string_p */
6152 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6153 bytes; nonzero otherwise. */
6156 bcmp_translate (re_char *s1, re_char *s2,
6157 REGISTER int len, RE_TRANSLATE_TYPE translate)
6159 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6161 const unsigned char *p1_end = s1 + len;
6162 const unsigned char *p2_end = s2 + len;
6164 while (p1 != p1_end && p2 != p2_end)
6166 Emchar p1_ch, p2_ch;
6168 p1_ch = charptr_emchar (p1);
6169 p2_ch = charptr_emchar (p2);
6171 if (RE_TRANSLATE (p1_ch)
6172 != RE_TRANSLATE (p2_ch))
6177 #else /* not MULE */
6180 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6187 /* Entry points for GNU code. */
6189 /* re_compile_pattern is the GNU regular expression compiler: it
6190 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6191 Returns 0 if the pattern was valid, otherwise an error string.
6193 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6194 are set in BUFP on entry.
6196 We call regex_compile to do the actual compilation. */
6199 re_compile_pattern (const char *pattern, int length,
6200 struct re_pattern_buffer *bufp)
6204 /* GNU code is written to assume at least RE_NREGS registers will be set
6205 (and at least one extra will be -1). */
6206 bufp->regs_allocated = REGS_UNALLOCATED;
6208 /* And GNU code determines whether or not to get register information
6209 by passing null for the REGS argument to re_match, etc., not by
6213 /* Match anchors at newline. */
6214 bufp->newline_anchor = 1;
6216 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6220 return gettext (re_error_msgid[(int) ret]);
6223 /* Entry points compatible with 4.2 BSD regex library. We don't define
6224 them unless specifically requested. */
6226 #ifdef _REGEX_RE_COMP
6228 /* BSD has one and only one pattern buffer. */
6229 static struct re_pattern_buffer re_comp_buf;
6232 re_comp (const char *s)
6238 if (!re_comp_buf.buffer)
6239 return gettext ("No previous regular expression");
6243 if (!re_comp_buf.buffer)
6245 re_comp_buf.buffer = (unsigned char *) malloc (200);
6246 if (re_comp_buf.buffer == NULL)
6247 return gettext (re_error_msgid[(int) REG_ESPACE]);
6248 re_comp_buf.allocated = 200;
6250 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6251 if (re_comp_buf.fastmap == NULL)
6252 return gettext (re_error_msgid[(int) REG_ESPACE]);
6255 /* Since `re_exec' always passes NULL for the `regs' argument, we
6256 don't need to initialize the pattern buffer fields which affect it. */
6258 /* Match anchors at newlines. */
6259 re_comp_buf.newline_anchor = 1;
6261 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6266 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6267 return (char *) gettext (re_error_msgid[(int) ret]);
6272 re_exec (const char *s)
6274 const int len = strlen (s);
6276 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6278 #endif /* _REGEX_RE_COMP */
6280 /* POSIX.2 functions. Don't define these for Emacs. */
6284 /* regcomp takes a regular expression as a string and compiles it.
6286 PREG is a regex_t *. We do not expect any fields to be initialized,
6287 since POSIX says we shouldn't. Thus, we set
6289 `buffer' to the compiled pattern;
6290 `used' to the length of the compiled pattern;
6291 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6292 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6293 RE_SYNTAX_POSIX_BASIC;
6294 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6295 `fastmap' and `fastmap_accurate' to zero;
6296 `re_nsub' to the number of subexpressions in PATTERN.
6298 PATTERN is the address of the pattern string.
6300 CFLAGS is a series of bits which affect compilation.
6302 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6303 use POSIX basic syntax.
6305 If REG_NEWLINE is set, then . and [^...] don't match newline.
6306 Also, regexec will try a match beginning after every newline.
6308 If REG_ICASE is set, then we considers upper- and lowercase
6309 versions of letters to be equivalent when matching.
6311 If REG_NOSUB is set, then when PREG is passed to regexec, that
6312 routine will report only success or failure, and nothing about the
6315 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6316 the return codes and their meanings.) */
6319 regcomp (regex_t *preg, const char *pattern, int cflags)
6323 = (cflags & REG_EXTENDED) ?
6324 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6326 /* regex_compile will allocate the space for the compiled pattern. */
6328 preg->allocated = 0;
6331 /* Don't bother to use a fastmap when searching. This simplifies the
6332 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6333 characters after newlines into the fastmap. This way, we just try
6337 if (cflags & REG_ICASE)
6341 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6342 if (preg->translate == NULL)
6343 return (int) REG_ESPACE;
6345 /* Map uppercase characters to corresponding lowercase ones. */
6346 for (i = 0; i < CHAR_SET_SIZE; i++)
6347 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6350 preg->translate = NULL;
6352 /* If REG_NEWLINE is set, newlines are treated differently. */
6353 if (cflags & REG_NEWLINE)
6354 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6355 syntax &= ~RE_DOT_NEWLINE;
6356 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6357 /* It also changes the matching behavior. */
6358 preg->newline_anchor = 1;
6361 preg->newline_anchor = 0;
6363 preg->no_sub = !!(cflags & REG_NOSUB);
6365 /* POSIX says a null character in the pattern terminates it, so we
6366 can use strlen here in compiling the pattern. */
6367 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6369 /* POSIX doesn't distinguish between an unmatched open-group and an
6370 unmatched close-group: both are REG_EPAREN. */
6371 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6377 /* regexec searches for a given pattern, specified by PREG, in the
6380 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6381 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6382 least NMATCH elements, and we set them to the offsets of the
6383 corresponding matched substrings.
6385 EFLAGS specifies `execution flags' which affect matching: if
6386 REG_NOTBOL is set, then ^ does not match at the beginning of the
6387 string; if REG_NOTEOL is set, then $ does not match at the end.
6389 We return 0 if we find a match and REG_NOMATCH if not. */
6392 regexec (const regex_t *preg, const char *string, Element_count nmatch,
6393 regmatch_t pmatch[], int eflags)
6396 struct re_registers regs;
6397 regex_t private_preg;
6398 int len = strlen (string);
6399 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6401 private_preg = *preg;
6403 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6404 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6406 /* The user has told us exactly how many registers to return
6407 information about, via `nmatch'. We have to pass that on to the
6408 matching routines. */
6409 private_preg.regs_allocated = REGS_FIXED;
6413 regs.num_regs = nmatch;
6414 regs.start = TALLOC (nmatch, regoff_t);
6415 regs.end = TALLOC (nmatch, regoff_t);
6416 if (regs.start == NULL || regs.end == NULL)
6417 return (int) REG_NOMATCH;
6420 /* Perform the searching operation. */
6421 ret = re_search (&private_preg, string, len,
6422 /* start: */ 0, /* range: */ len,
6423 want_reg_info ? ®s : (struct re_registers *) 0);
6425 /* Copy the register information to the POSIX structure. */
6432 for (r = 0; r < nmatch; r++)
6434 pmatch[r].rm_so = regs.start[r];
6435 pmatch[r].rm_eo = regs.end[r];
6439 /* If we needed the temporary register info, free the space now. */
6444 /* We want zero return to mean success, unlike `re_search'. */
6445 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6449 /* Returns a message corresponding to an error code, ERRCODE, returned
6450 from either regcomp or regexec. We don't use PREG here. */
6453 regerror (int errcode, const regex_t *preg, char *errbuf,
6454 Memory_count errbuf_size)
6457 Memory_count msg_size;
6460 || (size_t) errcode >= (sizeof (re_error_msgid)
6461 / sizeof (re_error_msgid[0])))
6462 /* Only error codes returned by the rest of the code should be passed
6463 to this routine. If we are given anything else, or if other regex
6464 code generates an invalid error code, then the program has a bug.
6465 Dump core so we can fix it. */
6468 msg = gettext (re_error_msgid[errcode]);
6470 msg_size = strlen (msg) + 1; /* Includes the null. */
6472 if (errbuf_size != 0)
6474 if (msg_size > errbuf_size)
6476 strncpy (errbuf, msg, errbuf_size - 1);
6477 errbuf[errbuf_size - 1] = 0;
6480 strcpy (errbuf, msg);
6487 /* Free dynamically allocated space used by PREG. */
6490 regfree (regex_t *preg)
6492 if (preg->buffer != NULL)
6493 free (preg->buffer);
6494 preg->buffer = NULL;
6496 preg->allocated = 0;
6499 if (preg->fastmap != NULL)
6500 free (preg->fastmap);
6501 preg->fastmap = NULL;
6502 preg->fastmap_accurate = 0;
6504 if (preg->translate != NULL)
6505 free (preg->translate);
6506 preg->translate = NULL;
6509 #endif /* not emacs */
6513 make-backup-files: t
6515 trim-versions-without-asking: nil