1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA. */
25 /* Synched up with: FSF 19.29. */
27 /* Changes made for XEmacs:
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead.
34 (3) Lots and lots of changes for Mule. They are bracketed by
35 `#ifdef MULE' or with comments that have `XEmacs' in them.
42 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
51 /* Converts the pointer to the char to BEG-based offset from the start. */
52 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
53 ? (d) - string1 : (d) - (string2 - size1))
55 #define PTR_TO_OFFSET(d) 0
58 /* We assume non-Mule if emacs isn't defined. */
63 /* We need this for `regex.h', and perhaps for the Emacs include files. */
64 #include <sys/types.h>
66 /* This is for other GNU distributions with internationalized messages. */
67 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
70 # define gettext(msgid) (msgid)
73 /* XEmacs: define this to add in a speedup for patterns anchored at
74 the beginning of a line. Keep the ifdefs so that it's easier to
75 tell where/why this code has diverged from v19. */
76 #define REGEX_BEGLINE_CHECK
78 /* XEmacs: the current mmap-based ralloc handles small blocks very
79 poorly, so we disable it here. */
81 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
85 /* The `emacs' switch turns on certain matching commands
86 that make sense only in Emacs. */
93 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
99 Lisp_Object Vthe_lisp_rangetab;
102 complex_vars_of_regex (void)
104 Vthe_lisp_rangetab = Fmake_range_table ();
105 staticpro (&Vthe_lisp_rangetab);
111 complex_vars_of_regex (void)
117 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
118 #define TRANSLATE_P(tr) (!NILP (tr))
120 #else /* not emacs */
122 /* If we are not linking with Emacs proper,
123 we can't use the relocating allocator
124 even if config.h says that we can. */
127 #if defined (STDC_HEADERS) || defined (_LIBC)
135 #include <stddef.h> /* for ptrdiff_t */
137 #define charptr_emchar(str) ((Emchar) (str)[0])
141 #define INC_CHARPTR(p) ((p)++)
142 #define DEC_CHARPTR(p) ((p)--)
146 /* Define the syntax stuff for \<, \>, etc. */
148 /* This must be nonzero for the wordchar and notwordchar pattern
149 commands in re_match_2. */
156 extern char *re_syntax_table;
158 #else /* not SYNTAX_TABLE */
160 /* How many characters in the character set. */
161 #define CHAR_SET_SIZE 256
163 static char re_syntax_table[CHAR_SET_SIZE];
166 init_syntax_once (void)
172 const char *word_syntax_chars =
173 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
175 memset (re_syntax_table, 0, sizeof (re_syntax_table));
177 while (*word_syntax_chars)
178 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
184 #endif /* SYNTAX_TABLE */
186 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
187 #undef SYNTAX_FROM_CACHE
188 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
190 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
191 #define TRANSLATE_P(tr) tr
195 /* Under XEmacs, this is needed because we don't define it elsewhere. */
196 #ifdef SWITCH_ENUM_BUG
197 #define SWITCH_ENUM_CAST(x) ((int)(x))
199 #define SWITCH_ENUM_CAST(x) (x)
203 /* Get the interface, including the syntax bits. */
206 /* isalpha etc. are used for the character classes. */
209 /* Jim Meyering writes:
211 "... Some ctype macros are valid only for character codes that
212 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
213 using /bin/cc or gcc but without giving an ansi option). So, all
214 ctype uses should be through macros like ISPRINT... If
215 STDC_HEADERS is defined, then autoconf has verified that the ctype
216 macros don't need to be guarded with references to isascii. ...
217 Defining isascii to 1 should let any compiler worth its salt
218 eliminate the && through constant folding." */
220 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
221 #define ISASCII_1(c) 1
223 #define ISASCII_1(c) isascii(c)
227 /* The IS*() macros can be passed any character, including an extended
228 one. We need to make sure there are no crashes, which would occur
229 otherwise due to out-of-bounds array references. */
230 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
232 #define ISASCII(c) ISASCII_1 (c)
236 #define ISBLANK(c) (ISASCII (c) && isblank (c))
238 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
241 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
243 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
246 #define ISPRINT(c) (ISASCII (c) && isprint (c))
247 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
248 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
249 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
250 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
251 #define ISLOWER(c) (ISASCII (c) && islower (c))
252 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
253 #define ISSPACE(c) (ISASCII (c) && isspace (c))
254 #define ISUPPER(c) (ISASCII (c) && isupper (c))
255 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
258 #define NULL (void *)0
261 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
262 since ours (we hope) works properly with all combinations of
263 machines, compilers, `char' and `unsigned char' argument types.
264 (Per Bothner suggested the basic approach.) */
265 #undef SIGN_EXTEND_CHAR
267 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
268 #else /* not __STDC__ */
269 /* As in Harbison and Steele. */
270 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
273 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
274 use `alloca' instead of `malloc'. This is because using malloc in
275 re_search* or re_match* could cause memory leaks when C-g is used in
276 Emacs; also, malloc is slower and causes storage fragmentation. On
277 the other hand, malloc is more portable, and easier to debug.
279 Because we sometimes use alloca, some routines have to be macros,
280 not functions -- `alloca'-allocated space disappears at the end of the
281 function it is called in. */
285 #define REGEX_ALLOCATE malloc
286 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
287 #define REGEX_FREE free
289 #else /* not REGEX_MALLOC */
291 /* Emacs already defines alloca, sometimes. */
294 /* Make alloca work the best possible way. */
296 #define alloca __builtin_alloca
297 #else /* not __GNUC__ */
300 #else /* not __GNUC__ or HAVE_ALLOCA_H */
301 #ifndef _AIX /* Already did AIX, up at the top. */
303 #endif /* not _AIX */
304 #endif /* HAVE_ALLOCA_H */
305 #endif /* __GNUC__ */
307 #endif /* not alloca */
309 #define REGEX_ALLOCATE alloca
311 /* Assumes a `char *destination' variable. */
312 #define REGEX_REALLOCATE(source, osize, nsize) \
313 (destination = (char *) alloca (nsize), \
314 memmove (destination, source, osize), \
317 /* No need to do anything to free, after alloca. */
318 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
320 #endif /* REGEX_MALLOC */
322 /* Define how to allocate the failure stack. */
325 #define REGEX_ALLOCATE_STACK(size) \
326 r_alloc ((char **) &failure_stack_ptr, (size))
327 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
328 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
329 #define REGEX_FREE_STACK(ptr) \
330 r_alloc_free ((void **) &failure_stack_ptr)
332 #else /* not REL_ALLOC */
336 #define REGEX_ALLOCATE_STACK malloc
337 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
338 #define REGEX_FREE_STACK free
340 #else /* not REGEX_MALLOC */
342 #define REGEX_ALLOCATE_STACK alloca
344 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
345 REGEX_REALLOCATE (source, osize, nsize)
346 /* No need to explicitly free anything. */
347 #define REGEX_FREE_STACK(arg)
349 #endif /* REGEX_MALLOC */
350 #endif /* REL_ALLOC */
353 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
354 `string1' or just past its end. This works if PTR is NULL, which is
356 #define FIRST_STRING_P(ptr) \
357 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
359 /* (Re)Allocate N items of type T using malloc, or fail. */
360 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
361 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
362 #define RETALLOC_IF(addr, n, t) \
363 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
364 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
366 #define BYTEWIDTH 8 /* In bits. */
368 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
372 #define MAX(a, b) ((a) > (b) ? (a) : (b))
373 #define MIN(a, b) ((a) < (b) ? (a) : (b))
375 /* Type of source-pattern and string chars. */
376 typedef const unsigned char re_char;
378 typedef char re_bool;
383 /* These are the command codes that appear in compiled regular
384 expressions. Some opcodes are followed by argument bytes. A
385 command code can specify any interpretation whatsoever for its
386 arguments. Zero bytes may appear in the compiled regular expression. */
392 /* Succeed right away--no more backtracking. */
395 /* Followed by one byte giving n, then by n literal bytes. */
398 /* Matches any (more or less) character. */
401 /* Matches any one char belonging to specified set. First
402 following byte is number of bitmap bytes. Then come bytes
403 for a bitmap saying which chars are in. Bits in each byte
404 are ordered low-bit-first. A character is in the set if its
405 bit is 1. A character too large to have a bit in the map is
406 automatically not in the set. */
409 /* Same parameters as charset, but match any character that is
410 not one of those specified. */
413 /* Start remembering the text that is matched, for storing in a
414 register. Followed by one byte with the register number, in
415 the range 0 to one less than the pattern buffer's re_nsub
416 field. Then followed by one byte with the number of groups
417 inner to this one. (This last has to be part of the
418 start_memory only because we need it in the on_failure_jump
422 /* Stop remembering the text that is matched and store it in a
423 memory register. Followed by one byte with the register
424 number, in the range 0 to one less than `re_nsub' in the
425 pattern buffer, and one byte with the number of inner groups,
426 just like `start_memory'. (We need the number of inner
427 groups here because we don't have any easy way of finding the
428 corresponding start_memory when we're at a stop_memory.) */
431 /* Match a duplicate of something remembered. Followed by one
432 byte containing the register number. */
435 /* Fail unless at beginning of line. */
438 /* Fail unless at end of line. */
441 /* Succeeds if at beginning of buffer (if emacs) or at beginning
442 of string to be matched (if not). */
445 /* Analogously, for end of buffer/string. */
448 /* Followed by two byte relative address to which to jump. */
451 /* Same as jump, but marks the end of an alternative. */
454 /* Followed by two-byte relative address of place to resume at
455 in case of failure. */
458 /* Like on_failure_jump, but pushes a placeholder instead of the
459 current string position when executed. */
460 on_failure_keep_string_jump,
462 /* Throw away latest failure point and then jump to following
463 two-byte relative address. */
466 /* Change to pop_failure_jump if know won't have to backtrack to
467 match; otherwise change to jump. This is used to jump
468 back to the beginning of a repeat. If what follows this jump
469 clearly won't match what the repeat does, such that we can be
470 sure that there is no use backtracking out of repetitions
471 already matched, then we change it to a pop_failure_jump.
472 Followed by two-byte address. */
475 /* Jump to following two-byte address, and push a dummy failure
476 point. This failure point will be thrown away if an attempt
477 is made to use it for a failure. A `+' construct makes this
478 before the first repeat. Also used as an intermediary kind
479 of jump when compiling an alternative. */
482 /* Push a dummy failure point and continue. Used at the end of
486 /* Followed by two-byte relative address and two-byte number n.
487 After matching N times, jump to the address upon failure. */
490 /* Followed by two-byte relative address, and two-byte number n.
491 Jump to the address N times, then fail. */
494 /* Set the following two-byte relative address to the
495 subsequent two-byte number. The address *includes* the two
499 wordchar, /* Matches any word-constituent character. */
500 notwordchar, /* Matches any char that is not a word-constituent. */
502 wordbeg, /* Succeeds if at word beginning. */
503 wordend, /* Succeeds if at word end. */
505 wordbound, /* Succeeds if at a word boundary. */
506 notwordbound /* Succeeds if not at a word boundary. */
509 ,before_dot, /* Succeeds if before point. */
510 at_dot, /* Succeeds if at point. */
511 after_dot, /* Succeeds if after point. */
513 /* Matches any character whose syntax is specified. Followed by
514 a byte which contains a syntax code, e.g., Sword. */
517 /* Matches any character whose syntax is not that specified. */
523 /* need extra stuff to be able to properly work with XEmacs/Mule
524 characters (which may take up more than one byte) */
526 ,charset_mule, /* Matches any character belonging to specified set.
527 The set is stored in "unified range-table
528 format"; see rangetab.c. Unlike the `charset'
529 opcode, this can handle arbitrary characters. */
531 charset_mule_not /* Same parameters as charset_mule, but match any
532 character that is not one of those specified. */
534 /* 97/2/17 jhod: The following two were merged back in from the Mule
535 2.3 code to enable some language specific processing */
536 ,categoryspec, /* Matches entries in the character category tables */
537 notcategoryspec /* The opposite of the above */
542 /* Common operations on the compiled pattern. */
544 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
546 #define STORE_NUMBER(destination, number) \
548 (destination)[0] = (number) & 0377; \
549 (destination)[1] = (number) >> 8; \
552 /* Same as STORE_NUMBER, except increment DESTINATION to
553 the byte after where the number is stored. Therefore, DESTINATION
554 must be an lvalue. */
556 #define STORE_NUMBER_AND_INCR(destination, number) \
558 STORE_NUMBER (destination, number); \
559 (destination) += 2; \
562 /* Put into DESTINATION a number stored in two contiguous bytes starting
565 #define EXTRACT_NUMBER(destination, source) \
567 (destination) = *(source) & 0377; \
568 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
573 extract_number (int *dest, re_char *source)
575 int temp = SIGN_EXTEND_CHAR (*(source + 1));
576 *dest = *source & 0377;
580 #ifndef EXTRACT_MACROS /* To debug the macros. */
581 #undef EXTRACT_NUMBER
582 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
583 #endif /* not EXTRACT_MACROS */
587 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
588 SOURCE must be an lvalue. */
590 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
592 EXTRACT_NUMBER (destination, source); \
598 extract_number_and_incr (int *destination, unsigned char **source)
600 extract_number (destination, *source);
604 #ifndef EXTRACT_MACROS
605 #undef EXTRACT_NUMBER_AND_INCR
606 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
607 extract_number_and_incr (&dest, &src)
608 #endif /* not EXTRACT_MACROS */
612 /* If DEBUG is defined, Regex prints many voluminous messages about what
613 it is doing (if the variable `debug' is nonzero). If linked with the
614 main program in `iregex.c', you can enter patterns and strings
615 interactively. And if linked with the main program in `main.c' and
616 the other test files, you can run the already-written tests. */
620 /* We use standard I/O for debugging. */
624 /* XEmacs provides its own version of assert() */
625 /* It is useful to test things that ``must'' be true when debugging. */
629 static int debug = 0;
631 #define DEBUG_STATEMENT(e) e
632 #define DEBUG_PRINT1(x) if (debug) printf (x)
633 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
634 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
635 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
636 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
637 if (debug) print_partial_compiled_pattern (s, e)
638 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
639 if (debug) print_double_string (w, s1, sz1, s2, sz2)
642 /* Print the fastmap in human-readable form. */
645 print_fastmap (char *fastmap)
647 unsigned was_a_range = 0;
650 while (i < (1 << BYTEWIDTH))
656 while (i < (1 << BYTEWIDTH) && fastmap[i])
672 /* Print a compiled pattern string in human-readable form, starting at
673 the START pointer into it and ending just before the pointer END. */
676 print_partial_compiled_pattern (re_char *start, re_char *end)
679 unsigned char *p = (unsigned char *) start;
688 /* Loop over pattern commands. */
691 printf ("%ld:\t", (long)(p - start));
693 switch ((re_opcode_t) *p++)
701 printf ("/exactn/%d", mcnt);
712 printf ("/start_memory/%d/%d", mcnt, *p++);
717 printf ("/stop_memory/%d/%d", mcnt, *p++);
721 printf ("/duplicate/%d", *p++);
731 REGISTER int c, last = -100;
732 REGISTER int in_range = 0;
734 printf ("/charset [%s",
735 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
737 assert (p + *p < pend);
739 for (c = 0; c < 256; c++)
740 if (((unsigned char) (c / 8) < *p)
741 && (p[1 + (c/8)] & (1 << (c % 8))))
743 /* Are we starting a range? */
744 if (last + 1 == c && ! in_range)
749 /* Have we broken a range? */
750 else if (last + 1 != c && in_range)
773 case charset_mule_not:
777 printf ("/charset_mule [%s",
778 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
779 nentries = unified_range_table_nentries (p);
780 for (i = 0; i < nentries; i++)
782 EMACS_INT first, last;
783 Lisp_Object dummy_val;
785 unified_range_table_get_range (p, i, &first, &last,
790 printf ("(0x%lx)", (long)first);
797 printf ("(0x%lx)", (long)last);
801 p += unified_range_table_bytes_used (p);
814 case on_failure_jump:
815 extract_number_and_incr (&mcnt, &p);
816 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
819 case on_failure_keep_string_jump:
820 extract_number_and_incr (&mcnt, &p);
821 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
824 case dummy_failure_jump:
825 extract_number_and_incr (&mcnt, &p);
826 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
829 case push_dummy_failure:
830 printf ("/push_dummy_failure");
834 extract_number_and_incr (&mcnt, &p);
835 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
838 case pop_failure_jump:
839 extract_number_and_incr (&mcnt, &p);
840 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
844 extract_number_and_incr (&mcnt, &p);
845 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
849 extract_number_and_incr (&mcnt, &p);
850 printf ("/jump to %ld", (long)(p + mcnt - start));
854 extract_number_and_incr (&mcnt, &p);
855 extract_number_and_incr (&mcnt2, &p);
856 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
860 extract_number_and_incr (&mcnt, &p);
861 extract_number_and_incr (&mcnt2, &p);
862 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
866 extract_number_and_incr (&mcnt, &p);
867 extract_number_and_incr (&mcnt2, &p);
868 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
872 printf ("/wordbound");
876 printf ("/notwordbound");
888 printf ("/before_dot");
896 printf ("/after_dot");
900 printf ("/syntaxspec");
902 printf ("/%d", mcnt);
906 printf ("/notsyntaxspec");
908 printf ("/%d", mcnt);
912 /* 97/2/17 jhod Mule category patch */
914 printf ("/categoryspec");
916 printf ("/%d", mcnt);
919 case notcategoryspec:
920 printf ("/notcategoryspec");
922 printf ("/%d", mcnt);
924 /* end of category patch */
929 printf ("/wordchar");
933 printf ("/notwordchar");
945 printf ("?%d", *(p-1));
951 printf ("%ld:\tend of pattern.\n", (long)(p - start));
956 print_compiled_pattern (struct re_pattern_buffer *bufp)
958 re_char *buffer = bufp->buffer;
960 print_partial_compiled_pattern (buffer, buffer + bufp->used);
961 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
964 if (bufp->fastmap_accurate && bufp->fastmap)
966 printf ("fastmap: ");
967 print_fastmap (bufp->fastmap);
970 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
971 printf ("regs_alloc: %d\t", bufp->regs_allocated);
972 printf ("can_be_null: %d\t", bufp->can_be_null);
973 printf ("newline_anchor: %d\n", bufp->newline_anchor);
974 printf ("no_sub: %d\t", bufp->no_sub);
975 printf ("not_bol: %d\t", bufp->not_bol);
976 printf ("not_eol: %d\t", bufp->not_eol);
977 printf ("syntax: %d\n", bufp->syntax);
978 /* Perhaps we should print the translate table? */
979 /* and maybe the category table? */
984 print_double_string (re_char *where, re_char *string1, int size1,
985 re_char *string2, int size2)
991 Element_count this_char;
993 if (FIRST_STRING_P (where))
995 for (this_char = where - string1; this_char < size1; this_char++)
996 putchar (string1[this_char]);
1001 for (this_char = where - string2; this_char < size2; this_char++)
1002 putchar (string2[this_char]);
1006 #else /* not DEBUG */
1011 #define DEBUG_STATEMENT(e)
1012 #define DEBUG_PRINT1(x)
1013 #define DEBUG_PRINT2(x1, x2)
1014 #define DEBUG_PRINT3(x1, x2, x3)
1015 #define DEBUG_PRINT4(x1, x2, x3, x4)
1016 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1017 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1021 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1022 also be assigned to arbitrarily: each pattern buffer stores its own
1023 syntax, so it can be changed between regex compilations. */
1024 /* This has no initializer because initialized variables in Emacs
1025 become read-only after dumping. */
1026 reg_syntax_t re_syntax_options;
1029 /* Specify the precise syntax of regexps for compilation. This provides
1030 for compatibility for various utilities which historically have
1031 different, incompatible syntaxes.
1033 The argument SYNTAX is a bit mask comprised of the various bits
1034 defined in regex.h. We return the old syntax. */
1037 re_set_syntax (reg_syntax_t syntax)
1039 reg_syntax_t ret = re_syntax_options;
1041 re_syntax_options = syntax;
1045 /* This table gives an error message for each of the error codes listed
1046 in regex.h. Obviously the order here has to be same as there.
1047 POSIX doesn't require that we do anything for REG_NOERROR,
1048 but why not be nice? */
1050 static const char *re_error_msgid[] =
1052 "Success", /* REG_NOERROR */
1053 "No match", /* REG_NOMATCH */
1054 "Invalid regular expression", /* REG_BADPAT */
1055 "Invalid collation character", /* REG_ECOLLATE */
1056 "Invalid character class name", /* REG_ECTYPE */
1057 "Trailing backslash", /* REG_EESCAPE */
1058 "Invalid back reference", /* REG_ESUBREG */
1059 "Unmatched [ or [^", /* REG_EBRACK */
1060 "Unmatched ( or \\(", /* REG_EPAREN */
1061 "Unmatched \\{", /* REG_EBRACE */
1062 "Invalid content of \\{\\}", /* REG_BADBR */
1063 "Invalid range end", /* REG_ERANGE */
1064 "Memory exhausted", /* REG_ESPACE */
1065 "Invalid preceding regular expression", /* REG_BADRPT */
1066 "Premature end of regular expression", /* REG_EEND */
1067 "Regular expression too big", /* REG_ESIZE */
1068 "Unmatched ) or \\)", /* REG_ERPAREN */
1070 "Invalid syntax designator", /* REG_ESYNTAX */
1073 "Ranges may not span charsets", /* REG_ERANGESPAN */
1074 "Invalid category designator", /* REG_ECATEGORY */
1078 /* Avoiding alloca during matching, to placate r_alloc. */
1080 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1081 searching and matching functions should not call alloca. On some
1082 systems, alloca is implemented in terms of malloc, and if we're
1083 using the relocating allocator routines, then malloc could cause a
1084 relocation, which might (if the strings being searched are in the
1085 ralloc heap) shift the data out from underneath the regexp
1088 Here's another reason to avoid allocation: Emacs
1089 processes input from X in a signal handler; processing X input may
1090 call malloc; if input arrives while a matching routine is calling
1091 malloc, then we're scrod. But Emacs can't just block input while
1092 calling matching routines; then we don't notice interrupts when
1093 they come in. So, Emacs blocks input around all regexp calls
1094 except the matching calls, which it leaves unprotected, in the
1095 faith that they will not malloc. */
1097 /* Normally, this is fine. */
1098 #define MATCH_MAY_ALLOCATE
1100 /* When using GNU C, we are not REALLY using the C alloca, no matter
1101 what config.h may say. So don't take precautions for it. */
1106 /* The match routines may not allocate if (1) they would do it with malloc
1107 and (2) it's not safe for them to use malloc.
1108 Note that if REL_ALLOC is defined, matching would not use malloc for the
1109 failure stack, but we would still use it for the register vectors;
1110 so REL_ALLOC should not affect this. */
1111 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1112 #undef MATCH_MAY_ALLOCATE
1116 /* Failure stack declarations and macros; both re_compile_fastmap and
1117 re_match_2 use a failure stack. These have to be macros because of
1118 REGEX_ALLOCATE_STACK. */
1121 /* Number of failure points for which to initially allocate space
1122 when matching. If this number is exceeded, we allocate more
1123 space, so it is not a hard limit. */
1124 #ifndef INIT_FAILURE_ALLOC
1125 #define INIT_FAILURE_ALLOC 5
1128 /* Roughly the maximum number of failure points on the stack. Would be
1129 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1130 This is a variable only so users of regex can assign to it; we never
1131 change it ourselves. */
1132 #if defined (MATCH_MAY_ALLOCATE)
1133 /* 4400 was enough to cause a crash on Alpha OSF/1,
1134 whose default stack limit is 2mb. */
1135 int re_max_failures = 20000;
1137 int re_max_failures = 2000;
1140 union fail_stack_elt
1146 typedef union fail_stack_elt fail_stack_elt_t;
1150 fail_stack_elt_t *stack;
1152 Element_count avail; /* Offset of next open position. */
1155 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1156 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1157 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1160 /* Define macros to initialize and free the failure stack.
1161 Do `return -2' if the alloc fails. */
1163 #ifdef MATCH_MAY_ALLOCATE
1164 #define INIT_FAIL_STACK() \
1166 fail_stack.stack = (fail_stack_elt_t *) \
1167 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1169 if (fail_stack.stack == NULL) \
1172 fail_stack.size = INIT_FAILURE_ALLOC; \
1173 fail_stack.avail = 0; \
1176 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1178 #define INIT_FAIL_STACK() \
1180 fail_stack.avail = 0; \
1183 #define RESET_FAIL_STACK()
1187 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1189 Return 1 if succeeds, and 0 if either ran out of memory
1190 allocating space for it or it was already too large.
1192 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1194 #define DOUBLE_FAIL_STACK(fail_stack) \
1195 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1197 : ((fail_stack).stack = (fail_stack_elt_t *) \
1198 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1199 (fail_stack).size * sizeof (fail_stack_elt_t), \
1200 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1202 (fail_stack).stack == NULL \
1204 : ((fail_stack).size <<= 1, \
1208 /* Push pointer POINTER on FAIL_STACK.
1209 Return 1 if was able to do so and 0 if ran out of memory allocating
1211 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1212 ((FAIL_STACK_FULL () \
1213 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1215 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1218 /* Push a pointer value onto the failure stack.
1219 Assumes the variable `fail_stack'. Probably should only
1220 be called from within `PUSH_FAILURE_POINT'. */
1221 #define PUSH_FAILURE_POINTER(item) \
1222 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1224 /* This pushes an integer-valued item onto the failure stack.
1225 Assumes the variable `fail_stack'. Probably should only
1226 be called from within `PUSH_FAILURE_POINT'. */
1227 #define PUSH_FAILURE_INT(item) \
1228 fail_stack.stack[fail_stack.avail++].integer = (item)
1230 /* Push a fail_stack_elt_t value onto the failure stack.
1231 Assumes the variable `fail_stack'. Probably should only
1232 be called from within `PUSH_FAILURE_POINT'. */
1233 #define PUSH_FAILURE_ELT(item) \
1234 fail_stack.stack[fail_stack.avail++] = (item)
1236 /* These three POP... operations complement the three PUSH... operations.
1237 All assume that `fail_stack' is nonempty. */
1238 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1239 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1240 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1242 /* Used to omit pushing failure point id's when we're not debugging. */
1244 #define DEBUG_PUSH PUSH_FAILURE_INT
1245 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1247 #define DEBUG_PUSH(item)
1248 #define DEBUG_POP(item_addr)
1252 /* Push the information about the state we will need
1253 if we ever fail back to it.
1255 Requires variables fail_stack, regstart, regend, reg_info, and
1256 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1259 Does `return FAILURE_CODE' if runs out of memory. */
1261 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1262 #define DECLARE_DESTINATION char *destination
1264 #define DECLARE_DESTINATION DECLARE_NOTHING
1267 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1269 DECLARE_DESTINATION; \
1270 /* Must be int, so when we don't save any registers, the arithmetic \
1271 of 0 + -1 isn't done as unsigned. */ \
1274 DEBUG_STATEMENT (failure_id++); \
1275 DEBUG_STATEMENT (nfailure_points_pushed++); \
1276 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1277 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1278 (unsigned long) (fail_stack).avail); \
1279 DEBUG_PRINT2 (" size: %lu\n", \
1280 (unsigned long) (fail_stack).size); \
1282 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1283 DEBUG_PRINT2 (" available: %ld\n", \
1284 (long) REMAINING_AVAIL_SLOTS); \
1286 /* Ensure we have enough space allocated for what we will push. */ \
1287 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1289 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1290 return failure_code; \
1292 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1293 (unsigned long) (fail_stack).size); \
1294 DEBUG_PRINT2 (" slots available: %ld\n", \
1295 (long) REMAINING_AVAIL_SLOTS); \
1298 /* Push the info, starting with the registers. */ \
1299 DEBUG_PRINT1 ("\n"); \
1301 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1304 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1305 DEBUG_STATEMENT (num_regs_pushed++); \
1307 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1308 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1310 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1311 PUSH_FAILURE_POINTER (regend[this_reg]); \
1313 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1314 * (long *) (®_info[this_reg])); \
1315 DEBUG_PRINT2 (" match_null=%d", \
1316 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1317 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1318 DEBUG_PRINT2 (" matched_something=%d", \
1319 MATCHED_SOMETHING (reg_info[this_reg])); \
1320 DEBUG_PRINT2 (" ever_matched_something=%d", \
1321 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1322 DEBUG_PRINT1 ("\n"); \
1323 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1326 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1327 PUSH_FAILURE_INT (lowest_active_reg); \
1329 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1330 PUSH_FAILURE_INT (highest_active_reg); \
1332 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1333 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1334 PUSH_FAILURE_POINTER (pattern_place); \
1336 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1337 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1339 DEBUG_PRINT1 ("'\n"); \
1340 PUSH_FAILURE_POINTER (string_place); \
1342 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1343 DEBUG_PUSH (failure_id); \
1346 /* This is the number of items that are pushed and popped on the stack
1347 for each register. */
1348 #define NUM_REG_ITEMS 3
1350 /* Individual items aside from the registers. */
1352 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1354 #define NUM_NONREG_ITEMS 4
1357 /* We push at most this many items on the stack. */
1358 /* We used to use (num_regs - 1), which is the number of registers
1359 this regexp will save; but that was changed to 5
1360 to avoid stack overflow for a regexp with lots of parens. */
1361 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1363 /* We actually push this many items. */
1364 #define NUM_FAILURE_ITEMS \
1365 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1368 /* How many items can still be added to the stack without overflowing it. */
1369 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1372 /* Pops what PUSH_FAIL_STACK pushes.
1374 We restore into the parameters, all of which should be lvalues:
1375 STR -- the saved data position.
1376 PAT -- the saved pattern position.
1377 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1378 REGSTART, REGEND -- arrays of string positions.
1379 REG_INFO -- array of information about each subexpression.
1381 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1382 `pend', `string1', `size1', `string2', and `size2'. */
1384 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1385 regstart, regend, reg_info) \
1387 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1389 const unsigned char *string_temp; \
1391 assert (!FAIL_STACK_EMPTY ()); \
1393 /* Remove failure points and point to how many regs pushed. */ \
1394 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1395 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1396 (unsigned long) fail_stack.avail); \
1397 DEBUG_PRINT2 (" size: %lu\n", \
1398 (unsigned long) fail_stack.size); \
1400 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1402 DEBUG_POP (&ffailure_id.integer); \
1403 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1404 * (unsigned int *) &ffailure_id); \
1406 /* If the saved string location is NULL, it came from an \
1407 on_failure_keep_string_jump opcode, and we want to throw away the \
1408 saved NULL, thus retaining our current position in the string. */ \
1409 string_temp = POP_FAILURE_POINTER (); \
1410 if (string_temp != NULL) \
1411 str = string_temp; \
1413 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1414 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1415 DEBUG_PRINT1 ("'\n"); \
1417 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1418 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1419 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1421 /* Restore register info. */ \
1422 high_reg = (unsigned) POP_FAILURE_INT (); \
1423 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1425 low_reg = (unsigned) POP_FAILURE_INT (); \
1426 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1428 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1430 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1432 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1433 DEBUG_PRINT2 (" info: 0x%lx\n", \
1434 * (long *) ®_info[this_reg]); \
1436 regend[this_reg] = POP_FAILURE_POINTER (); \
1437 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1439 regstart[this_reg] = POP_FAILURE_POINTER (); \
1440 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1443 set_regs_matched_done = 0; \
1444 DEBUG_STATEMENT (nfailure_points_popped++); \
1445 } while (0) /* POP_FAILURE_POINT */
1449 /* Structure for per-register (a.k.a. per-group) information.
1450 Other register information, such as the
1451 starting and ending positions (which are addresses), and the list of
1452 inner groups (which is a bits list) are maintained in separate
1455 We are making a (strictly speaking) nonportable assumption here: that
1456 the compiler will pack our bit fields into something that fits into
1457 the type of `word', i.e., is something that fits into one item on the
1462 fail_stack_elt_t word;
1465 /* This field is one if this group can match the empty string,
1466 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1467 #define MATCH_NULL_UNSET_VALUE 3
1468 unsigned match_null_string_p : 2;
1469 unsigned is_active : 1;
1470 unsigned matched_something : 1;
1471 unsigned ever_matched_something : 1;
1473 } register_info_type;
1475 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1476 #define IS_ACTIVE(R) ((R).bits.is_active)
1477 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1478 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1481 /* Call this when have matched a real character; it sets `matched' flags
1482 for the subexpressions which we are currently inside. Also records
1483 that those subexprs have matched. */
1484 #define SET_REGS_MATCHED() \
1487 if (!set_regs_matched_done) \
1490 set_regs_matched_done = 1; \
1491 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1493 MATCHED_SOMETHING (reg_info[r]) \
1494 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1501 /* Registers are set to a sentinel when they haven't yet matched. */
1502 static unsigned char reg_unset_dummy;
1503 #define REG_UNSET_VALUE (®_unset_dummy)
1504 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1506 /* Subroutine declarations and macros for regex_compile. */
1508 /* Fetch the next character in the uncompiled pattern---translating it
1509 if necessary. Also cast from a signed character in the constant
1510 string passed to us by the user to an unsigned char that we can use
1511 as an array index (in, e.g., `translate'). */
1512 #define PATFETCH(c) \
1515 c = TRANSLATE (c); \
1518 /* Fetch the next character in the uncompiled pattern, with no
1520 #define PATFETCH_RAW(c) \
1521 do {if (p == pend) return REG_EEND; \
1522 assert (p < pend); \
1523 c = charptr_emchar (p); \
1527 /* Go backwards one character in the pattern. */
1528 #define PATUNFETCH DEC_CHARPTR (p)
1532 #define PATFETCH_EXTENDED(emch) \
1533 do {if (p == pend) return REG_EEND; \
1534 assert (p < pend); \
1535 emch = charptr_emchar ((const Bufbyte *) p); \
1537 if (TRANSLATE_P (translate) && emch < 0x80) \
1538 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1541 #define PATFETCH_RAW_EXTENDED(emch) \
1542 do {if (p == pend) return REG_EEND; \
1543 assert (p < pend); \
1544 emch = charptr_emchar ((const Bufbyte *) p); \
1548 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1550 #define PATFETCH_EITHER(emch) \
1552 if (has_extended_chars) \
1553 PATFETCH_EXTENDED (emch); \
1558 #define PATFETCH_RAW_EITHER(emch) \
1560 if (has_extended_chars) \
1561 PATFETCH_RAW_EXTENDED (emch); \
1563 PATFETCH_RAW (emch); \
1566 #define PATUNFETCH_EITHER \
1568 if (has_extended_chars) \
1569 PATUNFETCH_EXTENDED (emch); \
1571 PATUNFETCH (emch); \
1574 #else /* not MULE */
1576 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1577 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1578 #define PATUNFETCH_EITHER PATUNFETCH
1582 /* If `translate' is non-null, return translate[D], else just D. We
1583 cast the subscript to translate because some data is declared as
1584 `char *', to avoid warnings when a string constant is passed. But
1585 when we use a character as a subscript we must make it unsigned. */
1586 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1590 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1591 (TRANSLATE_P (translate) && emch < 0x80 ? RE_TRANSLATE (emch) : (emch))
1595 /* Macros for outputting the compiled pattern into `buffer'. */
1597 /* If the buffer isn't allocated when it comes in, use this. */
1598 #define INIT_BUF_SIZE 32
1600 /* Make sure we have at least N more bytes of space in buffer. */
1601 #define GET_BUFFER_SPACE(n) \
1602 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1605 /* Make sure we have one more byte of buffer space and then add C to it. */
1606 #define BUF_PUSH(c) \
1608 GET_BUFFER_SPACE (1); \
1609 *buf_end++ = (unsigned char) (c); \
1613 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1614 #define BUF_PUSH_2(c1, c2) \
1616 GET_BUFFER_SPACE (2); \
1617 *buf_end++ = (unsigned char) (c1); \
1618 *buf_end++ = (unsigned char) (c2); \
1622 /* As with BUF_PUSH_2, except for three bytes. */
1623 #define BUF_PUSH_3(c1, c2, c3) \
1625 GET_BUFFER_SPACE (3); \
1626 *buf_end++ = (unsigned char) (c1); \
1627 *buf_end++ = (unsigned char) (c2); \
1628 *buf_end++ = (unsigned char) (c3); \
1632 /* Store a jump with opcode OP at LOC to location TO. We store a
1633 relative address offset by the three bytes the jump itself occupies. */
1634 #define STORE_JUMP(op, loc, to) \
1635 store_op1 (op, loc, (to) - (loc) - 3)
1637 /* Likewise, for a two-argument jump. */
1638 #define STORE_JUMP2(op, loc, to, arg) \
1639 store_op2 (op, loc, (to) - (loc) - 3, arg)
1641 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1643 #define INSERT_JUMP(op, loc, to) \
1644 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1646 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1648 #define INSERT_JUMP2(op, loc, to, arg) \
1649 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1652 /* This is not an arbitrary limit: the arguments which represent offsets
1653 into the pattern are two bytes long. So if 2^16 bytes turns out to
1654 be too small, many things would have to change. */
1655 #define MAX_BUF_SIZE (1L << 16)
1658 /* Extend the buffer by twice its current size via realloc and
1659 reset the pointers that pointed into the old block to point to the
1660 correct places in the new one. If extending the buffer results in it
1661 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1662 #define EXTEND_BUFFER() \
1664 re_char *old_buffer = bufp->buffer; \
1665 if (bufp->allocated == MAX_BUF_SIZE) \
1667 bufp->allocated <<= 1; \
1668 if (bufp->allocated > MAX_BUF_SIZE) \
1669 bufp->allocated = MAX_BUF_SIZE; \
1670 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1671 if (bufp->buffer == NULL) \
1672 return REG_ESPACE; \
1673 /* If the buffer moved, move all the pointers into it. */ \
1674 if (old_buffer != bufp->buffer) \
1676 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1677 begalt = (begalt - old_buffer) + bufp->buffer; \
1678 if (fixup_alt_jump) \
1679 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1681 laststart = (laststart - old_buffer) + bufp->buffer; \
1682 if (pending_exact) \
1683 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1688 /* Since we have one byte reserved for the register number argument to
1689 {start,stop}_memory, the maximum number of groups we can report
1690 things about is what fits in that byte. */
1691 #define MAX_REGNUM 255
1693 /* But patterns can have more than `MAX_REGNUM' registers. We just
1694 ignore the excess. */
1695 typedef unsigned regnum_t;
1698 /* Macros for the compile stack. */
1700 /* Since offsets can go either forwards or backwards, this type needs to
1701 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1702 typedef int pattern_offset_t;
1706 pattern_offset_t begalt_offset;
1707 pattern_offset_t fixup_alt_jump;
1708 pattern_offset_t inner_group_offset;
1709 pattern_offset_t laststart_offset;
1711 } compile_stack_elt_t;
1716 compile_stack_elt_t *stack;
1718 unsigned avail; /* Offset of next open position. */
1719 } compile_stack_type;
1722 #define INIT_COMPILE_STACK_SIZE 32
1724 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1725 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1727 /* The next available element. */
1728 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1731 /* Set the bit for character C in a bit vector. */
1732 #define SET_LIST_BIT(c) \
1733 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1734 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1738 /* Set the "bit" for character C in a range table. */
1739 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1741 /* Set the "bit" for character c in the appropriate table. */
1742 #define SET_EITHER_BIT(c) \
1744 if (has_extended_chars) \
1745 SET_RANGETAB_BIT (c); \
1750 #else /* not MULE */
1752 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1757 /* Get the next unsigned number in the uncompiled pattern. */
1758 #define GET_UNSIGNED_NUMBER(num) \
1762 while (ISDIGIT (c)) \
1766 num = num * 10 + c - '0'; \
1774 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1776 #define IS_CHAR_CLASS(string) \
1777 (STREQ (string, "alpha") || STREQ (string, "upper") \
1778 || STREQ (string, "lower") || STREQ (string, "digit") \
1779 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1780 || STREQ (string, "space") || STREQ (string, "print") \
1781 || STREQ (string, "punct") || STREQ (string, "graph") \
1782 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1784 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1785 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1786 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1787 unsigned char *end);
1788 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1789 unsigned char *end);
1790 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1791 reg_syntax_t syntax);
1792 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1793 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1795 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1796 RE_TRANSLATE_TYPE translate,
1797 reg_syntax_t syntax,
1800 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1802 RE_TRANSLATE_TYPE translate,
1803 reg_syntax_t syntax,
1806 static re_bool group_match_null_string_p (unsigned char **p,
1808 register_info_type *reg_info);
1809 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1810 register_info_type *reg_info);
1811 static re_bool common_op_match_null_string_p (unsigned char **p,
1813 register_info_type *reg_info);
1814 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1815 REGISTER int len, RE_TRANSLATE_TYPE translate);
1816 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1817 re_char *string1, int size1,
1818 re_char *string2, int size2, int pos,
1819 struct re_registers *regs, int stop);
1821 #ifndef MATCH_MAY_ALLOCATE
1823 /* If we cannot allocate large objects within re_match_2_internal,
1824 we make the fail stack and register vectors global.
1825 The fail stack, we grow to the maximum size when a regexp
1827 The register vectors, we adjust in size each time we
1828 compile a regexp, according to the number of registers it needs. */
1830 static fail_stack_type fail_stack;
1832 /* Size with which the following vectors are currently allocated.
1833 That is so we can make them bigger as needed,
1834 but never make them smaller. */
1835 static int regs_allocated_size;
1837 static re_char ** regstart, ** regend;
1838 static re_char ** old_regstart, ** old_regend;
1839 static re_char **best_regstart, **best_regend;
1840 static register_info_type *reg_info;
1841 static re_char **reg_dummy;
1842 static register_info_type *reg_info_dummy;
1844 /* Make the register vectors big enough for NUM_REGS registers,
1845 but don't make them smaller. */
1848 regex_grow_registers (int num_regs)
1850 if (num_regs > regs_allocated_size)
1852 RETALLOC_IF (regstart, num_regs, re_char *);
1853 RETALLOC_IF (regend, num_regs, re_char *);
1854 RETALLOC_IF (old_regstart, num_regs, re_char *);
1855 RETALLOC_IF (old_regend, num_regs, re_char *);
1856 RETALLOC_IF (best_regstart, num_regs, re_char *);
1857 RETALLOC_IF (best_regend, num_regs, re_char *);
1858 RETALLOC_IF (reg_info, num_regs, register_info_type);
1859 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1860 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1862 regs_allocated_size = num_regs;
1866 #endif /* not MATCH_MAY_ALLOCATE */
1868 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1869 Returns one of error codes defined in `regex.h', or zero for success.
1871 Assumes the `allocated' (and perhaps `buffer') and `translate'
1872 fields are set in BUFP on entry.
1874 If it succeeds, results are put in BUFP (if it returns an error, the
1875 contents of BUFP are undefined):
1876 `buffer' is the compiled pattern;
1877 `syntax' is set to SYNTAX;
1878 `used' is set to the length of the compiled pattern;
1879 `fastmap_accurate' is zero;
1880 `re_nsub' is the number of subexpressions in PATTERN;
1881 `not_bol' and `not_eol' are zero;
1883 The `fastmap' and `newline_anchor' fields are neither
1884 examined nor set. */
1886 /* Return, freeing storage we allocated. */
1887 #define FREE_STACK_RETURN(value) \
1888 return (free (compile_stack.stack), value)
1890 static reg_errcode_t
1891 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1892 struct re_pattern_buffer *bufp)
1894 /* We fetch characters from PATTERN here. We declare these as int
1895 (or possibly long) so that chars above 127 can be used as
1896 array indices. The macros that fetch a character from the pattern
1897 make sure to coerce to unsigned char before assigning, so we won't
1898 get bitten by negative numbers here. */
1899 /* XEmacs change: used to be unsigned char. */
1900 REGISTER EMACS_INT c, c1;
1902 /* A random temporary spot in PATTERN. */
1905 /* Points to the end of the buffer, where we should append. */
1906 REGISTER unsigned char *buf_end;
1908 /* Keeps track of unclosed groups. */
1909 compile_stack_type compile_stack;
1911 /* Points to the current (ending) position in the pattern. */
1912 re_char *p = pattern;
1913 re_char *pend = pattern + size;
1915 /* How to translate the characters in the pattern. */
1916 RE_TRANSLATE_TYPE translate = bufp->translate;
1918 /* Address of the count-byte of the most recently inserted `exactn'
1919 command. This makes it possible to tell if a new exact-match
1920 character can be added to that command or if the character requires
1921 a new `exactn' command. */
1922 unsigned char *pending_exact = 0;
1924 /* Address of start of the most recently finished expression.
1925 This tells, e.g., postfix * where to find the start of its
1926 operand. Reset at the beginning of groups and alternatives. */
1927 unsigned char *laststart = 0;
1929 /* Address of beginning of regexp, or inside of last group. */
1930 unsigned char *begalt;
1932 /* Place in the uncompiled pattern (i.e., the {) to
1933 which to go back if the interval is invalid. */
1934 re_char *beg_interval;
1936 /* Address of the place where a forward jump should go to the end of
1937 the containing expression. Each alternative of an `or' -- except the
1938 last -- ends with a forward jump of this sort. */
1939 unsigned char *fixup_alt_jump = 0;
1941 /* Counts open-groups as they are encountered. Remembered for the
1942 matching close-group on the compile stack, so the same register
1943 number is put in the stop_memory as the start_memory. */
1944 regnum_t regnum = 0;
1947 DEBUG_PRINT1 ("\nCompiling pattern: ");
1952 for (debug_count = 0; debug_count < size; debug_count++)
1953 putchar (pattern[debug_count]);
1958 /* Initialize the compile stack. */
1959 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1960 if (compile_stack.stack == NULL)
1963 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1964 compile_stack.avail = 0;
1966 /* Initialize the pattern buffer. */
1967 bufp->syntax = syntax;
1968 bufp->fastmap_accurate = 0;
1969 bufp->not_bol = bufp->not_eol = 0;
1971 /* Set `used' to zero, so that if we return an error, the pattern
1972 printer (for debugging) will think there's no pattern. We reset it
1976 /* Always count groups, whether or not bufp->no_sub is set. */
1979 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1980 /* Initialize the syntax table. */
1981 init_syntax_once ();
1984 if (bufp->allocated == 0)
1987 { /* If zero allocated, but buffer is non-null, try to realloc
1988 enough space. This loses if buffer's address is bogus, but
1989 that is the user's responsibility. */
1990 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1993 { /* Caller did not allocate a buffer. Do it for them. */
1994 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1996 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1998 bufp->allocated = INIT_BUF_SIZE;
2001 begalt = buf_end = bufp->buffer;
2003 /* Loop through the uncompiled pattern until we're at the end. */
2012 if ( /* If at start of pattern, it's an operator. */
2014 /* If context independent, it's an operator. */
2015 || syntax & RE_CONTEXT_INDEP_ANCHORS
2016 /* Otherwise, depends on what's come before. */
2017 || at_begline_loc_p (pattern, p, syntax))
2027 if ( /* If at end of pattern, it's an operator. */
2029 /* If context independent, it's an operator. */
2030 || syntax & RE_CONTEXT_INDEP_ANCHORS
2031 /* Otherwise, depends on what's next. */
2032 || at_endline_loc_p (p, pend, syntax))
2042 if ((syntax & RE_BK_PLUS_QM)
2043 || (syntax & RE_LIMITED_OPS))
2047 /* If there is no previous pattern... */
2050 if (syntax & RE_CONTEXT_INVALID_OPS)
2051 FREE_STACK_RETURN (REG_BADRPT);
2052 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2057 /* true means zero/many matches are allowed. */
2058 re_bool zero_times_ok = c != '+';
2059 re_bool many_times_ok = c != '?';
2061 /* true means match shortest string possible. */
2062 re_bool minimal = false;
2064 /* If there is a sequence of repetition chars, collapse it
2065 down to just one (the right one). We can't combine
2066 interval operators with these because of, e.g., `a{2}*',
2067 which should only match an even number of `a's. */
2072 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2073 && (c == '+' || c == '?')))
2076 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2078 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2081 if (!(c1 == '+' || c1 == '?'))
2096 /* If we get here, we found another repeat character. */
2097 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2099 /* "*?" and "+?" and "??" are okay (and mean match
2100 minimally), but other sequences (such as "*??" and
2101 "+++") are rejected (reserved for future use). */
2102 if (minimal || c != '?')
2103 FREE_STACK_RETURN (REG_BADRPT);
2108 zero_times_ok |= c != '+';
2109 many_times_ok |= c != '?';
2113 /* Star, etc. applied to an empty pattern is equivalent
2114 to an empty pattern. */
2118 /* Now we know whether zero matches is allowed
2119 and whether two or more matches is allowed
2120 and whether we want minimal or maximal matching. */
2126 0: /on_failure_jump to 6
2131 GET_BUFFER_SPACE (6);
2132 INSERT_JUMP (jump, laststart, buf_end + 3);
2134 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2137 else if (zero_times_ok)
2142 6: /on_failure_jump to 3
2145 GET_BUFFER_SPACE (6);
2146 INSERT_JUMP (jump, laststart, buf_end + 3);
2148 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2155 3: /on_failure_jump to 0
2158 GET_BUFFER_SPACE (3);
2159 STORE_JUMP (on_failure_jump, buf_end, laststart);
2165 /* Are we optimizing this jump? */
2166 re_bool keep_string_p = false;
2169 { /* More than one repetition is allowed, so put in
2170 at the end a backward relative jump from
2171 `buf_end' to before the next jump we're going
2172 to put in below (which jumps from laststart to
2175 But if we are at the `*' in the exact sequence `.*\n',
2176 insert an unconditional jump backwards to the .,
2177 instead of the beginning of the loop. This way we only
2178 push a failure point once, instead of every time
2179 through the loop. */
2180 assert (p - 1 > pattern);
2182 /* Allocate the space for the jump. */
2183 GET_BUFFER_SPACE (3);
2185 /* We know we are not at the first character of the
2186 pattern, because laststart was nonzero. And we've
2187 already incremented `p', by the way, to be the
2188 character after the `*'. Do we have to do something
2189 analogous here for null bytes, because of
2193 && p < pend && *p == '\n'
2194 && !(syntax & RE_DOT_NEWLINE))
2195 { /* We have .*\n. */
2196 STORE_JUMP (jump, buf_end, laststart);
2197 keep_string_p = true;
2200 /* Anything else. */
2201 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2203 /* We've added more stuff to the buffer. */
2207 /* On failure, jump from laststart to buf_end + 3,
2208 which will be the end of the buffer after this jump
2210 GET_BUFFER_SPACE (3);
2211 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2213 laststart, buf_end + 3);
2218 /* At least one repetition is required, so insert a
2219 `dummy_failure_jump' before the initial
2220 `on_failure_jump' instruction of the loop. This
2221 effects a skip over that instruction the first time
2222 we hit that loop. */
2223 GET_BUFFER_SPACE (3);
2224 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2234 laststart = buf_end;
2241 /* XEmacs change: this whole section */
2242 re_bool had_char_class = false;
2244 re_bool has_extended_chars = false;
2245 REGISTER Lisp_Object rtab = Qnil;
2248 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2250 /* Ensure that we have enough space to push a charset: the
2251 opcode, the length count, and the bitset; 34 bytes in all. */
2252 GET_BUFFER_SPACE (34);
2254 laststart = buf_end;
2256 /* We test `*p == '^' twice, instead of using an if
2257 statement, so we only need one BUF_PUSH. */
2258 BUF_PUSH (*p == '^' ? charset_not : charset);
2262 /* Remember the first position in the bracket expression. */
2265 /* Push the number of bytes in the bitmap. */
2266 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2268 /* Clear the whole map. */
2269 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2271 /* charset_not matches newline according to a syntax bit. */
2272 if ((re_opcode_t) buf_end[-2] == charset_not
2273 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2274 SET_LIST_BIT ('\n');
2277 start_over_with_extended:
2278 if (has_extended_chars)
2280 /* There are extended chars here, which means we need to start
2281 over and shift to unified range-table format. */
2282 if (buf_end[-2] == charset)
2283 buf_end[-2] = charset_mule;
2285 buf_end[-2] = charset_mule_not;
2287 p = p1; /* go back to the beginning of the charset, after
2289 rtab = Vthe_lisp_rangetab;
2290 Fclear_range_table (rtab);
2292 /* charset_not matches newline according to a syntax bit. */
2293 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2294 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2295 SET_EITHER_BIT ('\n');
2299 /* Read in characters and ranges, setting map bits. */
2302 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2307 if (c >= 0x80 && !has_extended_chars)
2309 has_extended_chars = 1;
2310 /* Frumble-bumble, we've found some extended chars.
2311 Need to start over, process everything using
2312 the general extended-char mechanism, and need
2313 to use charset_mule and charset_mule_not instead
2314 of charset and charset_not. */
2315 goto start_over_with_extended;
2318 /* \ might escape characters inside [...] and [^...]. */
2319 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2321 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2325 if (c1 >= 0x80 && !has_extended_chars)
2327 has_extended_chars = 1;
2328 goto start_over_with_extended;
2331 SET_EITHER_BIT (c1);
2335 /* Could be the end of the bracket expression. If it's
2336 not (i.e., when the bracket expression is `[]' so
2337 far), the ']' character bit gets set way below. */
2338 if (c == ']' && p != p1 + 1)
2341 /* Look ahead to see if it's a range when the last thing
2342 was a character class. */
2343 if (had_char_class && c == '-' && *p != ']')
2344 FREE_STACK_RETURN (REG_ERANGE);
2346 /* Look ahead to see if it's a range when the last thing
2347 was a character: if this is a hyphen not at the
2348 beginning or the end of a list, then it's the range
2351 && !(p - 2 >= pattern && p[-2] == '[')
2352 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2358 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2360 has_extended_chars = 1;
2361 goto start_over_with_extended;
2363 if (has_extended_chars)
2364 ret = compile_extended_range (&p, pend, translate,
2368 ret = compile_range (&p, pend, translate, syntax, buf_end);
2369 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2372 else if (p[0] == '-' && p[1] != ']')
2373 { /* This handles ranges made up of characters only. */
2376 /* Move past the `-'. */
2380 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2382 has_extended_chars = 1;
2383 goto start_over_with_extended;
2385 if (has_extended_chars)
2386 ret = compile_extended_range (&p, pend, translate,
2390 ret = compile_range (&p, pend, translate, syntax, buf_end);
2391 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2394 /* See if we're at the beginning of a possible character
2397 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2398 { /* Leave room for the null. */
2399 char str[CHAR_CLASS_MAX_LENGTH + 1];
2404 /* If pattern is `[[:'. */
2405 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2409 /* #### This code is unused.
2410 Correctness is not checked after TRT
2413 if (c == ':' || c == ']' || p == pend
2414 || c1 == CHAR_CLASS_MAX_LENGTH)
2416 str[c1++] = (char) c;
2420 /* If isn't a word bracketed by `[:' and `:]':
2421 undo the ending character, the letters, and leave
2422 the leading `:' and `[' (but set bits for them). */
2423 if (c == ':' && *p == ']')
2426 re_bool is_alnum = STREQ (str, "alnum");
2427 re_bool is_alpha = STREQ (str, "alpha");
2428 re_bool is_blank = STREQ (str, "blank");
2429 re_bool is_cntrl = STREQ (str, "cntrl");
2430 re_bool is_digit = STREQ (str, "digit");
2431 re_bool is_graph = STREQ (str, "graph");
2432 re_bool is_lower = STREQ (str, "lower");
2433 re_bool is_print = STREQ (str, "print");
2434 re_bool is_punct = STREQ (str, "punct");
2435 re_bool is_space = STREQ (str, "space");
2436 re_bool is_upper = STREQ (str, "upper");
2437 re_bool is_xdigit = STREQ (str, "xdigit");
2439 if (!IS_CHAR_CLASS (str))
2440 FREE_STACK_RETURN (REG_ECTYPE);
2442 /* Throw away the ] at the end of the character
2446 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2448 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2450 /* This was split into 3 if's to
2451 avoid an arbitrary limit in some compiler. */
2452 if ( (is_alnum && ISALNUM (ch))
2453 || (is_alpha && ISALPHA (ch))
2454 || (is_blank && ISBLANK (ch))
2455 || (is_cntrl && ISCNTRL (ch)))
2456 SET_EITHER_BIT (ch);
2457 if ( (is_digit && ISDIGIT (ch))
2458 || (is_graph && ISGRAPH (ch))
2459 || (is_lower && ISLOWER (ch))
2460 || (is_print && ISPRINT (ch)))
2461 SET_EITHER_BIT (ch);
2462 if ( (is_punct && ISPUNCT (ch))
2463 || (is_space && ISSPACE (ch))
2464 || (is_upper && ISUPPER (ch))
2465 || (is_xdigit && ISXDIGIT (ch)))
2466 SET_EITHER_BIT (ch);
2468 had_char_class = true;
2475 SET_EITHER_BIT ('[');
2476 SET_EITHER_BIT (':');
2477 had_char_class = false;
2482 had_char_class = false;
2488 if (has_extended_chars)
2490 /* We have a range table, not a bit vector. */
2492 unified_range_table_bytes_needed (rtab);
2493 GET_BUFFER_SPACE (bytes_needed);
2494 unified_range_table_copy_data (rtab, buf_end);
2495 buf_end += unified_range_table_bytes_used (buf_end);
2499 /* Discard any (non)matching list bytes that are all 0 at the
2500 end of the map. Decrease the map-length byte too. */
2501 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2503 buf_end += buf_end[-1];
2509 if (syntax & RE_NO_BK_PARENS)
2516 if (syntax & RE_NO_BK_PARENS)
2523 if (syntax & RE_NEWLINE_ALT)
2530 if (syntax & RE_NO_BK_VBAR)
2537 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2538 goto handle_interval;
2544 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2546 /* Do not translate the character after the \, so that we can
2547 distinguish, e.g., \B from \b, even if we normally would
2548 translate, e.g., B to b. */
2554 if (syntax & RE_NO_BK_PARENS)
2555 goto normal_backslash;
2561 if (!(syntax & RE_NO_SHY_GROUPS)
2569 case ':': /* shy groups */
2573 /* All others are reserved for future constructs. */
2575 FREE_STACK_RETURN (REG_BADPAT);
2584 if (COMPILE_STACK_FULL)
2586 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2587 compile_stack_elt_t);
2588 if (compile_stack.stack == NULL) return REG_ESPACE;
2590 compile_stack.size <<= 1;
2593 /* These are the values to restore when we hit end of this
2594 group. They are all relative offsets, so that if the
2595 whole pattern moves because of realloc, they will still
2597 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2598 COMPILE_STACK_TOP.fixup_alt_jump
2599 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2600 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2601 COMPILE_STACK_TOP.regnum = r;
2603 /* We will eventually replace the 0 with the number of
2604 groups inner to this one. But do not push a
2605 start_memory for groups beyond the last one we can
2606 represent in the compiled pattern. */
2607 if (r <= MAX_REGNUM)
2609 COMPILE_STACK_TOP.inner_group_offset
2610 = buf_end - bufp->buffer + 2;
2611 BUF_PUSH_3 (start_memory, r, 0);
2614 compile_stack.avail++;
2619 /* If we've reached MAX_REGNUM groups, then this open
2620 won't actually generate any code, so we'll have to
2621 clear pending_exact explicitly. */
2628 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2630 if (COMPILE_STACK_EMPTY) {
2631 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2632 goto normal_backslash;
2634 FREE_STACK_RETURN (REG_ERPAREN);
2639 { /* Push a dummy failure point at the end of the
2640 alternative for a possible future
2641 `pop_failure_jump' to pop. See comments at
2642 `push_dummy_failure' in `re_match_2'. */
2643 BUF_PUSH (push_dummy_failure);
2645 /* We allocated space for this jump when we assigned
2646 to `fixup_alt_jump', in the `handle_alt' case below. */
2647 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2650 /* See similar code for backslashed left paren above. */
2651 if (COMPILE_STACK_EMPTY) {
2652 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2655 FREE_STACK_RETURN (REG_ERPAREN);
2658 /* Since we just checked for an empty stack above, this
2659 ``can't happen''. */
2660 assert (compile_stack.avail != 0);
2662 /* We don't just want to restore into `regnum', because
2663 later groups should continue to be numbered higher,
2664 as in `(ab)c(de)' -- the second group is #2. */
2665 regnum_t this_group_regnum;
2667 compile_stack.avail--;
2668 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2670 = COMPILE_STACK_TOP.fixup_alt_jump
2671 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2673 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2674 this_group_regnum = COMPILE_STACK_TOP.regnum;
2675 /* If we've reached MAX_REGNUM groups, then this open
2676 won't actually generate any code, so we'll have to
2677 clear pending_exact explicitly. */
2680 /* We're at the end of the group, so now we know how many
2681 groups were inside this one. */
2682 if (this_group_regnum <= MAX_REGNUM)
2684 unsigned char *inner_group_loc
2685 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2687 *inner_group_loc = regnum - this_group_regnum;
2688 BUF_PUSH_3 (stop_memory, this_group_regnum,
2689 regnum - this_group_regnum);
2695 case '|': /* `\|'. */
2696 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2697 goto normal_backslash;
2699 if (syntax & RE_LIMITED_OPS)
2702 /* Insert before the previous alternative a jump which
2703 jumps to this alternative if the former fails. */
2704 GET_BUFFER_SPACE (3);
2705 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2709 /* The alternative before this one has a jump after it
2710 which gets executed if it gets matched. Adjust that
2711 jump so it will jump to this alternative's analogous
2712 jump (put in below, which in turn will jump to the next
2713 (if any) alternative's such jump, etc.). The last such
2714 jump jumps to the correct final destination. A picture:
2720 If we are at `b', then fixup_alt_jump right now points to a
2721 three-byte space after `a'. We'll put in the jump, set
2722 fixup_alt_jump to right after `b', and leave behind three
2723 bytes which we'll fill in when we get to after `c'. */
2726 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2728 /* Mark and leave space for a jump after this alternative,
2729 to be filled in later either by next alternative or
2730 when know we're at the end of a series of alternatives. */
2731 fixup_alt_jump = buf_end;
2732 GET_BUFFER_SPACE (3);
2741 /* If \{ is a literal. */
2742 if (!(syntax & RE_INTERVALS)
2743 /* If we're at `\{' and it's not the open-interval
2745 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2746 || (p - 2 == pattern && p == pend))
2747 goto normal_backslash;
2751 /* If got here, then the syntax allows intervals. */
2753 /* At least (most) this many matches must be made. */
2754 int lower_bound = -1, upper_bound = -1;
2756 beg_interval = p - 1;
2760 if (syntax & RE_NO_BK_BRACES)
2761 goto unfetch_interval;
2763 FREE_STACK_RETURN (REG_EBRACE);
2766 GET_UNSIGNED_NUMBER (lower_bound);
2770 GET_UNSIGNED_NUMBER (upper_bound);
2771 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2774 /* Interval such as `{1}' => match exactly once. */
2775 upper_bound = lower_bound;
2777 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2778 || lower_bound > upper_bound)
2780 if (syntax & RE_NO_BK_BRACES)
2781 goto unfetch_interval;
2783 FREE_STACK_RETURN (REG_BADBR);
2786 if (!(syntax & RE_NO_BK_BRACES))
2788 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2795 if (syntax & RE_NO_BK_BRACES)
2796 goto unfetch_interval;
2798 FREE_STACK_RETURN (REG_BADBR);
2801 /* We just parsed a valid interval. */
2803 /* If it's invalid to have no preceding re. */
2806 if (syntax & RE_CONTEXT_INVALID_OPS)
2807 FREE_STACK_RETURN (REG_BADRPT);
2808 else if (syntax & RE_CONTEXT_INDEP_OPS)
2809 laststart = buf_end;
2811 goto unfetch_interval;
2814 /* If the upper bound is zero, don't want to succeed at
2815 all; jump from `laststart' to `b + 3', which will be
2816 the end of the buffer after we insert the jump. */
2817 if (upper_bound == 0)
2819 GET_BUFFER_SPACE (3);
2820 INSERT_JUMP (jump, laststart, buf_end + 3);
2824 /* Otherwise, we have a nontrivial interval. When
2825 we're all done, the pattern will look like:
2826 set_number_at <jump count> <upper bound>
2827 set_number_at <succeed_n count> <lower bound>
2828 succeed_n <after jump addr> <succeed_n count>
2830 jump_n <succeed_n addr> <jump count>
2831 (The upper bound and `jump_n' are omitted if
2832 `upper_bound' is 1, though.) */
2834 { /* If the upper bound is > 1, we need to insert
2835 more at the end of the loop. */
2836 Memory_count nbytes = 10 + (upper_bound > 1) * 10;
2838 GET_BUFFER_SPACE (nbytes);
2840 /* Initialize lower bound of the `succeed_n', even
2841 though it will be set during matching by its
2842 attendant `set_number_at' (inserted next),
2843 because `re_compile_fastmap' needs to know.
2844 Jump to the `jump_n' we might insert below. */
2845 INSERT_JUMP2 (succeed_n, laststart,
2846 buf_end + 5 + (upper_bound > 1) * 5,
2850 /* Code to initialize the lower bound. Insert
2851 before the `succeed_n'. The `5' is the last two
2852 bytes of this `set_number_at', plus 3 bytes of
2853 the following `succeed_n'. */
2854 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2857 if (upper_bound > 1)
2858 { /* More than one repetition is allowed, so
2859 append a backward jump to the `succeed_n'
2860 that starts this interval.
2862 When we've reached this during matching,
2863 we'll have matched the interval once, so
2864 jump back only `upper_bound - 1' times. */
2865 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2869 /* The location we want to set is the second
2870 parameter of the `jump_n'; that is `b-2' as
2871 an absolute address. `laststart' will be
2872 the `set_number_at' we're about to insert;
2873 `laststart+3' the number to set, the source
2874 for the relative address. But we are
2875 inserting into the middle of the pattern --
2876 so everything is getting moved up by 5.
2877 Conclusion: (b - 2) - (laststart + 3) + 5,
2878 i.e., b - laststart.
2880 We insert this at the beginning of the loop
2881 so that if we fail during matching, we'll
2882 reinitialize the bounds. */
2883 insert_op2 (set_number_at, laststart,
2884 buf_end - laststart,
2885 upper_bound - 1, buf_end);
2890 beg_interval = NULL;
2895 /* If an invalid interval, match the characters as literals. */
2896 assert (beg_interval);
2898 beg_interval = NULL;
2900 /* normal_char and normal_backslash need `c'. */
2903 if (!(syntax & RE_NO_BK_BRACES))
2905 if (p > pattern && p[-1] == '\\')
2906 goto normal_backslash;
2911 /* There is no way to specify the before_dot and after_dot
2912 operators. rms says this is ok. --karl */
2918 laststart = buf_end;
2920 /* XEmacs addition */
2921 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2922 FREE_STACK_RETURN (REG_ESYNTAX);
2923 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2927 laststart = buf_end;
2929 /* XEmacs addition */
2930 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2931 FREE_STACK_RETURN (REG_ESYNTAX);
2932 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2936 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2938 laststart = buf_end;
2940 if (c < 32 || c > 127)
2941 FREE_STACK_RETURN (REG_ECATEGORY);
2942 BUF_PUSH_2 (categoryspec, c);
2946 laststart = buf_end;
2948 if (c < 32 || c > 127)
2949 FREE_STACK_RETURN (REG_ECATEGORY);
2950 BUF_PUSH_2 (notcategoryspec, c);
2952 /* end of category patch */
2958 laststart = buf_end;
2959 BUF_PUSH (wordchar);
2964 laststart = buf_end;
2965 BUF_PUSH (notwordchar);
2978 BUF_PUSH (wordbound);
2982 BUF_PUSH (notwordbound);
2993 case '1': case '2': case '3': case '4': case '5':
2994 case '6': case '7': case '8': case '9':
2997 if (syntax & RE_NO_BK_REFS)
3003 FREE_STACK_RETURN (REG_ESUBREG);
3005 /* Can't back reference to a subexpression if inside of it. */
3006 if (group_in_compile_stack (compile_stack, reg))
3009 laststart = buf_end;
3010 BUF_PUSH_2 (duplicate, reg);
3017 if (syntax & RE_BK_PLUS_QM)
3020 goto normal_backslash;
3024 /* You might think it would be useful for \ to mean
3025 not to translate; but if we don't translate it,
3026 it will never match anything. */
3034 /* Expects the character in `c'. */
3035 /* `p' points to the location after where `c' came from. */
3038 /* XEmacs: modifications here for Mule. */
3039 /* `q' points to the beginning of the next char. */
3042 /* If no exactn currently being built. */
3045 /* If last exactn not at current position. */
3046 || pending_exact + *pending_exact + 1 != buf_end
3048 /* We have only one byte following the exactn for the count. */
3049 || ((unsigned int) (*pending_exact + (q - p)) >=
3050 ((unsigned int) (1 << BYTEWIDTH) - 1))
3052 /* If followed by a repetition operator. */
3053 || *q == '*' || *q == '^'
3054 || ((syntax & RE_BK_PLUS_QM)
3055 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3056 : (*q == '+' || *q == '?'))
3057 || ((syntax & RE_INTERVALS)
3058 && ((syntax & RE_NO_BK_BRACES)
3060 : (q[0] == '\\' && q[1] == '{'))))
3062 /* Start building a new exactn. */
3064 laststart = buf_end;
3066 BUF_PUSH_2 (exactn, 0);
3067 pending_exact = buf_end - 1;
3076 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3079 bt_count = set_charptr_emchar (tmp_buf, c);
3081 for (i = 0; i < bt_count; i++)
3083 BUF_PUSH (tmp_buf[i]);
3091 } /* while p != pend */
3094 /* Through the pattern now. */
3097 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3099 if (!COMPILE_STACK_EMPTY)
3100 FREE_STACK_RETURN (REG_EPAREN);
3102 /* If we don't want backtracking, force success
3103 the first time we reach the end of the compiled pattern. */
3104 if (syntax & RE_NO_POSIX_BACKTRACKING)
3107 free (compile_stack.stack);
3109 /* We have succeeded; set the length of the buffer. */
3110 bufp->used = buf_end - bufp->buffer;
3115 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3116 print_compiled_pattern (bufp);
3120 #ifndef MATCH_MAY_ALLOCATE
3121 /* Initialize the failure stack to the largest possible stack. This
3122 isn't necessary unless we're trying to avoid calling alloca in
3123 the search and match routines. */
3125 int num_regs = bufp->re_nsub + 1;
3127 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3128 is strictly greater than re_max_failures, the largest possible stack
3129 is 2 * re_max_failures failure points. */
3130 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3132 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3135 if (! fail_stack.stack)
3137 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3138 * sizeof (fail_stack_elt_t));
3141 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3143 * sizeof (fail_stack_elt_t)));
3144 #else /* not emacs */
3145 if (! fail_stack.stack)
3147 = (fail_stack_elt_t *) malloc (fail_stack.size
3148 * sizeof (fail_stack_elt_t));
3151 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3153 * sizeof (fail_stack_elt_t)));
3157 regex_grow_registers (num_regs);
3159 #endif /* not MATCH_MAY_ALLOCATE */
3162 } /* regex_compile */
3164 /* Subroutines for `regex_compile'. */
3166 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3169 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3171 *loc = (unsigned char) op;
3172 STORE_NUMBER (loc + 1, arg);
3176 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3179 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3181 *loc = (unsigned char) op;
3182 STORE_NUMBER (loc + 1, arg1);
3183 STORE_NUMBER (loc + 3, arg2);
3187 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3188 for OP followed by two-byte integer parameter ARG. */
3191 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3193 REGISTER unsigned char *pfrom = end;
3194 REGISTER unsigned char *pto = end + 3;
3196 while (pfrom != loc)
3199 store_op1 (op, loc, arg);
3203 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3206 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3209 REGISTER unsigned char *pfrom = end;
3210 REGISTER unsigned char *pto = end + 5;
3212 while (pfrom != loc)
3215 store_op2 (op, loc, arg1, arg2);
3219 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3220 after an alternative or a begin-subexpression. We assume there is at
3221 least one character before the ^. */
3224 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3226 re_char *prev = p - 2;
3227 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3230 /* After a subexpression? */
3231 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3232 /* After an alternative? */
3233 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3237 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3238 at least one character after the $, i.e., `P < PEND'. */
3241 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3244 re_bool next_backslash = *next == '\\';
3245 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3248 /* Before a subexpression? */
3249 (syntax & RE_NO_BK_PARENS ? *next == ')'
3250 : next_backslash && next_next && *next_next == ')')
3251 /* Before an alternative? */
3252 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3253 : next_backslash && next_next && *next_next == '|');
3257 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3258 false if it's not. */
3261 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3265 for (this_element = compile_stack.avail - 1;
3268 if (compile_stack.stack[this_element].regnum == regnum)
3275 /* Read the ending character of a range (in a bracket expression) from the
3276 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3277 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3278 Then we set the translation of all bits between the starting and
3279 ending characters (inclusive) in the compiled pattern B.
3281 Return an error code.
3283 We use these short variable names so we can use the same macros as
3284 `regex_compile' itself. */
3286 static reg_errcode_t
3287 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3288 reg_syntax_t syntax, unsigned char *buf_end)
3290 Element_count this_char;
3292 re_char *p = *p_ptr;
3293 int range_start, range_end;
3298 /* Even though the pattern is a signed `char *', we need to fetch
3299 with unsigned char *'s; if the high bit of the pattern character
3300 is set, the range endpoints will be negative if we fetch using a
3303 We also want to fetch the endpoints without translating them; the
3304 appropriate translation is done in the bit-setting loop below. */
3305 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3306 range_start = ((const unsigned char *) p)[-2];
3307 range_end = ((const unsigned char *) p)[0];
3309 /* Have to increment the pointer into the pattern string, so the
3310 caller isn't still at the ending character. */
3313 /* If the start is after the end, the range is empty. */
3314 if (range_start > range_end)
3315 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3317 /* Here we see why `this_char' has to be larger than an `unsigned
3318 char' -- the range is inclusive, so if `range_end' == 0xff
3319 (assuming 8-bit characters), we would otherwise go into an infinite
3320 loop, since all characters <= 0xff. */
3321 for (this_char = range_start; this_char <= range_end; this_char++)
3323 SET_LIST_BIT (TRANSLATE (this_char));
3331 static reg_errcode_t
3332 compile_extended_range (re_char **p_ptr, re_char *pend,
3333 RE_TRANSLATE_TYPE translate,
3334 reg_syntax_t syntax, Lisp_Object rtab)
3336 Emchar this_char, range_start, range_end;
3342 p = (const Bufbyte *) *p_ptr;
3343 range_end = charptr_emchar (p);
3344 p--; /* back to '-' */
3345 DEC_CHARPTR (p); /* back to start of range */
3346 /* We also want to fetch the endpoints without translating them; the
3347 appropriate translation is done in the bit-setting loop below. */
3348 range_start = charptr_emchar (p);
3349 INC_CHARPTR (*p_ptr);
3351 /* If the start is after the end, the range is empty. */
3352 if (range_start > range_end)
3353 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3355 /* Can't have ranges spanning different charsets, except maybe for
3356 ranges entirely within the first 256 chars. */
3358 if ((range_start >= 0x100 || range_end >= 0x100)
3359 && CHAR_LEADING_BYTE (range_start) !=
3360 CHAR_LEADING_BYTE (range_end))
3361 return REG_ERANGESPAN;
3363 /* As advertised, translations only work over the 0 - 0x7F range.
3364 Making this kind of stuff work generally is much harder.
3365 Iterating over the whole range like this would be way efficient
3366 if the range encompasses 10,000 chars or something. You'd have
3367 to do something like this:
3371 map over translation table in [range_start, range_end] of
3372 (put the mapped range in a;
3373 put the translation in b)
3374 invert the range in a and truncate to [range_start, range_end]
3375 compute the union of a, b
3376 union the result into rtab
3378 for (this_char = range_start;
3379 this_char <= range_end && this_char < 0x80; this_char++)
3381 SET_RANGETAB_BIT (TRANSLATE (this_char));
3384 if (this_char <= range_end)
3385 put_range_table (rtab, this_char, range_end, Qt);
3392 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3393 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3394 characters can start a string that matches the pattern. This fastmap
3395 is used by re_search to skip quickly over impossible starting points.
3397 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3398 area as BUFP->fastmap.
3400 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3403 Returns 0 if we succeed, -2 if an internal error. */
3406 re_compile_fastmap (struct re_pattern_buffer *bufp)
3409 #ifdef MATCH_MAY_ALLOCATE
3410 fail_stack_type fail_stack;
3412 DECLARE_DESTINATION;
3413 /* We don't push any register information onto the failure stack. */
3415 REGISTER char *fastmap = bufp->fastmap;
3416 unsigned char *pattern = bufp->buffer;
3417 unsigned long size = bufp->used;
3418 unsigned char *p = pattern;
3419 REGISTER unsigned char *pend = pattern + size;
3422 /* This holds the pointer to the failure stack, when
3423 it is allocated relocatably. */
3424 fail_stack_elt_t *failure_stack_ptr;
3427 /* Assume that each path through the pattern can be null until
3428 proven otherwise. We set this false at the bottom of switch
3429 statement, to which we get only if a particular path doesn't
3430 match the empty string. */
3431 re_bool path_can_be_null = true;
3433 /* We aren't doing a `succeed_n' to begin with. */
3434 re_bool succeed_n_p = false;
3436 assert (fastmap != NULL && p != NULL);
3439 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3440 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3441 bufp->can_be_null = 0;
3445 if (p == pend || *p == succeed)
3447 /* We have reached the (effective) end of pattern. */
3448 if (!FAIL_STACK_EMPTY ())
3450 bufp->can_be_null |= path_can_be_null;
3452 /* Reset for next path. */
3453 path_can_be_null = true;
3455 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3463 /* We should never be about to go beyond the end of the pattern. */
3466 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3469 /* I guess the idea here is to simply not bother with a fastmap
3470 if a backreference is used, since it's too hard to figure out
3471 the fastmap for the corresponding group. Setting
3472 `can_be_null' stops `re_search_2' from using the fastmap, so
3473 that is all we do. */
3475 bufp->can_be_null = 1;
3479 /* Following are the cases which match a character. These end
3488 /* XEmacs: Under Mule, these bit vectors will
3489 only contain values for characters below 0x80. */
3490 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3491 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3497 /* Chars beyond end of map must be allowed. */
3499 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3501 /* And all extended characters must be allowed, too. */
3502 for (j = 0x80; j < 0xA0; j++)
3504 #else /* not MULE */
3505 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3509 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3510 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3520 nentries = unified_range_table_nentries (p);
3521 for (i = 0; i < nentries; i++)
3523 EMACS_INT first, last;
3524 Lisp_Object dummy_val;
3526 Bufbyte strr[MAX_EMCHAR_LEN];
3528 unified_range_table_get_range (p, i, &first, &last,
3530 for (jj = first; jj <= last && jj < 0x80; jj++)
3532 /* Ranges below 0x100 can span charsets, but there
3533 are only two (Control-1 and Latin-1), and
3534 either first or last has to be in them. */
3535 set_charptr_emchar (strr, first);
3539 set_charptr_emchar (strr, last);
3546 case charset_mule_not:
3551 nentries = unified_range_table_nentries (p);
3552 for (i = 0; i < nentries; i++)
3554 EMACS_INT first, last;
3555 Lisp_Object dummy_val;
3557 int smallest_prev = 0;
3559 unified_range_table_get_range (p, i, &first, &last,
3561 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3563 smallest_prev = last + 1;
3564 if (smallest_prev >= 0x80)
3567 /* Calculating which leading bytes are actually allowed
3568 here is rather difficult, so we just punt and allow
3570 for (i = 0x80; i < 0xA0; i++)
3582 for (j = 0; j < (1 << BYTEWIDTH); j++)
3585 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3594 goto matchnotsyntax;
3596 for (j = 0; j < (1 << BYTEWIDTH); j++)
3599 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3607 int fastmap_newline = fastmap['\n'];
3609 /* `.' matches anything ... */
3611 /* "anything" only includes bytes that can be the
3612 first byte of a character. */
3613 for (j = 0; j < 0xA0; j++)
3616 for (j = 0; j < (1 << BYTEWIDTH); j++)
3620 /* ... except perhaps newline. */
3621 if (!(bufp->syntax & RE_DOT_NEWLINE))
3622 fastmap['\n'] = fastmap_newline;
3624 /* Return if we have already set `can_be_null'; if we have,
3625 then the fastmap is irrelevant. Something's wrong here. */
3626 else if (bufp->can_be_null)
3629 /* Otherwise, have to check alternative paths. */
3640 /* This match depends on text properties. These end with
3641 aborting optimizations. */
3642 bufp->can_be_null = 1;
3646 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3652 for (j = 0; j < 0x80; j++)
3655 (regex_emacs_buffer->mirror_syntax_table), j) ==
3656 (enum syntaxcode) k)
3658 for (j = 0x80; j < 0xA0; j++)
3660 if (LEADING_BYTE_PREFIX_P(j))
3661 /* too complicated to calculate this right */
3668 cset = CHARSET_BY_LEADING_BYTE (j);
3669 if (CHARSETP (cset))
3671 if (charset_syntax (regex_emacs_buffer, cset,
3673 == Sword || multi_p)
3678 #else /* not MULE */
3679 for (j = 0; j < (1 << BYTEWIDTH); j++)
3682 (regex_emacs_buffer->mirror_syntax_table), j) ==
3683 (enum syntaxcode) k)
3689 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3695 for (j = 0; j < 0x80; j++)
3698 (regex_emacs_buffer->mirror_syntax_table), j) !=
3699 (enum syntaxcode) k)
3701 for (j = 0x80; j < 0xA0; j++)
3703 if (LEADING_BYTE_PREFIX_P(j))
3704 /* too complicated to calculate this right */
3711 cset = CHARSET_BY_LEADING_BYTE (j);
3712 if (CHARSETP (cset))
3714 if (charset_syntax (regex_emacs_buffer, cset,
3716 != Sword || multi_p)
3721 #else /* not MULE */
3722 for (j = 0; j < (1 << BYTEWIDTH); j++)
3725 (regex_emacs_buffer->mirror_syntax_table), j) !=
3726 (enum syntaxcode) k)
3733 /* 97/2/17 jhod category patch */
3735 case notcategoryspec:
3736 bufp->can_be_null = 1;
3738 /* end if category patch */
3741 /* All cases after this match the empty string. These end with
3749 #endif /* not emacs */
3763 case push_dummy_failure:
3768 case pop_failure_jump:
3769 case maybe_pop_jump:
3772 case dummy_failure_jump:
3773 EXTRACT_NUMBER_AND_INCR (j, p);
3778 /* Jump backward implies we just went through the body of a
3779 loop and matched nothing. Opcode jumped to should be
3780 `on_failure_jump' or `succeed_n'. Just treat it like an
3781 ordinary jump. For a * loop, it has pushed its failure
3782 point already; if so, discard that as redundant. */
3783 if ((re_opcode_t) *p != on_failure_jump
3784 && (re_opcode_t) *p != succeed_n)
3788 EXTRACT_NUMBER_AND_INCR (j, p);
3791 /* If what's on the stack is where we are now, pop it. */
3792 if (!FAIL_STACK_EMPTY ()
3793 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3799 case on_failure_jump:
3800 case on_failure_keep_string_jump:
3801 handle_on_failure_jump:
3802 EXTRACT_NUMBER_AND_INCR (j, p);
3804 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3805 end of the pattern. We don't want to push such a point,
3806 since when we restore it above, entering the switch will
3807 increment `p' past the end of the pattern. We don't need
3808 to push such a point since we obviously won't find any more
3809 fastmap entries beyond `pend'. Such a pattern can match
3810 the null string, though. */
3813 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3815 RESET_FAIL_STACK ();
3820 bufp->can_be_null = 1;
3824 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3825 succeed_n_p = false;
3832 /* Get to the number of times to succeed. */
3835 /* Increment p past the n for when k != 0. */
3836 EXTRACT_NUMBER_AND_INCR (k, p);
3840 succeed_n_p = true; /* Spaghetti code alert. */
3841 goto handle_on_failure_jump;
3858 abort (); /* We have listed all the cases. */
3861 /* Getting here means we have found the possible starting
3862 characters for one path of the pattern -- and that the empty
3863 string does not match. We need not follow this path further.
3864 Instead, look at the next alternative (remembered on the
3865 stack), or quit if no more. The test at the top of the loop
3866 does these things. */
3867 path_can_be_null = false;
3871 /* Set `can_be_null' for the last path (also the first path, if the
3872 pattern is empty). */
3873 bufp->can_be_null |= path_can_be_null;
3876 RESET_FAIL_STACK ();
3878 } /* re_compile_fastmap */
3880 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3881 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3882 this memory for recording register information. STARTS and ENDS
3883 must be allocated using the malloc library routine, and must each
3884 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3886 If NUM_REGS == 0, then subsequent matches should allocate their own
3889 Unless this function is called, the first search or match using
3890 PATTERN_BUFFER will allocate its own register data, without
3891 freeing the old data. */
3894 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3895 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3899 bufp->regs_allocated = REGS_REALLOCATE;
3900 regs->num_regs = num_regs;
3901 regs->start = starts;
3906 bufp->regs_allocated = REGS_UNALLOCATED;
3908 regs->start = regs->end = (regoff_t *) 0;
3912 /* Searching routines. */
3914 /* Like re_search_2, below, but only one string is specified, and
3915 doesn't let you say where to stop matching. */
3918 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3919 int startpos, int range, struct re_registers *regs)
3921 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3926 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3927 # define bytecount_to_charcount(ptr, len) (len)
3928 # define charcount_to_bytecount(ptr, len) (len)
3929 typedef int Charcount;
3932 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3933 virtual concatenation of STRING1 and STRING2, starting first at index
3934 STARTPOS, then at STARTPOS + 1, and so on.
3936 With MULE, STARTPOS is a byte position, not a char position. And the
3937 search will increment STARTPOS by the width of the current leading
3940 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3942 RANGE is how far to scan while trying to match. RANGE = 0 means try
3943 only at STARTPOS; in general, the last start tried is STARTPOS +
3946 With MULE, RANGE is a byte position, not a char position. The last
3947 start tried is the character starting <= STARTPOS + RANGE.
3949 In REGS, return the indices of the virtual concatenation of STRING1
3950 and STRING2 that matched the entire BUFP->buffer and its contained
3953 Do not consider matching one past the index STOP in the virtual
3954 concatenation of STRING1 and STRING2.
3956 We return either the position in the strings at which the match was
3957 found, -1 if no match, or -2 if error (such as failure
3961 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
3962 int size1, const char *str2, int size2, int startpos,
3963 int range, struct re_registers *regs, int stop)
3966 re_char *string1 = (re_char *) str1;
3967 re_char *string2 = (re_char *) str2;
3968 REGISTER char *fastmap = bufp->fastmap;
3969 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
3970 int total_size = size1 + size2;
3971 int endpos = startpos + range;
3972 #ifdef REGEX_BEGLINE_CHECK
3973 int anchored_at_begline = 0;
3978 /* Check for out-of-range STARTPOS. */
3979 if (startpos < 0 || startpos > total_size)
3982 /* Fix up RANGE if it might eventually take us outside
3983 the virtual concatenation of STRING1 and STRING2. */
3985 range = 0 - startpos;
3986 else if (endpos > total_size)
3987 range = total_size - startpos;
3989 /* If the search isn't to be a backwards one, don't waste time in a
3990 search for a pattern that must be anchored. */
3991 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3997 d = ((const unsigned char *)
3998 (startpos >= size1 ? string2 - size1 : string1) + startpos);
3999 range = charcount_to_bytecount (d, 1);
4004 /* In a forward search for something that starts with \=.
4005 don't keep searching past point. */
4006 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4008 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4015 /* Update the fastmap now if not correct already. */
4016 if (fastmap && !bufp->fastmap_accurate)
4017 if (re_compile_fastmap (bufp) == -2)
4020 #ifdef REGEX_BEGLINE_CHECK
4022 unsigned long i = 0;
4024 while (i < bufp->used)
4026 if (bufp->buffer[i] == start_memory ||
4027 bufp->buffer[i] == stop_memory)
4032 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4037 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4039 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4045 /* Loop through the string, looking for a place to start matching. */
4048 #ifdef REGEX_BEGLINE_CHECK
4049 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4050 then we can speed things up by skipping to the next beginning-of-
4052 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4055 /* whose stupid idea was it anyway to make this
4056 function take two strings to match?? */
4060 if (startpos < size1 && startpos + range >= size1)
4061 lim = range - (size1 - startpos);
4063 d = ((const unsigned char *)
4064 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4065 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4066 d_size = charcount_to_bytecount (d, 1);
4068 if (TRANSLATE_P (translate))
4069 while (range > lim && *d != '\n')
4071 d += d_size; /* Speedier INC_CHARPTR(d) */
4072 d_size = charcount_to_bytecount (d, 1);
4076 while (range > lim && *d != '\n')
4078 d += d_size; /* Speedier INC_CHARPTR(d) */
4079 d_size = charcount_to_bytecount (d, 1);
4083 startpos += irange - range;
4085 #endif /* REGEX_BEGLINE_CHECK */
4087 /* If a fastmap is supplied, skip quickly over characters that
4088 cannot be the start of a match. If the pattern can match the
4089 null string, however, we don't need to skip characters; we want
4090 the first null string. */
4091 if (fastmap && startpos < total_size && !bufp->can_be_null)
4093 if (range > 0) /* Searching forwards. */
4098 if (startpos < size1 && startpos + range >= size1)
4099 lim = range - (size1 - startpos);
4101 d = ((const unsigned char *)
4102 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4104 /* Written out as an if-else to avoid testing `translate'
4106 if (TRANSLATE_P (translate))
4112 buf_ch = charptr_emchar (d);
4113 buf_ch = RE_TRANSLATE (buf_ch);
4114 if (buf_ch >= 0200 || fastmap[(unsigned char) buf_ch])
4117 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4120 d_size = charcount_to_bytecount (d, 1);
4122 d += d_size; /* Speedier INC_CHARPTR(d) */
4125 while (range > lim && !fastmap[*d])
4127 d_size = charcount_to_bytecount (d, 1);
4129 d += d_size; /* Speedier INC_CHARPTR(d) */
4132 startpos += irange - range;
4134 else /* Searching backwards. */
4136 Emchar c = (size1 == 0 || startpos >= size1
4137 ? charptr_emchar (string2 + startpos - size1)
4138 : charptr_emchar (string1 + startpos));
4141 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4144 if (!fastmap[(unsigned char) c])
4150 /* If can't match the null string, and that's all we have left, fail. */
4151 if (range >= 0 && startpos == total_size && fastmap
4152 && !bufp->can_be_null)
4155 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4156 if (!no_quit_in_re_search)
4159 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4160 startpos, regs, stop);
4161 #ifndef REGEX_MALLOC
4178 d = ((const unsigned char *)
4179 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4180 d_size = charcount_to_bytecount (d, 1);
4186 /* Note startpos > size1 not >=. If we are on the
4187 string1/string2 boundary, we want to backup into string1. */
4188 d = ((const unsigned char *)
4189 (startpos > size1 ? string2 - size1 : string1) + startpos);
4191 d_size = charcount_to_bytecount (d, 1);
4199 /* Declarations and macros for re_match_2. */
4201 /* This converts PTR, a pointer into one of the search strings `string1'
4202 and `string2' into an offset from the beginning of that string. */
4203 #define POINTER_TO_OFFSET(ptr) \
4204 (FIRST_STRING_P (ptr) \
4205 ? ((regoff_t) ((ptr) - string1)) \
4206 : ((regoff_t) ((ptr) - string2 + size1)))
4208 /* Macros for dealing with the split strings in re_match_2. */
4210 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4212 /* Call before fetching a character with *d. This switches over to
4213 string2 if necessary. */
4214 #define REGEX_PREFETCH() \
4217 /* End of string2 => fail. */ \
4218 if (dend == end_match_2) \
4220 /* End of string1 => advance to string2. */ \
4222 dend = end_match_2; \
4226 /* Test if at very beginning or at very end of the virtual concatenation
4227 of `string1' and `string2'. If only one string, it's `string2'. */
4228 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4229 #define AT_STRINGS_END(d) ((d) == end2)
4232 If the given position straddles the string gap, return the equivalent
4233 position that is before or after the gap, respectively; otherwise,
4234 return the same position. */
4235 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4236 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4238 /* Test if CH is a word-constituent character. (XEmacs change) */
4239 #define WORDCHAR_P_UNSAFE(ch) \
4240 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4243 /* Free everything we malloc. */
4244 #ifdef MATCH_MAY_ALLOCATE
4245 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4246 #define FREE_VARIABLES() \
4248 REGEX_FREE_STACK (fail_stack.stack); \
4249 FREE_VAR (regstart); \
4250 FREE_VAR (regend); \
4251 FREE_VAR (old_regstart); \
4252 FREE_VAR (old_regend); \
4253 FREE_VAR (best_regstart); \
4254 FREE_VAR (best_regend); \
4255 FREE_VAR (reg_info); \
4256 FREE_VAR (reg_dummy); \
4257 FREE_VAR (reg_info_dummy); \
4259 #else /* not MATCH_MAY_ALLOCATE */
4260 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4261 #endif /* MATCH_MAY_ALLOCATE */
4263 /* These values must meet several constraints. They must not be valid
4264 register values; since we have a limit of 255 registers (because
4265 we use only one byte in the pattern for the register number), we can
4266 use numbers larger than 255. They must differ by 1, because of
4267 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4268 be larger than the value for the highest register, so we do not try
4269 to actually save any registers when none are active. */
4270 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4271 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4273 /* Matching routines. */
4275 #ifndef emacs /* Emacs never uses this. */
4276 /* re_match is like re_match_2 except it takes only a single string. */
4279 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4280 int pos, struct re_registers *regs)
4282 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4287 #endif /* not emacs */
4290 /* re_match_2 matches the compiled pattern in BUFP against the
4291 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4292 SIZE2, respectively). We start matching at POS, and stop matching
4295 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4296 store offsets for the substring each group matched in REGS. See the
4297 documentation for exactly how many groups we fill.
4299 We return -1 if no match, -2 if an internal error (such as the
4300 failure stack overflowing). Otherwise, we return the length of the
4301 matched substring. */
4304 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4305 int size1, const char *string2, int size2, int pos,
4306 struct re_registers *regs, int stop)
4311 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4313 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4319 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4320 (re_char *) string2, size2,
4327 /* This is a separate function so that we can force an alloca cleanup
4330 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4331 int size1, re_char *string2, int size2, int pos,
4332 struct re_registers *regs, int stop)
4334 /* General temporaries. */
4337 int should_succeed; /* XEmacs change */
4339 /* Just past the end of the corresponding string. */
4340 re_char *end1, *end2;
4342 /* Pointers into string1 and string2, just past the last characters in
4343 each to consider matching. */
4344 re_char *end_match_1, *end_match_2;
4346 /* Where we are in the data, and the end of the current string. */
4349 /* Where we are in the pattern, and the end of the pattern. */
4350 unsigned char *p = bufp->buffer;
4351 REGISTER unsigned char *pend = p + bufp->used;
4353 /* Mark the opcode just after a start_memory, so we can test for an
4354 empty subpattern when we get to the stop_memory. */
4355 re_char *just_past_start_mem = 0;
4357 /* We use this to map every character in the string. */
4358 RE_TRANSLATE_TYPE translate = bufp->translate;
4360 /* Failure point stack. Each place that can handle a failure further
4361 down the line pushes a failure point on this stack. It consists of
4362 restart, regend, and reg_info for all registers corresponding to
4363 the subexpressions we're currently inside, plus the number of such
4364 registers, and, finally, two char *'s. The first char * is where
4365 to resume scanning the pattern; the second one is where to resume
4366 scanning the strings. If the latter is zero, the failure point is
4367 a ``dummy''; if a failure happens and the failure point is a dummy,
4368 it gets discarded and the next one is tried. */
4369 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4370 fail_stack_type fail_stack;
4373 static unsigned failure_id;
4374 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4378 /* This holds the pointer to the failure stack, when
4379 it is allocated relocatably. */
4380 fail_stack_elt_t *failure_stack_ptr;
4383 /* We fill all the registers internally, independent of what we
4384 return, for use in backreferences. The number here includes
4385 an element for register zero. */
4386 int num_regs = bufp->re_nsub + 1;
4388 /* The currently active registers. */
4389 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4390 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4392 /* Information on the contents of registers. These are pointers into
4393 the input strings; they record just what was matched (on this
4394 attempt) by a subexpression part of the pattern, that is, the
4395 regnum-th regstart pointer points to where in the pattern we began
4396 matching and the regnum-th regend points to right after where we
4397 stopped matching the regnum-th subexpression. (The zeroth register
4398 keeps track of what the whole pattern matches.) */
4399 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4400 re_char **regstart, **regend;
4403 /* If a group that's operated upon by a repetition operator fails to
4404 match anything, then the register for its start will need to be
4405 restored because it will have been set to wherever in the string we
4406 are when we last see its open-group operator. Similarly for a
4408 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4409 re_char **old_regstart, **old_regend;
4412 /* The is_active field of reg_info helps us keep track of which (possibly
4413 nested) subexpressions we are currently in. The matched_something
4414 field of reg_info[reg_num] helps us tell whether or not we have
4415 matched any of the pattern so far this time through the reg_num-th
4416 subexpression. These two fields get reset each time through any
4417 loop their register is in. */
4418 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4419 register_info_type *reg_info;
4422 /* The following record the register info as found in the above
4423 variables when we find a match better than any we've seen before.
4424 This happens as we backtrack through the failure points, which in
4425 turn happens only if we have not yet matched the entire string. */
4426 unsigned best_regs_set = false;
4427 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4428 re_char **best_regstart, **best_regend;
4431 /* Logically, this is `best_regend[0]'. But we don't want to have to
4432 allocate space for that if we're not allocating space for anything
4433 else (see below). Also, we never need info about register 0 for
4434 any of the other register vectors, and it seems rather a kludge to
4435 treat `best_regend' differently than the rest. So we keep track of
4436 the end of the best match so far in a separate variable. We
4437 initialize this to NULL so that when we backtrack the first time
4438 and need to test it, it's not garbage. */
4439 re_char *match_end = NULL;
4441 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4442 int set_regs_matched_done = 0;
4444 /* Used when we pop values we don't care about. */
4445 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4446 re_char **reg_dummy;
4447 register_info_type *reg_info_dummy;
4451 /* Counts the total number of registers pushed. */
4452 unsigned num_regs_pushed = 0;
4455 /* 1 if this match ends in the same string (string1 or string2)
4456 as the best previous match. */
4459 /* 1 if this match is the best seen so far. */
4460 re_bool best_match_p;
4462 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4466 #ifdef MATCH_MAY_ALLOCATE
4467 /* Do not bother to initialize all the register variables if there are
4468 no groups in the pattern, as it takes a fair amount of time. If
4469 there are groups, we include space for register 0 (the whole
4470 pattern), even though we never use it, since it simplifies the
4471 array indexing. We should fix this. */
4474 regstart = REGEX_TALLOC (num_regs, re_char *);
4475 regend = REGEX_TALLOC (num_regs, re_char *);
4476 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4477 old_regend = REGEX_TALLOC (num_regs, re_char *);
4478 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4479 best_regend = REGEX_TALLOC (num_regs, re_char *);
4480 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4481 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4482 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4484 if (!(regstart && regend && old_regstart && old_regend && reg_info
4485 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4493 /* We must initialize all our variables to NULL, so that
4494 `FREE_VARIABLES' doesn't try to free them. */
4495 regstart = regend = old_regstart = old_regend = best_regstart
4496 = best_regend = reg_dummy = NULL;
4497 reg_info = reg_info_dummy = (register_info_type *) NULL;
4499 #endif /* MATCH_MAY_ALLOCATE */
4501 /* The starting position is bogus. */
4502 if (pos < 0 || pos > size1 + size2)
4508 /* Initialize subexpression text positions to -1 to mark ones that no
4509 start_memory/stop_memory has been seen for. Also initialize the
4510 register information struct. */
4511 for (mcnt = 1; mcnt < num_regs; mcnt++)
4513 regstart[mcnt] = regend[mcnt]
4514 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4516 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4517 IS_ACTIVE (reg_info[mcnt]) = 0;
4518 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4519 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4521 /* We move `string1' into `string2' if the latter's empty -- but not if
4522 `string1' is null. */
4523 if (size2 == 0 && string1 != NULL)
4530 end1 = string1 + size1;
4531 end2 = string2 + size2;
4533 /* Compute where to stop matching, within the two strings. */
4536 end_match_1 = string1 + stop;
4537 end_match_2 = string2;
4542 end_match_2 = string2 + stop - size1;
4545 /* `p' scans through the pattern as `d' scans through the data.
4546 `dend' is the end of the input string that `d' points within. `d'
4547 is advanced into the following input string whenever necessary, but
4548 this happens before fetching; therefore, at the beginning of the
4549 loop, `d' can be pointing at the end of a string, but it cannot
4551 if (size1 > 0 && pos <= size1)
4558 d = string2 + pos - size1;
4562 DEBUG_PRINT1 ("The compiled pattern is: \n");
4563 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4564 DEBUG_PRINT1 ("The string to match is: `");
4565 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4566 DEBUG_PRINT1 ("'\n");
4568 /* This loops over pattern commands. It exits by returning from the
4569 function if the match is complete, or it drops through if the match
4570 fails at this starting point in the input data. */
4573 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4574 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4575 if (!no_quit_in_re_search)
4580 { /* End of pattern means we might have succeeded. */
4581 DEBUG_PRINT1 ("end of pattern ... ");
4583 /* If we haven't matched the entire string, and we want the
4584 longest match, try backtracking. */
4585 if (d != end_match_2)
4587 same_str_p = (FIRST_STRING_P (match_end)
4588 == MATCHING_IN_FIRST_STRING);
4590 /* AIX compiler got confused when this was combined
4591 with the previous declaration. */
4593 best_match_p = d > match_end;
4595 best_match_p = !MATCHING_IN_FIRST_STRING;
4597 DEBUG_PRINT1 ("backtracking.\n");
4599 if (!FAIL_STACK_EMPTY ())
4600 { /* More failure points to try. */
4602 /* If exceeds best match so far, save it. */
4603 if (!best_regs_set || best_match_p)
4605 best_regs_set = true;
4608 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4610 for (mcnt = 1; mcnt < num_regs; mcnt++)
4612 best_regstart[mcnt] = regstart[mcnt];
4613 best_regend[mcnt] = regend[mcnt];
4619 /* If no failure points, don't restore garbage. And if
4620 last match is real best match, don't restore second
4622 else if (best_regs_set && !best_match_p)
4625 /* Restore best match. It may happen that `dend ==
4626 end_match_1' while the restored d is in string2.
4627 For example, the pattern `x.*y.*z' against the
4628 strings `x-' and `y-z-', if the two strings are
4629 not consecutive in memory. */
4630 DEBUG_PRINT1 ("Restoring best registers.\n");
4633 dend = ((d >= string1 && d <= end1)
4634 ? end_match_1 : end_match_2);
4636 for (mcnt = 1; mcnt < num_regs; mcnt++)
4638 regstart[mcnt] = best_regstart[mcnt];
4639 regend[mcnt] = best_regend[mcnt];
4642 } /* d != end_match_2 */
4645 DEBUG_PRINT1 ("Accepting match.\n");
4647 /* If caller wants register contents data back, do it. */
4648 if (regs && !bufp->no_sub)
4650 /* Have the register data arrays been allocated? */
4651 if (bufp->regs_allocated == REGS_UNALLOCATED)
4652 { /* No. So allocate them with malloc. We need one
4653 extra element beyond `num_regs' for the `-1' marker
4655 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4656 regs->start = TALLOC (regs->num_regs, regoff_t);
4657 regs->end = TALLOC (regs->num_regs, regoff_t);
4658 if (regs->start == NULL || regs->end == NULL)
4663 bufp->regs_allocated = REGS_REALLOCATE;
4665 else if (bufp->regs_allocated == REGS_REALLOCATE)
4666 { /* Yes. If we need more elements than were already
4667 allocated, reallocate them. If we need fewer, just
4669 if (regs->num_regs < num_regs + 1)
4671 regs->num_regs = num_regs + 1;
4672 RETALLOC (regs->start, regs->num_regs, regoff_t);
4673 RETALLOC (regs->end, regs->num_regs, regoff_t);
4674 if (regs->start == NULL || regs->end == NULL)
4683 /* These braces fend off a "empty body in an else-statement"
4684 warning under GCC when assert expands to nothing. */
4685 assert (bufp->regs_allocated == REGS_FIXED);
4688 /* Convert the pointer data in `regstart' and `regend' to
4689 indices. Register zero has to be set differently,
4690 since we haven't kept track of any info for it. */
4691 if (regs->num_regs > 0)
4693 regs->start[0] = pos;
4694 regs->end[0] = (MATCHING_IN_FIRST_STRING
4695 ? ((regoff_t) (d - string1))
4696 : ((regoff_t) (d - string2 + size1)));
4699 /* Go through the first `min (num_regs, regs->num_regs)'
4700 registers, since that is all we initialized. */
4701 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4703 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4704 regs->start[mcnt] = regs->end[mcnt] = -1;
4708 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4710 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4714 /* If the regs structure we return has more elements than
4715 were in the pattern, set the extra elements to -1. If
4716 we (re)allocated the registers, this is the case,
4717 because we always allocate enough to have at least one
4719 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4720 regs->start[mcnt] = regs->end[mcnt] = -1;
4721 } /* regs && !bufp->no_sub */
4723 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4724 nfailure_points_pushed, nfailure_points_popped,
4725 nfailure_points_pushed - nfailure_points_popped);
4726 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4728 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4732 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4738 /* Otherwise match next pattern command. */
4739 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4741 /* Ignore these. Used to ignore the n of succeed_n's which
4742 currently have n == 0. */
4744 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4748 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4751 /* Match the next n pattern characters exactly. The following
4752 byte in the pattern defines n, and the n bytes after that
4753 are the characters to match. */
4756 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4758 /* This is written out as an if-else so we don't waste time
4759 testing `translate' inside the loop. */
4760 if (TRANSLATE_P (translate))
4765 Emchar pat_ch, buf_ch;
4769 pat_ch = charptr_emchar (p);
4770 buf_ch = charptr_emchar (d);
4771 if (RE_TRANSLATE (buf_ch) != pat_ch)
4774 pat_len = charcount_to_bytecount (p, 1);
4779 #else /* not MULE */
4781 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4793 if (*d++ != *p++) goto fail;
4797 SET_REGS_MATCHED ();
4801 /* Match any character except possibly a newline or a null. */
4803 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4807 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4808 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4811 SET_REGS_MATCHED ();
4812 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4813 INC_CHARPTR (d); /* XEmacs change */
4820 REGISTER unsigned char c;
4821 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4823 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4826 c = TRANSLATE (*d); /* The character to match. */
4828 /* Cast to `unsigned' instead of `unsigned char' in case the
4829 bit list is a full 32 bytes long. */
4830 if (c < (unsigned) (*p * BYTEWIDTH)
4831 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4836 if (!not_p) goto fail;
4838 SET_REGS_MATCHED ();
4839 INC_CHARPTR (d); /* XEmacs change */
4845 case charset_mule_not:
4848 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4850 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4853 c = charptr_emchar ((const Bufbyte *) d);
4854 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4856 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4859 p += unified_range_table_bytes_used (p);
4861 if (!not_p) goto fail;
4863 SET_REGS_MATCHED ();
4870 /* The beginning of a group is represented by start_memory.
4871 The arguments are the register number in the next byte, and the
4872 number of groups inner to this one in the next. The text
4873 matched within the group is recorded (in the internal
4874 registers data structure) under the register number. */
4876 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4878 /* Find out if this group can match the empty string. */
4879 p1 = p; /* To send to group_match_null_string_p. */
4881 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4882 REG_MATCH_NULL_STRING_P (reg_info[*p])
4883 = group_match_null_string_p (&p1, pend, reg_info);
4885 /* Save the position in the string where we were the last time
4886 we were at this open-group operator in case the group is
4887 operated upon by a repetition operator, e.g., with `(a*)*b'
4888 against `ab'; then we want to ignore where we are now in
4889 the string in case this attempt to match fails. */
4890 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4891 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4893 DEBUG_PRINT2 (" old_regstart: %d\n",
4894 POINTER_TO_OFFSET (old_regstart[*p]));
4897 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4899 IS_ACTIVE (reg_info[*p]) = 1;
4900 MATCHED_SOMETHING (reg_info[*p]) = 0;
4902 /* Clear this whenever we change the register activity status. */
4903 set_regs_matched_done = 0;
4905 /* This is the new highest active register. */
4906 highest_active_reg = *p;
4908 /* If nothing was active before, this is the new lowest active
4910 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4911 lowest_active_reg = *p;
4913 /* Move past the register number and inner group count. */
4915 just_past_start_mem = p;
4920 /* The stop_memory opcode represents the end of a group. Its
4921 arguments are the same as start_memory's: the register
4922 number, and the number of inner groups. */
4924 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4926 /* We need to save the string position the last time we were at
4927 this close-group operator in case the group is operated
4928 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4929 against `aba'; then we want to ignore where we are now in
4930 the string in case this attempt to match fails. */
4931 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4932 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4934 DEBUG_PRINT2 (" old_regend: %d\n",
4935 POINTER_TO_OFFSET (old_regend[*p]));
4938 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4940 /* This register isn't active anymore. */
4941 IS_ACTIVE (reg_info[*p]) = 0;
4943 /* Clear this whenever we change the register activity status. */
4944 set_regs_matched_done = 0;
4946 /* If this was the only register active, nothing is active
4948 if (lowest_active_reg == highest_active_reg)
4950 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4951 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4954 { /* We must scan for the new highest active register, since
4955 it isn't necessarily one less than now: consider
4956 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4957 new highest active register is 1. */
4958 unsigned char r = *p - 1;
4959 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4962 /* If we end up at register zero, that means that we saved
4963 the registers as the result of an `on_failure_jump', not
4964 a `start_memory', and we jumped to past the innermost
4965 `stop_memory'. For example, in ((.)*) we save
4966 registers 1 and 2 as a result of the *, but when we pop
4967 back to the second ), we are at the stop_memory 1.
4968 Thus, nothing is active. */
4971 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4972 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4976 highest_active_reg = r;
4978 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
4980 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
4982 lowest_active_reg = r;
4986 /* If just failed to match something this time around with a
4987 group that's operated on by a repetition operator, try to
4988 force exit from the ``loop'', and restore the register
4989 information for this group that we had before trying this
4991 if ((!MATCHED_SOMETHING (reg_info[*p])
4992 || just_past_start_mem == p - 1)
4995 re_bool is_a_jump_n = false;
4999 switch ((re_opcode_t) *p1++)
5003 case pop_failure_jump:
5004 case maybe_pop_jump:
5006 case dummy_failure_jump:
5007 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5017 /* If the next operation is a jump backwards in the pattern
5018 to an on_failure_jump right before the start_memory
5019 corresponding to this stop_memory, exit from the loop
5020 by forcing a failure after pushing on the stack the
5021 on_failure_jump's jump in the pattern, and d. */
5022 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5023 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5025 /* If this group ever matched anything, then restore
5026 what its registers were before trying this last
5027 failed match, e.g., with `(a*)*b' against `ab' for
5028 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5029 against `aba' for regend[3].
5031 Also restore the registers for inner groups for,
5032 e.g., `((a*)(b*))*' against `aba' (register 3 would
5033 otherwise get trashed). */
5035 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5039 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5041 /* Restore this and inner groups' (if any) registers. */
5042 for (r = *p; r < *p + *(p + 1); r++)
5044 regstart[r] = old_regstart[r];
5046 /* xx why this test? */
5047 if (old_regend[r] >= regstart[r])
5048 regend[r] = old_regend[r];
5052 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5053 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5059 /* Move past the register number and the inner group count. */
5064 /* \<digit> has been turned into a `duplicate' command which is
5065 followed by the numeric value of <digit> as the register number. */
5068 REGISTER re_char *d2, *dend2;
5069 int regno = *p++; /* Get which register to match against. */
5070 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5072 /* Can't back reference a group which we've never matched. */
5073 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5076 /* Where in input to try to start matching. */
5077 d2 = regstart[regno];
5079 /* Where to stop matching; if both the place to start and
5080 the place to stop matching are in the same string, then
5081 set to the place to stop, otherwise, for now have to use
5082 the end of the first string. */
5084 dend2 = ((FIRST_STRING_P (regstart[regno])
5085 == FIRST_STRING_P (regend[regno]))
5086 ? regend[regno] : end_match_1);
5089 /* If necessary, advance to next segment in register
5093 if (dend2 == end_match_2) break;
5094 if (dend2 == regend[regno]) break;
5096 /* End of string1 => advance to string2. */
5098 dend2 = regend[regno];
5100 /* At end of register contents => success */
5101 if (d2 == dend2) break;
5103 /* If necessary, advance to next segment in data. */
5106 /* How many characters left in this segment to match. */
5109 /* Want how many consecutive characters we can match in
5110 one shot, so, if necessary, adjust the count. */
5111 if (mcnt > dend2 - d2)
5114 /* Compare that many; failure if mismatch, else move
5116 if (TRANSLATE_P (translate)
5117 ? bcmp_translate ((unsigned char *) d,
5118 (unsigned char *) d2, mcnt, translate)
5119 : memcmp (d, d2, mcnt))
5121 d += mcnt, d2 += mcnt;
5123 /* Do this because we've match some characters. */
5124 SET_REGS_MATCHED ();
5130 /* begline matches the empty string at the beginning of the string
5131 (unless `not_bol' is set in `bufp'), and, if
5132 `newline_anchor' is set, after newlines. */
5134 DEBUG_PRINT1 ("EXECUTING begline.\n");
5136 if (AT_STRINGS_BEG (d))
5138 if (!bufp->not_bol) break;
5140 else if (d[-1] == '\n' && bufp->newline_anchor)
5144 /* In all other cases, we fail. */
5148 /* endline is the dual of begline. */
5150 DEBUG_PRINT1 ("EXECUTING endline.\n");
5152 if (AT_STRINGS_END (d))
5154 if (!bufp->not_eol) break;
5157 /* We have to ``prefetch'' the next character. */
5158 else if ((d == end1 ? *string2 : *d) == '\n'
5159 && bufp->newline_anchor)
5166 /* Match at the very beginning of the data. */
5168 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5169 if (AT_STRINGS_BEG (d))
5174 /* Match at the very end of the data. */
5176 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5177 if (AT_STRINGS_END (d))
5182 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5183 pushes NULL as the value for the string on the stack. Then
5184 `pop_failure_point' will keep the current value for the
5185 string, instead of restoring it. To see why, consider
5186 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5187 then the . fails against the \n. But the next thing we want
5188 to do is match the \n against the \n; if we restored the
5189 string value, we would be back at the foo.
5191 Because this is used only in specific cases, we don't need to
5192 check all the things that `on_failure_jump' does, to make
5193 sure the right things get saved on the stack. Hence we don't
5194 share its code. The only reason to push anything on the
5195 stack at all is that otherwise we would have to change
5196 `anychar's code to do something besides goto fail in this
5197 case; that seems worse than this. */
5198 case on_failure_keep_string_jump:
5199 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5201 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5202 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5204 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5208 /* Uses of on_failure_jump:
5210 Each alternative starts with an on_failure_jump that points
5211 to the beginning of the next alternative. Each alternative
5212 except the last ends with a jump that in effect jumps past
5213 the rest of the alternatives. (They really jump to the
5214 ending jump of the following alternative, because tensioning
5215 these jumps is a hassle.)
5217 Repeats start with an on_failure_jump that points past both
5218 the repetition text and either the following jump or
5219 pop_failure_jump back to this on_failure_jump. */
5220 case on_failure_jump:
5222 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5224 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5225 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5227 /* If this on_failure_jump comes right before a group (i.e.,
5228 the original * applied to a group), save the information
5229 for that group and all inner ones, so that if we fail back
5230 to this point, the group's information will be correct.
5231 For example, in \(a*\)*\1, we need the preceding group,
5232 and in \(\(a*\)b*\)\2, we need the inner group. */
5234 /* We can't use `p' to check ahead because we push
5235 a failure point to `p + mcnt' after we do this. */
5238 /* We need to skip no_op's before we look for the
5239 start_memory in case this on_failure_jump is happening as
5240 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5242 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5245 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5247 /* We have a new highest active register now. This will
5248 get reset at the start_memory we are about to get to,
5249 but we will have saved all the registers relevant to
5250 this repetition op, as described above. */
5251 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5252 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5253 lowest_active_reg = *(p1 + 1);
5256 DEBUG_PRINT1 (":\n");
5257 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5261 /* A smart repeat ends with `maybe_pop_jump'.
5262 We change it to either `pop_failure_jump' or `jump'. */
5263 case maybe_pop_jump:
5264 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5265 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5267 REGISTER unsigned char *p2 = p;
5269 /* Compare the beginning of the repeat with what in the
5270 pattern follows its end. If we can establish that there
5271 is nothing that they would both match, i.e., that we
5272 would have to backtrack because of (as in, e.g., `a*a')
5273 then we can change to pop_failure_jump, because we'll
5274 never have to backtrack.
5276 This is not true in the case of alternatives: in
5277 `(a|ab)*' we do need to backtrack to the `ab' alternative
5278 (e.g., if the string was `ab'). But instead of trying to
5279 detect that here, the alternative has put on a dummy
5280 failure point which is what we will end up popping. */
5282 /* Skip over open/close-group commands.
5283 If what follows this loop is a ...+ construct,
5284 look at what begins its body, since we will have to
5285 match at least one of that. */
5289 && ((re_opcode_t) *p2 == stop_memory
5290 || (re_opcode_t) *p2 == start_memory))
5292 else if (p2 + 6 < pend
5293 && (re_opcode_t) *p2 == dummy_failure_jump)
5300 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5301 to the `maybe_finalize_jump' of this case. Examine what
5304 /* If we're at the end of the pattern, we can change. */
5307 /* Consider what happens when matching ":\(.*\)"
5308 against ":/". I don't really understand this code
5310 p[-3] = (unsigned char) pop_failure_jump;
5312 (" End of pattern: change to `pop_failure_jump'.\n");
5315 else if ((re_opcode_t) *p2 == exactn
5316 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5318 REGISTER unsigned char c
5319 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5321 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5323 p[-3] = (unsigned char) pop_failure_jump;
5324 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5328 else if ((re_opcode_t) p1[3] == charset
5329 || (re_opcode_t) p1[3] == charset_not)
5331 int not_p = (re_opcode_t) p1[3] == charset_not;
5333 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5334 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5337 /* `not_p' is equal to 1 if c would match, which means
5338 that we can't change to pop_failure_jump. */
5341 p[-3] = (unsigned char) pop_failure_jump;
5342 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5346 else if ((re_opcode_t) *p2 == charset)
5349 REGISTER unsigned char c
5350 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5353 if ((re_opcode_t) p1[3] == exactn
5354 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5355 && (p2[2 + p1[5] / BYTEWIDTH]
5356 & (1 << (p1[5] % BYTEWIDTH)))))
5358 p[-3] = (unsigned char) pop_failure_jump;
5359 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5363 else if ((re_opcode_t) p1[3] == charset_not)
5366 /* We win if the charset_not inside the loop
5367 lists every character listed in the charset after. */
5368 for (idx = 0; idx < (int) p2[1]; idx++)
5369 if (! (p2[2 + idx] == 0
5370 || (idx < (int) p1[4]
5371 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5376 p[-3] = (unsigned char) pop_failure_jump;
5377 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5380 else if ((re_opcode_t) p1[3] == charset)
5383 /* We win if the charset inside the loop
5384 has no overlap with the one after the loop. */
5386 idx < (int) p2[1] && idx < (int) p1[4];
5388 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5391 if (idx == p2[1] || idx == p1[4])
5393 p[-3] = (unsigned char) pop_failure_jump;
5394 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5399 p -= 2; /* Point at relative address again. */
5400 if ((re_opcode_t) p[-1] != pop_failure_jump)
5402 p[-1] = (unsigned char) jump;
5403 DEBUG_PRINT1 (" Match => jump.\n");
5404 goto unconditional_jump;
5406 /* Note fall through. */
5409 /* The end of a simple repeat has a pop_failure_jump back to
5410 its matching on_failure_jump, where the latter will push a
5411 failure point. The pop_failure_jump takes off failure
5412 points put on by this pop_failure_jump's matching
5413 on_failure_jump; we got through the pattern to here from the
5414 matching on_failure_jump, so didn't fail. */
5415 case pop_failure_jump:
5417 /* We need to pass separate storage for the lowest and
5418 highest registers, even though we don't care about the
5419 actual values. Otherwise, we will restore only one
5420 register from the stack, since lowest will == highest in
5421 `pop_failure_point'. */
5422 int dummy_low_reg, dummy_high_reg;
5423 unsigned char *pdummy;
5424 re_char *sdummy = NULL;
5426 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5427 POP_FAILURE_POINT (sdummy, pdummy,
5428 dummy_low_reg, dummy_high_reg,
5429 reg_dummy, reg_dummy, reg_info_dummy);
5431 /* Note fall through. */
5434 /* Unconditionally jump (without popping any failure points). */
5437 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5438 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5439 p += mcnt; /* Do the jump. */
5440 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5444 /* We need this opcode so we can detect where alternatives end
5445 in `group_match_null_string_p' et al. */
5447 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5448 goto unconditional_jump;
5451 /* Normally, the on_failure_jump pushes a failure point, which
5452 then gets popped at pop_failure_jump. We will end up at
5453 pop_failure_jump, also, and with a pattern of, say, `a+', we
5454 are skipping over the on_failure_jump, so we have to push
5455 something meaningless for pop_failure_jump to pop. */
5456 case dummy_failure_jump:
5457 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5458 /* It doesn't matter what we push for the string here. What
5459 the code at `fail' tests is the value for the pattern. */
5460 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5461 goto unconditional_jump;
5464 /* At the end of an alternative, we need to push a dummy failure
5465 point in case we are followed by a `pop_failure_jump', because
5466 we don't want the failure point for the alternative to be
5467 popped. For example, matching `(a|ab)*' against `aab'
5468 requires that we match the `ab' alternative. */
5469 case push_dummy_failure:
5470 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5471 /* See comments just above at `dummy_failure_jump' about the
5473 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5476 /* Have to succeed matching what follows at least n times.
5477 After that, handle like `on_failure_jump'. */
5479 EXTRACT_NUMBER (mcnt, p + 2);
5480 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5483 /* Originally, this is how many times we HAVE to succeed. */
5488 STORE_NUMBER_AND_INCR (p, mcnt);
5489 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5493 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5495 p[2] = (unsigned char) no_op;
5496 p[3] = (unsigned char) no_op;
5502 EXTRACT_NUMBER (mcnt, p + 2);
5503 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5505 /* Originally, this is how many times we CAN jump. */
5509 STORE_NUMBER (p + 2, mcnt);
5510 goto unconditional_jump;
5512 /* If don't have to jump any more, skip over the rest of command. */
5519 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5521 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5523 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5524 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5525 STORE_NUMBER (p1, mcnt);
5530 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5536 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5540 re_char *d_before = POS_BEFORE_GAP_UNSAFE (d);
5541 re_char *d_after = POS_AFTER_GAP_UNSAFE (d);
5543 /* emch1 is the character before d, syn1 is the syntax of emch1,
5544 emch2 is the character at d, and syn2 is the syntax of emch2. */
5545 Emchar emch1, emch2;
5551 DEC_CHARPTR (d_before);
5552 emch1 = charptr_emchar (d_before);
5553 emch2 = charptr_emchar (d_after);
5556 pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5557 UPDATE_SYNTAX_CACHE (pos_before);
5559 syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5562 UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1);
5564 syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5567 result = ((syn1 == Sword) != (syn2 == Sword));
5569 if (result == should_succeed)
5575 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5577 goto matchwordbound;
5580 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5581 if (AT_STRINGS_END (d))
5584 /* XEmacs: this originally read:
5586 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5590 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5591 Emchar emch = charptr_emchar (dtmp);
5593 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5594 UPDATE_SYNTAX_CACHE (charpos);
5596 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5599 if (AT_STRINGS_BEG (d))
5601 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5603 emch = charptr_emchar (dtmp);
5605 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5607 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5614 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5615 if (AT_STRINGS_BEG (d))
5618 /* XEmacs: this originally read:
5620 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5621 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5624 The or condition is incorrect (reversed).
5629 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5630 UPDATE_SYNTAX_CACHE (charpos);
5632 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5634 emch = charptr_emchar (dtmp);
5635 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5638 if (AT_STRINGS_END (d))
5640 dtmp = POS_AFTER_GAP_UNSAFE (d);
5641 emch = charptr_emchar (dtmp);
5643 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5645 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5653 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5654 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5655 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5656 >= BUF_PT (regex_emacs_buffer)))
5661 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5662 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5663 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5664 != BUF_PT (regex_emacs_buffer)))
5669 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5670 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5671 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5672 <= BUF_PT (regex_emacs_buffer)))
5675 #if 0 /* not emacs19 */
5677 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5678 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5679 != BUF_PT (regex_emacs_buffer))
5682 #endif /* not emacs19 */
5685 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5690 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5702 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5703 UPDATE_SYNTAX_CACHE (charpos);
5707 emch = charptr_emchar ((const Bufbyte *) d);
5708 matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->mirror_syntax_table,
5709 emch) == (enum syntaxcode) mcnt);
5711 if (matches != should_succeed)
5713 SET_REGS_MATCHED ();
5718 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5720 goto matchnotsyntax;
5723 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5727 goto matchornotsyntax;
5730 /* 97/2/17 jhod Mule category code patch */
5739 emch = charptr_emchar ((const Bufbyte *) d);
5741 if (check_category_char(emch, regex_emacs_buffer->category_table,
5742 mcnt, should_succeed))
5744 SET_REGS_MATCHED ();
5748 case notcategoryspec:
5750 goto matchornotcategory;
5751 /* end of category patch */
5753 #else /* not emacs */
5755 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5757 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5759 SET_REGS_MATCHED ();
5764 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5766 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5768 SET_REGS_MATCHED ();
5776 continue; /* Successfully executed one pattern command; keep going. */
5779 /* We goto here if a matching operation fails. */
5781 if (!FAIL_STACK_EMPTY ())
5782 { /* A restart point is known. Restore to that state. */
5783 DEBUG_PRINT1 ("\nFAIL:\n");
5784 POP_FAILURE_POINT (d, p,
5785 lowest_active_reg, highest_active_reg,
5786 regstart, regend, reg_info);
5788 /* If this failure point is a dummy, try the next one. */
5792 /* If we failed to the end of the pattern, don't examine *p. */
5796 re_bool is_a_jump_n = false;
5798 /* If failed to a backwards jump that's part of a repetition
5799 loop, need to pop this failure point and use the next one. */
5800 switch ((re_opcode_t) *p)
5804 case maybe_pop_jump:
5805 case pop_failure_jump:
5808 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5811 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5813 && (re_opcode_t) *p1 == on_failure_jump))
5821 if (d >= string1 && d <= end1)
5825 break; /* Matching at this starting point really fails. */
5829 goto restore_best_regs;
5833 return -1; /* Failure to match. */
5836 /* Subroutine definitions for re_match_2. */
5839 /* We are passed P pointing to a register number after a start_memory.
5841 Return true if the pattern up to the corresponding stop_memory can
5842 match the empty string, and false otherwise.
5844 If we find the matching stop_memory, sets P to point to one past its number.
5845 Otherwise, sets P to an undefined byte less than or equal to END.
5847 We don't handle duplicates properly (yet). */
5850 group_match_null_string_p (unsigned char **p, unsigned char *end,
5851 register_info_type *reg_info)
5854 /* Point to after the args to the start_memory. */
5855 unsigned char *p1 = *p + 2;
5859 /* Skip over opcodes that can match nothing, and return true or
5860 false, as appropriate, when we get to one that can't, or to the
5861 matching stop_memory. */
5863 switch ((re_opcode_t) *p1)
5865 /* Could be either a loop or a series of alternatives. */
5866 case on_failure_jump:
5868 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5870 /* If the next operation is not a jump backwards in the
5875 /* Go through the on_failure_jumps of the alternatives,
5876 seeing if any of the alternatives cannot match nothing.
5877 The last alternative starts with only a jump,
5878 whereas the rest start with on_failure_jump and end
5879 with a jump, e.g., here is the pattern for `a|b|c':
5881 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5882 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5885 So, we have to first go through the first (n-1)
5886 alternatives and then deal with the last one separately. */
5889 /* Deal with the first (n-1) alternatives, which start
5890 with an on_failure_jump (see above) that jumps to right
5891 past a jump_past_alt. */
5893 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5895 /* `mcnt' holds how many bytes long the alternative
5896 is, including the ending `jump_past_alt' and
5899 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5903 /* Move to right after this alternative, including the
5907 /* Break if it's the beginning of an n-th alternative
5908 that doesn't begin with an on_failure_jump. */
5909 if ((re_opcode_t) *p1 != on_failure_jump)
5912 /* Still have to check that it's not an n-th
5913 alternative that starts with an on_failure_jump. */
5915 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5916 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5918 /* Get to the beginning of the n-th alternative. */
5924 /* Deal with the last alternative: go back and get number
5925 of the `jump_past_alt' just before it. `mcnt' contains
5926 the length of the alternative. */
5927 EXTRACT_NUMBER (mcnt, p1 - 2);
5929 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5932 p1 += mcnt; /* Get past the n-th alternative. */
5938 assert (p1[1] == **p);
5944 if (!common_op_match_null_string_p (&p1, end, reg_info))
5947 } /* while p1 < end */
5950 } /* group_match_null_string_p */
5953 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5954 It expects P to be the first byte of a single alternative and END one
5955 byte past the last. The alternative can contain groups. */
5958 alt_match_null_string_p (unsigned char *p, unsigned char *end,
5959 register_info_type *reg_info)
5962 unsigned char *p1 = p;
5966 /* Skip over opcodes that can match nothing, and break when we get
5967 to one that can't. */
5969 switch ((re_opcode_t) *p1)
5972 case on_failure_jump:
5974 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5979 if (!common_op_match_null_string_p (&p1, end, reg_info))
5982 } /* while p1 < end */
5985 } /* alt_match_null_string_p */
5988 /* Deals with the ops common to group_match_null_string_p and
5989 alt_match_null_string_p.
5991 Sets P to one after the op and its arguments, if any. */
5994 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
5995 register_info_type *reg_info)
6000 unsigned char *p1 = *p;
6002 switch ((re_opcode_t) *p1++)
6022 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6023 ret = group_match_null_string_p (&p1, end, reg_info);
6025 /* Have to set this here in case we're checking a group which
6026 contains a group and a back reference to it. */
6028 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6029 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6035 /* If this is an optimized succeed_n for zero times, make the jump. */
6037 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6045 /* Get to the number of times to succeed. */
6047 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6052 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6060 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6068 /* All other opcodes mean we cannot match the empty string. */
6074 } /* common_op_match_null_string_p */
6077 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6078 bytes; nonzero otherwise. */
6081 bcmp_translate (re_char *s1, re_char *s2,
6082 REGISTER int len, RE_TRANSLATE_TYPE translate)
6084 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6086 const unsigned char *p1_end = s1 + len;
6087 const unsigned char *p2_end = s2 + len;
6089 while (p1 != p1_end && p2 != p2_end)
6091 Emchar p1_ch, p2_ch;
6093 p1_ch = charptr_emchar (p1);
6094 p2_ch = charptr_emchar (p2);
6096 if (RE_TRANSLATE (p1_ch)
6097 != RE_TRANSLATE (p2_ch))
6102 #else /* not MULE */
6105 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6112 /* Entry points for GNU code. */
6114 /* re_compile_pattern is the GNU regular expression compiler: it
6115 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6116 Returns 0 if the pattern was valid, otherwise an error string.
6118 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6119 are set in BUFP on entry.
6121 We call regex_compile to do the actual compilation. */
6124 re_compile_pattern (const char *pattern, int length,
6125 struct re_pattern_buffer *bufp)
6129 /* GNU code is written to assume at least RE_NREGS registers will be set
6130 (and at least one extra will be -1). */
6131 bufp->regs_allocated = REGS_UNALLOCATED;
6133 /* And GNU code determines whether or not to get register information
6134 by passing null for the REGS argument to re_match, etc., not by
6138 /* Match anchors at newline. */
6139 bufp->newline_anchor = 1;
6141 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6145 return gettext (re_error_msgid[(int) ret]);
6148 /* Entry points compatible with 4.2 BSD regex library. We don't define
6149 them unless specifically requested. */
6151 #ifdef _REGEX_RE_COMP
6153 /* BSD has one and only one pattern buffer. */
6154 static struct re_pattern_buffer re_comp_buf;
6157 re_comp (const char *s)
6163 if (!re_comp_buf.buffer)
6164 return gettext ("No previous regular expression");
6168 if (!re_comp_buf.buffer)
6170 re_comp_buf.buffer = (unsigned char *) malloc (200);
6171 if (re_comp_buf.buffer == NULL)
6172 return gettext (re_error_msgid[(int) REG_ESPACE]);
6173 re_comp_buf.allocated = 200;
6175 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6176 if (re_comp_buf.fastmap == NULL)
6177 return gettext (re_error_msgid[(int) REG_ESPACE]);
6180 /* Since `re_exec' always passes NULL for the `regs' argument, we
6181 don't need to initialize the pattern buffer fields which affect it. */
6183 /* Match anchors at newlines. */
6184 re_comp_buf.newline_anchor = 1;
6186 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6191 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6192 return (char *) gettext (re_error_msgid[(int) ret]);
6197 re_exec (const char *s)
6199 const int len = strlen (s);
6201 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6203 #endif /* _REGEX_RE_COMP */
6205 /* POSIX.2 functions. Don't define these for Emacs. */
6209 /* regcomp takes a regular expression as a string and compiles it.
6211 PREG is a regex_t *. We do not expect any fields to be initialized,
6212 since POSIX says we shouldn't. Thus, we set
6214 `buffer' to the compiled pattern;
6215 `used' to the length of the compiled pattern;
6216 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6217 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6218 RE_SYNTAX_POSIX_BASIC;
6219 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6220 `fastmap' and `fastmap_accurate' to zero;
6221 `re_nsub' to the number of subexpressions in PATTERN.
6223 PATTERN is the address of the pattern string.
6225 CFLAGS is a series of bits which affect compilation.
6227 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6228 use POSIX basic syntax.
6230 If REG_NEWLINE is set, then . and [^...] don't match newline.
6231 Also, regexec will try a match beginning after every newline.
6233 If REG_ICASE is set, then we considers upper- and lowercase
6234 versions of letters to be equivalent when matching.
6236 If REG_NOSUB is set, then when PREG is passed to regexec, that
6237 routine will report only success or failure, and nothing about the
6240 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6241 the return codes and their meanings.) */
6244 regcomp (regex_t *preg, const char *pattern, int cflags)
6248 = (cflags & REG_EXTENDED) ?
6249 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6251 /* regex_compile will allocate the space for the compiled pattern. */
6253 preg->allocated = 0;
6256 /* Don't bother to use a fastmap when searching. This simplifies the
6257 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6258 characters after newlines into the fastmap. This way, we just try
6262 if (cflags & REG_ICASE)
6266 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6267 if (preg->translate == NULL)
6268 return (int) REG_ESPACE;
6270 /* Map uppercase characters to corresponding lowercase ones. */
6271 for (i = 0; i < CHAR_SET_SIZE; i++)
6272 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6275 preg->translate = NULL;
6277 /* If REG_NEWLINE is set, newlines are treated differently. */
6278 if (cflags & REG_NEWLINE)
6279 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6280 syntax &= ~RE_DOT_NEWLINE;
6281 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6282 /* It also changes the matching behavior. */
6283 preg->newline_anchor = 1;
6286 preg->newline_anchor = 0;
6288 preg->no_sub = !!(cflags & REG_NOSUB);
6290 /* POSIX says a null character in the pattern terminates it, so we
6291 can use strlen here in compiling the pattern. */
6292 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6294 /* POSIX doesn't distinguish between an unmatched open-group and an
6295 unmatched close-group: both are REG_EPAREN. */
6296 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6302 /* regexec searches for a given pattern, specified by PREG, in the
6305 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6306 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6307 least NMATCH elements, and we set them to the offsets of the
6308 corresponding matched substrings.
6310 EFLAGS specifies `execution flags' which affect matching: if
6311 REG_NOTBOL is set, then ^ does not match at the beginning of the
6312 string; if REG_NOTEOL is set, then $ does not match at the end.
6314 We return 0 if we find a match and REG_NOMATCH if not. */
6317 regexec (const regex_t *preg, const char *string, Element_count nmatch,
6318 regmatch_t pmatch[], int eflags)
6321 struct re_registers regs;
6322 regex_t private_preg;
6323 int len = strlen (string);
6324 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6326 private_preg = *preg;
6328 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6329 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6331 /* The user has told us exactly how many registers to return
6332 information about, via `nmatch'. We have to pass that on to the
6333 matching routines. */
6334 private_preg.regs_allocated = REGS_FIXED;
6338 regs.num_regs = nmatch;
6339 regs.start = TALLOC (nmatch, regoff_t);
6340 regs.end = TALLOC (nmatch, regoff_t);
6341 if (regs.start == NULL || regs.end == NULL)
6342 return (int) REG_NOMATCH;
6345 /* Perform the searching operation. */
6346 ret = re_search (&private_preg, string, len,
6347 /* start: */ 0, /* range: */ len,
6348 want_reg_info ? ®s : (struct re_registers *) 0);
6350 /* Copy the register information to the POSIX structure. */
6357 for (r = 0; r < nmatch; r++)
6359 pmatch[r].rm_so = regs.start[r];
6360 pmatch[r].rm_eo = regs.end[r];
6364 /* If we needed the temporary register info, free the space now. */
6369 /* We want zero return to mean success, unlike `re_search'. */
6370 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6374 /* Returns a message corresponding to an error code, ERRCODE, returned
6375 from either regcomp or regexec. We don't use PREG here. */
6378 regerror (int errcode, const regex_t *preg, char *errbuf,
6379 Memory_count errbuf_size)
6382 Memory_count msg_size;
6385 || (size_t) errcode >= (sizeof (re_error_msgid)
6386 / sizeof (re_error_msgid[0])))
6387 /* Only error codes returned by the rest of the code should be passed
6388 to this routine. If we are given anything else, or if other regex
6389 code generates an invalid error code, then the program has a bug.
6390 Dump core so we can fix it. */
6393 msg = gettext (re_error_msgid[errcode]);
6395 msg_size = strlen (msg) + 1; /* Includes the null. */
6397 if (errbuf_size != 0)
6399 if (msg_size > errbuf_size)
6401 strncpy (errbuf, msg, errbuf_size - 1);
6402 errbuf[errbuf_size - 1] = 0;
6405 strcpy (errbuf, msg);
6412 /* Free dynamically allocated space used by PREG. */
6415 regfree (regex_t *preg)
6417 if (preg->buffer != NULL)
6418 free (preg->buffer);
6419 preg->buffer = NULL;
6421 preg->allocated = 0;
6424 if (preg->fastmap != NULL)
6425 free (preg->fastmap);
6426 preg->fastmap = NULL;
6427 preg->fastmap_accurate = 0;
6429 if (preg->translate != NULL)
6430 free (preg->translate);
6431 preg->translate = NULL;
6434 #endif /* not emacs */
6438 make-backup-files: t
6440 trim-versions-without-asking: nil