1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
94 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
100 Lisp_Object Vthe_lisp_rangetab;
103 complex_vars_of_regex (void)
105 Vthe_lisp_rangetab = Fmake_range_table ();
106 staticpro (&Vthe_lisp_rangetab);
112 complex_vars_of_regex (void)
118 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
119 #define TRANSLATE_P(tr) (!NILP (tr))
121 #else /* not emacs */
123 /* If we are not linking with Emacs proper,
124 we can't use the relocating allocator
125 even if config.h says that we can. */
128 #if defined (STDC_HEADERS) || defined (_LIBC)
135 /* Types normally included via lisp.h */
136 #include <stddef.h> /* for ptrdiff_t */
139 #ifndef DECLARE_NOTHING
140 #define DECLARE_NOTHING struct nosuchstruct
146 #define charptr_emchar(str) ((Emchar) (str)[0])
148 #define INC_CHARPTR(p) ((p)++)
149 #define DEC_CHARPTR(p) ((p)--)
153 /* Define the syntax stuff for \<, \>, etc. */
155 /* This must be nonzero for the wordchar and notwordchar pattern
156 commands in re_match_2. */
163 extern char *re_syntax_table;
165 #else /* not SYNTAX_TABLE */
167 /* How many characters in the character set. */
168 #define CHAR_SET_SIZE 256
170 static char re_syntax_table[CHAR_SET_SIZE];
173 init_syntax_once (void)
179 const char *word_syntax_chars =
180 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
182 memset (re_syntax_table, 0, sizeof (re_syntax_table));
184 while (*word_syntax_chars)
185 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
191 #endif /* SYNTAX_TABLE */
193 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
194 #undef SYNTAX_FROM_CACHE
195 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
197 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
198 #define TRANSLATE_P(tr) tr
202 /* Under XEmacs, this is needed because we don't define it elsewhere. */
203 #ifdef SWITCH_ENUM_BUG
204 #define SWITCH_ENUM_CAST(x) ((int)(x))
206 #define SWITCH_ENUM_CAST(x) (x)
210 /* Get the interface, including the syntax bits. */
213 /* isalpha etc. are used for the character classes. */
216 /* Jim Meyering writes:
218 "... Some ctype macros are valid only for character codes that
219 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
220 using /bin/cc or gcc but without giving an ansi option). So, all
221 ctype uses should be through macros like ISPRINT... If
222 STDC_HEADERS is defined, then autoconf has verified that the ctype
223 macros don't need to be guarded with references to isascii. ...
224 Defining isascii to 1 should let any compiler worth its salt
225 eliminate the && through constant folding." */
227 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
228 #define ISASCII_1(c) 1
230 #define ISASCII_1(c) isascii(c)
234 /* The IS*() macros can be passed any character, including an extended
235 one. We need to make sure there are no crashes, which would occur
236 otherwise due to out-of-bounds array references. */
237 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
239 #define ISASCII(c) ISASCII_1 (c)
243 #define ISBLANK(c) (ISASCII (c) && isblank (c))
245 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
248 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
250 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
253 #define ISPRINT(c) (ISASCII (c) && isprint (c))
254 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
255 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
256 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
257 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
258 #define ISLOWER(c) (ISASCII (c) && islower (c))
259 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
260 #define ISSPACE(c) (ISASCII (c) && isspace (c))
261 #define ISUPPER(c) (ISASCII (c) && isupper (c))
262 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
265 #define NULL (void *)0
268 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
269 since ours (we hope) works properly with all combinations of
270 machines, compilers, `char' and `unsigned char' argument types.
271 (Per Bothner suggested the basic approach.) */
272 #undef SIGN_EXTEND_CHAR
274 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
275 #else /* not __STDC__ */
276 /* As in Harbison and Steele. */
277 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
280 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
281 use `alloca' instead of `malloc'. This is because using malloc in
282 re_search* or re_match* could cause memory leaks when C-g is used in
283 Emacs; also, malloc is slower and causes storage fragmentation. On
284 the other hand, malloc is more portable, and easier to debug.
286 Because we sometimes use alloca, some routines have to be macros,
287 not functions -- `alloca'-allocated space disappears at the end of the
288 function it is called in. */
292 #define REGEX_ALLOCATE malloc
293 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
294 #define REGEX_FREE free
296 #else /* not REGEX_MALLOC */
298 /* Emacs already defines alloca, sometimes. */
301 /* Make alloca work the best possible way. */
303 #define alloca __builtin_alloca
304 #else /* not __GNUC__ */
307 #else /* not __GNUC__ or HAVE_ALLOCA_H */
308 #ifndef _AIX /* Already did AIX, up at the top. */
310 #endif /* not _AIX */
311 #endif /* HAVE_ALLOCA_H */
312 #endif /* __GNUC__ */
314 #endif /* not alloca */
316 #define REGEX_ALLOCATE alloca
318 /* Assumes a `char *destination' variable. */
319 #define REGEX_REALLOCATE(source, osize, nsize) \
320 (destination = (char *) alloca (nsize), \
321 memmove (destination, source, osize), \
324 /* No need to do anything to free, after alloca. */
325 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
327 #endif /* REGEX_MALLOC */
329 /* Define how to allocate the failure stack. */
332 #define REGEX_ALLOCATE_STACK(size) \
333 r_alloc ((char **) &failure_stack_ptr, (size))
334 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
335 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
336 #define REGEX_FREE_STACK(ptr) \
337 r_alloc_free ((void **) &failure_stack_ptr)
339 #else /* not REL_ALLOC */
343 #define REGEX_ALLOCATE_STACK malloc
344 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
345 #define REGEX_FREE_STACK free
347 #else /* not REGEX_MALLOC */
349 #define REGEX_ALLOCATE_STACK alloca
351 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
352 REGEX_REALLOCATE (source, osize, nsize)
353 /* No need to explicitly free anything. */
354 #define REGEX_FREE_STACK(arg)
356 #endif /* REGEX_MALLOC */
357 #endif /* REL_ALLOC */
360 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
361 `string1' or just past its end. This works if PTR is NULL, which is
363 #define FIRST_STRING_P(ptr) \
364 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
366 /* (Re)Allocate N items of type T using malloc, or fail. */
367 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
368 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
369 #define RETALLOC_IF(addr, n, t) \
370 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
371 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
373 #define BYTEWIDTH 8 /* In bits. */
375 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
379 #define MAX(a, b) ((a) > (b) ? (a) : (b))
380 #define MIN(a, b) ((a) < (b) ? (a) : (b))
382 /* Type of source-pattern and string chars. */
383 typedef const unsigned char re_char;
385 typedef char re_bool;
390 /* These are the command codes that appear in compiled regular
391 expressions. Some opcodes are followed by argument bytes. A
392 command code can specify any interpretation whatsoever for its
393 arguments. Zero bytes may appear in the compiled regular expression. */
399 /* Succeed right away--no more backtracking. */
402 /* Followed by one byte giving n, then by n literal bytes. */
405 /* Matches any (more or less) character. */
408 /* Matches any one char belonging to specified set. First
409 following byte is number of bitmap bytes. Then come bytes
410 for a bitmap saying which chars are in. Bits in each byte
411 are ordered low-bit-first. A character is in the set if its
412 bit is 1. A character too large to have a bit in the map is
413 automatically not in the set. */
416 /* Same parameters as charset, but match any character that is
417 not one of those specified. */
420 /* Start remembering the text that is matched, for storing in a
421 register. Followed by one byte with the register number, in
422 the range 0 to one less than the pattern buffer's re_nsub
423 field. Then followed by one byte with the number of groups
424 inner to this one. (This last has to be part of the
425 start_memory only because we need it in the on_failure_jump
429 /* Stop remembering the text that is matched and store it in a
430 memory register. Followed by one byte with the register
431 number, in the range 0 to one less than `re_nsub' in the
432 pattern buffer, and one byte with the number of inner groups,
433 just like `start_memory'. (We need the number of inner
434 groups here because we don't have any easy way of finding the
435 corresponding start_memory when we're at a stop_memory.) */
438 /* Match a duplicate of something remembered. Followed by one
439 byte containing the register number. */
442 /* Fail unless at beginning of line. */
445 /* Fail unless at end of line. */
448 /* Succeeds if at beginning of buffer (if emacs) or at beginning
449 of string to be matched (if not). */
452 /* Analogously, for end of buffer/string. */
455 /* Followed by two byte relative address to which to jump. */
458 /* Same as jump, but marks the end of an alternative. */
461 /* Followed by two-byte relative address of place to resume at
462 in case of failure. */
465 /* Like on_failure_jump, but pushes a placeholder instead of the
466 current string position when executed. */
467 on_failure_keep_string_jump,
469 /* Throw away latest failure point and then jump to following
470 two-byte relative address. */
473 /* Change to pop_failure_jump if know won't have to backtrack to
474 match; otherwise change to jump. This is used to jump
475 back to the beginning of a repeat. If what follows this jump
476 clearly won't match what the repeat does, such that we can be
477 sure that there is no use backtracking out of repetitions
478 already matched, then we change it to a pop_failure_jump.
479 Followed by two-byte address. */
482 /* Jump to following two-byte address, and push a dummy failure
483 point. This failure point will be thrown away if an attempt
484 is made to use it for a failure. A `+' construct makes this
485 before the first repeat. Also used as an intermediary kind
486 of jump when compiling an alternative. */
489 /* Push a dummy failure point and continue. Used at the end of
493 /* Followed by two-byte relative address and two-byte number n.
494 After matching N times, jump to the address upon failure. */
497 /* Followed by two-byte relative address, and two-byte number n.
498 Jump to the address N times, then fail. */
501 /* Set the following two-byte relative address to the
502 subsequent two-byte number. The address *includes* the two
506 wordchar, /* Matches any word-constituent character. */
507 notwordchar, /* Matches any char that is not a word-constituent. */
509 wordbeg, /* Succeeds if at word beginning. */
510 wordend, /* Succeeds if at word end. */
512 wordbound, /* Succeeds if at a word boundary. */
513 notwordbound /* Succeeds if not at a word boundary. */
516 ,before_dot, /* Succeeds if before point. */
517 at_dot, /* Succeeds if at point. */
518 after_dot, /* Succeeds if after point. */
520 /* Matches any character whose syntax is specified. Followed by
521 a byte which contains a syntax code, e.g., Sword. */
524 /* Matches any character whose syntax is not that specified. */
530 /* need extra stuff to be able to properly work with XEmacs/Mule
531 characters (which may take up more than one byte) */
533 ,charset_mule, /* Matches any character belonging to specified set.
534 The set is stored in "unified range-table
535 format"; see rangetab.c. Unlike the `charset'
536 opcode, this can handle arbitrary characters. */
538 charset_mule_not /* Same parameters as charset_mule, but match any
539 character that is not one of those specified. */
541 /* 97/2/17 jhod: The following two were merged back in from the Mule
542 2.3 code to enable some language specific processing */
543 ,categoryspec, /* Matches entries in the character category tables */
544 notcategoryspec /* The opposite of the above */
549 /* Common operations on the compiled pattern. */
551 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
553 #define STORE_NUMBER(destination, number) \
555 (destination)[0] = (number) & 0377; \
556 (destination)[1] = (number) >> 8; \
559 /* Same as STORE_NUMBER, except increment DESTINATION to
560 the byte after where the number is stored. Therefore, DESTINATION
561 must be an lvalue. */
563 #define STORE_NUMBER_AND_INCR(destination, number) \
565 STORE_NUMBER (destination, number); \
566 (destination) += 2; \
569 /* Put into DESTINATION a number stored in two contiguous bytes starting
572 #define EXTRACT_NUMBER(destination, source) \
574 (destination) = *(source) & 0377; \
575 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
580 extract_number (int *dest, re_char *source)
582 int temp = SIGN_EXTEND_CHAR (*(source + 1));
583 *dest = *source & 0377;
587 #ifndef EXTRACT_MACROS /* To debug the macros. */
588 #undef EXTRACT_NUMBER
589 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
590 #endif /* not EXTRACT_MACROS */
594 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
595 SOURCE must be an lvalue. */
597 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
599 EXTRACT_NUMBER (destination, source); \
605 extract_number_and_incr (int *destination, unsigned char **source)
607 extract_number (destination, *source);
611 #ifndef EXTRACT_MACROS
612 #undef EXTRACT_NUMBER_AND_INCR
613 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
614 extract_number_and_incr (&dest, &src)
615 #endif /* not EXTRACT_MACROS */
619 /* If DEBUG is defined, Regex prints many voluminous messages about what
620 it is doing (if the variable `debug' is nonzero). If linked with the
621 main program in `iregex.c', you can enter patterns and strings
622 interactively. And if linked with the main program in `main.c' and
623 the other test files, you can run the already-written tests. */
627 /* We use standard I/O for debugging. */
631 /* XEmacs provides its own version of assert() */
632 /* It is useful to test things that ``must'' be true when debugging. */
636 static int debug = 0;
638 #define DEBUG_STATEMENT(e) e
639 #define DEBUG_PRINT1(x) if (debug) printf (x)
640 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
641 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
642 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
643 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
644 if (debug) print_partial_compiled_pattern (s, e)
645 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
646 if (debug) print_double_string (w, s1, sz1, s2, sz2)
649 /* Print the fastmap in human-readable form. */
652 print_fastmap (char *fastmap)
654 unsigned was_a_range = 0;
657 while (i < (1 << BYTEWIDTH))
663 while (i < (1 << BYTEWIDTH) && fastmap[i])
679 /* Print a compiled pattern string in human-readable form, starting at
680 the START pointer into it and ending just before the pointer END. */
683 print_partial_compiled_pattern (re_char *start, re_char *end)
686 unsigned char *p = (unsigned char *) start;
695 /* Loop over pattern commands. */
698 printf ("%ld:\t", (long)(p - start));
700 switch ((re_opcode_t) *p++)
708 printf ("/exactn/%d", mcnt);
719 printf ("/start_memory/%d/%d", mcnt, *p++);
724 printf ("/stop_memory/%d/%d", mcnt, *p++);
728 printf ("/duplicate/%d", *p++);
738 REGISTER int c, last = -100;
739 REGISTER int in_range = 0;
741 printf ("/charset [%s",
742 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
744 assert (p + *p < pend);
746 for (c = 0; c < 256; c++)
747 if (((unsigned char) (c / 8) < *p)
748 && (p[1 + (c/8)] & (1 << (c % 8))))
750 /* Are we starting a range? */
751 if (last + 1 == c && ! in_range)
756 /* Have we broken a range? */
757 else if (last + 1 != c && in_range)
780 case charset_mule_not:
784 printf ("/charset_mule [%s",
785 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
786 nentries = unified_range_table_nentries (p);
787 for (i = 0; i < nentries; i++)
789 EMACS_INT first, last;
790 Lisp_Object dummy_val;
792 unified_range_table_get_range (p, i, &first, &last,
797 printf ("(0x%lx)", (long)first);
804 printf ("(0x%lx)", (long)last);
808 p += unified_range_table_bytes_used (p);
821 case on_failure_jump:
822 extract_number_and_incr (&mcnt, &p);
823 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
826 case on_failure_keep_string_jump:
827 extract_number_and_incr (&mcnt, &p);
828 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
831 case dummy_failure_jump:
832 extract_number_and_incr (&mcnt, &p);
833 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
836 case push_dummy_failure:
837 printf ("/push_dummy_failure");
841 extract_number_and_incr (&mcnt, &p);
842 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
845 case pop_failure_jump:
846 extract_number_and_incr (&mcnt, &p);
847 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
851 extract_number_and_incr (&mcnt, &p);
852 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
856 extract_number_and_incr (&mcnt, &p);
857 printf ("/jump to %ld", (long)(p + mcnt - start));
861 extract_number_and_incr (&mcnt, &p);
862 extract_number_and_incr (&mcnt2, &p);
863 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
867 extract_number_and_incr (&mcnt, &p);
868 extract_number_and_incr (&mcnt2, &p);
869 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
873 extract_number_and_incr (&mcnt, &p);
874 extract_number_and_incr (&mcnt2, &p);
875 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
879 printf ("/wordbound");
883 printf ("/notwordbound");
895 printf ("/before_dot");
903 printf ("/after_dot");
907 printf ("/syntaxspec");
909 printf ("/%d", mcnt);
913 printf ("/notsyntaxspec");
915 printf ("/%d", mcnt);
919 /* 97/2/17 jhod Mule category patch */
921 printf ("/categoryspec");
923 printf ("/%d", mcnt);
926 case notcategoryspec:
927 printf ("/notcategoryspec");
929 printf ("/%d", mcnt);
931 /* end of category patch */
936 printf ("/wordchar");
940 printf ("/notwordchar");
952 printf ("?%d", *(p-1));
958 printf ("%ld:\tend of pattern.\n", (long)(p - start));
963 print_compiled_pattern (struct re_pattern_buffer *bufp)
965 re_char *buffer = bufp->buffer;
967 print_partial_compiled_pattern (buffer, buffer + bufp->used);
968 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
971 if (bufp->fastmap_accurate && bufp->fastmap)
973 printf ("fastmap: ");
974 print_fastmap (bufp->fastmap);
977 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
978 printf ("regs_alloc: %d\t", bufp->regs_allocated);
979 printf ("can_be_null: %d\t", bufp->can_be_null);
980 printf ("newline_anchor: %d\n", bufp->newline_anchor);
981 printf ("no_sub: %d\t", bufp->no_sub);
982 printf ("not_bol: %d\t", bufp->not_bol);
983 printf ("not_eol: %d\t", bufp->not_eol);
984 printf ("syntax: %d\n", bufp->syntax);
985 /* Perhaps we should print the translate table? */
986 /* and maybe the category table? */
991 print_double_string (re_char *where, re_char *string1, int size1,
992 re_char *string2, int size2)
998 Element_count this_char;
1000 if (FIRST_STRING_P (where))
1002 for (this_char = where - string1; this_char < size1; this_char++)
1003 putchar (string1[this_char]);
1008 for (this_char = where - string2; this_char < size2; this_char++)
1009 putchar (string2[this_char]);
1013 #else /* not DEBUG */
1018 #define DEBUG_STATEMENT(e)
1019 #define DEBUG_PRINT1(x)
1020 #define DEBUG_PRINT2(x1, x2)
1021 #define DEBUG_PRINT3(x1, x2, x3)
1022 #define DEBUG_PRINT4(x1, x2, x3, x4)
1023 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1024 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1028 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1029 also be assigned to arbitrarily: each pattern buffer stores its own
1030 syntax, so it can be changed between regex compilations. */
1031 /* This has no initializer because initialized variables in Emacs
1032 become read-only after dumping. */
1033 reg_syntax_t re_syntax_options;
1036 /* Specify the precise syntax of regexps for compilation. This provides
1037 for compatibility for various utilities which historically have
1038 different, incompatible syntaxes.
1040 The argument SYNTAX is a bit mask comprised of the various bits
1041 defined in regex.h. We return the old syntax. */
1044 re_set_syntax (reg_syntax_t syntax)
1046 reg_syntax_t ret = re_syntax_options;
1048 re_syntax_options = syntax;
1052 /* This table gives an error message for each of the error codes listed
1053 in regex.h. Obviously the order here has to be same as there.
1054 POSIX doesn't require that we do anything for REG_NOERROR,
1055 but why not be nice? */
1057 static const char *re_error_msgid[] =
1059 "Success", /* REG_NOERROR */
1060 "No match", /* REG_NOMATCH */
1061 "Invalid regular expression", /* REG_BADPAT */
1062 "Invalid collation character", /* REG_ECOLLATE */
1063 "Invalid character class name", /* REG_ECTYPE */
1064 "Trailing backslash", /* REG_EESCAPE */
1065 "Invalid back reference", /* REG_ESUBREG */
1066 "Unmatched [ or [^", /* REG_EBRACK */
1067 "Unmatched ( or \\(", /* REG_EPAREN */
1068 "Unmatched \\{", /* REG_EBRACE */
1069 "Invalid content of \\{\\}", /* REG_BADBR */
1070 "Invalid range end", /* REG_ERANGE */
1071 "Memory exhausted", /* REG_ESPACE */
1072 "Invalid preceding regular expression", /* REG_BADRPT */
1073 "Premature end of regular expression", /* REG_EEND */
1074 "Regular expression too big", /* REG_ESIZE */
1075 "Unmatched ) or \\)", /* REG_ERPAREN */
1077 "Invalid syntax designator", /* REG_ESYNTAX */
1080 "Ranges may not span charsets", /* REG_ERANGESPAN */
1081 "Invalid category designator", /* REG_ECATEGORY */
1085 /* Avoiding alloca during matching, to placate r_alloc. */
1087 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1088 searching and matching functions should not call alloca. On some
1089 systems, alloca is implemented in terms of malloc, and if we're
1090 using the relocating allocator routines, then malloc could cause a
1091 relocation, which might (if the strings being searched are in the
1092 ralloc heap) shift the data out from underneath the regexp
1095 Here's another reason to avoid allocation: Emacs
1096 processes input from X in a signal handler; processing X input may
1097 call malloc; if input arrives while a matching routine is calling
1098 malloc, then we're scrod. But Emacs can't just block input while
1099 calling matching routines; then we don't notice interrupts when
1100 they come in. So, Emacs blocks input around all regexp calls
1101 except the matching calls, which it leaves unprotected, in the
1102 faith that they will not malloc. */
1104 /* Normally, this is fine. */
1105 #define MATCH_MAY_ALLOCATE
1107 /* When using GNU C, we are not REALLY using the C alloca, no matter
1108 what config.h may say. So don't take precautions for it. */
1113 /* The match routines may not allocate if (1) they would do it with malloc
1114 and (2) it's not safe for them to use malloc.
1115 Note that if REL_ALLOC is defined, matching would not use malloc for the
1116 failure stack, but we would still use it for the register vectors;
1117 so REL_ALLOC should not affect this. */
1118 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1119 #undef MATCH_MAY_ALLOCATE
1123 /* Failure stack declarations and macros; both re_compile_fastmap and
1124 re_match_2 use a failure stack. These have to be macros because of
1125 REGEX_ALLOCATE_STACK. */
1128 /* Number of failure points for which to initially allocate space
1129 when matching. If this number is exceeded, we allocate more
1130 space, so it is not a hard limit. */
1131 #ifndef INIT_FAILURE_ALLOC
1132 #define INIT_FAILURE_ALLOC 5
1135 /* Roughly the maximum number of failure points on the stack. Would be
1136 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1137 This is a variable only so users of regex can assign to it; we never
1138 change it ourselves. */
1139 #if defined (MATCH_MAY_ALLOCATE)
1140 /* 4400 was enough to cause a crash on Alpha OSF/1,
1141 whose default stack limit is 2mb. */
1142 int re_max_failures = 20000;
1144 int re_max_failures = 2000;
1147 union fail_stack_elt
1153 typedef union fail_stack_elt fail_stack_elt_t;
1157 fail_stack_elt_t *stack;
1159 Element_count avail; /* Offset of next open position. */
1162 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1163 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1164 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1167 /* Define macros to initialize and free the failure stack.
1168 Do `return -2' if the alloc fails. */
1170 #ifdef MATCH_MAY_ALLOCATE
1171 #define INIT_FAIL_STACK() \
1173 fail_stack.stack = (fail_stack_elt_t *) \
1174 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1176 if (fail_stack.stack == NULL) \
1179 fail_stack.size = INIT_FAILURE_ALLOC; \
1180 fail_stack.avail = 0; \
1183 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1185 #define INIT_FAIL_STACK() \
1187 fail_stack.avail = 0; \
1190 #define RESET_FAIL_STACK()
1194 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1196 Return 1 if succeeds, and 0 if either ran out of memory
1197 allocating space for it or it was already too large.
1199 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1201 #define DOUBLE_FAIL_STACK(fail_stack) \
1202 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1204 : ((fail_stack).stack = (fail_stack_elt_t *) \
1205 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1206 (fail_stack).size * sizeof (fail_stack_elt_t), \
1207 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1209 (fail_stack).stack == NULL \
1211 : ((fail_stack).size <<= 1, \
1215 /* Push pointer POINTER on FAIL_STACK.
1216 Return 1 if was able to do so and 0 if ran out of memory allocating
1218 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1219 ((FAIL_STACK_FULL () \
1220 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1222 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1225 /* Push a pointer value onto the failure stack.
1226 Assumes the variable `fail_stack'. Probably should only
1227 be called from within `PUSH_FAILURE_POINT'. */
1228 #define PUSH_FAILURE_POINTER(item) \
1229 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1231 /* This pushes an integer-valued item onto the failure stack.
1232 Assumes the variable `fail_stack'. Probably should only
1233 be called from within `PUSH_FAILURE_POINT'. */
1234 #define PUSH_FAILURE_INT(item) \
1235 fail_stack.stack[fail_stack.avail++].integer = (item)
1237 /* Push a fail_stack_elt_t value onto the failure stack.
1238 Assumes the variable `fail_stack'. Probably should only
1239 be called from within `PUSH_FAILURE_POINT'. */
1240 #define PUSH_FAILURE_ELT(item) \
1241 fail_stack.stack[fail_stack.avail++] = (item)
1243 /* These three POP... operations complement the three PUSH... operations.
1244 All assume that `fail_stack' is nonempty. */
1245 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1246 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1247 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1249 /* Used to omit pushing failure point id's when we're not debugging. */
1251 #define DEBUG_PUSH PUSH_FAILURE_INT
1252 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1254 #define DEBUG_PUSH(item)
1255 #define DEBUG_POP(item_addr)
1259 /* Push the information about the state we will need
1260 if we ever fail back to it.
1262 Requires variables fail_stack, regstart, regend, reg_info, and
1263 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1266 Does `return FAILURE_CODE' if runs out of memory. */
1268 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1269 #define DECLARE_DESTINATION char *destination
1271 #define DECLARE_DESTINATION DECLARE_NOTHING
1274 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1276 DECLARE_DESTINATION; \
1277 /* Must be int, so when we don't save any registers, the arithmetic \
1278 of 0 + -1 isn't done as unsigned. */ \
1281 DEBUG_STATEMENT (failure_id++); \
1282 DEBUG_STATEMENT (nfailure_points_pushed++); \
1283 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1284 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1285 (unsigned long) (fail_stack).avail); \
1286 DEBUG_PRINT2 (" size: %lu\n", \
1287 (unsigned long) (fail_stack).size); \
1289 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1290 DEBUG_PRINT2 (" available: %ld\n", \
1291 (long) REMAINING_AVAIL_SLOTS); \
1293 /* Ensure we have enough space allocated for what we will push. */ \
1294 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1296 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1297 return failure_code; \
1299 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1300 (unsigned long) (fail_stack).size); \
1301 DEBUG_PRINT2 (" slots available: %ld\n", \
1302 (long) REMAINING_AVAIL_SLOTS); \
1305 /* Push the info, starting with the registers. */ \
1306 DEBUG_PRINT1 ("\n"); \
1308 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1311 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1312 DEBUG_STATEMENT (num_regs_pushed++); \
1314 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1315 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1317 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1318 PUSH_FAILURE_POINTER (regend[this_reg]); \
1320 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1321 * (long *) (®_info[this_reg])); \
1322 DEBUG_PRINT2 (" match_null=%d", \
1323 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1324 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1325 DEBUG_PRINT2 (" matched_something=%d", \
1326 MATCHED_SOMETHING (reg_info[this_reg])); \
1327 DEBUG_PRINT2 (" ever_matched_something=%d", \
1328 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1329 DEBUG_PRINT1 ("\n"); \
1330 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1333 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1334 PUSH_FAILURE_INT (lowest_active_reg); \
1336 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1337 PUSH_FAILURE_INT (highest_active_reg); \
1339 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1340 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1341 PUSH_FAILURE_POINTER (pattern_place); \
1343 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1344 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1346 DEBUG_PRINT1 ("'\n"); \
1347 PUSH_FAILURE_POINTER (string_place); \
1349 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1350 DEBUG_PUSH (failure_id); \
1353 /* This is the number of items that are pushed and popped on the stack
1354 for each register. */
1355 #define NUM_REG_ITEMS 3
1357 /* Individual items aside from the registers. */
1359 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1361 #define NUM_NONREG_ITEMS 4
1364 /* We push at most this many items on the stack. */
1365 /* We used to use (num_regs - 1), which is the number of registers
1366 this regexp will save; but that was changed to 5
1367 to avoid stack overflow for a regexp with lots of parens. */
1368 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1370 /* We actually push this many items. */
1371 #define NUM_FAILURE_ITEMS \
1372 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1375 /* How many items can still be added to the stack without overflowing it. */
1376 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1379 /* Pops what PUSH_FAIL_STACK pushes.
1381 We restore into the parameters, all of which should be lvalues:
1382 STR -- the saved data position.
1383 PAT -- the saved pattern position.
1384 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1385 REGSTART, REGEND -- arrays of string positions.
1386 REG_INFO -- array of information about each subexpression.
1388 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1389 `pend', `string1', `size1', `string2', and `size2'. */
1391 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1392 regstart, regend, reg_info) \
1394 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1396 const unsigned char *string_temp; \
1398 assert (!FAIL_STACK_EMPTY ()); \
1400 /* Remove failure points and point to how many regs pushed. */ \
1401 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1402 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1403 (unsigned long) fail_stack.avail); \
1404 DEBUG_PRINT2 (" size: %lu\n", \
1405 (unsigned long) fail_stack.size); \
1407 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1409 DEBUG_POP (&ffailure_id.integer); \
1410 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1411 * (unsigned int *) &ffailure_id); \
1413 /* If the saved string location is NULL, it came from an \
1414 on_failure_keep_string_jump opcode, and we want to throw away the \
1415 saved NULL, thus retaining our current position in the string. */ \
1416 string_temp = POP_FAILURE_POINTER (); \
1417 if (string_temp != NULL) \
1418 str = string_temp; \
1420 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1421 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1422 DEBUG_PRINT1 ("'\n"); \
1424 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1425 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1426 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1428 /* Restore register info. */ \
1429 high_reg = (unsigned) POP_FAILURE_INT (); \
1430 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1432 low_reg = (unsigned) POP_FAILURE_INT (); \
1433 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1435 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1437 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1439 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1440 DEBUG_PRINT2 (" info: 0x%lx\n", \
1441 * (long *) ®_info[this_reg]); \
1443 regend[this_reg] = POP_FAILURE_POINTER (); \
1444 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1446 regstart[this_reg] = POP_FAILURE_POINTER (); \
1447 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1450 set_regs_matched_done = 0; \
1451 DEBUG_STATEMENT (nfailure_points_popped++); \
1452 } while (0) /* POP_FAILURE_POINT */
1456 /* Structure for per-register (a.k.a. per-group) information.
1457 Other register information, such as the
1458 starting and ending positions (which are addresses), and the list of
1459 inner groups (which is a bits list) are maintained in separate
1462 We are making a (strictly speaking) nonportable assumption here: that
1463 the compiler will pack our bit fields into something that fits into
1464 the type of `word', i.e., is something that fits into one item on the
1469 fail_stack_elt_t word;
1472 /* This field is one if this group can match the empty string,
1473 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1474 #define MATCH_NULL_UNSET_VALUE 3
1475 unsigned match_null_string_p : 2;
1476 unsigned is_active : 1;
1477 unsigned matched_something : 1;
1478 unsigned ever_matched_something : 1;
1480 } register_info_type;
1482 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1483 #define IS_ACTIVE(R) ((R).bits.is_active)
1484 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1485 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1488 /* Call this when have matched a real character; it sets `matched' flags
1489 for the subexpressions which we are currently inside. Also records
1490 that those subexprs have matched. */
1491 #define SET_REGS_MATCHED() \
1494 if (!set_regs_matched_done) \
1497 set_regs_matched_done = 1; \
1498 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1500 MATCHED_SOMETHING (reg_info[r]) \
1501 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1508 /* Registers are set to a sentinel when they haven't yet matched. */
1509 static unsigned char reg_unset_dummy;
1510 #define REG_UNSET_VALUE (®_unset_dummy)
1511 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1513 /* Subroutine declarations and macros for regex_compile. */
1515 /* Fetch the next character in the uncompiled pattern---translating it
1516 if necessary. Also cast from a signed character in the constant
1517 string passed to us by the user to an unsigned char that we can use
1518 as an array index (in, e.g., `translate'). */
1519 #define PATFETCH(c) \
1522 c = TRANSLATE (c); \
1525 /* Fetch the next character in the uncompiled pattern, with no
1527 #define PATFETCH_RAW(c) \
1528 do {if (p == pend) return REG_EEND; \
1529 assert (p < pend); \
1530 c = charptr_emchar (p); \
1534 /* Go backwards one character in the pattern. */
1535 #define PATUNFETCH DEC_CHARPTR (p)
1539 #define PATFETCH_EXTENDED(emch) \
1540 do {if (p == pend) return REG_EEND; \
1541 assert (p < pend); \
1542 emch = charptr_emchar ((const Bufbyte *) p); \
1544 if (TRANSLATE_P (translate) && emch < 0x80) \
1545 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1548 #define PATFETCH_RAW_EXTENDED(emch) \
1549 do {if (p == pend) return REG_EEND; \
1550 assert (p < pend); \
1551 emch = charptr_emchar ((const Bufbyte *) p); \
1555 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1557 #define PATFETCH_EITHER(emch) \
1559 if (has_extended_chars) \
1560 PATFETCH_EXTENDED (emch); \
1565 #define PATFETCH_RAW_EITHER(emch) \
1567 if (has_extended_chars) \
1568 PATFETCH_RAW_EXTENDED (emch); \
1570 PATFETCH_RAW (emch); \
1573 #define PATUNFETCH_EITHER \
1575 if (has_extended_chars) \
1576 PATUNFETCH_EXTENDED (emch); \
1578 PATUNFETCH (emch); \
1581 #else /* not MULE */
1583 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1584 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1585 #define PATUNFETCH_EITHER PATUNFETCH
1589 /* If `translate' is non-null, return translate[D], else just D. We
1590 cast the subscript to translate because some data is declared as
1591 `char *', to avoid warnings when a string constant is passed. But
1592 when we use a character as a subscript we must make it unsigned. */
1593 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1597 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1598 (TRANSLATE_P (translate) && emch < 0x80 ? RE_TRANSLATE (emch) : (emch))
1602 /* Macros for outputting the compiled pattern into `buffer'. */
1604 /* If the buffer isn't allocated when it comes in, use this. */
1605 #define INIT_BUF_SIZE 32
1607 /* Make sure we have at least N more bytes of space in buffer. */
1608 #define GET_BUFFER_SPACE(n) \
1609 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1612 /* Make sure we have one more byte of buffer space and then add C to it. */
1613 #define BUF_PUSH(c) \
1615 GET_BUFFER_SPACE (1); \
1616 *buf_end++ = (unsigned char) (c); \
1620 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1621 #define BUF_PUSH_2(c1, c2) \
1623 GET_BUFFER_SPACE (2); \
1624 *buf_end++ = (unsigned char) (c1); \
1625 *buf_end++ = (unsigned char) (c2); \
1629 /* As with BUF_PUSH_2, except for three bytes. */
1630 #define BUF_PUSH_3(c1, c2, c3) \
1632 GET_BUFFER_SPACE (3); \
1633 *buf_end++ = (unsigned char) (c1); \
1634 *buf_end++ = (unsigned char) (c2); \
1635 *buf_end++ = (unsigned char) (c3); \
1639 /* Store a jump with opcode OP at LOC to location TO. We store a
1640 relative address offset by the three bytes the jump itself occupies. */
1641 #define STORE_JUMP(op, loc, to) \
1642 store_op1 (op, loc, (to) - (loc) - 3)
1644 /* Likewise, for a two-argument jump. */
1645 #define STORE_JUMP2(op, loc, to, arg) \
1646 store_op2 (op, loc, (to) - (loc) - 3, arg)
1648 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1650 #define INSERT_JUMP(op, loc, to) \
1651 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1653 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1655 #define INSERT_JUMP2(op, loc, to, arg) \
1656 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1659 /* This is not an arbitrary limit: the arguments which represent offsets
1660 into the pattern are two bytes long. So if 2^16 bytes turns out to
1661 be too small, many things would have to change. */
1662 #define MAX_BUF_SIZE (1L << 16)
1665 /* Extend the buffer by twice its current size via realloc and
1666 reset the pointers that pointed into the old block to point to the
1667 correct places in the new one. If extending the buffer results in it
1668 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1669 #define EXTEND_BUFFER() \
1671 re_char *old_buffer = bufp->buffer; \
1672 if (bufp->allocated == MAX_BUF_SIZE) \
1674 bufp->allocated <<= 1; \
1675 if (bufp->allocated > MAX_BUF_SIZE) \
1676 bufp->allocated = MAX_BUF_SIZE; \
1677 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1678 if (bufp->buffer == NULL) \
1679 return REG_ESPACE; \
1680 /* If the buffer moved, move all the pointers into it. */ \
1681 if (old_buffer != bufp->buffer) \
1683 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1684 begalt = (begalt - old_buffer) + bufp->buffer; \
1685 if (fixup_alt_jump) \
1686 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1688 laststart = (laststart - old_buffer) + bufp->buffer; \
1689 if (pending_exact) \
1690 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1695 /* Since we have one byte reserved for the register number argument to
1696 {start,stop}_memory, the maximum number of groups we can report
1697 things about is what fits in that byte. */
1698 #define MAX_REGNUM 255
1700 /* But patterns can have more than `MAX_REGNUM' registers. We just
1701 ignore the excess. */
1702 typedef unsigned regnum_t;
1705 /* Macros for the compile stack. */
1707 /* Since offsets can go either forwards or backwards, this type needs to
1708 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1709 typedef int pattern_offset_t;
1713 pattern_offset_t begalt_offset;
1714 pattern_offset_t fixup_alt_jump;
1715 pattern_offset_t inner_group_offset;
1716 pattern_offset_t laststart_offset;
1718 } compile_stack_elt_t;
1723 compile_stack_elt_t *stack;
1725 unsigned avail; /* Offset of next open position. */
1726 } compile_stack_type;
1729 #define INIT_COMPILE_STACK_SIZE 32
1731 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1732 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1734 /* The next available element. */
1735 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1738 /* Set the bit for character C in a bit vector. */
1739 #define SET_LIST_BIT(c) \
1740 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1741 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1745 /* Set the "bit" for character C in a range table. */
1746 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1748 /* Set the "bit" for character c in the appropriate table. */
1749 #define SET_EITHER_BIT(c) \
1751 if (has_extended_chars) \
1752 SET_RANGETAB_BIT (c); \
1757 #else /* not MULE */
1759 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1764 /* Get the next unsigned number in the uncompiled pattern. */
1765 #define GET_UNSIGNED_NUMBER(num) \
1769 while (ISDIGIT (c)) \
1773 num = num * 10 + c - '0'; \
1781 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1783 #define IS_CHAR_CLASS(string) \
1784 (STREQ (string, "alpha") || STREQ (string, "upper") \
1785 || STREQ (string, "lower") || STREQ (string, "digit") \
1786 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1787 || STREQ (string, "space") || STREQ (string, "print") \
1788 || STREQ (string, "punct") || STREQ (string, "graph") \
1789 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1791 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1792 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1793 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1794 unsigned char *end);
1795 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1796 unsigned char *end);
1797 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1798 reg_syntax_t syntax);
1799 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1800 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1802 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1803 RE_TRANSLATE_TYPE translate,
1804 reg_syntax_t syntax,
1807 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1809 RE_TRANSLATE_TYPE translate,
1810 reg_syntax_t syntax,
1813 static re_bool group_match_null_string_p (unsigned char **p,
1815 register_info_type *reg_info);
1816 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1817 register_info_type *reg_info);
1818 static re_bool common_op_match_null_string_p (unsigned char **p,
1820 register_info_type *reg_info);
1821 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1822 REGISTER int len, RE_TRANSLATE_TYPE translate);
1823 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1824 re_char *string1, int size1,
1825 re_char *string2, int size2, int pos,
1826 struct re_registers *regs, int stop);
1828 #ifndef MATCH_MAY_ALLOCATE
1830 /* If we cannot allocate large objects within re_match_2_internal,
1831 we make the fail stack and register vectors global.
1832 The fail stack, we grow to the maximum size when a regexp
1834 The register vectors, we adjust in size each time we
1835 compile a regexp, according to the number of registers it needs. */
1837 static fail_stack_type fail_stack;
1839 /* Size with which the following vectors are currently allocated.
1840 That is so we can make them bigger as needed,
1841 but never make them smaller. */
1842 static int regs_allocated_size;
1844 static re_char ** regstart, ** regend;
1845 static re_char ** old_regstart, ** old_regend;
1846 static re_char **best_regstart, **best_regend;
1847 static register_info_type *reg_info;
1848 static re_char **reg_dummy;
1849 static register_info_type *reg_info_dummy;
1851 /* Make the register vectors big enough for NUM_REGS registers,
1852 but don't make them smaller. */
1855 regex_grow_registers (int num_regs)
1857 if (num_regs > regs_allocated_size)
1859 RETALLOC_IF (regstart, num_regs, re_char *);
1860 RETALLOC_IF (regend, num_regs, re_char *);
1861 RETALLOC_IF (old_regstart, num_regs, re_char *);
1862 RETALLOC_IF (old_regend, num_regs, re_char *);
1863 RETALLOC_IF (best_regstart, num_regs, re_char *);
1864 RETALLOC_IF (best_regend, num_regs, re_char *);
1865 RETALLOC_IF (reg_info, num_regs, register_info_type);
1866 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1867 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1869 regs_allocated_size = num_regs;
1873 #endif /* not MATCH_MAY_ALLOCATE */
1875 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1876 Returns one of error codes defined in `regex.h', or zero for success.
1878 Assumes the `allocated' (and perhaps `buffer') and `translate'
1879 fields are set in BUFP on entry.
1881 If it succeeds, results are put in BUFP (if it returns an error, the
1882 contents of BUFP are undefined):
1883 `buffer' is the compiled pattern;
1884 `syntax' is set to SYNTAX;
1885 `used' is set to the length of the compiled pattern;
1886 `fastmap_accurate' is zero;
1887 `re_nsub' is the number of subexpressions in PATTERN;
1888 `not_bol' and `not_eol' are zero;
1890 The `fastmap' and `newline_anchor' fields are neither
1891 examined nor set. */
1893 /* Return, freeing storage we allocated. */
1894 #define FREE_STACK_RETURN(value) \
1895 return (free (compile_stack.stack), value)
1897 static reg_errcode_t
1898 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1899 struct re_pattern_buffer *bufp)
1901 /* We fetch characters from PATTERN here. We declare these as int
1902 (or possibly long) so that chars above 127 can be used as
1903 array indices. The macros that fetch a character from the pattern
1904 make sure to coerce to unsigned char before assigning, so we won't
1905 get bitten by negative numbers here. */
1906 /* XEmacs change: used to be unsigned char. */
1907 REGISTER EMACS_INT c, c1;
1909 /* A random temporary spot in PATTERN. */
1912 /* Points to the end of the buffer, where we should append. */
1913 REGISTER unsigned char *buf_end;
1915 /* Keeps track of unclosed groups. */
1916 compile_stack_type compile_stack;
1918 /* Points to the current (ending) position in the pattern. */
1919 re_char *p = pattern;
1920 re_char *pend = pattern + size;
1922 /* How to translate the characters in the pattern. */
1923 RE_TRANSLATE_TYPE translate = bufp->translate;
1925 /* Address of the count-byte of the most recently inserted `exactn'
1926 command. This makes it possible to tell if a new exact-match
1927 character can be added to that command or if the character requires
1928 a new `exactn' command. */
1929 unsigned char *pending_exact = 0;
1931 /* Address of start of the most recently finished expression.
1932 This tells, e.g., postfix * where to find the start of its
1933 operand. Reset at the beginning of groups and alternatives. */
1934 unsigned char *laststart = 0;
1936 /* Address of beginning of regexp, or inside of last group. */
1937 unsigned char *begalt;
1939 /* Place in the uncompiled pattern (i.e., the {) to
1940 which to go back if the interval is invalid. */
1941 re_char *beg_interval;
1943 /* Address of the place where a forward jump should go to the end of
1944 the containing expression. Each alternative of an `or' -- except the
1945 last -- ends with a forward jump of this sort. */
1946 unsigned char *fixup_alt_jump = 0;
1948 /* Counts open-groups as they are encountered. Remembered for the
1949 matching close-group on the compile stack, so the same register
1950 number is put in the stop_memory as the start_memory. */
1951 regnum_t regnum = 0;
1954 DEBUG_PRINT1 ("\nCompiling pattern: ");
1959 for (debug_count = 0; debug_count < size; debug_count++)
1960 putchar (pattern[debug_count]);
1965 /* Initialize the compile stack. */
1966 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1967 if (compile_stack.stack == NULL)
1970 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1971 compile_stack.avail = 0;
1973 /* Initialize the pattern buffer. */
1974 bufp->syntax = syntax;
1975 bufp->fastmap_accurate = 0;
1976 bufp->not_bol = bufp->not_eol = 0;
1978 /* Set `used' to zero, so that if we return an error, the pattern
1979 printer (for debugging) will think there's no pattern. We reset it
1983 /* Always count groups, whether or not bufp->no_sub is set. */
1986 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1987 /* Initialize the syntax table. */
1988 init_syntax_once ();
1991 if (bufp->allocated == 0)
1994 { /* If zero allocated, but buffer is non-null, try to realloc
1995 enough space. This loses if buffer's address is bogus, but
1996 that is the user's responsibility. */
1997 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2000 { /* Caller did not allocate a buffer. Do it for them. */
2001 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2003 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2005 bufp->allocated = INIT_BUF_SIZE;
2008 begalt = buf_end = bufp->buffer;
2010 /* Loop through the uncompiled pattern until we're at the end. */
2019 if ( /* If at start of pattern, it's an operator. */
2021 /* If context independent, it's an operator. */
2022 || syntax & RE_CONTEXT_INDEP_ANCHORS
2023 /* Otherwise, depends on what's come before. */
2024 || at_begline_loc_p (pattern, p, syntax))
2034 if ( /* If at end of pattern, it's an operator. */
2036 /* If context independent, it's an operator. */
2037 || syntax & RE_CONTEXT_INDEP_ANCHORS
2038 /* Otherwise, depends on what's next. */
2039 || at_endline_loc_p (p, pend, syntax))
2049 if ((syntax & RE_BK_PLUS_QM)
2050 || (syntax & RE_LIMITED_OPS))
2054 /* If there is no previous pattern... */
2057 if (syntax & RE_CONTEXT_INVALID_OPS)
2058 FREE_STACK_RETURN (REG_BADRPT);
2059 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2064 /* true means zero/many matches are allowed. */
2065 re_bool zero_times_ok = c != '+';
2066 re_bool many_times_ok = c != '?';
2068 /* true means match shortest string possible. */
2069 re_bool minimal = false;
2071 /* If there is a sequence of repetition chars, collapse it
2072 down to just one (the right one). We can't combine
2073 interval operators with these because of, e.g., `a{2}*',
2074 which should only match an even number of `a's. */
2079 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2080 && (c == '+' || c == '?')))
2083 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2085 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2088 if (!(c1 == '+' || c1 == '?'))
2103 /* If we get here, we found another repeat character. */
2104 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2106 /* "*?" and "+?" and "??" are okay (and mean match
2107 minimally), but other sequences (such as "*??" and
2108 "+++") are rejected (reserved for future use). */
2109 if (minimal || c != '?')
2110 FREE_STACK_RETURN (REG_BADRPT);
2115 zero_times_ok |= c != '+';
2116 many_times_ok |= c != '?';
2120 /* Star, etc. applied to an empty pattern is equivalent
2121 to an empty pattern. */
2125 /* Now we know whether zero matches is allowed
2126 and whether two or more matches is allowed
2127 and whether we want minimal or maximal matching. */
2133 0: /on_failure_jump to 6
2138 GET_BUFFER_SPACE (6);
2139 INSERT_JUMP (jump, laststart, buf_end + 3);
2141 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2144 else if (zero_times_ok)
2149 6: /on_failure_jump to 3
2152 GET_BUFFER_SPACE (6);
2153 INSERT_JUMP (jump, laststart, buf_end + 3);
2155 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2162 3: /on_failure_jump to 0
2165 GET_BUFFER_SPACE (3);
2166 STORE_JUMP (on_failure_jump, buf_end, laststart);
2172 /* Are we optimizing this jump? */
2173 re_bool keep_string_p = false;
2176 { /* More than one repetition is allowed, so put in
2177 at the end a backward relative jump from
2178 `buf_end' to before the next jump we're going
2179 to put in below (which jumps from laststart to
2182 But if we are at the `*' in the exact sequence `.*\n',
2183 insert an unconditional jump backwards to the .,
2184 instead of the beginning of the loop. This way we only
2185 push a failure point once, instead of every time
2186 through the loop. */
2187 assert (p - 1 > pattern);
2189 /* Allocate the space for the jump. */
2190 GET_BUFFER_SPACE (3);
2192 /* We know we are not at the first character of the
2193 pattern, because laststart was nonzero. And we've
2194 already incremented `p', by the way, to be the
2195 character after the `*'. Do we have to do something
2196 analogous here for null bytes, because of
2200 && p < pend && *p == '\n'
2201 && !(syntax & RE_DOT_NEWLINE))
2202 { /* We have .*\n. */
2203 STORE_JUMP (jump, buf_end, laststart);
2204 keep_string_p = true;
2207 /* Anything else. */
2208 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2210 /* We've added more stuff to the buffer. */
2214 /* On failure, jump from laststart to buf_end + 3,
2215 which will be the end of the buffer after this jump
2217 GET_BUFFER_SPACE (3);
2218 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2220 laststart, buf_end + 3);
2225 /* At least one repetition is required, so insert a
2226 `dummy_failure_jump' before the initial
2227 `on_failure_jump' instruction of the loop. This
2228 effects a skip over that instruction the first time
2229 we hit that loop. */
2230 GET_BUFFER_SPACE (3);
2231 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2241 laststart = buf_end;
2248 /* XEmacs change: this whole section */
2249 re_bool had_char_class = false;
2251 re_bool has_extended_chars = false;
2252 REGISTER Lisp_Object rtab = Qnil;
2255 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2257 /* Ensure that we have enough space to push a charset: the
2258 opcode, the length count, and the bitset; 34 bytes in all. */
2259 GET_BUFFER_SPACE (34);
2261 laststart = buf_end;
2263 /* We test `*p == '^' twice, instead of using an if
2264 statement, so we only need one BUF_PUSH. */
2265 BUF_PUSH (*p == '^' ? charset_not : charset);
2269 /* Remember the first position in the bracket expression. */
2272 /* Push the number of bytes in the bitmap. */
2273 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2275 /* Clear the whole map. */
2276 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2278 /* charset_not matches newline according to a syntax bit. */
2279 if ((re_opcode_t) buf_end[-2] == charset_not
2280 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2281 SET_LIST_BIT ('\n');
2284 start_over_with_extended:
2285 if (has_extended_chars)
2287 /* There are extended chars here, which means we need to start
2288 over and shift to unified range-table format. */
2289 if (buf_end[-2] == charset)
2290 buf_end[-2] = charset_mule;
2292 buf_end[-2] = charset_mule_not;
2294 p = p1; /* go back to the beginning of the charset, after
2296 rtab = Vthe_lisp_rangetab;
2297 Fclear_range_table (rtab);
2299 /* charset_not matches newline according to a syntax bit. */
2300 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2301 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2302 SET_EITHER_BIT ('\n');
2306 /* Read in characters and ranges, setting map bits. */
2309 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2314 if (c >= 0x80 && !has_extended_chars)
2316 has_extended_chars = 1;
2317 /* Frumble-bumble, we've found some extended chars.
2318 Need to start over, process everything using
2319 the general extended-char mechanism, and need
2320 to use charset_mule and charset_mule_not instead
2321 of charset and charset_not. */
2322 goto start_over_with_extended;
2325 /* \ might escape characters inside [...] and [^...]. */
2326 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2328 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2332 if (c1 >= 0x80 && !has_extended_chars)
2334 has_extended_chars = 1;
2335 goto start_over_with_extended;
2338 SET_EITHER_BIT (c1);
2342 /* Could be the end of the bracket expression. If it's
2343 not (i.e., when the bracket expression is `[]' so
2344 far), the ']' character bit gets set way below. */
2345 if (c == ']' && p != p1 + 1)
2348 /* Look ahead to see if it's a range when the last thing
2349 was a character class. */
2350 if (had_char_class && c == '-' && *p != ']')
2351 FREE_STACK_RETURN (REG_ERANGE);
2353 /* Look ahead to see if it's a range when the last thing
2354 was a character: if this is a hyphen not at the
2355 beginning or the end of a list, then it's the range
2358 && !(p - 2 >= pattern && p[-2] == '[')
2359 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2365 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2367 has_extended_chars = 1;
2368 goto start_over_with_extended;
2370 if (has_extended_chars)
2371 ret = compile_extended_range (&p, pend, translate,
2375 ret = compile_range (&p, pend, translate, syntax, buf_end);
2376 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2379 else if (p[0] == '-' && p[1] != ']')
2380 { /* This handles ranges made up of characters only. */
2383 /* Move past the `-'. */
2387 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2389 has_extended_chars = 1;
2390 goto start_over_with_extended;
2392 if (has_extended_chars)
2393 ret = compile_extended_range (&p, pend, translate,
2397 ret = compile_range (&p, pend, translate, syntax, buf_end);
2398 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2401 /* See if we're at the beginning of a possible character
2404 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2405 { /* Leave room for the null. */
2406 char str[CHAR_CLASS_MAX_LENGTH + 1];
2411 /* If pattern is `[[:'. */
2412 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2416 /* #### This code is unused.
2417 Correctness is not checked after TRT
2420 if (c == ':' || c == ']' || p == pend
2421 || c1 == CHAR_CLASS_MAX_LENGTH)
2423 str[c1++] = (char) c;
2427 /* If isn't a word bracketed by `[:' and `:]':
2428 undo the ending character, the letters, and leave
2429 the leading `:' and `[' (but set bits for them). */
2430 if (c == ':' && *p == ']')
2433 re_bool is_alnum = STREQ (str, "alnum");
2434 re_bool is_alpha = STREQ (str, "alpha");
2435 re_bool is_blank = STREQ (str, "blank");
2436 re_bool is_cntrl = STREQ (str, "cntrl");
2437 re_bool is_digit = STREQ (str, "digit");
2438 re_bool is_graph = STREQ (str, "graph");
2439 re_bool is_lower = STREQ (str, "lower");
2440 re_bool is_print = STREQ (str, "print");
2441 re_bool is_punct = STREQ (str, "punct");
2442 re_bool is_space = STREQ (str, "space");
2443 re_bool is_upper = STREQ (str, "upper");
2444 re_bool is_xdigit = STREQ (str, "xdigit");
2446 if (!IS_CHAR_CLASS (str))
2447 FREE_STACK_RETURN (REG_ECTYPE);
2449 /* Throw away the ] at the end of the character
2453 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2455 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2457 /* This was split into 3 if's to
2458 avoid an arbitrary limit in some compiler. */
2459 if ( (is_alnum && ISALNUM (ch))
2460 || (is_alpha && ISALPHA (ch))
2461 || (is_blank && ISBLANK (ch))
2462 || (is_cntrl && ISCNTRL (ch)))
2463 SET_EITHER_BIT (ch);
2464 if ( (is_digit && ISDIGIT (ch))
2465 || (is_graph && ISGRAPH (ch))
2466 || (is_lower && ISLOWER (ch))
2467 || (is_print && ISPRINT (ch)))
2468 SET_EITHER_BIT (ch);
2469 if ( (is_punct && ISPUNCT (ch))
2470 || (is_space && ISSPACE (ch))
2471 || (is_upper && ISUPPER (ch))
2472 || (is_xdigit && ISXDIGIT (ch)))
2473 SET_EITHER_BIT (ch);
2475 had_char_class = true;
2482 SET_EITHER_BIT ('[');
2483 SET_EITHER_BIT (':');
2484 had_char_class = false;
2489 had_char_class = false;
2495 if (has_extended_chars)
2497 /* We have a range table, not a bit vector. */
2499 unified_range_table_bytes_needed (rtab);
2500 GET_BUFFER_SPACE (bytes_needed);
2501 unified_range_table_copy_data (rtab, buf_end);
2502 buf_end += unified_range_table_bytes_used (buf_end);
2506 /* Discard any (non)matching list bytes that are all 0 at the
2507 end of the map. Decrease the map-length byte too. */
2508 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2510 buf_end += buf_end[-1];
2516 if (syntax & RE_NO_BK_PARENS)
2523 if (syntax & RE_NO_BK_PARENS)
2530 if (syntax & RE_NEWLINE_ALT)
2537 if (syntax & RE_NO_BK_VBAR)
2544 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2545 goto handle_interval;
2551 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2553 /* Do not translate the character after the \, so that we can
2554 distinguish, e.g., \B from \b, even if we normally would
2555 translate, e.g., B to b. */
2561 if (syntax & RE_NO_BK_PARENS)
2562 goto normal_backslash;
2568 if (!(syntax & RE_NO_SHY_GROUPS)
2576 case ':': /* shy groups */
2580 /* All others are reserved for future constructs. */
2582 FREE_STACK_RETURN (REG_BADPAT);
2591 if (COMPILE_STACK_FULL)
2593 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2594 compile_stack_elt_t);
2595 if (compile_stack.stack == NULL) return REG_ESPACE;
2597 compile_stack.size <<= 1;
2600 /* These are the values to restore when we hit end of this
2601 group. They are all relative offsets, so that if the
2602 whole pattern moves because of realloc, they will still
2604 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2605 COMPILE_STACK_TOP.fixup_alt_jump
2606 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2607 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2608 COMPILE_STACK_TOP.regnum = r;
2610 /* We will eventually replace the 0 with the number of
2611 groups inner to this one. But do not push a
2612 start_memory for groups beyond the last one we can
2613 represent in the compiled pattern. */
2614 if (r <= MAX_REGNUM)
2616 COMPILE_STACK_TOP.inner_group_offset
2617 = buf_end - bufp->buffer + 2;
2618 BUF_PUSH_3 (start_memory, r, 0);
2621 compile_stack.avail++;
2626 /* If we've reached MAX_REGNUM groups, then this open
2627 won't actually generate any code, so we'll have to
2628 clear pending_exact explicitly. */
2635 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2637 if (COMPILE_STACK_EMPTY) {
2638 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2639 goto normal_backslash;
2641 FREE_STACK_RETURN (REG_ERPAREN);
2646 { /* Push a dummy failure point at the end of the
2647 alternative for a possible future
2648 `pop_failure_jump' to pop. See comments at
2649 `push_dummy_failure' in `re_match_2'. */
2650 BUF_PUSH (push_dummy_failure);
2652 /* We allocated space for this jump when we assigned
2653 to `fixup_alt_jump', in the `handle_alt' case below. */
2654 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2657 /* See similar code for backslashed left paren above. */
2658 if (COMPILE_STACK_EMPTY) {
2659 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2662 FREE_STACK_RETURN (REG_ERPAREN);
2665 /* Since we just checked for an empty stack above, this
2666 ``can't happen''. */
2667 assert (compile_stack.avail != 0);
2669 /* We don't just want to restore into `regnum', because
2670 later groups should continue to be numbered higher,
2671 as in `(ab)c(de)' -- the second group is #2. */
2672 regnum_t this_group_regnum;
2674 compile_stack.avail--;
2675 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2677 = COMPILE_STACK_TOP.fixup_alt_jump
2678 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2680 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2681 this_group_regnum = COMPILE_STACK_TOP.regnum;
2682 /* If we've reached MAX_REGNUM groups, then this open
2683 won't actually generate any code, so we'll have to
2684 clear pending_exact explicitly. */
2687 /* We're at the end of the group, so now we know how many
2688 groups were inside this one. */
2689 if (this_group_regnum <= MAX_REGNUM)
2691 unsigned char *inner_group_loc
2692 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2694 *inner_group_loc = regnum - this_group_regnum;
2695 BUF_PUSH_3 (stop_memory, this_group_regnum,
2696 regnum - this_group_regnum);
2702 case '|': /* `\|'. */
2703 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2704 goto normal_backslash;
2706 if (syntax & RE_LIMITED_OPS)
2709 /* Insert before the previous alternative a jump which
2710 jumps to this alternative if the former fails. */
2711 GET_BUFFER_SPACE (3);
2712 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2716 /* The alternative before this one has a jump after it
2717 which gets executed if it gets matched. Adjust that
2718 jump so it will jump to this alternative's analogous
2719 jump (put in below, which in turn will jump to the next
2720 (if any) alternative's such jump, etc.). The last such
2721 jump jumps to the correct final destination. A picture:
2727 If we are at `b', then fixup_alt_jump right now points to a
2728 three-byte space after `a'. We'll put in the jump, set
2729 fixup_alt_jump to right after `b', and leave behind three
2730 bytes which we'll fill in when we get to after `c'. */
2733 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2735 /* Mark and leave space for a jump after this alternative,
2736 to be filled in later either by next alternative or
2737 when know we're at the end of a series of alternatives. */
2738 fixup_alt_jump = buf_end;
2739 GET_BUFFER_SPACE (3);
2748 /* If \{ is a literal. */
2749 if (!(syntax & RE_INTERVALS)
2750 /* If we're at `\{' and it's not the open-interval
2752 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2753 || (p - 2 == pattern && p == pend))
2754 goto normal_backslash;
2758 /* If got here, then the syntax allows intervals. */
2760 /* At least (most) this many matches must be made. */
2761 int lower_bound = -1, upper_bound = -1;
2763 beg_interval = p - 1;
2767 if (syntax & RE_NO_BK_BRACES)
2768 goto unfetch_interval;
2770 FREE_STACK_RETURN (REG_EBRACE);
2773 GET_UNSIGNED_NUMBER (lower_bound);
2777 GET_UNSIGNED_NUMBER (upper_bound);
2778 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2781 /* Interval such as `{1}' => match exactly once. */
2782 upper_bound = lower_bound;
2784 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2785 || lower_bound > upper_bound)
2787 if (syntax & RE_NO_BK_BRACES)
2788 goto unfetch_interval;
2790 FREE_STACK_RETURN (REG_BADBR);
2793 if (!(syntax & RE_NO_BK_BRACES))
2795 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2802 if (syntax & RE_NO_BK_BRACES)
2803 goto unfetch_interval;
2805 FREE_STACK_RETURN (REG_BADBR);
2808 /* We just parsed a valid interval. */
2810 /* If it's invalid to have no preceding re. */
2813 if (syntax & RE_CONTEXT_INVALID_OPS)
2814 FREE_STACK_RETURN (REG_BADRPT);
2815 else if (syntax & RE_CONTEXT_INDEP_OPS)
2816 laststart = buf_end;
2818 goto unfetch_interval;
2821 /* If the upper bound is zero, don't want to succeed at
2822 all; jump from `laststart' to `b + 3', which will be
2823 the end of the buffer after we insert the jump. */
2824 if (upper_bound == 0)
2826 GET_BUFFER_SPACE (3);
2827 INSERT_JUMP (jump, laststart, buf_end + 3);
2831 /* Otherwise, we have a nontrivial interval. When
2832 we're all done, the pattern will look like:
2833 set_number_at <jump count> <upper bound>
2834 set_number_at <succeed_n count> <lower bound>
2835 succeed_n <after jump addr> <succeed_n count>
2837 jump_n <succeed_n addr> <jump count>
2838 (The upper bound and `jump_n' are omitted if
2839 `upper_bound' is 1, though.) */
2841 { /* If the upper bound is > 1, we need to insert
2842 more at the end of the loop. */
2843 Memory_count nbytes = 10 + (upper_bound > 1) * 10;
2845 GET_BUFFER_SPACE (nbytes);
2847 /* Initialize lower bound of the `succeed_n', even
2848 though it will be set during matching by its
2849 attendant `set_number_at' (inserted next),
2850 because `re_compile_fastmap' needs to know.
2851 Jump to the `jump_n' we might insert below. */
2852 INSERT_JUMP2 (succeed_n, laststart,
2853 buf_end + 5 + (upper_bound > 1) * 5,
2857 /* Code to initialize the lower bound. Insert
2858 before the `succeed_n'. The `5' is the last two
2859 bytes of this `set_number_at', plus 3 bytes of
2860 the following `succeed_n'. */
2861 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2864 if (upper_bound > 1)
2865 { /* More than one repetition is allowed, so
2866 append a backward jump to the `succeed_n'
2867 that starts this interval.
2869 When we've reached this during matching,
2870 we'll have matched the interval once, so
2871 jump back only `upper_bound - 1' times. */
2872 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2876 /* The location we want to set is the second
2877 parameter of the `jump_n'; that is `b-2' as
2878 an absolute address. `laststart' will be
2879 the `set_number_at' we're about to insert;
2880 `laststart+3' the number to set, the source
2881 for the relative address. But we are
2882 inserting into the middle of the pattern --
2883 so everything is getting moved up by 5.
2884 Conclusion: (b - 2) - (laststart + 3) + 5,
2885 i.e., b - laststart.
2887 We insert this at the beginning of the loop
2888 so that if we fail during matching, we'll
2889 reinitialize the bounds. */
2890 insert_op2 (set_number_at, laststart,
2891 buf_end - laststart,
2892 upper_bound - 1, buf_end);
2897 beg_interval = NULL;
2902 /* If an invalid interval, match the characters as literals. */
2903 assert (beg_interval);
2905 beg_interval = NULL;
2907 /* normal_char and normal_backslash need `c'. */
2910 if (!(syntax & RE_NO_BK_BRACES))
2912 if (p > pattern && p[-1] == '\\')
2913 goto normal_backslash;
2918 /* There is no way to specify the before_dot and after_dot
2919 operators. rms says this is ok. --karl */
2925 laststart = buf_end;
2927 /* XEmacs addition */
2928 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2929 FREE_STACK_RETURN (REG_ESYNTAX);
2930 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2934 laststart = buf_end;
2936 /* XEmacs addition */
2937 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2938 FREE_STACK_RETURN (REG_ESYNTAX);
2939 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2943 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2945 laststart = buf_end;
2947 if (c < 32 || c > 127)
2948 FREE_STACK_RETURN (REG_ECATEGORY);
2949 BUF_PUSH_2 (categoryspec, c);
2953 laststart = buf_end;
2955 if (c < 32 || c > 127)
2956 FREE_STACK_RETURN (REG_ECATEGORY);
2957 BUF_PUSH_2 (notcategoryspec, c);
2959 /* end of category patch */
2965 laststart = buf_end;
2966 BUF_PUSH (wordchar);
2971 laststart = buf_end;
2972 BUF_PUSH (notwordchar);
2985 BUF_PUSH (wordbound);
2989 BUF_PUSH (notwordbound);
3000 case '1': case '2': case '3': case '4': case '5':
3001 case '6': case '7': case '8': case '9':
3004 if (syntax & RE_NO_BK_REFS)
3010 FREE_STACK_RETURN (REG_ESUBREG);
3012 /* Can't back reference to a subexpression if inside of it. */
3013 if (group_in_compile_stack (compile_stack, reg))
3016 laststart = buf_end;
3017 BUF_PUSH_2 (duplicate, reg);
3024 if (syntax & RE_BK_PLUS_QM)
3027 goto normal_backslash;
3031 /* You might think it would be useful for \ to mean
3032 not to translate; but if we don't translate it,
3033 it will never match anything. */
3041 /* Expects the character in `c'. */
3042 /* `p' points to the location after where `c' came from. */
3045 /* XEmacs: modifications here for Mule. */
3046 /* `q' points to the beginning of the next char. */
3049 /* If no exactn currently being built. */
3052 /* If last exactn not at current position. */
3053 || pending_exact + *pending_exact + 1 != buf_end
3055 /* We have only one byte following the exactn for the count. */
3056 || ((unsigned int) (*pending_exact + (q - p)) >=
3057 ((unsigned int) (1 << BYTEWIDTH) - 1))
3059 /* If followed by a repetition operator. */
3060 || *q == '*' || *q == '^'
3061 || ((syntax & RE_BK_PLUS_QM)
3062 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3063 : (*q == '+' || *q == '?'))
3064 || ((syntax & RE_INTERVALS)
3065 && ((syntax & RE_NO_BK_BRACES)
3067 : (q[0] == '\\' && q[1] == '{'))))
3069 /* Start building a new exactn. */
3071 laststart = buf_end;
3073 BUF_PUSH_2 (exactn, 0);
3074 pending_exact = buf_end - 1;
3083 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3086 bt_count = set_charptr_emchar (tmp_buf, c);
3088 for (i = 0; i < bt_count; i++)
3090 BUF_PUSH (tmp_buf[i]);
3098 } /* while p != pend */
3101 /* Through the pattern now. */
3104 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3106 if (!COMPILE_STACK_EMPTY)
3107 FREE_STACK_RETURN (REG_EPAREN);
3109 /* If we don't want backtracking, force success
3110 the first time we reach the end of the compiled pattern. */
3111 if (syntax & RE_NO_POSIX_BACKTRACKING)
3114 free (compile_stack.stack);
3116 /* We have succeeded; set the length of the buffer. */
3117 bufp->used = buf_end - bufp->buffer;
3122 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3123 print_compiled_pattern (bufp);
3127 #ifndef MATCH_MAY_ALLOCATE
3128 /* Initialize the failure stack to the largest possible stack. This
3129 isn't necessary unless we're trying to avoid calling alloca in
3130 the search and match routines. */
3132 int num_regs = bufp->re_nsub + 1;
3134 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3135 is strictly greater than re_max_failures, the largest possible stack
3136 is 2 * re_max_failures failure points. */
3137 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3139 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3142 if (! fail_stack.stack)
3144 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3145 * sizeof (fail_stack_elt_t));
3148 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3150 * sizeof (fail_stack_elt_t)));
3151 #else /* not emacs */
3152 if (! fail_stack.stack)
3154 = (fail_stack_elt_t *) malloc (fail_stack.size
3155 * sizeof (fail_stack_elt_t));
3158 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3160 * sizeof (fail_stack_elt_t)));
3164 regex_grow_registers (num_regs);
3166 #endif /* not MATCH_MAY_ALLOCATE */
3169 } /* regex_compile */
3171 /* Subroutines for `regex_compile'. */
3173 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3176 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3178 *loc = (unsigned char) op;
3179 STORE_NUMBER (loc + 1, arg);
3183 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3186 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3188 *loc = (unsigned char) op;
3189 STORE_NUMBER (loc + 1, arg1);
3190 STORE_NUMBER (loc + 3, arg2);
3194 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3195 for OP followed by two-byte integer parameter ARG. */
3198 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3200 REGISTER unsigned char *pfrom = end;
3201 REGISTER unsigned char *pto = end + 3;
3203 while (pfrom != loc)
3206 store_op1 (op, loc, arg);
3210 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3213 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3216 REGISTER unsigned char *pfrom = end;
3217 REGISTER unsigned char *pto = end + 5;
3219 while (pfrom != loc)
3222 store_op2 (op, loc, arg1, arg2);
3226 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3227 after an alternative or a begin-subexpression. We assume there is at
3228 least one character before the ^. */
3231 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3233 re_char *prev = p - 2;
3234 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3237 /* After a subexpression? */
3238 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3239 /* After an alternative? */
3240 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3244 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3245 at least one character after the $, i.e., `P < PEND'. */
3248 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3251 re_bool next_backslash = *next == '\\';
3252 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3255 /* Before a subexpression? */
3256 (syntax & RE_NO_BK_PARENS ? *next == ')'
3257 : next_backslash && next_next && *next_next == ')')
3258 /* Before an alternative? */
3259 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3260 : next_backslash && next_next && *next_next == '|');
3264 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3265 false if it's not. */
3268 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3272 for (this_element = compile_stack.avail - 1;
3275 if (compile_stack.stack[this_element].regnum == regnum)
3282 /* Read the ending character of a range (in a bracket expression) from the
3283 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3284 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3285 Then we set the translation of all bits between the starting and
3286 ending characters (inclusive) in the compiled pattern B.
3288 Return an error code.
3290 We use these short variable names so we can use the same macros as
3291 `regex_compile' itself. */
3293 static reg_errcode_t
3294 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3295 reg_syntax_t syntax, unsigned char *buf_end)
3297 Element_count this_char;
3299 re_char *p = *p_ptr;
3300 int range_start, range_end;
3305 /* Even though the pattern is a signed `char *', we need to fetch
3306 with unsigned char *'s; if the high bit of the pattern character
3307 is set, the range endpoints will be negative if we fetch using a
3310 We also want to fetch the endpoints without translating them; the
3311 appropriate translation is done in the bit-setting loop below. */
3312 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3313 range_start = ((const unsigned char *) p)[-2];
3314 range_end = ((const unsigned char *) p)[0];
3316 /* Have to increment the pointer into the pattern string, so the
3317 caller isn't still at the ending character. */
3320 /* If the start is after the end, the range is empty. */
3321 if (range_start > range_end)
3322 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3324 /* Here we see why `this_char' has to be larger than an `unsigned
3325 char' -- the range is inclusive, so if `range_end' == 0xff
3326 (assuming 8-bit characters), we would otherwise go into an infinite
3327 loop, since all characters <= 0xff. */
3328 for (this_char = range_start; this_char <= range_end; this_char++)
3330 SET_LIST_BIT (TRANSLATE (this_char));
3338 static reg_errcode_t
3339 compile_extended_range (re_char **p_ptr, re_char *pend,
3340 RE_TRANSLATE_TYPE translate,
3341 reg_syntax_t syntax, Lisp_Object rtab)
3343 Emchar this_char, range_start, range_end;
3349 p = (const Bufbyte *) *p_ptr;
3350 range_end = charptr_emchar (p);
3351 p--; /* back to '-' */
3352 DEC_CHARPTR (p); /* back to start of range */
3353 /* We also want to fetch the endpoints without translating them; the
3354 appropriate translation is done in the bit-setting loop below. */
3355 range_start = charptr_emchar (p);
3356 INC_CHARPTR (*p_ptr);
3358 /* If the start is after the end, the range is empty. */
3359 if (range_start > range_end)
3360 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3362 /* Can't have ranges spanning different charsets, except maybe for
3363 ranges entirely within the first 256 chars. */
3365 if ((range_start >= 0x100 || range_end >= 0x100)
3367 && CHAR_CHARSET_ID (range_start) != CHAR_CHARSET_ID (range_end)
3369 && CHAR_LEADING_BYTE (range_start) != CHAR_LEADING_BYTE (range_end)
3372 return REG_ERANGESPAN;
3374 /* As advertised, translations only work over the 0 - 0x7F range.
3375 Making this kind of stuff work generally is much harder.
3376 Iterating over the whole range like this would be way efficient
3377 if the range encompasses 10,000 chars or something. You'd have
3378 to do something like this:
3382 map over translation table in [range_start, range_end] of
3383 (put the mapped range in a;
3384 put the translation in b)
3385 invert the range in a and truncate to [range_start, range_end]
3386 compute the union of a, b
3387 union the result into rtab
3389 for (this_char = range_start;
3390 this_char <= range_end && this_char < 0x80; this_char++)
3392 SET_RANGETAB_BIT (TRANSLATE (this_char));
3395 if (this_char <= range_end)
3396 put_range_table (rtab, this_char, range_end, Qt);
3403 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3404 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3405 characters can start a string that matches the pattern. This fastmap
3406 is used by re_search to skip quickly over impossible starting points.
3408 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3409 area as BUFP->fastmap.
3411 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3414 Returns 0 if we succeed, -2 if an internal error. */
3417 re_compile_fastmap (struct re_pattern_buffer *bufp)
3420 #ifdef MATCH_MAY_ALLOCATE
3421 fail_stack_type fail_stack;
3423 DECLARE_DESTINATION;
3424 /* We don't push any register information onto the failure stack. */
3426 REGISTER char *fastmap = bufp->fastmap;
3427 unsigned char *pattern = bufp->buffer;
3428 unsigned long size = bufp->used;
3429 unsigned char *p = pattern;
3430 REGISTER unsigned char *pend = pattern + size;
3433 /* This holds the pointer to the failure stack, when
3434 it is allocated relocatably. */
3435 fail_stack_elt_t *failure_stack_ptr;
3438 /* Assume that each path through the pattern can be null until
3439 proven otherwise. We set this false at the bottom of switch
3440 statement, to which we get only if a particular path doesn't
3441 match the empty string. */
3442 re_bool path_can_be_null = true;
3444 /* We aren't doing a `succeed_n' to begin with. */
3445 re_bool succeed_n_p = false;
3447 assert (fastmap != NULL && p != NULL);
3450 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3451 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3452 bufp->can_be_null = 0;
3456 if (p == pend || *p == succeed)
3458 /* We have reached the (effective) end of pattern. */
3459 if (!FAIL_STACK_EMPTY ())
3461 bufp->can_be_null |= path_can_be_null;
3463 /* Reset for next path. */
3464 path_can_be_null = true;
3466 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3474 /* We should never be about to go beyond the end of the pattern. */
3477 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3480 /* I guess the idea here is to simply not bother with a fastmap
3481 if a backreference is used, since it's too hard to figure out
3482 the fastmap for the corresponding group. Setting
3483 `can_be_null' stops `re_search_2' from using the fastmap, so
3484 that is all we do. */
3486 bufp->can_be_null = 1;
3490 /* Following are the cases which match a character. These end
3499 /* XEmacs: Under Mule, these bit vectors will
3500 only contain values for characters below 0x80. */
3501 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3502 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3508 /* Chars beyond end of map must be allowed. */
3510 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3512 /* And all extended characters must be allowed, too. */
3513 for (j = 0x80; j < 0xA0; j++)
3515 #else /* not MULE */
3516 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3520 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3521 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3531 nentries = unified_range_table_nentries (p);
3532 for (i = 0; i < nentries; i++)
3534 EMACS_INT first, last;
3535 Lisp_Object dummy_val;
3537 Bufbyte strr[MAX_EMCHAR_LEN];
3539 unified_range_table_get_range (p, i, &first, &last,
3541 for (jj = first; jj <= last && jj < 0x80; jj++)
3543 /* Ranges below 0x100 can span charsets, but there
3544 are only two (Control-1 and Latin-1), and
3545 either first or last has to be in them. */
3546 set_charptr_emchar (strr, first);
3550 set_charptr_emchar (strr, last);
3557 case charset_mule_not:
3562 nentries = unified_range_table_nentries (p);
3563 for (i = 0; i < nentries; i++)
3565 EMACS_INT first, last;
3566 Lisp_Object dummy_val;
3568 int smallest_prev = 0;
3570 unified_range_table_get_range (p, i, &first, &last,
3572 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3574 smallest_prev = last + 1;
3575 if (smallest_prev >= 0x80)
3578 /* Calculating which leading bytes are actually allowed
3579 here is rather difficult, so we just punt and allow
3581 for (i = 0x80; i < 0xA0; i++)
3593 for (j = 0; j < (1 << BYTEWIDTH); j++)
3596 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3605 goto matchnotsyntax;
3607 for (j = 0; j < (1 << BYTEWIDTH); j++)
3610 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3618 int fastmap_newline = fastmap['\n'];
3620 /* `.' matches anything ... */
3622 /* "anything" only includes bytes that can be the
3623 first byte of a character. */
3624 for (j = 0; j < 0xA0; j++)
3627 for (j = 0; j < (1 << BYTEWIDTH); j++)
3631 /* ... except perhaps newline. */
3632 if (!(bufp->syntax & RE_DOT_NEWLINE))
3633 fastmap['\n'] = fastmap_newline;
3635 /* Return if we have already set `can_be_null'; if we have,
3636 then the fastmap is irrelevant. Something's wrong here. */
3637 else if (bufp->can_be_null)
3640 /* Otherwise, have to check alternative paths. */
3651 /* This match depends on text properties. These end with
3652 aborting optimizations. */
3653 bufp->can_be_null = 1;
3657 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3664 for (j = 0; j < 0x80; j++)
3667 (regex_emacs_buffer->syntax_table), j) ==
3668 (enum syntaxcode) k)
3671 for (j = 0; j < 0x80; j++)
3674 (regex_emacs_buffer->mirror_syntax_table), j) ==
3675 (enum syntaxcode) k)
3678 for (j = 0x80; j < 0xA0; j++)
3681 if (LEADING_BYTE_PREFIX_P(j))
3682 /* too complicated to calculate this right */
3690 cset = CHARSET_BY_LEADING_BYTE (j);
3691 if (CHARSETP (cset))
3693 if (charset_syntax (regex_emacs_buffer, cset,
3695 == Sword || multi_p)
3702 #else /* not MULE */
3703 for (j = 0; j < (1 << BYTEWIDTH); j++)
3706 (regex_emacs_buffer->mirror_syntax_table), j) ==
3707 (enum syntaxcode) k)
3713 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3720 for (j = 0; j < 0x80; j++)
3723 (regex_emacs_buffer->syntax_table), j) !=
3724 (enum syntaxcode) k)
3727 for (j = 0; j < 0x80; j++)
3730 (regex_emacs_buffer->mirror_syntax_table), j) !=
3731 (enum syntaxcode) k)
3734 for (j = 0x80; j < 0xA0; j++)
3737 if (LEADING_BYTE_PREFIX_P(j))
3738 /* too complicated to calculate this right */
3746 cset = CHARSET_BY_LEADING_BYTE (j);
3747 if (CHARSETP (cset))
3749 if (charset_syntax (regex_emacs_buffer, cset,
3751 != Sword || multi_p)
3758 #else /* not MULE */
3759 for (j = 0; j < (1 << BYTEWIDTH); j++)
3762 (regex_emacs_buffer->mirror_syntax_table), j) !=
3763 (enum syntaxcode) k)
3770 /* 97/2/17 jhod category patch */
3772 case notcategoryspec:
3773 bufp->can_be_null = 1;
3775 /* end if category patch */
3778 /* All cases after this match the empty string. These end with
3800 case push_dummy_failure:
3805 case pop_failure_jump:
3806 case maybe_pop_jump:
3809 case dummy_failure_jump:
3810 EXTRACT_NUMBER_AND_INCR (j, p);
3815 /* Jump backward implies we just went through the body of a
3816 loop and matched nothing. Opcode jumped to should be
3817 `on_failure_jump' or `succeed_n'. Just treat it like an
3818 ordinary jump. For a * loop, it has pushed its failure
3819 point already; if so, discard that as redundant. */
3820 if ((re_opcode_t) *p != on_failure_jump
3821 && (re_opcode_t) *p != succeed_n)
3825 EXTRACT_NUMBER_AND_INCR (j, p);
3828 /* If what's on the stack is where we are now, pop it. */
3829 if (!FAIL_STACK_EMPTY ()
3830 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3836 case on_failure_jump:
3837 case on_failure_keep_string_jump:
3838 handle_on_failure_jump:
3839 EXTRACT_NUMBER_AND_INCR (j, p);
3841 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3842 end of the pattern. We don't want to push such a point,
3843 since when we restore it above, entering the switch will
3844 increment `p' past the end of the pattern. We don't need
3845 to push such a point since we obviously won't find any more
3846 fastmap entries beyond `pend'. Such a pattern can match
3847 the null string, though. */
3850 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3852 RESET_FAIL_STACK ();
3857 bufp->can_be_null = 1;
3861 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3862 succeed_n_p = false;
3869 /* Get to the number of times to succeed. */
3872 /* Increment p past the n for when k != 0. */
3873 EXTRACT_NUMBER_AND_INCR (k, p);
3877 succeed_n_p = true; /* Spaghetti code alert. */
3878 goto handle_on_failure_jump;
3895 abort (); /* We have listed all the cases. */
3898 /* Getting here means we have found the possible starting
3899 characters for one path of the pattern -- and that the empty
3900 string does not match. We need not follow this path further.
3901 Instead, look at the next alternative (remembered on the
3902 stack), or quit if no more. The test at the top of the loop
3903 does these things. */
3904 path_can_be_null = false;
3908 /* Set `can_be_null' for the last path (also the first path, if the
3909 pattern is empty). */
3910 bufp->can_be_null |= path_can_be_null;
3913 RESET_FAIL_STACK ();
3915 } /* re_compile_fastmap */
3917 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3918 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3919 this memory for recording register information. STARTS and ENDS
3920 must be allocated using the malloc library routine, and must each
3921 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3923 If NUM_REGS == 0, then subsequent matches should allocate their own
3926 Unless this function is called, the first search or match using
3927 PATTERN_BUFFER will allocate its own register data, without
3928 freeing the old data. */
3931 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3932 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3936 bufp->regs_allocated = REGS_REALLOCATE;
3937 regs->num_regs = num_regs;
3938 regs->start = starts;
3943 bufp->regs_allocated = REGS_UNALLOCATED;
3945 regs->start = regs->end = (regoff_t *) 0;
3949 /* Searching routines. */
3951 /* Like re_search_2, below, but only one string is specified, and
3952 doesn't let you say where to stop matching. */
3955 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3956 int startpos, int range, struct re_registers *regs)
3958 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3963 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3964 # define bytecount_to_charcount(ptr, len) (len)
3965 # define charcount_to_bytecount(ptr, len) (len)
3966 typedef int Charcount;
3969 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3970 virtual concatenation of STRING1 and STRING2, starting first at index
3971 STARTPOS, then at STARTPOS + 1, and so on.
3973 With MULE, STARTPOS is a byte position, not a char position. And the
3974 search will increment STARTPOS by the width of the current leading
3977 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3979 RANGE is how far to scan while trying to match. RANGE = 0 means try
3980 only at STARTPOS; in general, the last start tried is STARTPOS +
3983 With MULE, RANGE is a byte position, not a char position. The last
3984 start tried is the character starting <= STARTPOS + RANGE.
3986 In REGS, return the indices of the virtual concatenation of STRING1
3987 and STRING2 that matched the entire BUFP->buffer and its contained
3990 Do not consider matching one past the index STOP in the virtual
3991 concatenation of STRING1 and STRING2.
3993 We return either the position in the strings at which the match was
3994 found, -1 if no match, or -2 if error (such as failure
3998 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
3999 int size1, const char *str2, int size2, int startpos,
4000 int range, struct re_registers *regs, int stop)
4003 re_char *string1 = (re_char *) str1;
4004 re_char *string2 = (re_char *) str2;
4005 REGISTER char *fastmap = bufp->fastmap;
4006 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4007 int total_size = size1 + size2;
4008 int endpos = startpos + range;
4009 #ifdef REGEX_BEGLINE_CHECK
4010 int anchored_at_begline = 0;
4015 /* Check for out-of-range STARTPOS. */
4016 if (startpos < 0 || startpos > total_size)
4019 /* Fix up RANGE if it might eventually take us outside
4020 the virtual concatenation of STRING1 and STRING2. */
4022 range = 0 - startpos;
4023 else if (endpos > total_size)
4024 range = total_size - startpos;
4026 /* If the search isn't to be a backwards one, don't waste time in a
4027 search for a pattern that must be anchored. */
4028 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4034 d = ((const unsigned char *)
4035 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4036 range = charcount_to_bytecount (d, 1);
4041 /* In a forward search for something that starts with \=.
4042 don't keep searching past point. */
4043 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4045 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4052 /* Update the fastmap now if not correct already. */
4053 if (fastmap && !bufp->fastmap_accurate)
4054 if (re_compile_fastmap (bufp) == -2)
4057 #ifdef REGEX_BEGLINE_CHECK
4059 unsigned long i = 0;
4061 while (i < bufp->used)
4063 if (bufp->buffer[i] == start_memory ||
4064 bufp->buffer[i] == stop_memory)
4069 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4074 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4076 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4082 /* Loop through the string, looking for a place to start matching. */
4085 #ifdef REGEX_BEGLINE_CHECK
4086 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4087 then we can speed things up by skipping to the next beginning-of-
4089 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4092 /* whose stupid idea was it anyway to make this
4093 function take two strings to match?? */
4097 if (startpos < size1 && startpos + range >= size1)
4098 lim = range - (size1 - startpos);
4100 d = ((const unsigned char *)
4101 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4102 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4103 d_size = charcount_to_bytecount (d, 1);
4105 if (TRANSLATE_P (translate))
4106 while (range > lim && *d != '\n')
4108 d += d_size; /* Speedier INC_CHARPTR(d) */
4109 d_size = charcount_to_bytecount (d, 1);
4113 while (range > lim && *d != '\n')
4115 d += d_size; /* Speedier INC_CHARPTR(d) */
4116 d_size = charcount_to_bytecount (d, 1);
4120 startpos += irange - range;
4122 #endif /* REGEX_BEGLINE_CHECK */
4124 /* If a fastmap is supplied, skip quickly over characters that
4125 cannot be the start of a match. If the pattern can match the
4126 null string, however, we don't need to skip characters; we want
4127 the first null string. */
4128 if (fastmap && startpos < total_size && !bufp->can_be_null)
4130 if (range > 0) /* Searching forwards. */
4135 if (startpos < size1 && startpos + range >= size1)
4136 lim = range - (size1 - startpos);
4138 d = ((const unsigned char *)
4139 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4141 /* Written out as an if-else to avoid testing `translate'
4143 if (TRANSLATE_P (translate))
4149 buf_ch = charptr_emchar (d);
4150 buf_ch = RE_TRANSLATE (buf_ch);
4151 if (buf_ch >= 0200 || fastmap[(unsigned char) buf_ch])
4154 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4157 d_size = charcount_to_bytecount (d, 1);
4159 d += d_size; /* Speedier INC_CHARPTR(d) */
4162 while (range > lim && !fastmap[*d])
4164 d_size = charcount_to_bytecount (d, 1);
4166 d += d_size; /* Speedier INC_CHARPTR(d) */
4169 startpos += irange - range;
4171 else /* Searching backwards. */
4173 Emchar c = (size1 == 0 || startpos >= size1
4174 ? charptr_emchar (string2 + startpos - size1)
4175 : charptr_emchar (string1 + startpos));
4178 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4181 if (!fastmap[(unsigned char) c])
4187 /* If can't match the null string, and that's all we have left, fail. */
4188 if (range >= 0 && startpos == total_size && fastmap
4189 && !bufp->can_be_null)
4192 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4193 if (!no_quit_in_re_search)
4196 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4197 startpos, regs, stop);
4198 #ifndef REGEX_MALLOC
4215 d = ((const unsigned char *)
4216 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4217 d_size = charcount_to_bytecount (d, 1);
4223 /* Note startpos > size1 not >=. If we are on the
4224 string1/string2 boundary, we want to backup into string1. */
4225 d = ((const unsigned char *)
4226 (startpos > size1 ? string2 - size1 : string1) + startpos);
4228 d_size = charcount_to_bytecount (d, 1);
4236 /* Declarations and macros for re_match_2. */
4238 /* This converts PTR, a pointer into one of the search strings `string1'
4239 and `string2' into an offset from the beginning of that string. */
4240 #define POINTER_TO_OFFSET(ptr) \
4241 (FIRST_STRING_P (ptr) \
4242 ? ((regoff_t) ((ptr) - string1)) \
4243 : ((regoff_t) ((ptr) - string2 + size1)))
4245 /* Macros for dealing with the split strings in re_match_2. */
4247 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4249 /* Call before fetching a character with *d. This switches over to
4250 string2 if necessary. */
4251 #define REGEX_PREFETCH() \
4254 /* End of string2 => fail. */ \
4255 if (dend == end_match_2) \
4257 /* End of string1 => advance to string2. */ \
4259 dend = end_match_2; \
4263 /* Test if at very beginning or at very end of the virtual concatenation
4264 of `string1' and `string2'. If only one string, it's `string2'. */
4265 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4266 #define AT_STRINGS_END(d) ((d) == end2)
4269 If the given position straddles the string gap, return the equivalent
4270 position that is before or after the gap, respectively; otherwise,
4271 return the same position. */
4272 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4273 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4275 /* Test if CH is a word-constituent character. (XEmacs change) */
4277 #define WORDCHAR_P_UNSAFE(ch) \
4278 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), \
4281 #define WORDCHAR_P_UNSAFE(ch) \
4282 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4286 /* Free everything we malloc. */
4287 #ifdef MATCH_MAY_ALLOCATE
4288 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4289 #define FREE_VARIABLES() \
4291 REGEX_FREE_STACK (fail_stack.stack); \
4292 FREE_VAR (regstart); \
4293 FREE_VAR (regend); \
4294 FREE_VAR (old_regstart); \
4295 FREE_VAR (old_regend); \
4296 FREE_VAR (best_regstart); \
4297 FREE_VAR (best_regend); \
4298 FREE_VAR (reg_info); \
4299 FREE_VAR (reg_dummy); \
4300 FREE_VAR (reg_info_dummy); \
4302 #else /* not MATCH_MAY_ALLOCATE */
4303 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4304 #endif /* MATCH_MAY_ALLOCATE */
4306 /* These values must meet several constraints. They must not be valid
4307 register values; since we have a limit of 255 registers (because
4308 we use only one byte in the pattern for the register number), we can
4309 use numbers larger than 255. They must differ by 1, because of
4310 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4311 be larger than the value for the highest register, so we do not try
4312 to actually save any registers when none are active. */
4313 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4314 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4316 /* Matching routines. */
4318 #ifndef emacs /* Emacs never uses this. */
4319 /* re_match is like re_match_2 except it takes only a single string. */
4322 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4323 int pos, struct re_registers *regs)
4325 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4330 #endif /* not emacs */
4333 /* re_match_2 matches the compiled pattern in BUFP against the
4334 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4335 SIZE2, respectively). We start matching at POS, and stop matching
4338 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4339 store offsets for the substring each group matched in REGS. See the
4340 documentation for exactly how many groups we fill.
4342 We return -1 if no match, -2 if an internal error (such as the
4343 failure stack overflowing). Otherwise, we return the length of the
4344 matched substring. */
4347 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4348 int size1, const char *string2, int size2, int pos,
4349 struct re_registers *regs, int stop)
4354 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4356 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4362 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4363 (re_char *) string2, size2,
4370 /* This is a separate function so that we can force an alloca cleanup
4373 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4374 int size1, re_char *string2, int size2, int pos,
4375 struct re_registers *regs, int stop)
4377 /* General temporaries. */
4380 int should_succeed; /* XEmacs change */
4382 /* Just past the end of the corresponding string. */
4383 re_char *end1, *end2;
4385 /* Pointers into string1 and string2, just past the last characters in
4386 each to consider matching. */
4387 re_char *end_match_1, *end_match_2;
4389 /* Where we are in the data, and the end of the current string. */
4392 /* Where we are in the pattern, and the end of the pattern. */
4393 unsigned char *p = bufp->buffer;
4394 REGISTER unsigned char *pend = p + bufp->used;
4396 /* Mark the opcode just after a start_memory, so we can test for an
4397 empty subpattern when we get to the stop_memory. */
4398 re_char *just_past_start_mem = 0;
4400 /* We use this to map every character in the string. */
4401 RE_TRANSLATE_TYPE translate = bufp->translate;
4403 /* Failure point stack. Each place that can handle a failure further
4404 down the line pushes a failure point on this stack. It consists of
4405 restart, regend, and reg_info for all registers corresponding to
4406 the subexpressions we're currently inside, plus the number of such
4407 registers, and, finally, two char *'s. The first char * is where
4408 to resume scanning the pattern; the second one is where to resume
4409 scanning the strings. If the latter is zero, the failure point is
4410 a ``dummy''; if a failure happens and the failure point is a dummy,
4411 it gets discarded and the next one is tried. */
4412 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4413 fail_stack_type fail_stack;
4416 static unsigned failure_id;
4417 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4421 /* This holds the pointer to the failure stack, when
4422 it is allocated relocatably. */
4423 fail_stack_elt_t *failure_stack_ptr;
4426 /* We fill all the registers internally, independent of what we
4427 return, for use in backreferences. The number here includes
4428 an element for register zero. */
4429 int num_regs = bufp->re_nsub + 1;
4431 /* The currently active registers. */
4432 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4433 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4435 /* Information on the contents of registers. These are pointers into
4436 the input strings; they record just what was matched (on this
4437 attempt) by a subexpression part of the pattern, that is, the
4438 regnum-th regstart pointer points to where in the pattern we began
4439 matching and the regnum-th regend points to right after where we
4440 stopped matching the regnum-th subexpression. (The zeroth register
4441 keeps track of what the whole pattern matches.) */
4442 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4443 re_char **regstart, **regend;
4446 /* If a group that's operated upon by a repetition operator fails to
4447 match anything, then the register for its start will need to be
4448 restored because it will have been set to wherever in the string we
4449 are when we last see its open-group operator. Similarly for a
4451 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4452 re_char **old_regstart, **old_regend;
4455 /* The is_active field of reg_info helps us keep track of which (possibly
4456 nested) subexpressions we are currently in. The matched_something
4457 field of reg_info[reg_num] helps us tell whether or not we have
4458 matched any of the pattern so far this time through the reg_num-th
4459 subexpression. These two fields get reset each time through any
4460 loop their register is in. */
4461 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4462 register_info_type *reg_info;
4465 /* The following record the register info as found in the above
4466 variables when we find a match better than any we've seen before.
4467 This happens as we backtrack through the failure points, which in
4468 turn happens only if we have not yet matched the entire string. */
4469 unsigned best_regs_set = false;
4470 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4471 re_char **best_regstart, **best_regend;
4474 /* Logically, this is `best_regend[0]'. But we don't want to have to
4475 allocate space for that if we're not allocating space for anything
4476 else (see below). Also, we never need info about register 0 for
4477 any of the other register vectors, and it seems rather a kludge to
4478 treat `best_regend' differently than the rest. So we keep track of
4479 the end of the best match so far in a separate variable. We
4480 initialize this to NULL so that when we backtrack the first time
4481 and need to test it, it's not garbage. */
4482 re_char *match_end = NULL;
4484 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4485 int set_regs_matched_done = 0;
4487 /* Used when we pop values we don't care about. */
4488 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4489 re_char **reg_dummy;
4490 register_info_type *reg_info_dummy;
4494 /* Counts the total number of registers pushed. */
4495 unsigned num_regs_pushed = 0;
4498 /* 1 if this match ends in the same string (string1 or string2)
4499 as the best previous match. */
4502 /* 1 if this match is the best seen so far. */
4503 re_bool best_match_p;
4505 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4509 #ifdef MATCH_MAY_ALLOCATE
4510 /* Do not bother to initialize all the register variables if there are
4511 no groups in the pattern, as it takes a fair amount of time. If
4512 there are groups, we include space for register 0 (the whole
4513 pattern), even though we never use it, since it simplifies the
4514 array indexing. We should fix this. */
4517 regstart = REGEX_TALLOC (num_regs, re_char *);
4518 regend = REGEX_TALLOC (num_regs, re_char *);
4519 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4520 old_regend = REGEX_TALLOC (num_regs, re_char *);
4521 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4522 best_regend = REGEX_TALLOC (num_regs, re_char *);
4523 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4524 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4525 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4527 if (!(regstart && regend && old_regstart && old_regend && reg_info
4528 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4536 /* We must initialize all our variables to NULL, so that
4537 `FREE_VARIABLES' doesn't try to free them. */
4538 regstart = regend = old_regstart = old_regend = best_regstart
4539 = best_regend = reg_dummy = NULL;
4540 reg_info = reg_info_dummy = (register_info_type *) NULL;
4542 #endif /* MATCH_MAY_ALLOCATE */
4544 /* The starting position is bogus. */
4545 if (pos < 0 || pos > size1 + size2)
4551 /* Initialize subexpression text positions to -1 to mark ones that no
4552 start_memory/stop_memory has been seen for. Also initialize the
4553 register information struct. */
4554 for (mcnt = 1; mcnt < num_regs; mcnt++)
4556 regstart[mcnt] = regend[mcnt]
4557 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4559 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4560 IS_ACTIVE (reg_info[mcnt]) = 0;
4561 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4562 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4564 /* We move `string1' into `string2' if the latter's empty -- but not if
4565 `string1' is null. */
4566 if (size2 == 0 && string1 != NULL)
4573 end1 = string1 + size1;
4574 end2 = string2 + size2;
4576 /* Compute where to stop matching, within the two strings. */
4579 end_match_1 = string1 + stop;
4580 end_match_2 = string2;
4585 end_match_2 = string2 + stop - size1;
4588 /* `p' scans through the pattern as `d' scans through the data.
4589 `dend' is the end of the input string that `d' points within. `d'
4590 is advanced into the following input string whenever necessary, but
4591 this happens before fetching; therefore, at the beginning of the
4592 loop, `d' can be pointing at the end of a string, but it cannot
4594 if (size1 > 0 && pos <= size1)
4601 d = string2 + pos - size1;
4605 DEBUG_PRINT1 ("The compiled pattern is: \n");
4606 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4607 DEBUG_PRINT1 ("The string to match is: `");
4608 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4609 DEBUG_PRINT1 ("'\n");
4611 /* This loops over pattern commands. It exits by returning from the
4612 function if the match is complete, or it drops through if the match
4613 fails at this starting point in the input data. */
4616 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4617 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4618 if (!no_quit_in_re_search)
4623 { /* End of pattern means we might have succeeded. */
4624 DEBUG_PRINT1 ("end of pattern ... ");
4626 /* If we haven't matched the entire string, and we want the
4627 longest match, try backtracking. */
4628 if (d != end_match_2)
4630 same_str_p = (FIRST_STRING_P (match_end)
4631 == MATCHING_IN_FIRST_STRING);
4633 /* AIX compiler got confused when this was combined
4634 with the previous declaration. */
4636 best_match_p = d > match_end;
4638 best_match_p = !MATCHING_IN_FIRST_STRING;
4640 DEBUG_PRINT1 ("backtracking.\n");
4642 if (!FAIL_STACK_EMPTY ())
4643 { /* More failure points to try. */
4645 /* If exceeds best match so far, save it. */
4646 if (!best_regs_set || best_match_p)
4648 best_regs_set = true;
4651 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4653 for (mcnt = 1; mcnt < num_regs; mcnt++)
4655 best_regstart[mcnt] = regstart[mcnt];
4656 best_regend[mcnt] = regend[mcnt];
4662 /* If no failure points, don't restore garbage. And if
4663 last match is real best match, don't restore second
4665 else if (best_regs_set && !best_match_p)
4668 /* Restore best match. It may happen that `dend ==
4669 end_match_1' while the restored d is in string2.
4670 For example, the pattern `x.*y.*z' against the
4671 strings `x-' and `y-z-', if the two strings are
4672 not consecutive in memory. */
4673 DEBUG_PRINT1 ("Restoring best registers.\n");
4676 dend = ((d >= string1 && d <= end1)
4677 ? end_match_1 : end_match_2);
4679 for (mcnt = 1; mcnt < num_regs; mcnt++)
4681 regstart[mcnt] = best_regstart[mcnt];
4682 regend[mcnt] = best_regend[mcnt];
4685 } /* d != end_match_2 */
4688 DEBUG_PRINT1 ("Accepting match.\n");
4690 /* If caller wants register contents data back, do it. */
4691 if (regs && !bufp->no_sub)
4693 /* Have the register data arrays been allocated? */
4694 if (bufp->regs_allocated == REGS_UNALLOCATED)
4695 { /* No. So allocate them with malloc. We need one
4696 extra element beyond `num_regs' for the `-1' marker
4698 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4699 regs->start = TALLOC (regs->num_regs, regoff_t);
4700 regs->end = TALLOC (regs->num_regs, regoff_t);
4701 if (regs->start == NULL || regs->end == NULL)
4706 bufp->regs_allocated = REGS_REALLOCATE;
4708 else if (bufp->regs_allocated == REGS_REALLOCATE)
4709 { /* Yes. If we need more elements than were already
4710 allocated, reallocate them. If we need fewer, just
4712 if (regs->num_regs < num_regs + 1)
4714 regs->num_regs = num_regs + 1;
4715 RETALLOC (regs->start, regs->num_regs, regoff_t);
4716 RETALLOC (regs->end, regs->num_regs, regoff_t);
4717 if (regs->start == NULL || regs->end == NULL)
4726 /* These braces fend off a "empty body in an else-statement"
4727 warning under GCC when assert expands to nothing. */
4728 assert (bufp->regs_allocated == REGS_FIXED);
4731 /* Convert the pointer data in `regstart' and `regend' to
4732 indices. Register zero has to be set differently,
4733 since we haven't kept track of any info for it. */
4734 if (regs->num_regs > 0)
4736 regs->start[0] = pos;
4737 regs->end[0] = (MATCHING_IN_FIRST_STRING
4738 ? ((regoff_t) (d - string1))
4739 : ((regoff_t) (d - string2 + size1)));
4742 /* Go through the first `min (num_regs, regs->num_regs)'
4743 registers, since that is all we initialized. */
4744 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4746 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4747 regs->start[mcnt] = regs->end[mcnt] = -1;
4751 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4753 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4756 } /* regs && !bufp->no_sub */
4758 /* If we have regs and the regs structure has more elements than
4759 were in the pattern, set the extra elements to -1. If we
4760 (re)allocated the registers, this is the case, because we
4761 always allocate enough to have at least one -1 at the end.
4763 We do this even when no_sub is set because some applications
4764 (XEmacs) reuse register structures which may contain stale
4765 information, and permit attempts to access those registers.
4767 It would be possible to require the caller to do this, but we'd
4768 have to change the API for this function to reflect that, and
4769 audit all callers. */
4770 if (regs && regs->num_regs > 0)
4771 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4772 regs->start[mcnt] = regs->end[mcnt] = -1;
4774 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4775 nfailure_points_pushed, nfailure_points_popped,
4776 nfailure_points_pushed - nfailure_points_popped);
4777 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4779 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4783 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4789 /* Otherwise match next pattern command. */
4790 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4792 /* Ignore these. Used to ignore the n of succeed_n's which
4793 currently have n == 0. */
4795 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4799 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4802 /* Match the next n pattern characters exactly. The following
4803 byte in the pattern defines n, and the n bytes after that
4804 are the characters to match. */
4807 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4809 /* This is written out as an if-else so we don't waste time
4810 testing `translate' inside the loop. */
4811 if (TRANSLATE_P (translate))
4816 Emchar pat_ch, buf_ch;
4820 pat_ch = charptr_emchar (p);
4821 buf_ch = charptr_emchar (d);
4822 if (RE_TRANSLATE (buf_ch) != pat_ch)
4825 pat_len = charcount_to_bytecount (p, 1);
4830 #else /* not MULE */
4832 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4844 if (*d++ != *p++) goto fail;
4848 SET_REGS_MATCHED ();
4852 /* Match any character except possibly a newline or a null. */
4854 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4858 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4859 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4862 SET_REGS_MATCHED ();
4863 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4864 INC_CHARPTR (d); /* XEmacs change */
4871 REGISTER unsigned char c;
4872 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4874 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4877 c = TRANSLATE (*d); /* The character to match. */
4879 /* Cast to `unsigned' instead of `unsigned char' in case the
4880 bit list is a full 32 bytes long. */
4881 if (c < (unsigned) (*p * BYTEWIDTH)
4882 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4887 if (!not_p) goto fail;
4889 SET_REGS_MATCHED ();
4890 INC_CHARPTR (d); /* XEmacs change */
4896 case charset_mule_not:
4899 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4901 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4904 c = charptr_emchar ((const Bufbyte *) d);
4905 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4907 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4910 p += unified_range_table_bytes_used (p);
4912 if (!not_p) goto fail;
4914 SET_REGS_MATCHED ();
4921 /* The beginning of a group is represented by start_memory.
4922 The arguments are the register number in the next byte, and the
4923 number of groups inner to this one in the next. The text
4924 matched within the group is recorded (in the internal
4925 registers data structure) under the register number. */
4927 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4929 /* Find out if this group can match the empty string. */
4930 p1 = p; /* To send to group_match_null_string_p. */
4932 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4933 REG_MATCH_NULL_STRING_P (reg_info[*p])
4934 = group_match_null_string_p (&p1, pend, reg_info);
4936 /* Save the position in the string where we were the last time
4937 we were at this open-group operator in case the group is
4938 operated upon by a repetition operator, e.g., with `(a*)*b'
4939 against `ab'; then we want to ignore where we are now in
4940 the string in case this attempt to match fails. */
4941 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4942 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4944 DEBUG_PRINT2 (" old_regstart: %d\n",
4945 POINTER_TO_OFFSET (old_regstart[*p]));
4948 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4950 IS_ACTIVE (reg_info[*p]) = 1;
4951 MATCHED_SOMETHING (reg_info[*p]) = 0;
4953 /* Clear this whenever we change the register activity status. */
4954 set_regs_matched_done = 0;
4956 /* This is the new highest active register. */
4957 highest_active_reg = *p;
4959 /* If nothing was active before, this is the new lowest active
4961 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4962 lowest_active_reg = *p;
4964 /* Move past the register number and inner group count. */
4966 just_past_start_mem = p;
4971 /* The stop_memory opcode represents the end of a group. Its
4972 arguments are the same as start_memory's: the register
4973 number, and the number of inner groups. */
4975 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4977 /* We need to save the string position the last time we were at
4978 this close-group operator in case the group is operated
4979 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4980 against `aba'; then we want to ignore where we are now in
4981 the string in case this attempt to match fails. */
4982 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4983 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4985 DEBUG_PRINT2 (" old_regend: %d\n",
4986 POINTER_TO_OFFSET (old_regend[*p]));
4989 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4991 /* This register isn't active anymore. */
4992 IS_ACTIVE (reg_info[*p]) = 0;
4994 /* Clear this whenever we change the register activity status. */
4995 set_regs_matched_done = 0;
4997 /* If this was the only register active, nothing is active
4999 if (lowest_active_reg == highest_active_reg)
5001 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5002 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5005 { /* We must scan for the new highest active register, since
5006 it isn't necessarily one less than now: consider
5007 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
5008 new highest active register is 1. */
5009 unsigned char r = *p - 1;
5010 while (r > 0 && !IS_ACTIVE (reg_info[r]))
5013 /* If we end up at register zero, that means that we saved
5014 the registers as the result of an `on_failure_jump', not
5015 a `start_memory', and we jumped to past the innermost
5016 `stop_memory'. For example, in ((.)*) we save
5017 registers 1 and 2 as a result of the *, but when we pop
5018 back to the second ), we are at the stop_memory 1.
5019 Thus, nothing is active. */
5022 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5023 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5027 highest_active_reg = r;
5029 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
5031 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
5033 lowest_active_reg = r;
5037 /* If just failed to match something this time around with a
5038 group that's operated on by a repetition operator, try to
5039 force exit from the ``loop'', and restore the register
5040 information for this group that we had before trying this
5042 if ((!MATCHED_SOMETHING (reg_info[*p])
5043 || just_past_start_mem == p - 1)
5046 re_bool is_a_jump_n = false;
5050 switch ((re_opcode_t) *p1++)
5054 case pop_failure_jump:
5055 case maybe_pop_jump:
5057 case dummy_failure_jump:
5058 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5068 /* If the next operation is a jump backwards in the pattern
5069 to an on_failure_jump right before the start_memory
5070 corresponding to this stop_memory, exit from the loop
5071 by forcing a failure after pushing on the stack the
5072 on_failure_jump's jump in the pattern, and d. */
5073 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5074 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5076 /* If this group ever matched anything, then restore
5077 what its registers were before trying this last
5078 failed match, e.g., with `(a*)*b' against `ab' for
5079 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5080 against `aba' for regend[3].
5082 Also restore the registers for inner groups for,
5083 e.g., `((a*)(b*))*' against `aba' (register 3 would
5084 otherwise get trashed). */
5086 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5090 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5092 /* Restore this and inner groups' (if any) registers. */
5093 for (r = *p; r < *p + *(p + 1); r++)
5095 regstart[r] = old_regstart[r];
5097 /* xx why this test? */
5098 if (old_regend[r] >= regstart[r])
5099 regend[r] = old_regend[r];
5103 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5104 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5110 /* Move past the register number and the inner group count. */
5115 /* \<digit> has been turned into a `duplicate' command which is
5116 followed by the numeric value of <digit> as the register number. */
5119 REGISTER re_char *d2, *dend2;
5120 int regno = *p++; /* Get which register to match against. */
5121 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5123 /* Can't back reference a group which we've never matched. */
5124 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5127 /* Where in input to try to start matching. */
5128 d2 = regstart[regno];
5130 /* Where to stop matching; if both the place to start and
5131 the place to stop matching are in the same string, then
5132 set to the place to stop, otherwise, for now have to use
5133 the end of the first string. */
5135 dend2 = ((FIRST_STRING_P (regstart[regno])
5136 == FIRST_STRING_P (regend[regno]))
5137 ? regend[regno] : end_match_1);
5140 /* If necessary, advance to next segment in register
5144 if (dend2 == end_match_2) break;
5145 if (dend2 == regend[regno]) break;
5147 /* End of string1 => advance to string2. */
5149 dend2 = regend[regno];
5151 /* At end of register contents => success */
5152 if (d2 == dend2) break;
5154 /* If necessary, advance to next segment in data. */
5157 /* How many characters left in this segment to match. */
5160 /* Want how many consecutive characters we can match in
5161 one shot, so, if necessary, adjust the count. */
5162 if (mcnt > dend2 - d2)
5165 /* Compare that many; failure if mismatch, else move
5167 if (TRANSLATE_P (translate)
5168 ? bcmp_translate ((unsigned char *) d,
5169 (unsigned char *) d2, mcnt, translate)
5170 : memcmp (d, d2, mcnt))
5172 d += mcnt, d2 += mcnt;
5174 /* Do this because we've match some characters. */
5175 SET_REGS_MATCHED ();
5181 /* begline matches the empty string at the beginning of the string
5182 (unless `not_bol' is set in `bufp'), and, if
5183 `newline_anchor' is set, after newlines. */
5185 DEBUG_PRINT1 ("EXECUTING begline.\n");
5187 if (AT_STRINGS_BEG (d))
5189 if (!bufp->not_bol) break;
5191 else if (d[-1] == '\n' && bufp->newline_anchor)
5195 /* In all other cases, we fail. */
5199 /* endline is the dual of begline. */
5201 DEBUG_PRINT1 ("EXECUTING endline.\n");
5203 if (AT_STRINGS_END (d))
5205 if (!bufp->not_eol) break;
5208 /* We have to ``prefetch'' the next character. */
5209 else if ((d == end1 ? *string2 : *d) == '\n'
5210 && bufp->newline_anchor)
5217 /* Match at the very beginning of the data. */
5219 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5220 if (AT_STRINGS_BEG (d))
5225 /* Match at the very end of the data. */
5227 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5228 if (AT_STRINGS_END (d))
5233 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5234 pushes NULL as the value for the string on the stack. Then
5235 `pop_failure_point' will keep the current value for the
5236 string, instead of restoring it. To see why, consider
5237 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5238 then the . fails against the \n. But the next thing we want
5239 to do is match the \n against the \n; if we restored the
5240 string value, we would be back at the foo.
5242 Because this is used only in specific cases, we don't need to
5243 check all the things that `on_failure_jump' does, to make
5244 sure the right things get saved on the stack. Hence we don't
5245 share its code. The only reason to push anything on the
5246 stack at all is that otherwise we would have to change
5247 `anychar's code to do something besides goto fail in this
5248 case; that seems worse than this. */
5249 case on_failure_keep_string_jump:
5250 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5252 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5253 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5255 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5259 /* Uses of on_failure_jump:
5261 Each alternative starts with an on_failure_jump that points
5262 to the beginning of the next alternative. Each alternative
5263 except the last ends with a jump that in effect jumps past
5264 the rest of the alternatives. (They really jump to the
5265 ending jump of the following alternative, because tensioning
5266 these jumps is a hassle.)
5268 Repeats start with an on_failure_jump that points past both
5269 the repetition text and either the following jump or
5270 pop_failure_jump back to this on_failure_jump. */
5271 case on_failure_jump:
5273 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5275 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5276 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5278 /* If this on_failure_jump comes right before a group (i.e.,
5279 the original * applied to a group), save the information
5280 for that group and all inner ones, so that if we fail back
5281 to this point, the group's information will be correct.
5282 For example, in \(a*\)*\1, we need the preceding group,
5283 and in \(\(a*\)b*\)\2, we need the inner group. */
5285 /* We can't use `p' to check ahead because we push
5286 a failure point to `p + mcnt' after we do this. */
5289 /* We need to skip no_op's before we look for the
5290 start_memory in case this on_failure_jump is happening as
5291 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5293 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5296 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5298 /* We have a new highest active register now. This will
5299 get reset at the start_memory we are about to get to,
5300 but we will have saved all the registers relevant to
5301 this repetition op, as described above. */
5302 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5303 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5304 lowest_active_reg = *(p1 + 1);
5307 DEBUG_PRINT1 (":\n");
5308 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5312 /* A smart repeat ends with `maybe_pop_jump'.
5313 We change it to either `pop_failure_jump' or `jump'. */
5314 case maybe_pop_jump:
5315 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5316 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5318 REGISTER unsigned char *p2 = p;
5320 /* Compare the beginning of the repeat with what in the
5321 pattern follows its end. If we can establish that there
5322 is nothing that they would both match, i.e., that we
5323 would have to backtrack because of (as in, e.g., `a*a')
5324 then we can change to pop_failure_jump, because we'll
5325 never have to backtrack.
5327 This is not true in the case of alternatives: in
5328 `(a|ab)*' we do need to backtrack to the `ab' alternative
5329 (e.g., if the string was `ab'). But instead of trying to
5330 detect that here, the alternative has put on a dummy
5331 failure point which is what we will end up popping. */
5333 /* Skip over open/close-group commands.
5334 If what follows this loop is a ...+ construct,
5335 look at what begins its body, since we will have to
5336 match at least one of that. */
5340 && ((re_opcode_t) *p2 == stop_memory
5341 || (re_opcode_t) *p2 == start_memory))
5343 else if (p2 + 6 < pend
5344 && (re_opcode_t) *p2 == dummy_failure_jump)
5351 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5352 to the `maybe_finalize_jump' of this case. Examine what
5355 /* If we're at the end of the pattern, we can change. */
5358 /* Consider what happens when matching ":\(.*\)"
5359 against ":/". I don't really understand this code
5361 p[-3] = (unsigned char) pop_failure_jump;
5363 (" End of pattern: change to `pop_failure_jump'.\n");
5366 else if ((re_opcode_t) *p2 == exactn
5367 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5369 REGISTER unsigned char c
5370 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5372 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5374 p[-3] = (unsigned char) pop_failure_jump;
5375 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5379 else if ((re_opcode_t) p1[3] == charset
5380 || (re_opcode_t) p1[3] == charset_not)
5382 int not_p = (re_opcode_t) p1[3] == charset_not;
5384 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5385 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5388 /* `not_p' is equal to 1 if c would match, which means
5389 that we can't change to pop_failure_jump. */
5392 p[-3] = (unsigned char) pop_failure_jump;
5393 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5397 else if ((re_opcode_t) *p2 == charset)
5400 REGISTER unsigned char c
5401 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5404 if ((re_opcode_t) p1[3] == exactn
5405 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5406 && (p2[2 + p1[5] / BYTEWIDTH]
5407 & (1 << (p1[5] % BYTEWIDTH)))))
5409 p[-3] = (unsigned char) pop_failure_jump;
5410 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5414 else if ((re_opcode_t) p1[3] == charset_not)
5417 /* We win if the charset_not inside the loop
5418 lists every character listed in the charset after. */
5419 for (idx = 0; idx < (int) p2[1]; idx++)
5420 if (! (p2[2 + idx] == 0
5421 || (idx < (int) p1[4]
5422 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5427 p[-3] = (unsigned char) pop_failure_jump;
5428 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5431 else if ((re_opcode_t) p1[3] == charset)
5434 /* We win if the charset inside the loop
5435 has no overlap with the one after the loop. */
5437 idx < (int) p2[1] && idx < (int) p1[4];
5439 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5442 if (idx == p2[1] || idx == p1[4])
5444 p[-3] = (unsigned char) pop_failure_jump;
5445 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5450 p -= 2; /* Point at relative address again. */
5451 if ((re_opcode_t) p[-1] != pop_failure_jump)
5453 p[-1] = (unsigned char) jump;
5454 DEBUG_PRINT1 (" Match => jump.\n");
5455 goto unconditional_jump;
5457 /* Note fall through. */
5460 /* The end of a simple repeat has a pop_failure_jump back to
5461 its matching on_failure_jump, where the latter will push a
5462 failure point. The pop_failure_jump takes off failure
5463 points put on by this pop_failure_jump's matching
5464 on_failure_jump; we got through the pattern to here from the
5465 matching on_failure_jump, so didn't fail. */
5466 case pop_failure_jump:
5468 /* We need to pass separate storage for the lowest and
5469 highest registers, even though we don't care about the
5470 actual values. Otherwise, we will restore only one
5471 register from the stack, since lowest will == highest in
5472 `pop_failure_point'. */
5473 int dummy_low_reg, dummy_high_reg;
5474 unsigned char *pdummy;
5475 re_char *sdummy = NULL;
5477 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5478 POP_FAILURE_POINT (sdummy, pdummy,
5479 dummy_low_reg, dummy_high_reg,
5480 reg_dummy, reg_dummy, reg_info_dummy);
5482 /* Note fall through. */
5485 /* Unconditionally jump (without popping any failure points). */
5488 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5489 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5490 p += mcnt; /* Do the jump. */
5491 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5495 /* We need this opcode so we can detect where alternatives end
5496 in `group_match_null_string_p' et al. */
5498 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5499 goto unconditional_jump;
5502 /* Normally, the on_failure_jump pushes a failure point, which
5503 then gets popped at pop_failure_jump. We will end up at
5504 pop_failure_jump, also, and with a pattern of, say, `a+', we
5505 are skipping over the on_failure_jump, so we have to push
5506 something meaningless for pop_failure_jump to pop. */
5507 case dummy_failure_jump:
5508 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5509 /* It doesn't matter what we push for the string here. What
5510 the code at `fail' tests is the value for the pattern. */
5511 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5512 goto unconditional_jump;
5515 /* At the end of an alternative, we need to push a dummy failure
5516 point in case we are followed by a `pop_failure_jump', because
5517 we don't want the failure point for the alternative to be
5518 popped. For example, matching `(a|ab)*' against `aab'
5519 requires that we match the `ab' alternative. */
5520 case push_dummy_failure:
5521 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5522 /* See comments just above at `dummy_failure_jump' about the
5524 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5527 /* Have to succeed matching what follows at least n times.
5528 After that, handle like `on_failure_jump'. */
5530 EXTRACT_NUMBER (mcnt, p + 2);
5531 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5534 /* Originally, this is how many times we HAVE to succeed. */
5539 STORE_NUMBER_AND_INCR (p, mcnt);
5540 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5544 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5546 p[2] = (unsigned char) no_op;
5547 p[3] = (unsigned char) no_op;
5553 EXTRACT_NUMBER (mcnt, p + 2);
5554 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5556 /* Originally, this is how many times we CAN jump. */
5560 STORE_NUMBER (p + 2, mcnt);
5561 goto unconditional_jump;
5563 /* If don't have to jump any more, skip over the rest of command. */
5570 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5572 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5574 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5575 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5576 STORE_NUMBER (p1, mcnt);
5581 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5587 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5591 re_char *d_before = POS_BEFORE_GAP_UNSAFE (d);
5592 re_char *d_after = POS_AFTER_GAP_UNSAFE (d);
5594 /* emch1 is the character before d, syn1 is the syntax of emch1,
5595 emch2 is the character at d, and syn2 is the syntax of emch2. */
5596 Emchar emch1, emch2;
5602 DEC_CHARPTR (d_before);
5603 emch1 = charptr_emchar (d_before);
5604 emch2 = charptr_emchar (d_after);
5607 pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5608 UPDATE_SYNTAX_CACHE (pos_before);
5610 syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5613 UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1);
5615 syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5618 result = ((syn1 == Sword) != (syn2 == Sword));
5620 if (result == should_succeed)
5626 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5628 goto matchwordbound;
5631 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5632 if (AT_STRINGS_END (d))
5635 /* XEmacs: this originally read:
5637 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5641 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5642 Emchar emch = charptr_emchar (dtmp);
5644 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5645 UPDATE_SYNTAX_CACHE (charpos);
5647 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5650 if (AT_STRINGS_BEG (d))
5652 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5654 emch = charptr_emchar (dtmp);
5656 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5658 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5665 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5666 if (AT_STRINGS_BEG (d))
5669 /* XEmacs: this originally read:
5671 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5672 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5675 The or condition is incorrect (reversed).
5680 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5681 UPDATE_SYNTAX_CACHE (charpos);
5683 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5685 emch = charptr_emchar (dtmp);
5686 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5689 if (AT_STRINGS_END (d))
5691 dtmp = POS_AFTER_GAP_UNSAFE (d);
5692 emch = charptr_emchar (dtmp);
5694 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5696 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5704 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5705 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5706 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5707 >= BUF_PT (regex_emacs_buffer)))
5712 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5713 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5714 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5715 != BUF_PT (regex_emacs_buffer)))
5720 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5721 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5722 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5723 <= BUF_PT (regex_emacs_buffer)))
5726 #if 0 /* not emacs19 */
5728 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5729 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5730 != BUF_PT (regex_emacs_buffer))
5733 #endif /* not emacs19 */
5736 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5741 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5753 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5754 UPDATE_SYNTAX_CACHE (charpos);
5758 emch = charptr_emchar ((const Bufbyte *) d);
5760 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->syntax_table),
5761 emch) == (enum syntaxcode) mcnt);
5763 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5764 emch) == (enum syntaxcode) mcnt);
5767 if (matches != should_succeed)
5769 SET_REGS_MATCHED ();
5774 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5776 goto matchnotsyntax;
5779 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5783 goto matchornotsyntax;
5786 /* 97/2/17 jhod Mule category code patch */
5795 emch = charptr_emchar ((const Bufbyte *) d);
5797 if (check_category_char(emch, regex_emacs_buffer->category_table,
5798 mcnt, should_succeed))
5800 SET_REGS_MATCHED ();
5804 case notcategoryspec:
5806 goto matchornotcategory;
5807 /* end of category patch */
5809 #else /* not emacs */
5811 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5813 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5815 SET_REGS_MATCHED ();
5820 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5822 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5824 SET_REGS_MATCHED ();
5832 continue; /* Successfully executed one pattern command; keep going. */
5835 /* We goto here if a matching operation fails. */
5837 if (!FAIL_STACK_EMPTY ())
5838 { /* A restart point is known. Restore to that state. */
5839 DEBUG_PRINT1 ("\nFAIL:\n");
5840 POP_FAILURE_POINT (d, p,
5841 lowest_active_reg, highest_active_reg,
5842 regstart, regend, reg_info);
5844 /* If this failure point is a dummy, try the next one. */
5848 /* If we failed to the end of the pattern, don't examine *p. */
5852 re_bool is_a_jump_n = false;
5854 /* If failed to a backwards jump that's part of a repetition
5855 loop, need to pop this failure point and use the next one. */
5856 switch ((re_opcode_t) *p)
5860 case maybe_pop_jump:
5861 case pop_failure_jump:
5864 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5867 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5869 && (re_opcode_t) *p1 == on_failure_jump))
5877 if (d >= string1 && d <= end1)
5881 break; /* Matching at this starting point really fails. */
5885 goto restore_best_regs;
5889 return -1; /* Failure to match. */
5892 /* Subroutine definitions for re_match_2. */
5895 /* We are passed P pointing to a register number after a start_memory.
5897 Return true if the pattern up to the corresponding stop_memory can
5898 match the empty string, and false otherwise.
5900 If we find the matching stop_memory, sets P to point to one past its number.
5901 Otherwise, sets P to an undefined byte less than or equal to END.
5903 We don't handle duplicates properly (yet). */
5906 group_match_null_string_p (unsigned char **p, unsigned char *end,
5907 register_info_type *reg_info)
5910 /* Point to after the args to the start_memory. */
5911 unsigned char *p1 = *p + 2;
5915 /* Skip over opcodes that can match nothing, and return true or
5916 false, as appropriate, when we get to one that can't, or to the
5917 matching stop_memory. */
5919 switch ((re_opcode_t) *p1)
5921 /* Could be either a loop or a series of alternatives. */
5922 case on_failure_jump:
5924 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5926 /* If the next operation is not a jump backwards in the
5931 /* Go through the on_failure_jumps of the alternatives,
5932 seeing if any of the alternatives cannot match nothing.
5933 The last alternative starts with only a jump,
5934 whereas the rest start with on_failure_jump and end
5935 with a jump, e.g., here is the pattern for `a|b|c':
5937 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5938 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5941 So, we have to first go through the first (n-1)
5942 alternatives and then deal with the last one separately. */
5945 /* Deal with the first (n-1) alternatives, which start
5946 with an on_failure_jump (see above) that jumps to right
5947 past a jump_past_alt. */
5949 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5951 /* `mcnt' holds how many bytes long the alternative
5952 is, including the ending `jump_past_alt' and
5955 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5959 /* Move to right after this alternative, including the
5963 /* Break if it's the beginning of an n-th alternative
5964 that doesn't begin with an on_failure_jump. */
5965 if ((re_opcode_t) *p1 != on_failure_jump)
5968 /* Still have to check that it's not an n-th
5969 alternative that starts with an on_failure_jump. */
5971 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5972 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5974 /* Get to the beginning of the n-th alternative. */
5980 /* Deal with the last alternative: go back and get number
5981 of the `jump_past_alt' just before it. `mcnt' contains
5982 the length of the alternative. */
5983 EXTRACT_NUMBER (mcnt, p1 - 2);
5985 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5988 p1 += mcnt; /* Get past the n-th alternative. */
5994 assert (p1[1] == **p);
6000 if (!common_op_match_null_string_p (&p1, end, reg_info))
6003 } /* while p1 < end */
6006 } /* group_match_null_string_p */
6009 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6010 It expects P to be the first byte of a single alternative and END one
6011 byte past the last. The alternative can contain groups. */
6014 alt_match_null_string_p (unsigned char *p, unsigned char *end,
6015 register_info_type *reg_info)
6018 unsigned char *p1 = p;
6022 /* Skip over opcodes that can match nothing, and break when we get
6023 to one that can't. */
6025 switch ((re_opcode_t) *p1)
6028 case on_failure_jump:
6030 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6035 if (!common_op_match_null_string_p (&p1, end, reg_info))
6038 } /* while p1 < end */
6041 } /* alt_match_null_string_p */
6044 /* Deals with the ops common to group_match_null_string_p and
6045 alt_match_null_string_p.
6047 Sets P to one after the op and its arguments, if any. */
6050 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
6051 register_info_type *reg_info)
6056 unsigned char *p1 = *p;
6058 switch ((re_opcode_t) *p1++)
6078 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6079 ret = group_match_null_string_p (&p1, end, reg_info);
6081 /* Have to set this here in case we're checking a group which
6082 contains a group and a back reference to it. */
6084 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6085 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6091 /* If this is an optimized succeed_n for zero times, make the jump. */
6093 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6101 /* Get to the number of times to succeed. */
6103 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6108 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6116 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6124 /* All other opcodes mean we cannot match the empty string. */
6130 } /* common_op_match_null_string_p */
6133 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6134 bytes; nonzero otherwise. */
6137 bcmp_translate (re_char *s1, re_char *s2,
6138 REGISTER int len, RE_TRANSLATE_TYPE translate)
6140 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6142 const unsigned char *p1_end = s1 + len;
6143 const unsigned char *p2_end = s2 + len;
6145 while (p1 != p1_end && p2 != p2_end)
6147 Emchar p1_ch, p2_ch;
6149 p1_ch = charptr_emchar (p1);
6150 p2_ch = charptr_emchar (p2);
6152 if (RE_TRANSLATE (p1_ch)
6153 != RE_TRANSLATE (p2_ch))
6158 #else /* not MULE */
6161 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6168 /* Entry points for GNU code. */
6170 /* re_compile_pattern is the GNU regular expression compiler: it
6171 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6172 Returns 0 if the pattern was valid, otherwise an error string.
6174 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6175 are set in BUFP on entry.
6177 We call regex_compile to do the actual compilation. */
6180 re_compile_pattern (const char *pattern, int length,
6181 struct re_pattern_buffer *bufp)
6185 /* GNU code is written to assume at least RE_NREGS registers will be set
6186 (and at least one extra will be -1). */
6187 bufp->regs_allocated = REGS_UNALLOCATED;
6189 /* And GNU code determines whether or not to get register information
6190 by passing null for the REGS argument to re_match, etc., not by
6194 /* Match anchors at newline. */
6195 bufp->newline_anchor = 1;
6197 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6201 return gettext (re_error_msgid[(int) ret]);
6204 /* Entry points compatible with 4.2 BSD regex library. We don't define
6205 them unless specifically requested. */
6207 #ifdef _REGEX_RE_COMP
6209 /* BSD has one and only one pattern buffer. */
6210 static struct re_pattern_buffer re_comp_buf;
6213 re_comp (const char *s)
6219 if (!re_comp_buf.buffer)
6220 return gettext ("No previous regular expression");
6224 if (!re_comp_buf.buffer)
6226 re_comp_buf.buffer = (unsigned char *) malloc (200);
6227 if (re_comp_buf.buffer == NULL)
6228 return gettext (re_error_msgid[(int) REG_ESPACE]);
6229 re_comp_buf.allocated = 200;
6231 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6232 if (re_comp_buf.fastmap == NULL)
6233 return gettext (re_error_msgid[(int) REG_ESPACE]);
6236 /* Since `re_exec' always passes NULL for the `regs' argument, we
6237 don't need to initialize the pattern buffer fields which affect it. */
6239 /* Match anchors at newlines. */
6240 re_comp_buf.newline_anchor = 1;
6242 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6247 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6248 return (char *) gettext (re_error_msgid[(int) ret]);
6253 re_exec (const char *s)
6255 const int len = strlen (s);
6257 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6259 #endif /* _REGEX_RE_COMP */
6261 /* POSIX.2 functions. Don't define these for Emacs. */
6265 /* regcomp takes a regular expression as a string and compiles it.
6267 PREG is a regex_t *. We do not expect any fields to be initialized,
6268 since POSIX says we shouldn't. Thus, we set
6270 `buffer' to the compiled pattern;
6271 `used' to the length of the compiled pattern;
6272 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6273 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6274 RE_SYNTAX_POSIX_BASIC;
6275 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6276 `fastmap' and `fastmap_accurate' to zero;
6277 `re_nsub' to the number of subexpressions in PATTERN.
6279 PATTERN is the address of the pattern string.
6281 CFLAGS is a series of bits which affect compilation.
6283 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6284 use POSIX basic syntax.
6286 If REG_NEWLINE is set, then . and [^...] don't match newline.
6287 Also, regexec will try a match beginning after every newline.
6289 If REG_ICASE is set, then we considers upper- and lowercase
6290 versions of letters to be equivalent when matching.
6292 If REG_NOSUB is set, then when PREG is passed to regexec, that
6293 routine will report only success or failure, and nothing about the
6296 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6297 the return codes and their meanings.) */
6300 regcomp (regex_t *preg, const char *pattern, int cflags)
6304 = (cflags & REG_EXTENDED) ?
6305 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6307 /* regex_compile will allocate the space for the compiled pattern. */
6309 preg->allocated = 0;
6312 /* Don't bother to use a fastmap when searching. This simplifies the
6313 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6314 characters after newlines into the fastmap. This way, we just try
6318 if (cflags & REG_ICASE)
6322 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6323 if (preg->translate == NULL)
6324 return (int) REG_ESPACE;
6326 /* Map uppercase characters to corresponding lowercase ones. */
6327 for (i = 0; i < CHAR_SET_SIZE; i++)
6328 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6331 preg->translate = NULL;
6333 /* If REG_NEWLINE is set, newlines are treated differently. */
6334 if (cflags & REG_NEWLINE)
6335 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6336 syntax &= ~RE_DOT_NEWLINE;
6337 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6338 /* It also changes the matching behavior. */
6339 preg->newline_anchor = 1;
6342 preg->newline_anchor = 0;
6344 preg->no_sub = !!(cflags & REG_NOSUB);
6346 /* POSIX says a null character in the pattern terminates it, so we
6347 can use strlen here in compiling the pattern. */
6348 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6350 /* POSIX doesn't distinguish between an unmatched open-group and an
6351 unmatched close-group: both are REG_EPAREN. */
6352 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6358 /* regexec searches for a given pattern, specified by PREG, in the
6361 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6362 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6363 least NMATCH elements, and we set them to the offsets of the
6364 corresponding matched substrings.
6366 EFLAGS specifies `execution flags' which affect matching: if
6367 REG_NOTBOL is set, then ^ does not match at the beginning of the
6368 string; if REG_NOTEOL is set, then $ does not match at the end.
6370 We return 0 if we find a match and REG_NOMATCH if not. */
6373 regexec (const regex_t *preg, const char *string, Element_count nmatch,
6374 regmatch_t pmatch[], int eflags)
6377 struct re_registers regs;
6378 regex_t private_preg;
6379 int len = strlen (string);
6380 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6382 private_preg = *preg;
6384 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6385 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6387 /* The user has told us exactly how many registers to return
6388 information about, via `nmatch'. We have to pass that on to the
6389 matching routines. */
6390 private_preg.regs_allocated = REGS_FIXED;
6394 regs.num_regs = nmatch;
6395 regs.start = TALLOC (nmatch, regoff_t);
6396 regs.end = TALLOC (nmatch, regoff_t);
6397 if (regs.start == NULL || regs.end == NULL)
6398 return (int) REG_NOMATCH;
6401 /* Perform the searching operation. */
6402 ret = re_search (&private_preg, string, len,
6403 /* start: */ 0, /* range: */ len,
6404 want_reg_info ? ®s : (struct re_registers *) 0);
6406 /* Copy the register information to the POSIX structure. */
6413 for (r = 0; r < nmatch; r++)
6415 pmatch[r].rm_so = regs.start[r];
6416 pmatch[r].rm_eo = regs.end[r];
6420 /* If we needed the temporary register info, free the space now. */
6425 /* We want zero return to mean success, unlike `re_search'. */
6426 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6430 /* Returns a message corresponding to an error code, ERRCODE, returned
6431 from either regcomp or regexec. We don't use PREG here. */
6434 regerror (int errcode, const regex_t *preg, char *errbuf,
6435 Memory_count errbuf_size)
6438 Memory_count msg_size;
6441 || (size_t) errcode >= (sizeof (re_error_msgid)
6442 / sizeof (re_error_msgid[0])))
6443 /* Only error codes returned by the rest of the code should be passed
6444 to this routine. If we are given anything else, or if other regex
6445 code generates an invalid error code, then the program has a bug.
6446 Dump core so we can fix it. */
6449 msg = gettext (re_error_msgid[errcode]);
6451 msg_size = strlen (msg) + 1; /* Includes the null. */
6453 if (errbuf_size != 0)
6455 if (msg_size > errbuf_size)
6457 strncpy (errbuf, msg, errbuf_size - 1);
6458 errbuf[errbuf_size - 1] = 0;
6461 strcpy (errbuf, msg);
6468 /* Free dynamically allocated space used by PREG. */
6471 regfree (regex_t *preg)
6473 if (preg->buffer != NULL)
6474 free (preg->buffer);
6475 preg->buffer = NULL;
6477 preg->allocated = 0;
6480 if (preg->fastmap != NULL)
6481 free (preg->fastmap);
6482 preg->fastmap = NULL;
6483 preg->fastmap_accurate = 0;
6485 if (preg->translate != NULL)
6486 free (preg->translate);
6487 preg->translate = NULL;
6490 #endif /* not emacs */
6494 make-backup-files: t
6496 trim-versions-without-asking: nil