1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
94 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
100 Lisp_Object Vthe_lisp_rangetab;
103 complex_vars_of_regex (void)
105 Vthe_lisp_rangetab = Fmake_range_table ();
106 staticpro (&Vthe_lisp_rangetab);
112 complex_vars_of_regex (void)
118 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
119 #define TRANSLATE_P(tr) (!NILP (tr))
121 #else /* not emacs */
123 /* If we are not linking with Emacs proper,
124 we can't use the relocating allocator
125 even if config.h says that we can. */
128 #if defined (STDC_HEADERS) || defined (_LIBC)
135 #define charptr_emchar(str) ((Emchar) (str)[0])
137 #if (LONGBITS > INTBITS)
138 # define EMACS_INT long
140 # define EMACS_INT int
145 #define INC_CHARPTR(p) ((p)++)
146 #define DEC_CHARPTR(p) ((p)--)
150 /* Define the syntax stuff for \<, \>, etc. */
152 /* This must be nonzero for the wordchar and notwordchar pattern
153 commands in re_match_2. */
160 extern char *re_syntax_table;
162 #else /* not SYNTAX_TABLE */
164 /* How many characters in the character set. */
165 #define CHAR_SET_SIZE 256
167 static char re_syntax_table[CHAR_SET_SIZE];
170 init_syntax_once (void)
176 const char *word_syntax_chars =
177 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
179 memset (re_syntax_table, 0, sizeof (re_syntax_table));
181 while (*word_syntax_chars)
182 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
188 #endif /* SYNTAX_TABLE */
190 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
191 #undef SYNTAX_FROM_CACHE
192 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
194 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
195 #define TRANSLATE_P(tr) tr
199 /* Under XEmacs, this is needed because we don't define it elsewhere. */
200 #ifdef SWITCH_ENUM_BUG
201 #define SWITCH_ENUM_CAST(x) ((int)(x))
203 #define SWITCH_ENUM_CAST(x) (x)
207 /* Get the interface, including the syntax bits. */
210 /* isalpha etc. are used for the character classes. */
213 /* Jim Meyering writes:
215 "... Some ctype macros are valid only for character codes that
216 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
217 using /bin/cc or gcc but without giving an ansi option). So, all
218 ctype uses should be through macros like ISPRINT... If
219 STDC_HEADERS is defined, then autoconf has verified that the ctype
220 macros don't need to be guarded with references to isascii. ...
221 Defining isascii to 1 should let any compiler worth its salt
222 eliminate the && through constant folding." */
224 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
225 #define ISASCII_1(c) 1
227 #define ISASCII_1(c) isascii(c)
231 /* The IS*() macros can be passed any character, including an extended
232 one. We need to make sure there are no crashes, which would occur
233 otherwise due to out-of-bounds array references. */
234 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
236 #define ISASCII(c) ISASCII_1 (c)
240 #define ISBLANK(c) (ISASCII (c) && isblank (c))
242 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
245 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
247 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
250 #define ISPRINT(c) (ISASCII (c) && isprint (c))
251 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
252 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
253 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
254 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
255 #define ISLOWER(c) (ISASCII (c) && islower (c))
256 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
257 #define ISSPACE(c) (ISASCII (c) && isspace (c))
258 #define ISUPPER(c) (ISASCII (c) && isupper (c))
259 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
262 #define NULL (void *)0
265 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
266 since ours (we hope) works properly with all combinations of
267 machines, compilers, `char' and `unsigned char' argument types.
268 (Per Bothner suggested the basic approach.) */
269 #undef SIGN_EXTEND_CHAR
271 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
272 #else /* not __STDC__ */
273 /* As in Harbison and Steele. */
274 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
277 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
278 use `alloca' instead of `malloc'. This is because using malloc in
279 re_search* or re_match* could cause memory leaks when C-g is used in
280 Emacs; also, malloc is slower and causes storage fragmentation. On
281 the other hand, malloc is more portable, and easier to debug.
283 Because we sometimes use alloca, some routines have to be macros,
284 not functions -- `alloca'-allocated space disappears at the end of the
285 function it is called in. */
289 #define REGEX_ALLOCATE malloc
290 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
291 #define REGEX_FREE free
293 #else /* not REGEX_MALLOC */
295 /* Emacs already defines alloca, sometimes. */
298 /* Make alloca work the best possible way. */
300 #define alloca __builtin_alloca
301 #else /* not __GNUC__ */
304 #else /* not __GNUC__ or HAVE_ALLOCA_H */
305 #ifndef _AIX /* Already did AIX, up at the top. */
307 #endif /* not _AIX */
308 #endif /* HAVE_ALLOCA_H */
309 #endif /* __GNUC__ */
311 #endif /* not alloca */
313 #define REGEX_ALLOCATE alloca
315 /* Assumes a `char *destination' variable. */
316 #define REGEX_REALLOCATE(source, osize, nsize) \
317 (destination = (char *) alloca (nsize), \
318 memmove (destination, source, osize), \
321 /* No need to do anything to free, after alloca. */
322 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
324 #endif /* REGEX_MALLOC */
326 /* Define how to allocate the failure stack. */
329 #define REGEX_ALLOCATE_STACK(size) \
330 r_alloc ((char **) &failure_stack_ptr, (size))
331 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
332 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
333 #define REGEX_FREE_STACK(ptr) \
334 r_alloc_free ((void **) &failure_stack_ptr)
336 #else /* not REL_ALLOC */
340 #define REGEX_ALLOCATE_STACK malloc
341 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
342 #define REGEX_FREE_STACK free
344 #else /* not REGEX_MALLOC */
346 #define REGEX_ALLOCATE_STACK alloca
348 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
349 REGEX_REALLOCATE (source, osize, nsize)
350 /* No need to explicitly free anything. */
351 #define REGEX_FREE_STACK(arg)
353 #endif /* REGEX_MALLOC */
354 #endif /* REL_ALLOC */
357 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
358 `string1' or just past its end. This works if PTR is NULL, which is
360 #define FIRST_STRING_P(ptr) \
361 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
363 /* (Re)Allocate N items of type T using malloc, or fail. */
364 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
365 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
366 #define RETALLOC_IF(addr, n, t) \
367 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
368 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
370 #define BYTEWIDTH 8 /* In bits. */
372 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
376 #define MAX(a, b) ((a) > (b) ? (a) : (b))
377 #define MIN(a, b) ((a) < (b) ? (a) : (b))
379 /* Type of source-pattern and string chars. */
380 typedef const unsigned char re_char;
382 typedef char re_bool;
387 /* These are the command codes that appear in compiled regular
388 expressions. Some opcodes are followed by argument bytes. A
389 command code can specify any interpretation whatsoever for its
390 arguments. Zero bytes may appear in the compiled regular expression. */
396 /* Succeed right away--no more backtracking. */
399 /* Followed by one byte giving n, then by n literal bytes. */
402 /* Matches any (more or less) character. */
405 /* Matches any one char belonging to specified set. First
406 following byte is number of bitmap bytes. Then come bytes
407 for a bitmap saying which chars are in. Bits in each byte
408 are ordered low-bit-first. A character is in the set if its
409 bit is 1. A character too large to have a bit in the map is
410 automatically not in the set. */
413 /* Same parameters as charset, but match any character that is
414 not one of those specified. */
417 /* Start remembering the text that is matched, for storing in a
418 register. Followed by one byte with the register number, in
419 the range 0 to one less than the pattern buffer's re_nsub
420 field. Then followed by one byte with the number of groups
421 inner to this one. (This last has to be part of the
422 start_memory only because we need it in the on_failure_jump
426 /* Stop remembering the text that is matched and store it in a
427 memory register. Followed by one byte with the register
428 number, in the range 0 to one less than `re_nsub' in the
429 pattern buffer, and one byte with the number of inner groups,
430 just like `start_memory'. (We need the number of inner
431 groups here because we don't have any easy way of finding the
432 corresponding start_memory when we're at a stop_memory.) */
435 /* Match a duplicate of something remembered. Followed by one
436 byte containing the register number. */
439 /* Fail unless at beginning of line. */
442 /* Fail unless at end of line. */
445 /* Succeeds if at beginning of buffer (if emacs) or at beginning
446 of string to be matched (if not). */
449 /* Analogously, for end of buffer/string. */
452 /* Followed by two byte relative address to which to jump. */
455 /* Same as jump, but marks the end of an alternative. */
458 /* Followed by two-byte relative address of place to resume at
459 in case of failure. */
462 /* Like on_failure_jump, but pushes a placeholder instead of the
463 current string position when executed. */
464 on_failure_keep_string_jump,
466 /* Throw away latest failure point and then jump to following
467 two-byte relative address. */
470 /* Change to pop_failure_jump if know won't have to backtrack to
471 match; otherwise change to jump. This is used to jump
472 back to the beginning of a repeat. If what follows this jump
473 clearly won't match what the repeat does, such that we can be
474 sure that there is no use backtracking out of repetitions
475 already matched, then we change it to a pop_failure_jump.
476 Followed by two-byte address. */
479 /* Jump to following two-byte address, and push a dummy failure
480 point. This failure point will be thrown away if an attempt
481 is made to use it for a failure. A `+' construct makes this
482 before the first repeat. Also used as an intermediary kind
483 of jump when compiling an alternative. */
486 /* Push a dummy failure point and continue. Used at the end of
490 /* Followed by two-byte relative address and two-byte number n.
491 After matching N times, jump to the address upon failure. */
494 /* Followed by two-byte relative address, and two-byte number n.
495 Jump to the address N times, then fail. */
498 /* Set the following two-byte relative address to the
499 subsequent two-byte number. The address *includes* the two
503 wordchar, /* Matches any word-constituent character. */
504 notwordchar, /* Matches any char that is not a word-constituent. */
506 wordbeg, /* Succeeds if at word beginning. */
507 wordend, /* Succeeds if at word end. */
509 wordbound, /* Succeeds if at a word boundary. */
510 notwordbound /* Succeeds if not at a word boundary. */
513 ,before_dot, /* Succeeds if before point. */
514 at_dot, /* Succeeds if at point. */
515 after_dot, /* Succeeds if after point. */
517 /* Matches any character whose syntax is specified. Followed by
518 a byte which contains a syntax code, e.g., Sword. */
521 /* Matches any character whose syntax is not that specified. */
527 /* need extra stuff to be able to properly work with XEmacs/Mule
528 characters (which may take up more than one byte) */
530 ,charset_mule, /* Matches any character belonging to specified set.
531 The set is stored in "unified range-table
532 format"; see rangetab.c. Unlike the `charset'
533 opcode, this can handle arbitrary characters. */
535 charset_mule_not /* Same parameters as charset_mule, but match any
536 character that is not one of those specified. */
538 /* 97/2/17 jhod: The following two were merged back in from the Mule
539 2.3 code to enable some language specific processing */
540 ,categoryspec, /* Matches entries in the character category tables */
541 notcategoryspec /* The opposite of the above */
546 /* Common operations on the compiled pattern. */
548 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
550 #define STORE_NUMBER(destination, number) \
552 (destination)[0] = (number) & 0377; \
553 (destination)[1] = (number) >> 8; \
556 /* Same as STORE_NUMBER, except increment DESTINATION to
557 the byte after where the number is stored. Therefore, DESTINATION
558 must be an lvalue. */
560 #define STORE_NUMBER_AND_INCR(destination, number) \
562 STORE_NUMBER (destination, number); \
563 (destination) += 2; \
566 /* Put into DESTINATION a number stored in two contiguous bytes starting
569 #define EXTRACT_NUMBER(destination, source) \
571 (destination) = *(source) & 0377; \
572 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
577 extract_number (int *dest, re_char *source)
579 int temp = SIGN_EXTEND_CHAR (*(source + 1));
580 *dest = *source & 0377;
584 #ifndef EXTRACT_MACROS /* To debug the macros. */
585 #undef EXTRACT_NUMBER
586 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
587 #endif /* not EXTRACT_MACROS */
591 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
592 SOURCE must be an lvalue. */
594 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
596 EXTRACT_NUMBER (destination, source); \
602 extract_number_and_incr (int *destination, unsigned char **source)
604 extract_number (destination, *source);
608 #ifndef EXTRACT_MACROS
609 #undef EXTRACT_NUMBER_AND_INCR
610 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
611 extract_number_and_incr (&dest, &src)
612 #endif /* not EXTRACT_MACROS */
616 /* If DEBUG is defined, Regex prints many voluminous messages about what
617 it is doing (if the variable `debug' is nonzero). If linked with the
618 main program in `iregex.c', you can enter patterns and strings
619 interactively. And if linked with the main program in `main.c' and
620 the other test files, you can run the already-written tests. */
624 /* We use standard I/O for debugging. */
628 /* XEmacs provides its own version of assert() */
629 /* It is useful to test things that ``must'' be true when debugging. */
633 static int debug = 0;
635 #define DEBUG_STATEMENT(e) e
636 #define DEBUG_PRINT1(x) if (debug) printf (x)
637 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
638 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
639 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
640 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
641 if (debug) print_partial_compiled_pattern (s, e)
642 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
643 if (debug) print_double_string (w, s1, sz1, s2, sz2)
646 /* Print the fastmap in human-readable form. */
649 print_fastmap (char *fastmap)
651 unsigned was_a_range = 0;
654 while (i < (1 << BYTEWIDTH))
660 while (i < (1 << BYTEWIDTH) && fastmap[i])
676 /* Print a compiled pattern string in human-readable form, starting at
677 the START pointer into it and ending just before the pointer END. */
680 print_partial_compiled_pattern (re_char *start, re_char *end)
683 unsigned char *p = (unsigned char *) start;
692 /* Loop over pattern commands. */
695 printf ("%ld:\t", (long)(p - start));
697 switch ((re_opcode_t) *p++)
705 printf ("/exactn/%d", mcnt);
716 printf ("/start_memory/%d/%d", mcnt, *p++);
721 printf ("/stop_memory/%d/%d", mcnt, *p++);
725 printf ("/duplicate/%d", *p++);
735 REGISTER int c, last = -100;
736 REGISTER int in_range = 0;
738 printf ("/charset [%s",
739 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
741 assert (p + *p < pend);
743 for (c = 0; c < 256; c++)
744 if (((unsigned char) (c / 8) < *p)
745 && (p[1 + (c/8)] & (1 << (c % 8))))
747 /* Are we starting a range? */
748 if (last + 1 == c && ! in_range)
753 /* Have we broken a range? */
754 else if (last + 1 != c && in_range)
777 case charset_mule_not:
781 printf ("/charset_mule [%s",
782 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
783 nentries = unified_range_table_nentries (p);
784 for (i = 0; i < nentries; i++)
786 EMACS_INT first, last;
787 Lisp_Object dummy_val;
789 unified_range_table_get_range (p, i, &first, &last,
794 printf ("(0x%lx)", (long)first);
801 printf ("(0x%lx)", (long)last);
805 p += unified_range_table_bytes_used (p);
818 case on_failure_jump:
819 extract_number_and_incr (&mcnt, &p);
820 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
823 case on_failure_keep_string_jump:
824 extract_number_and_incr (&mcnt, &p);
825 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
828 case dummy_failure_jump:
829 extract_number_and_incr (&mcnt, &p);
830 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
833 case push_dummy_failure:
834 printf ("/push_dummy_failure");
838 extract_number_and_incr (&mcnt, &p);
839 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
842 case pop_failure_jump:
843 extract_number_and_incr (&mcnt, &p);
844 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
848 extract_number_and_incr (&mcnt, &p);
849 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
853 extract_number_and_incr (&mcnt, &p);
854 printf ("/jump to %ld", (long)(p + mcnt - start));
858 extract_number_and_incr (&mcnt, &p);
859 extract_number_and_incr (&mcnt2, &p);
860 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
864 extract_number_and_incr (&mcnt, &p);
865 extract_number_and_incr (&mcnt2, &p);
866 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
870 extract_number_and_incr (&mcnt, &p);
871 extract_number_and_incr (&mcnt2, &p);
872 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
876 printf ("/wordbound");
880 printf ("/notwordbound");
892 printf ("/before_dot");
900 printf ("/after_dot");
904 printf ("/syntaxspec");
906 printf ("/%d", mcnt);
910 printf ("/notsyntaxspec");
912 printf ("/%d", mcnt);
916 /* 97/2/17 jhod Mule category patch */
918 printf ("/categoryspec");
920 printf ("/%d", mcnt);
923 case notcategoryspec:
924 printf ("/notcategoryspec");
926 printf ("/%d", mcnt);
928 /* end of category patch */
933 printf ("/wordchar");
937 printf ("/notwordchar");
949 printf ("?%d", *(p-1));
955 printf ("%ld:\tend of pattern.\n", (long)(p - start));
960 print_compiled_pattern (struct re_pattern_buffer *bufp)
962 re_char *buffer = bufp->buffer;
964 print_partial_compiled_pattern (buffer, buffer + bufp->used);
965 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
968 if (bufp->fastmap_accurate && bufp->fastmap)
970 printf ("fastmap: ");
971 print_fastmap (bufp->fastmap);
974 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
975 printf ("regs_alloc: %d\t", bufp->regs_allocated);
976 printf ("can_be_null: %d\t", bufp->can_be_null);
977 printf ("newline_anchor: %d\n", bufp->newline_anchor);
978 printf ("no_sub: %d\t", bufp->no_sub);
979 printf ("not_bol: %d\t", bufp->not_bol);
980 printf ("not_eol: %d\t", bufp->not_eol);
981 printf ("syntax: %d\n", bufp->syntax);
982 /* Perhaps we should print the translate table? */
983 /* and maybe the category table? */
988 print_double_string (re_char *where, re_char *string1, int size1,
989 re_char *string2, int size2)
995 unsigned int this_char;
997 if (FIRST_STRING_P (where))
999 for (this_char = where - string1; this_char < size1; this_char++)
1000 putchar (string1[this_char]);
1005 for (this_char = where - string2; this_char < size2; this_char++)
1006 putchar (string2[this_char]);
1010 #else /* not DEBUG */
1015 #define DEBUG_STATEMENT(e)
1016 #define DEBUG_PRINT1(x)
1017 #define DEBUG_PRINT2(x1, x2)
1018 #define DEBUG_PRINT3(x1, x2, x3)
1019 #define DEBUG_PRINT4(x1, x2, x3, x4)
1020 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1021 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1025 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1026 also be assigned to arbitrarily: each pattern buffer stores its own
1027 syntax, so it can be changed between regex compilations. */
1028 /* This has no initializer because initialized variables in Emacs
1029 become read-only after dumping. */
1030 reg_syntax_t re_syntax_options;
1033 /* Specify the precise syntax of regexps for compilation. This provides
1034 for compatibility for various utilities which historically have
1035 different, incompatible syntaxes.
1037 The argument SYNTAX is a bit mask comprised of the various bits
1038 defined in regex.h. We return the old syntax. */
1041 re_set_syntax (reg_syntax_t syntax)
1043 reg_syntax_t ret = re_syntax_options;
1045 re_syntax_options = syntax;
1049 /* This table gives an error message for each of the error codes listed
1050 in regex.h. Obviously the order here has to be same as there.
1051 POSIX doesn't require that we do anything for REG_NOERROR,
1052 but why not be nice? */
1054 static const char *re_error_msgid[] =
1056 "Success", /* REG_NOERROR */
1057 "No match", /* REG_NOMATCH */
1058 "Invalid regular expression", /* REG_BADPAT */
1059 "Invalid collation character", /* REG_ECOLLATE */
1060 "Invalid character class name", /* REG_ECTYPE */
1061 "Trailing backslash", /* REG_EESCAPE */
1062 "Invalid back reference", /* REG_ESUBREG */
1063 "Unmatched [ or [^", /* REG_EBRACK */
1064 "Unmatched ( or \\(", /* REG_EPAREN */
1065 "Unmatched \\{", /* REG_EBRACE */
1066 "Invalid content of \\{\\}", /* REG_BADBR */
1067 "Invalid range end", /* REG_ERANGE */
1068 "Memory exhausted", /* REG_ESPACE */
1069 "Invalid preceding regular expression", /* REG_BADRPT */
1070 "Premature end of regular expression", /* REG_EEND */
1071 "Regular expression too big", /* REG_ESIZE */
1072 "Unmatched ) or \\)", /* REG_ERPAREN */
1074 "Invalid syntax designator", /* REG_ESYNTAX */
1077 "Ranges may not span charsets", /* REG_ERANGESPAN */
1078 "Invalid category designator", /* REG_ECATEGORY */
1082 /* Avoiding alloca during matching, to placate r_alloc. */
1084 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1085 searching and matching functions should not call alloca. On some
1086 systems, alloca is implemented in terms of malloc, and if we're
1087 using the relocating allocator routines, then malloc could cause a
1088 relocation, which might (if the strings being searched are in the
1089 ralloc heap) shift the data out from underneath the regexp
1092 Here's another reason to avoid allocation: Emacs
1093 processes input from X in a signal handler; processing X input may
1094 call malloc; if input arrives while a matching routine is calling
1095 malloc, then we're scrod. But Emacs can't just block input while
1096 calling matching routines; then we don't notice interrupts when
1097 they come in. So, Emacs blocks input around all regexp calls
1098 except the matching calls, which it leaves unprotected, in the
1099 faith that they will not malloc. */
1101 /* Normally, this is fine. */
1102 #define MATCH_MAY_ALLOCATE
1104 /* When using GNU C, we are not REALLY using the C alloca, no matter
1105 what config.h may say. So don't take precautions for it. */
1110 /* The match routines may not allocate if (1) they would do it with malloc
1111 and (2) it's not safe for them to use malloc.
1112 Note that if REL_ALLOC is defined, matching would not use malloc for the
1113 failure stack, but we would still use it for the register vectors;
1114 so REL_ALLOC should not affect this. */
1115 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1116 #undef MATCH_MAY_ALLOCATE
1120 /* Failure stack declarations and macros; both re_compile_fastmap and
1121 re_match_2 use a failure stack. These have to be macros because of
1122 REGEX_ALLOCATE_STACK. */
1125 /* Number of failure points for which to initially allocate space
1126 when matching. If this number is exceeded, we allocate more
1127 space, so it is not a hard limit. */
1128 #ifndef INIT_FAILURE_ALLOC
1129 #define INIT_FAILURE_ALLOC 5
1132 /* Roughly the maximum number of failure points on the stack. Would be
1133 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1134 This is a variable only so users of regex can assign to it; we never
1135 change it ourselves. */
1136 #if defined (MATCH_MAY_ALLOCATE)
1137 /* 4400 was enough to cause a crash on Alpha OSF/1,
1138 whose default stack limit is 2mb. */
1139 int re_max_failures = 20000;
1141 int re_max_failures = 2000;
1144 union fail_stack_elt
1150 typedef union fail_stack_elt fail_stack_elt_t;
1154 fail_stack_elt_t *stack;
1156 size_t avail; /* Offset of next open position. */
1159 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1160 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1161 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1164 /* Define macros to initialize and free the failure stack.
1165 Do `return -2' if the alloc fails. */
1167 #ifdef MATCH_MAY_ALLOCATE
1168 #define INIT_FAIL_STACK() \
1170 fail_stack.stack = (fail_stack_elt_t *) \
1171 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1173 if (fail_stack.stack == NULL) \
1176 fail_stack.size = INIT_FAILURE_ALLOC; \
1177 fail_stack.avail = 0; \
1180 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1182 #define INIT_FAIL_STACK() \
1184 fail_stack.avail = 0; \
1187 #define RESET_FAIL_STACK()
1191 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1193 Return 1 if succeeds, and 0 if either ran out of memory
1194 allocating space for it or it was already too large.
1196 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1198 #define DOUBLE_FAIL_STACK(fail_stack) \
1199 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1201 : ((fail_stack).stack = (fail_stack_elt_t *) \
1202 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1203 (fail_stack).size * sizeof (fail_stack_elt_t), \
1204 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1206 (fail_stack).stack == NULL \
1208 : ((fail_stack).size <<= 1, \
1212 /* Push pointer POINTER on FAIL_STACK.
1213 Return 1 if was able to do so and 0 if ran out of memory allocating
1215 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1216 ((FAIL_STACK_FULL () \
1217 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1219 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1222 /* Push a pointer value onto the failure stack.
1223 Assumes the variable `fail_stack'. Probably should only
1224 be called from within `PUSH_FAILURE_POINT'. */
1225 #define PUSH_FAILURE_POINTER(item) \
1226 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1228 /* This pushes an integer-valued item onto the failure stack.
1229 Assumes the variable `fail_stack'. Probably should only
1230 be called from within `PUSH_FAILURE_POINT'. */
1231 #define PUSH_FAILURE_INT(item) \
1232 fail_stack.stack[fail_stack.avail++].integer = (item)
1234 /* Push a fail_stack_elt_t value onto the failure stack.
1235 Assumes the variable `fail_stack'. Probably should only
1236 be called from within `PUSH_FAILURE_POINT'. */
1237 #define PUSH_FAILURE_ELT(item) \
1238 fail_stack.stack[fail_stack.avail++] = (item)
1240 /* These three POP... operations complement the three PUSH... operations.
1241 All assume that `fail_stack' is nonempty. */
1242 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1243 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1244 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1246 /* Used to omit pushing failure point id's when we're not debugging. */
1248 #define DEBUG_PUSH PUSH_FAILURE_INT
1249 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1251 #define DEBUG_PUSH(item)
1252 #define DEBUG_POP(item_addr)
1256 /* Push the information about the state we will need
1257 if we ever fail back to it.
1259 Requires variables fail_stack, regstart, regend, reg_info, and
1260 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1263 Does `return FAILURE_CODE' if runs out of memory. */
1265 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1266 #define DECLARE_DESTINATION char *destination
1268 #define DECLARE_DESTINATION DECLARE_NOTHING
1271 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1273 DECLARE_DESTINATION; \
1274 /* Must be int, so when we don't save any registers, the arithmetic \
1275 of 0 + -1 isn't done as unsigned. */ \
1278 DEBUG_STATEMENT (failure_id++); \
1279 DEBUG_STATEMENT (nfailure_points_pushed++); \
1280 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1281 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1282 (unsigned long) (fail_stack).avail); \
1283 DEBUG_PRINT2 (" size: %lu\n", \
1284 (unsigned long) (fail_stack).size); \
1286 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1287 DEBUG_PRINT2 (" available: %ld\n", \
1288 (long) REMAINING_AVAIL_SLOTS); \
1290 /* Ensure we have enough space allocated for what we will push. */ \
1291 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1293 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1294 return failure_code; \
1296 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1297 (unsigned long) (fail_stack).size); \
1298 DEBUG_PRINT2 (" slots available: %ld\n", \
1299 (long) REMAINING_AVAIL_SLOTS); \
1302 /* Push the info, starting with the registers. */ \
1303 DEBUG_PRINT1 ("\n"); \
1305 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1308 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1309 DEBUG_STATEMENT (num_regs_pushed++); \
1311 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1312 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1314 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1315 PUSH_FAILURE_POINTER (regend[this_reg]); \
1317 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1318 * (long *) (®_info[this_reg])); \
1319 DEBUG_PRINT2 (" match_null=%d", \
1320 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1321 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1322 DEBUG_PRINT2 (" matched_something=%d", \
1323 MATCHED_SOMETHING (reg_info[this_reg])); \
1324 DEBUG_PRINT2 (" ever_matched_something=%d", \
1325 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1326 DEBUG_PRINT1 ("\n"); \
1327 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1330 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1331 PUSH_FAILURE_INT (lowest_active_reg); \
1333 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1334 PUSH_FAILURE_INT (highest_active_reg); \
1336 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1337 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1338 PUSH_FAILURE_POINTER (pattern_place); \
1340 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1341 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1343 DEBUG_PRINT1 ("'\n"); \
1344 PUSH_FAILURE_POINTER (string_place); \
1346 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1347 DEBUG_PUSH (failure_id); \
1350 /* This is the number of items that are pushed and popped on the stack
1351 for each register. */
1352 #define NUM_REG_ITEMS 3
1354 /* Individual items aside from the registers. */
1356 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1358 #define NUM_NONREG_ITEMS 4
1361 /* We push at most this many items on the stack. */
1362 /* We used to use (num_regs - 1), which is the number of registers
1363 this regexp will save; but that was changed to 5
1364 to avoid stack overflow for a regexp with lots of parens. */
1365 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1367 /* We actually push this many items. */
1368 #define NUM_FAILURE_ITEMS \
1369 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1372 /* How many items can still be added to the stack without overflowing it. */
1373 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1376 /* Pops what PUSH_FAIL_STACK pushes.
1378 We restore into the parameters, all of which should be lvalues:
1379 STR -- the saved data position.
1380 PAT -- the saved pattern position.
1381 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1382 REGSTART, REGEND -- arrays of string positions.
1383 REG_INFO -- array of information about each subexpression.
1385 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1386 `pend', `string1', `size1', `string2', and `size2'. */
1388 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1389 regstart, regend, reg_info) \
1391 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1393 const unsigned char *string_temp; \
1395 assert (!FAIL_STACK_EMPTY ()); \
1397 /* Remove failure points and point to how many regs pushed. */ \
1398 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1399 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1400 (unsigned long) fail_stack.avail); \
1401 DEBUG_PRINT2 (" size: %lu\n", \
1402 (unsigned long) fail_stack.size); \
1404 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1406 DEBUG_POP (&ffailure_id.integer); \
1407 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1408 * (unsigned int *) &ffailure_id); \
1410 /* If the saved string location is NULL, it came from an \
1411 on_failure_keep_string_jump opcode, and we want to throw away the \
1412 saved NULL, thus retaining our current position in the string. */ \
1413 string_temp = POP_FAILURE_POINTER (); \
1414 if (string_temp != NULL) \
1415 str = string_temp; \
1417 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1418 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1419 DEBUG_PRINT1 ("'\n"); \
1421 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1422 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1423 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1425 /* Restore register info. */ \
1426 high_reg = (unsigned) POP_FAILURE_INT (); \
1427 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1429 low_reg = (unsigned) POP_FAILURE_INT (); \
1430 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1432 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1434 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1436 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1437 DEBUG_PRINT2 (" info: 0x%lx\n", \
1438 * (long *) ®_info[this_reg]); \
1440 regend[this_reg] = POP_FAILURE_POINTER (); \
1441 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1443 regstart[this_reg] = POP_FAILURE_POINTER (); \
1444 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1447 set_regs_matched_done = 0; \
1448 DEBUG_STATEMENT (nfailure_points_popped++); \
1449 } while (0) /* POP_FAILURE_POINT */
1453 /* Structure for per-register (a.k.a. per-group) information.
1454 Other register information, such as the
1455 starting and ending positions (which are addresses), and the list of
1456 inner groups (which is a bits list) are maintained in separate
1459 We are making a (strictly speaking) nonportable assumption here: that
1460 the compiler will pack our bit fields into something that fits into
1461 the type of `word', i.e., is something that fits into one item on the
1466 fail_stack_elt_t word;
1469 /* This field is one if this group can match the empty string,
1470 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1471 #define MATCH_NULL_UNSET_VALUE 3
1472 unsigned match_null_string_p : 2;
1473 unsigned is_active : 1;
1474 unsigned matched_something : 1;
1475 unsigned ever_matched_something : 1;
1477 } register_info_type;
1479 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1480 #define IS_ACTIVE(R) ((R).bits.is_active)
1481 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1482 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1485 /* Call this when have matched a real character; it sets `matched' flags
1486 for the subexpressions which we are currently inside. Also records
1487 that those subexprs have matched. */
1488 #define SET_REGS_MATCHED() \
1491 if (!set_regs_matched_done) \
1494 set_regs_matched_done = 1; \
1495 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1497 MATCHED_SOMETHING (reg_info[r]) \
1498 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1505 /* Registers are set to a sentinel when they haven't yet matched. */
1506 static unsigned char reg_unset_dummy;
1507 #define REG_UNSET_VALUE (®_unset_dummy)
1508 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1510 /* Subroutine declarations and macros for regex_compile. */
1512 /* Fetch the next character in the uncompiled pattern---translating it
1513 if necessary. Also cast from a signed character in the constant
1514 string passed to us by the user to an unsigned char that we can use
1515 as an array index (in, e.g., `translate'). */
1516 #define PATFETCH(c) \
1519 c = TRANSLATE (c); \
1522 /* Fetch the next character in the uncompiled pattern, with no
1524 #define PATFETCH_RAW(c) \
1525 do {if (p == pend) return REG_EEND; \
1526 assert (p < pend); \
1527 c = charptr_emchar (p); \
1531 /* Go backwards one character in the pattern. */
1532 #define PATUNFETCH DEC_CHARPTR (p)
1536 #define PATFETCH_EXTENDED(emch) \
1537 do {if (p == pend) return REG_EEND; \
1538 assert (p < pend); \
1539 emch = charptr_emchar ((const Bufbyte *) p); \
1541 if (TRANSLATE_P (translate) && emch < 0x80) \
1542 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1545 #define PATFETCH_RAW_EXTENDED(emch) \
1546 do {if (p == pend) return REG_EEND; \
1547 assert (p < pend); \
1548 emch = charptr_emchar ((const Bufbyte *) p); \
1552 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1554 #define PATFETCH_EITHER(emch) \
1556 if (has_extended_chars) \
1557 PATFETCH_EXTENDED (emch); \
1562 #define PATFETCH_RAW_EITHER(emch) \
1564 if (has_extended_chars) \
1565 PATFETCH_RAW_EXTENDED (emch); \
1567 PATFETCH_RAW (emch); \
1570 #define PATUNFETCH_EITHER \
1572 if (has_extended_chars) \
1573 PATUNFETCH_EXTENDED (emch); \
1575 PATUNFETCH (emch); \
1578 #else /* not MULE */
1580 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1581 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1582 #define PATUNFETCH_EITHER PATUNFETCH
1586 /* If `translate' is non-null, return translate[D], else just D. We
1587 cast the subscript to translate because some data is declared as
1588 `char *', to avoid warnings when a string constant is passed. But
1589 when we use a character as a subscript we must make it unsigned. */
1590 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1594 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1595 (TRANSLATE_P (translate) && emch < 0x80 ? RE_TRANSLATE (emch) : (emch))
1599 /* Macros for outputting the compiled pattern into `buffer'. */
1601 /* If the buffer isn't allocated when it comes in, use this. */
1602 #define INIT_BUF_SIZE 32
1604 /* Make sure we have at least N more bytes of space in buffer. */
1605 #define GET_BUFFER_SPACE(n) \
1606 while (buf_end - bufp->buffer + (n) > bufp->allocated) \
1609 /* Make sure we have one more byte of buffer space and then add C to it. */
1610 #define BUF_PUSH(c) \
1612 GET_BUFFER_SPACE (1); \
1613 *buf_end++ = (unsigned char) (c); \
1617 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1618 #define BUF_PUSH_2(c1, c2) \
1620 GET_BUFFER_SPACE (2); \
1621 *buf_end++ = (unsigned char) (c1); \
1622 *buf_end++ = (unsigned char) (c2); \
1626 /* As with BUF_PUSH_2, except for three bytes. */
1627 #define BUF_PUSH_3(c1, c2, c3) \
1629 GET_BUFFER_SPACE (3); \
1630 *buf_end++ = (unsigned char) (c1); \
1631 *buf_end++ = (unsigned char) (c2); \
1632 *buf_end++ = (unsigned char) (c3); \
1636 /* Store a jump with opcode OP at LOC to location TO. We store a
1637 relative address offset by the three bytes the jump itself occupies. */
1638 #define STORE_JUMP(op, loc, to) \
1639 store_op1 (op, loc, (to) - (loc) - 3)
1641 /* Likewise, for a two-argument jump. */
1642 #define STORE_JUMP2(op, loc, to, arg) \
1643 store_op2 (op, loc, (to) - (loc) - 3, arg)
1645 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1647 #define INSERT_JUMP(op, loc, to) \
1648 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1650 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1652 #define INSERT_JUMP2(op, loc, to, arg) \
1653 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1656 /* This is not an arbitrary limit: the arguments which represent offsets
1657 into the pattern are two bytes long. So if 2^16 bytes turns out to
1658 be too small, many things would have to change. */
1659 #define MAX_BUF_SIZE (1L << 16)
1662 /* Extend the buffer by twice its current size via realloc and
1663 reset the pointers that pointed into the old block to point to the
1664 correct places in the new one. If extending the buffer results in it
1665 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1666 #define EXTEND_BUFFER() \
1668 re_char *old_buffer = bufp->buffer; \
1669 if (bufp->allocated == MAX_BUF_SIZE) \
1671 bufp->allocated <<= 1; \
1672 if (bufp->allocated > MAX_BUF_SIZE) \
1673 bufp->allocated = MAX_BUF_SIZE; \
1674 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1675 if (bufp->buffer == NULL) \
1676 return REG_ESPACE; \
1677 /* If the buffer moved, move all the pointers into it. */ \
1678 if (old_buffer != bufp->buffer) \
1680 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1681 begalt = (begalt - old_buffer) + bufp->buffer; \
1682 if (fixup_alt_jump) \
1683 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1685 laststart = (laststart - old_buffer) + bufp->buffer; \
1686 if (pending_exact) \
1687 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1692 /* Since we have one byte reserved for the register number argument to
1693 {start,stop}_memory, the maximum number of groups we can report
1694 things about is what fits in that byte. */
1695 #define MAX_REGNUM 255
1697 /* But patterns can have more than `MAX_REGNUM' registers. We just
1698 ignore the excess. */
1699 typedef unsigned regnum_t;
1702 /* Macros for the compile stack. */
1704 /* Since offsets can go either forwards or backwards, this type needs to
1705 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1706 typedef int pattern_offset_t;
1710 pattern_offset_t begalt_offset;
1711 pattern_offset_t fixup_alt_jump;
1712 pattern_offset_t inner_group_offset;
1713 pattern_offset_t laststart_offset;
1715 } compile_stack_elt_t;
1720 compile_stack_elt_t *stack;
1722 unsigned avail; /* Offset of next open position. */
1723 } compile_stack_type;
1726 #define INIT_COMPILE_STACK_SIZE 32
1728 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1729 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1731 /* The next available element. */
1732 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1735 /* Set the bit for character C in a bit vector. */
1736 #define SET_LIST_BIT(c) \
1737 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1738 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1742 /* Set the "bit" for character C in a range table. */
1743 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1745 /* Set the "bit" for character c in the appropriate table. */
1746 #define SET_EITHER_BIT(c) \
1748 if (has_extended_chars) \
1749 SET_RANGETAB_BIT (c); \
1754 #else /* not MULE */
1756 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1761 /* Get the next unsigned number in the uncompiled pattern. */
1762 #define GET_UNSIGNED_NUMBER(num) \
1766 while (ISDIGIT (c)) \
1770 num = num * 10 + c - '0'; \
1778 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1780 #define IS_CHAR_CLASS(string) \
1781 (STREQ (string, "alpha") || STREQ (string, "upper") \
1782 || STREQ (string, "lower") || STREQ (string, "digit") \
1783 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1784 || STREQ (string, "space") || STREQ (string, "print") \
1785 || STREQ (string, "punct") || STREQ (string, "graph") \
1786 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1788 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1789 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1790 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1791 unsigned char *end);
1792 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1793 unsigned char *end);
1794 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1795 reg_syntax_t syntax);
1796 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1797 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1799 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1800 RE_TRANSLATE_TYPE translate,
1801 reg_syntax_t syntax,
1804 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1806 RE_TRANSLATE_TYPE translate,
1807 reg_syntax_t syntax,
1810 static re_bool group_match_null_string_p (unsigned char **p,
1812 register_info_type *reg_info);
1813 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1814 register_info_type *reg_info);
1815 static re_bool common_op_match_null_string_p (unsigned char **p,
1817 register_info_type *reg_info);
1818 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1819 REGISTER int len, RE_TRANSLATE_TYPE translate);
1820 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1821 re_char *string1, int size1,
1822 re_char *string2, int size2, int pos,
1823 struct re_registers *regs, int stop);
1825 #ifndef MATCH_MAY_ALLOCATE
1827 /* If we cannot allocate large objects within re_match_2_internal,
1828 we make the fail stack and register vectors global.
1829 The fail stack, we grow to the maximum size when a regexp
1831 The register vectors, we adjust in size each time we
1832 compile a regexp, according to the number of registers it needs. */
1834 static fail_stack_type fail_stack;
1836 /* Size with which the following vectors are currently allocated.
1837 That is so we can make them bigger as needed,
1838 but never make them smaller. */
1839 static int regs_allocated_size;
1841 static re_char ** regstart, ** regend;
1842 static re_char ** old_regstart, ** old_regend;
1843 static re_char **best_regstart, **best_regend;
1844 static register_info_type *reg_info;
1845 static re_char **reg_dummy;
1846 static register_info_type *reg_info_dummy;
1848 /* Make the register vectors big enough for NUM_REGS registers,
1849 but don't make them smaller. */
1852 regex_grow_registers (int num_regs)
1854 if (num_regs > regs_allocated_size)
1856 RETALLOC_IF (regstart, num_regs, re_char *);
1857 RETALLOC_IF (regend, num_regs, re_char *);
1858 RETALLOC_IF (old_regstart, num_regs, re_char *);
1859 RETALLOC_IF (old_regend, num_regs, re_char *);
1860 RETALLOC_IF (best_regstart, num_regs, re_char *);
1861 RETALLOC_IF (best_regend, num_regs, re_char *);
1862 RETALLOC_IF (reg_info, num_regs, register_info_type);
1863 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1864 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1866 regs_allocated_size = num_regs;
1870 #endif /* not MATCH_MAY_ALLOCATE */
1872 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1873 Returns one of error codes defined in `regex.h', or zero for success.
1875 Assumes the `allocated' (and perhaps `buffer') and `translate'
1876 fields are set in BUFP on entry.
1878 If it succeeds, results are put in BUFP (if it returns an error, the
1879 contents of BUFP are undefined):
1880 `buffer' is the compiled pattern;
1881 `syntax' is set to SYNTAX;
1882 `used' is set to the length of the compiled pattern;
1883 `fastmap_accurate' is zero;
1884 `re_nsub' is the number of subexpressions in PATTERN;
1885 `not_bol' and `not_eol' are zero;
1887 The `fastmap' and `newline_anchor' fields are neither
1888 examined nor set. */
1890 /* Return, freeing storage we allocated. */
1891 #define FREE_STACK_RETURN(value) \
1892 return (free (compile_stack.stack), value)
1894 static reg_errcode_t
1895 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1896 struct re_pattern_buffer *bufp)
1898 /* We fetch characters from PATTERN here. We declare these as int
1899 (or possibly long) so that chars above 127 can be used as
1900 array indices. The macros that fetch a character from the pattern
1901 make sure to coerce to unsigned char before assigning, so we won't
1902 get bitten by negative numbers here. */
1903 /* XEmacs change: used to be unsigned char. */
1904 REGISTER EMACS_INT c, c1;
1906 /* A random temporary spot in PATTERN. */
1909 /* Points to the end of the buffer, where we should append. */
1910 REGISTER unsigned char *buf_end;
1912 /* Keeps track of unclosed groups. */
1913 compile_stack_type compile_stack;
1915 /* Points to the current (ending) position in the pattern. */
1916 re_char *p = pattern;
1917 re_char *pend = pattern + size;
1919 /* How to translate the characters in the pattern. */
1920 RE_TRANSLATE_TYPE translate = bufp->translate;
1922 /* Address of the count-byte of the most recently inserted `exactn'
1923 command. This makes it possible to tell if a new exact-match
1924 character can be added to that command or if the character requires
1925 a new `exactn' command. */
1926 unsigned char *pending_exact = 0;
1928 /* Address of start of the most recently finished expression.
1929 This tells, e.g., postfix * where to find the start of its
1930 operand. Reset at the beginning of groups and alternatives. */
1931 unsigned char *laststart = 0;
1933 /* Address of beginning of regexp, or inside of last group. */
1934 unsigned char *begalt;
1936 /* Place in the uncompiled pattern (i.e., the {) to
1937 which to go back if the interval is invalid. */
1938 re_char *beg_interval;
1940 /* Address of the place where a forward jump should go to the end of
1941 the containing expression. Each alternative of an `or' -- except the
1942 last -- ends with a forward jump of this sort. */
1943 unsigned char *fixup_alt_jump = 0;
1945 /* Counts open-groups as they are encountered. Remembered for the
1946 matching close-group on the compile stack, so the same register
1947 number is put in the stop_memory as the start_memory. */
1948 regnum_t regnum = 0;
1951 DEBUG_PRINT1 ("\nCompiling pattern: ");
1954 unsigned debug_count;
1956 for (debug_count = 0; debug_count < size; debug_count++)
1957 putchar (pattern[debug_count]);
1962 /* Initialize the compile stack. */
1963 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1964 if (compile_stack.stack == NULL)
1967 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1968 compile_stack.avail = 0;
1970 /* Initialize the pattern buffer. */
1971 bufp->syntax = syntax;
1972 bufp->fastmap_accurate = 0;
1973 bufp->not_bol = bufp->not_eol = 0;
1975 /* Set `used' to zero, so that if we return an error, the pattern
1976 printer (for debugging) will think there's no pattern. We reset it
1980 /* Always count groups, whether or not bufp->no_sub is set. */
1983 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1984 /* Initialize the syntax table. */
1985 init_syntax_once ();
1988 if (bufp->allocated == 0)
1991 { /* If zero allocated, but buffer is non-null, try to realloc
1992 enough space. This loses if buffer's address is bogus, but
1993 that is the user's responsibility. */
1994 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1997 { /* Caller did not allocate a buffer. Do it for them. */
1998 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2000 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2002 bufp->allocated = INIT_BUF_SIZE;
2005 begalt = buf_end = bufp->buffer;
2007 /* Loop through the uncompiled pattern until we're at the end. */
2016 if ( /* If at start of pattern, it's an operator. */
2018 /* If context independent, it's an operator. */
2019 || syntax & RE_CONTEXT_INDEP_ANCHORS
2020 /* Otherwise, depends on what's come before. */
2021 || at_begline_loc_p (pattern, p, syntax))
2031 if ( /* If at end of pattern, it's an operator. */
2033 /* If context independent, it's an operator. */
2034 || syntax & RE_CONTEXT_INDEP_ANCHORS
2035 /* Otherwise, depends on what's next. */
2036 || at_endline_loc_p (p, pend, syntax))
2046 if ((syntax & RE_BK_PLUS_QM)
2047 || (syntax & RE_LIMITED_OPS))
2051 /* If there is no previous pattern... */
2054 if (syntax & RE_CONTEXT_INVALID_OPS)
2055 FREE_STACK_RETURN (REG_BADRPT);
2056 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2061 /* true means zero/many matches are allowed. */
2062 re_bool zero_times_ok = c != '+';
2063 re_bool many_times_ok = c != '?';
2065 /* true means match shortest string possible. */
2066 re_bool minimal = false;
2068 /* If there is a sequence of repetition chars, collapse it
2069 down to just one (the right one). We can't combine
2070 interval operators with these because of, e.g., `a{2}*',
2071 which should only match an even number of `a's. */
2076 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2077 && (c == '+' || c == '?')))
2080 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2082 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2085 if (!(c1 == '+' || c1 == '?'))
2100 /* If we get here, we found another repeat character. */
2101 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2103 /* "*?" and "+?" and "??" are okay (and mean match
2104 minimally), but other sequences (such as "*??" and
2105 "+++") are rejected (reserved for future use). */
2106 if (minimal || c != '?')
2107 FREE_STACK_RETURN (REG_BADRPT);
2112 zero_times_ok |= c != '+';
2113 many_times_ok |= c != '?';
2117 /* Star, etc. applied to an empty pattern is equivalent
2118 to an empty pattern. */
2122 /* Now we know whether zero matches is allowed
2123 and whether two or more matches is allowed
2124 and whether we want minimal or maximal matching. */
2130 0: /on_failure_jump to 6
2135 GET_BUFFER_SPACE (6);
2136 INSERT_JUMP (jump, laststart, buf_end + 3);
2138 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2141 else if (zero_times_ok)
2146 6: /on_failure_jump to 3
2149 GET_BUFFER_SPACE (6);
2150 INSERT_JUMP (jump, laststart, buf_end + 3);
2152 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2159 3: /on_failure_jump to 0
2162 GET_BUFFER_SPACE (3);
2163 STORE_JUMP (on_failure_jump, buf_end, laststart);
2169 /* Are we optimizing this jump? */
2170 re_bool keep_string_p = false;
2173 { /* More than one repetition is allowed, so put in
2174 at the end a backward relative jump from
2175 `buf_end' to before the next jump we're going
2176 to put in below (which jumps from laststart to
2179 But if we are at the `*' in the exact sequence `.*\n',
2180 insert an unconditional jump backwards to the .,
2181 instead of the beginning of the loop. This way we only
2182 push a failure point once, instead of every time
2183 through the loop. */
2184 assert (p - 1 > pattern);
2186 /* Allocate the space for the jump. */
2187 GET_BUFFER_SPACE (3);
2189 /* We know we are not at the first character of the
2190 pattern, because laststart was nonzero. And we've
2191 already incremented `p', by the way, to be the
2192 character after the `*'. Do we have to do something
2193 analogous here for null bytes, because of
2197 && p < pend && *p == '\n'
2198 && !(syntax & RE_DOT_NEWLINE))
2199 { /* We have .*\n. */
2200 STORE_JUMP (jump, buf_end, laststart);
2201 keep_string_p = true;
2204 /* Anything else. */
2205 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2207 /* We've added more stuff to the buffer. */
2211 /* On failure, jump from laststart to buf_end + 3,
2212 which will be the end of the buffer after this jump
2214 GET_BUFFER_SPACE (3);
2215 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2217 laststart, buf_end + 3);
2222 /* At least one repetition is required, so insert a
2223 `dummy_failure_jump' before the initial
2224 `on_failure_jump' instruction of the loop. This
2225 effects a skip over that instruction the first time
2226 we hit that loop. */
2227 GET_BUFFER_SPACE (3);
2228 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2238 laststart = buf_end;
2245 /* XEmacs change: this whole section */
2246 re_bool had_char_class = false;
2248 re_bool has_extended_chars = false;
2249 REGISTER Lisp_Object rtab = Qnil;
2252 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2254 /* Ensure that we have enough space to push a charset: the
2255 opcode, the length count, and the bitset; 34 bytes in all. */
2256 GET_BUFFER_SPACE (34);
2258 laststart = buf_end;
2260 /* We test `*p == '^' twice, instead of using an if
2261 statement, so we only need one BUF_PUSH. */
2262 BUF_PUSH (*p == '^' ? charset_not : charset);
2266 /* Remember the first position in the bracket expression. */
2269 /* Push the number of bytes in the bitmap. */
2270 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2272 /* Clear the whole map. */
2273 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2275 /* charset_not matches newline according to a syntax bit. */
2276 if ((re_opcode_t) buf_end[-2] == charset_not
2277 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2278 SET_LIST_BIT ('\n');
2281 start_over_with_extended:
2282 if (has_extended_chars)
2284 /* There are extended chars here, which means we need to start
2285 over and shift to unified range-table format. */
2286 if (buf_end[-2] == charset)
2287 buf_end[-2] = charset_mule;
2289 buf_end[-2] = charset_mule_not;
2291 p = p1; /* go back to the beginning of the charset, after
2293 rtab = Vthe_lisp_rangetab;
2294 Fclear_range_table (rtab);
2296 /* charset_not matches newline according to a syntax bit. */
2297 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2298 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2299 SET_EITHER_BIT ('\n');
2303 /* Read in characters and ranges, setting map bits. */
2306 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2311 if (c >= 0x80 && !has_extended_chars)
2313 has_extended_chars = 1;
2314 /* Frumble-bumble, we've found some extended chars.
2315 Need to start over, process everything using
2316 the general extended-char mechanism, and need
2317 to use charset_mule and charset_mule_not instead
2318 of charset and charset_not. */
2319 goto start_over_with_extended;
2322 /* \ might escape characters inside [...] and [^...]. */
2323 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2325 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2329 if (c1 >= 0x80 && !has_extended_chars)
2331 has_extended_chars = 1;
2332 goto start_over_with_extended;
2335 SET_EITHER_BIT (c1);
2339 /* Could be the end of the bracket expression. If it's
2340 not (i.e., when the bracket expression is `[]' so
2341 far), the ']' character bit gets set way below. */
2342 if (c == ']' && p != p1 + 1)
2345 /* Look ahead to see if it's a range when the last thing
2346 was a character class. */
2347 if (had_char_class && c == '-' && *p != ']')
2348 FREE_STACK_RETURN (REG_ERANGE);
2350 /* Look ahead to see if it's a range when the last thing
2351 was a character: if this is a hyphen not at the
2352 beginning or the end of a list, then it's the range
2355 && !(p - 2 >= pattern && p[-2] == '[')
2356 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2362 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2364 has_extended_chars = 1;
2365 goto start_over_with_extended;
2367 if (has_extended_chars)
2368 ret = compile_extended_range (&p, pend, translate,
2372 ret = compile_range (&p, pend, translate, syntax, buf_end);
2373 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2376 else if (p[0] == '-' && p[1] != ']')
2377 { /* This handles ranges made up of characters only. */
2380 /* Move past the `-'. */
2384 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2386 has_extended_chars = 1;
2387 goto start_over_with_extended;
2389 if (has_extended_chars)
2390 ret = compile_extended_range (&p, pend, translate,
2394 ret = compile_range (&p, pend, translate, syntax, buf_end);
2395 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2398 /* See if we're at the beginning of a possible character
2401 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2402 { /* Leave room for the null. */
2403 char str[CHAR_CLASS_MAX_LENGTH + 1];
2408 /* If pattern is `[[:'. */
2409 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2413 /* #### This code is unused.
2414 Correctness is not checked after TRT
2417 if (c == ':' || c == ']' || p == pend
2418 || c1 == CHAR_CLASS_MAX_LENGTH)
2420 str[c1++] = (char) c;
2424 /* If isn't a word bracketed by `[:' and `:]':
2425 undo the ending character, the letters, and leave
2426 the leading `:' and `[' (but set bits for them). */
2427 if (c == ':' && *p == ']')
2430 re_bool is_alnum = STREQ (str, "alnum");
2431 re_bool is_alpha = STREQ (str, "alpha");
2432 re_bool is_blank = STREQ (str, "blank");
2433 re_bool is_cntrl = STREQ (str, "cntrl");
2434 re_bool is_digit = STREQ (str, "digit");
2435 re_bool is_graph = STREQ (str, "graph");
2436 re_bool is_lower = STREQ (str, "lower");
2437 re_bool is_print = STREQ (str, "print");
2438 re_bool is_punct = STREQ (str, "punct");
2439 re_bool is_space = STREQ (str, "space");
2440 re_bool is_upper = STREQ (str, "upper");
2441 re_bool is_xdigit = STREQ (str, "xdigit");
2443 if (!IS_CHAR_CLASS (str))
2444 FREE_STACK_RETURN (REG_ECTYPE);
2446 /* Throw away the ] at the end of the character
2450 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2452 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2454 /* This was split into 3 if's to
2455 avoid an arbitrary limit in some compiler. */
2456 if ( (is_alnum && ISALNUM (ch))
2457 || (is_alpha && ISALPHA (ch))
2458 || (is_blank && ISBLANK (ch))
2459 || (is_cntrl && ISCNTRL (ch)))
2460 SET_EITHER_BIT (ch);
2461 if ( (is_digit && ISDIGIT (ch))
2462 || (is_graph && ISGRAPH (ch))
2463 || (is_lower && ISLOWER (ch))
2464 || (is_print && ISPRINT (ch)))
2465 SET_EITHER_BIT (ch);
2466 if ( (is_punct && ISPUNCT (ch))
2467 || (is_space && ISSPACE (ch))
2468 || (is_upper && ISUPPER (ch))
2469 || (is_xdigit && ISXDIGIT (ch)))
2470 SET_EITHER_BIT (ch);
2472 had_char_class = true;
2479 SET_EITHER_BIT ('[');
2480 SET_EITHER_BIT (':');
2481 had_char_class = false;
2486 had_char_class = false;
2492 if (has_extended_chars)
2494 /* We have a range table, not a bit vector. */
2496 unified_range_table_bytes_needed (rtab);
2497 GET_BUFFER_SPACE (bytes_needed);
2498 unified_range_table_copy_data (rtab, buf_end);
2499 buf_end += unified_range_table_bytes_used (buf_end);
2503 /* Discard any (non)matching list bytes that are all 0 at the
2504 end of the map. Decrease the map-length byte too. */
2505 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2507 buf_end += buf_end[-1];
2513 if (syntax & RE_NO_BK_PARENS)
2520 if (syntax & RE_NO_BK_PARENS)
2527 if (syntax & RE_NEWLINE_ALT)
2534 if (syntax & RE_NO_BK_VBAR)
2541 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2542 goto handle_interval;
2548 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2550 /* Do not translate the character after the \, so that we can
2551 distinguish, e.g., \B from \b, even if we normally would
2552 translate, e.g., B to b. */
2558 if (syntax & RE_NO_BK_PARENS)
2559 goto normal_backslash;
2565 if (!(syntax & RE_NO_SHY_GROUPS)
2573 case ':': /* shy groups */
2577 /* All others are reserved for future constructs. */
2579 FREE_STACK_RETURN (REG_BADPAT);
2588 if (COMPILE_STACK_FULL)
2590 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2591 compile_stack_elt_t);
2592 if (compile_stack.stack == NULL) return REG_ESPACE;
2594 compile_stack.size <<= 1;
2597 /* These are the values to restore when we hit end of this
2598 group. They are all relative offsets, so that if the
2599 whole pattern moves because of realloc, they will still
2601 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2602 COMPILE_STACK_TOP.fixup_alt_jump
2603 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2604 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2605 COMPILE_STACK_TOP.regnum = r;
2607 /* We will eventually replace the 0 with the number of
2608 groups inner to this one. But do not push a
2609 start_memory for groups beyond the last one we can
2610 represent in the compiled pattern. */
2611 if (r <= MAX_REGNUM)
2613 COMPILE_STACK_TOP.inner_group_offset
2614 = buf_end - bufp->buffer + 2;
2615 BUF_PUSH_3 (start_memory, r, 0);
2618 compile_stack.avail++;
2623 /* If we've reached MAX_REGNUM groups, then this open
2624 won't actually generate any code, so we'll have to
2625 clear pending_exact explicitly. */
2632 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2634 if (COMPILE_STACK_EMPTY) {
2635 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2636 goto normal_backslash;
2638 FREE_STACK_RETURN (REG_ERPAREN);
2643 { /* Push a dummy failure point at the end of the
2644 alternative for a possible future
2645 `pop_failure_jump' to pop. See comments at
2646 `push_dummy_failure' in `re_match_2'. */
2647 BUF_PUSH (push_dummy_failure);
2649 /* We allocated space for this jump when we assigned
2650 to `fixup_alt_jump', in the `handle_alt' case below. */
2651 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2654 /* See similar code for backslashed left paren above. */
2655 if (COMPILE_STACK_EMPTY) {
2656 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2659 FREE_STACK_RETURN (REG_ERPAREN);
2662 /* Since we just checked for an empty stack above, this
2663 ``can't happen''. */
2664 assert (compile_stack.avail != 0);
2666 /* We don't just want to restore into `regnum', because
2667 later groups should continue to be numbered higher,
2668 as in `(ab)c(de)' -- the second group is #2. */
2669 regnum_t this_group_regnum;
2671 compile_stack.avail--;
2672 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2674 = COMPILE_STACK_TOP.fixup_alt_jump
2675 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2677 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2678 this_group_regnum = COMPILE_STACK_TOP.regnum;
2679 /* If we've reached MAX_REGNUM groups, then this open
2680 won't actually generate any code, so we'll have to
2681 clear pending_exact explicitly. */
2684 /* We're at the end of the group, so now we know how many
2685 groups were inside this one. */
2686 if (this_group_regnum <= MAX_REGNUM)
2688 unsigned char *inner_group_loc
2689 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2691 *inner_group_loc = regnum - this_group_regnum;
2692 BUF_PUSH_3 (stop_memory, this_group_regnum,
2693 regnum - this_group_regnum);
2699 case '|': /* `\|'. */
2700 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2701 goto normal_backslash;
2703 if (syntax & RE_LIMITED_OPS)
2706 /* Insert before the previous alternative a jump which
2707 jumps to this alternative if the former fails. */
2708 GET_BUFFER_SPACE (3);
2709 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2713 /* The alternative before this one has a jump after it
2714 which gets executed if it gets matched. Adjust that
2715 jump so it will jump to this alternative's analogous
2716 jump (put in below, which in turn will jump to the next
2717 (if any) alternative's such jump, etc.). The last such
2718 jump jumps to the correct final destination. A picture:
2724 If we are at `b', then fixup_alt_jump right now points to a
2725 three-byte space after `a'. We'll put in the jump, set
2726 fixup_alt_jump to right after `b', and leave behind three
2727 bytes which we'll fill in when we get to after `c'. */
2730 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2732 /* Mark and leave space for a jump after this alternative,
2733 to be filled in later either by next alternative or
2734 when know we're at the end of a series of alternatives. */
2735 fixup_alt_jump = buf_end;
2736 GET_BUFFER_SPACE (3);
2745 /* If \{ is a literal. */
2746 if (!(syntax & RE_INTERVALS)
2747 /* If we're at `\{' and it's not the open-interval
2749 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2750 || (p - 2 == pattern && p == pend))
2751 goto normal_backslash;
2755 /* If got here, then the syntax allows intervals. */
2757 /* At least (most) this many matches must be made. */
2758 int lower_bound = -1, upper_bound = -1;
2760 beg_interval = p - 1;
2764 if (syntax & RE_NO_BK_BRACES)
2765 goto unfetch_interval;
2767 FREE_STACK_RETURN (REG_EBRACE);
2770 GET_UNSIGNED_NUMBER (lower_bound);
2774 GET_UNSIGNED_NUMBER (upper_bound);
2775 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2778 /* Interval such as `{1}' => match exactly once. */
2779 upper_bound = lower_bound;
2781 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2782 || lower_bound > upper_bound)
2784 if (syntax & RE_NO_BK_BRACES)
2785 goto unfetch_interval;
2787 FREE_STACK_RETURN (REG_BADBR);
2790 if (!(syntax & RE_NO_BK_BRACES))
2792 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2799 if (syntax & RE_NO_BK_BRACES)
2800 goto unfetch_interval;
2802 FREE_STACK_RETURN (REG_BADBR);
2805 /* We just parsed a valid interval. */
2807 /* If it's invalid to have no preceding re. */
2810 if (syntax & RE_CONTEXT_INVALID_OPS)
2811 FREE_STACK_RETURN (REG_BADRPT);
2812 else if (syntax & RE_CONTEXT_INDEP_OPS)
2813 laststart = buf_end;
2815 goto unfetch_interval;
2818 /* If the upper bound is zero, don't want to succeed at
2819 all; jump from `laststart' to `b + 3', which will be
2820 the end of the buffer after we insert the jump. */
2821 if (upper_bound == 0)
2823 GET_BUFFER_SPACE (3);
2824 INSERT_JUMP (jump, laststart, buf_end + 3);
2828 /* Otherwise, we have a nontrivial interval. When
2829 we're all done, the pattern will look like:
2830 set_number_at <jump count> <upper bound>
2831 set_number_at <succeed_n count> <lower bound>
2832 succeed_n <after jump addr> <succeed_n count>
2834 jump_n <succeed_n addr> <jump count>
2835 (The upper bound and `jump_n' are omitted if
2836 `upper_bound' is 1, though.) */
2838 { /* If the upper bound is > 1, we need to insert
2839 more at the end of the loop. */
2840 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2842 GET_BUFFER_SPACE (nbytes);
2844 /* Initialize lower bound of the `succeed_n', even
2845 though it will be set during matching by its
2846 attendant `set_number_at' (inserted next),
2847 because `re_compile_fastmap' needs to know.
2848 Jump to the `jump_n' we might insert below. */
2849 INSERT_JUMP2 (succeed_n, laststart,
2850 buf_end + 5 + (upper_bound > 1) * 5,
2854 /* Code to initialize the lower bound. Insert
2855 before the `succeed_n'. The `5' is the last two
2856 bytes of this `set_number_at', plus 3 bytes of
2857 the following `succeed_n'. */
2858 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2861 if (upper_bound > 1)
2862 { /* More than one repetition is allowed, so
2863 append a backward jump to the `succeed_n'
2864 that starts this interval.
2866 When we've reached this during matching,
2867 we'll have matched the interval once, so
2868 jump back only `upper_bound - 1' times. */
2869 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2873 /* The location we want to set is the second
2874 parameter of the `jump_n'; that is `b-2' as
2875 an absolute address. `laststart' will be
2876 the `set_number_at' we're about to insert;
2877 `laststart+3' the number to set, the source
2878 for the relative address. But we are
2879 inserting into the middle of the pattern --
2880 so everything is getting moved up by 5.
2881 Conclusion: (b - 2) - (laststart + 3) + 5,
2882 i.e., b - laststart.
2884 We insert this at the beginning of the loop
2885 so that if we fail during matching, we'll
2886 reinitialize the bounds. */
2887 insert_op2 (set_number_at, laststart,
2888 buf_end - laststart,
2889 upper_bound - 1, buf_end);
2894 beg_interval = NULL;
2899 /* If an invalid interval, match the characters as literals. */
2900 assert (beg_interval);
2902 beg_interval = NULL;
2904 /* normal_char and normal_backslash need `c'. */
2907 if (!(syntax & RE_NO_BK_BRACES))
2909 if (p > pattern && p[-1] == '\\')
2910 goto normal_backslash;
2915 /* There is no way to specify the before_dot and after_dot
2916 operators. rms says this is ok. --karl */
2922 laststart = buf_end;
2924 /* XEmacs addition */
2925 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2926 FREE_STACK_RETURN (REG_ESYNTAX);
2927 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2931 laststart = buf_end;
2933 /* XEmacs addition */
2934 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2935 FREE_STACK_RETURN (REG_ESYNTAX);
2936 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2940 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2942 laststart = buf_end;
2944 if (c < 32 || c > 127)
2945 FREE_STACK_RETURN (REG_ECATEGORY);
2946 BUF_PUSH_2 (categoryspec, c);
2950 laststart = buf_end;
2952 if (c < 32 || c > 127)
2953 FREE_STACK_RETURN (REG_ECATEGORY);
2954 BUF_PUSH_2 (notcategoryspec, c);
2956 /* end of category patch */
2962 laststart = buf_end;
2963 BUF_PUSH (wordchar);
2968 laststart = buf_end;
2969 BUF_PUSH (notwordchar);
2982 BUF_PUSH (wordbound);
2986 BUF_PUSH (notwordbound);
2997 case '1': case '2': case '3': case '4': case '5':
2998 case '6': case '7': case '8': case '9':
3001 if (syntax & RE_NO_BK_REFS)
3007 FREE_STACK_RETURN (REG_ESUBREG);
3009 /* Can't back reference to a subexpression if inside of it. */
3010 if (group_in_compile_stack (compile_stack, reg))
3013 laststart = buf_end;
3014 BUF_PUSH_2 (duplicate, reg);
3021 if (syntax & RE_BK_PLUS_QM)
3024 goto normal_backslash;
3028 /* You might think it would be useful for \ to mean
3029 not to translate; but if we don't translate it,
3030 it will never match anything. */
3038 /* Expects the character in `c'. */
3039 /* `p' points to the location after where `c' came from. */
3042 /* XEmacs: modifications here for Mule. */
3043 /* `q' points to the beginning of the next char. */
3046 /* If no exactn currently being built. */
3049 /* If last exactn not at current position. */
3050 || pending_exact + *pending_exact + 1 != buf_end
3052 /* We have only one byte following the exactn for the count. */
3053 || ((unsigned int) (*pending_exact + (q - p)) >=
3054 ((unsigned int) (1 << BYTEWIDTH) - 1))
3056 /* If followed by a repetition operator. */
3057 || *q == '*' || *q == '^'
3058 || ((syntax & RE_BK_PLUS_QM)
3059 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3060 : (*q == '+' || *q == '?'))
3061 || ((syntax & RE_INTERVALS)
3062 && ((syntax & RE_NO_BK_BRACES)
3064 : (q[0] == '\\' && q[1] == '{'))))
3066 /* Start building a new exactn. */
3068 laststart = buf_end;
3070 BUF_PUSH_2 (exactn, 0);
3071 pending_exact = buf_end - 1;
3080 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3083 bt_count = set_charptr_emchar (tmp_buf, c);
3085 for (i = 0; i < bt_count; i++)
3087 BUF_PUSH (tmp_buf[i]);
3095 } /* while p != pend */
3098 /* Through the pattern now. */
3101 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3103 if (!COMPILE_STACK_EMPTY)
3104 FREE_STACK_RETURN (REG_EPAREN);
3106 /* If we don't want backtracking, force success
3107 the first time we reach the end of the compiled pattern. */
3108 if (syntax & RE_NO_POSIX_BACKTRACKING)
3111 free (compile_stack.stack);
3113 /* We have succeeded; set the length of the buffer. */
3114 bufp->used = buf_end - bufp->buffer;
3119 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3120 print_compiled_pattern (bufp);
3124 #ifndef MATCH_MAY_ALLOCATE
3125 /* Initialize the failure stack to the largest possible stack. This
3126 isn't necessary unless we're trying to avoid calling alloca in
3127 the search and match routines. */
3129 int num_regs = bufp->re_nsub + 1;
3131 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3132 is strictly greater than re_max_failures, the largest possible stack
3133 is 2 * re_max_failures failure points. */
3134 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3136 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3139 if (! fail_stack.stack)
3141 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3142 * sizeof (fail_stack_elt_t));
3145 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3147 * sizeof (fail_stack_elt_t)));
3148 #else /* not emacs */
3149 if (! fail_stack.stack)
3151 = (fail_stack_elt_t *) malloc (fail_stack.size
3152 * sizeof (fail_stack_elt_t));
3155 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3157 * sizeof (fail_stack_elt_t)));
3161 regex_grow_registers (num_regs);
3163 #endif /* not MATCH_MAY_ALLOCATE */
3166 } /* regex_compile */
3168 /* Subroutines for `regex_compile'. */
3170 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3173 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3175 *loc = (unsigned char) op;
3176 STORE_NUMBER (loc + 1, arg);
3180 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3183 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3185 *loc = (unsigned char) op;
3186 STORE_NUMBER (loc + 1, arg1);
3187 STORE_NUMBER (loc + 3, arg2);
3191 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3192 for OP followed by two-byte integer parameter ARG. */
3195 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3197 REGISTER unsigned char *pfrom = end;
3198 REGISTER unsigned char *pto = end + 3;
3200 while (pfrom != loc)
3203 store_op1 (op, loc, arg);
3207 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3210 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3213 REGISTER unsigned char *pfrom = end;
3214 REGISTER unsigned char *pto = end + 5;
3216 while (pfrom != loc)
3219 store_op2 (op, loc, arg1, arg2);
3223 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3224 after an alternative or a begin-subexpression. We assume there is at
3225 least one character before the ^. */
3228 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3230 re_char *prev = p - 2;
3231 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3234 /* After a subexpression? */
3235 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3236 /* After an alternative? */
3237 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3241 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3242 at least one character after the $, i.e., `P < PEND'. */
3245 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3248 re_bool next_backslash = *next == '\\';
3249 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3252 /* Before a subexpression? */
3253 (syntax & RE_NO_BK_PARENS ? *next == ')'
3254 : next_backslash && next_next && *next_next == ')')
3255 /* Before an alternative? */
3256 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3257 : next_backslash && next_next && *next_next == '|');
3261 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3262 false if it's not. */
3265 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3269 for (this_element = compile_stack.avail - 1;
3272 if (compile_stack.stack[this_element].regnum == regnum)
3279 /* Read the ending character of a range (in a bracket expression) from the
3280 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3281 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3282 Then we set the translation of all bits between the starting and
3283 ending characters (inclusive) in the compiled pattern B.
3285 Return an error code.
3287 We use these short variable names so we can use the same macros as
3288 `regex_compile' itself. */
3290 static reg_errcode_t
3291 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3292 reg_syntax_t syntax, unsigned char *buf_end)
3296 re_char *p = *p_ptr;
3297 int range_start, range_end;
3302 /* Even though the pattern is a signed `char *', we need to fetch
3303 with unsigned char *'s; if the high bit of the pattern character
3304 is set, the range endpoints will be negative if we fetch using a
3307 We also want to fetch the endpoints without translating them; the
3308 appropriate translation is done in the bit-setting loop below. */
3309 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3310 range_start = ((const unsigned char *) p)[-2];
3311 range_end = ((const unsigned char *) p)[0];
3313 /* Have to increment the pointer into the pattern string, so the
3314 caller isn't still at the ending character. */
3317 /* If the start is after the end, the range is empty. */
3318 if (range_start > range_end)
3319 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3321 /* Here we see why `this_char' has to be larger than an `unsigned
3322 char' -- the range is inclusive, so if `range_end' == 0xff
3323 (assuming 8-bit characters), we would otherwise go into an infinite
3324 loop, since all characters <= 0xff. */
3325 for (this_char = range_start; this_char <= range_end; this_char++)
3327 SET_LIST_BIT (TRANSLATE (this_char));
3335 static reg_errcode_t
3336 compile_extended_range (re_char **p_ptr, re_char *pend,
3337 RE_TRANSLATE_TYPE translate,
3338 reg_syntax_t syntax, Lisp_Object rtab)
3340 Emchar this_char, range_start, range_end;
3346 p = (const Bufbyte *) *p_ptr;
3347 range_end = charptr_emchar (p);
3348 p--; /* back to '-' */
3349 DEC_CHARPTR (p); /* back to start of range */
3350 /* We also want to fetch the endpoints without translating them; the
3351 appropriate translation is done in the bit-setting loop below. */
3352 range_start = charptr_emchar (p);
3353 INC_CHARPTR (*p_ptr);
3355 /* If the start is after the end, the range is empty. */
3356 if (range_start > range_end)
3357 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3359 /* Can't have ranges spanning different charsets, except maybe for
3360 ranges entirely within the first 256 chars. */
3362 if ((range_start >= 0x100 || range_end >= 0x100)
3364 && CHAR_CHARSET_ID (range_start) != CHAR_CHARSET_ID (range_end)
3366 && CHAR_LEADING_BYTE (range_start) != CHAR_LEADING_BYTE (range_end)
3369 return REG_ERANGESPAN;
3371 /* As advertised, translations only work over the 0 - 0x7F range.
3372 Making this kind of stuff work generally is much harder.
3373 Iterating over the whole range like this would be way efficient
3374 if the range encompasses 10,000 chars or something. You'd have
3375 to do something like this:
3379 map over translation table in [range_start, range_end] of
3380 (put the mapped range in a;
3381 put the translation in b)
3382 invert the range in a and truncate to [range_start, range_end]
3383 compute the union of a, b
3384 union the result into rtab
3386 for (this_char = range_start;
3387 this_char <= range_end && this_char < 0x80; this_char++)
3389 SET_RANGETAB_BIT (TRANSLATE (this_char));
3392 if (this_char <= range_end)
3393 put_range_table (rtab, this_char, range_end, Qt);
3400 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3401 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3402 characters can start a string that matches the pattern. This fastmap
3403 is used by re_search to skip quickly over impossible starting points.
3405 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3406 area as BUFP->fastmap.
3408 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3411 Returns 0 if we succeed, -2 if an internal error. */
3414 re_compile_fastmap (struct re_pattern_buffer *bufp)
3417 #ifdef MATCH_MAY_ALLOCATE
3418 fail_stack_type fail_stack;
3420 DECLARE_DESTINATION;
3421 /* We don't push any register information onto the failure stack. */
3423 REGISTER char *fastmap = bufp->fastmap;
3424 unsigned char *pattern = bufp->buffer;
3425 unsigned long size = bufp->used;
3426 unsigned char *p = pattern;
3427 REGISTER unsigned char *pend = pattern + size;
3430 /* This holds the pointer to the failure stack, when
3431 it is allocated relocatably. */
3432 fail_stack_elt_t *failure_stack_ptr;
3435 /* Assume that each path through the pattern can be null until
3436 proven otherwise. We set this false at the bottom of switch
3437 statement, to which we get only if a particular path doesn't
3438 match the empty string. */
3439 re_bool path_can_be_null = true;
3441 /* We aren't doing a `succeed_n' to begin with. */
3442 re_bool succeed_n_p = false;
3444 assert (fastmap != NULL && p != NULL);
3447 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3448 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3449 bufp->can_be_null = 0;
3453 if (p == pend || *p == succeed)
3455 /* We have reached the (effective) end of pattern. */
3456 if (!FAIL_STACK_EMPTY ())
3458 bufp->can_be_null |= path_can_be_null;
3460 /* Reset for next path. */
3461 path_can_be_null = true;
3463 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3471 /* We should never be about to go beyond the end of the pattern. */
3474 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3477 /* I guess the idea here is to simply not bother with a fastmap
3478 if a backreference is used, since it's too hard to figure out
3479 the fastmap for the corresponding group. Setting
3480 `can_be_null' stops `re_search_2' from using the fastmap, so
3481 that is all we do. */
3483 bufp->can_be_null = 1;
3487 /* Following are the cases which match a character. These end
3496 /* XEmacs: Under Mule, these bit vectors will
3497 only contain values for characters below 0x80. */
3498 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3499 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3505 /* Chars beyond end of map must be allowed. */
3507 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3509 /* And all extended characters must be allowed, too. */
3510 for (j = 0x80; j < 0xA0; j++)
3512 #else /* not MULE */
3513 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3517 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3518 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3528 nentries = unified_range_table_nentries (p);
3529 for (i = 0; i < nentries; i++)
3531 EMACS_INT first, last;
3532 Lisp_Object dummy_val;
3534 Bufbyte strr[MAX_EMCHAR_LEN];
3536 unified_range_table_get_range (p, i, &first, &last,
3538 for (jj = first; jj <= last && jj < 0x80; jj++)
3540 /* Ranges below 0x100 can span charsets, but there
3541 are only two (Control-1 and Latin-1), and
3542 either first or last has to be in them. */
3543 set_charptr_emchar (strr, first);
3547 set_charptr_emchar (strr, last);
3554 case charset_mule_not:
3559 nentries = unified_range_table_nentries (p);
3560 for (i = 0; i < nentries; i++)
3562 EMACS_INT first, last;
3563 Lisp_Object dummy_val;
3565 int smallest_prev = 0;
3567 unified_range_table_get_range (p, i, &first, &last,
3569 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3571 smallest_prev = last + 1;
3572 if (smallest_prev >= 0x80)
3575 /* Calculating which leading bytes are actually allowed
3576 here is rather difficult, so we just punt and allow
3578 for (i = 0x80; i < 0xA0; i++)
3590 for (j = 0; j < (1 << BYTEWIDTH); j++)
3593 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3602 goto matchnotsyntax;
3604 for (j = 0; j < (1 << BYTEWIDTH); j++)
3607 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3615 int fastmap_newline = fastmap['\n'];
3617 /* `.' matches anything ... */
3619 /* "anything" only includes bytes that can be the
3620 first byte of a character. */
3621 for (j = 0; j < 0xA0; j++)
3624 for (j = 0; j < (1 << BYTEWIDTH); j++)
3628 /* ... except perhaps newline. */
3629 if (!(bufp->syntax & RE_DOT_NEWLINE))
3630 fastmap['\n'] = fastmap_newline;
3632 /* Return if we have already set `can_be_null'; if we have,
3633 then the fastmap is irrelevant. Something's wrong here. */
3634 else if (bufp->can_be_null)
3637 /* Otherwise, have to check alternative paths. */
3648 /* This match depends on text properties. These end with
3649 aborting optimizations. */
3650 bufp->can_be_null = 1;
3654 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3661 for (j = 0; j < 0x80; j++)
3664 (regex_emacs_buffer->syntax_table), j) ==
3665 (enum syntaxcode) k)
3668 for (j = 0; j < 0x80; j++)
3671 (regex_emacs_buffer->mirror_syntax_table), j) ==
3672 (enum syntaxcode) k)
3675 for (j = 0x80; j < 0xA0; j++)
3678 if (LEADING_BYTE_PREFIX_P(j))
3679 /* too complicated to calculate this right */
3687 cset = CHARSET_BY_LEADING_BYTE (j);
3688 if (CHARSETP (cset))
3690 if (charset_syntax (regex_emacs_buffer, cset,
3692 == Sword || multi_p)
3699 #else /* not MULE */
3700 for (j = 0; j < (1 << BYTEWIDTH); j++)
3703 (regex_emacs_buffer->mirror_syntax_table), j) ==
3704 (enum syntaxcode) k)
3710 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3717 for (j = 0; j < 0x80; j++)
3720 (regex_emacs_buffer->syntax_table), j) !=
3721 (enum syntaxcode) k)
3724 for (j = 0; j < 0x80; j++)
3727 (regex_emacs_buffer->mirror_syntax_table), j) !=
3728 (enum syntaxcode) k)
3731 for (j = 0x80; j < 0xA0; j++)
3734 if (LEADING_BYTE_PREFIX_P(j))
3735 /* too complicated to calculate this right */
3743 cset = CHARSET_BY_LEADING_BYTE (j);
3744 if (CHARSETP (cset))
3746 if (charset_syntax (regex_emacs_buffer, cset,
3748 != Sword || multi_p)
3755 #else /* not MULE */
3756 for (j = 0; j < (1 << BYTEWIDTH); j++)
3759 (regex_emacs_buffer->mirror_syntax_table), j) !=
3760 (enum syntaxcode) k)
3767 /* 97/2/17 jhod category patch */
3769 case notcategoryspec:
3770 bufp->can_be_null = 1;
3772 /* end if category patch */
3775 /* All cases after this match the empty string. These end with
3783 #endif /* not emacs */
3797 case push_dummy_failure:
3802 case pop_failure_jump:
3803 case maybe_pop_jump:
3806 case dummy_failure_jump:
3807 EXTRACT_NUMBER_AND_INCR (j, p);
3812 /* Jump backward implies we just went through the body of a
3813 loop and matched nothing. Opcode jumped to should be
3814 `on_failure_jump' or `succeed_n'. Just treat it like an
3815 ordinary jump. For a * loop, it has pushed its failure
3816 point already; if so, discard that as redundant. */
3817 if ((re_opcode_t) *p != on_failure_jump
3818 && (re_opcode_t) *p != succeed_n)
3822 EXTRACT_NUMBER_AND_INCR (j, p);
3825 /* If what's on the stack is where we are now, pop it. */
3826 if (!FAIL_STACK_EMPTY ()
3827 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3833 case on_failure_jump:
3834 case on_failure_keep_string_jump:
3835 handle_on_failure_jump:
3836 EXTRACT_NUMBER_AND_INCR (j, p);
3838 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3839 end of the pattern. We don't want to push such a point,
3840 since when we restore it above, entering the switch will
3841 increment `p' past the end of the pattern. We don't need
3842 to push such a point since we obviously won't find any more
3843 fastmap entries beyond `pend'. Such a pattern can match
3844 the null string, though. */
3847 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3849 RESET_FAIL_STACK ();
3854 bufp->can_be_null = 1;
3858 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3859 succeed_n_p = false;
3866 /* Get to the number of times to succeed. */
3869 /* Increment p past the n for when k != 0. */
3870 EXTRACT_NUMBER_AND_INCR (k, p);
3874 succeed_n_p = true; /* Spaghetti code alert. */
3875 goto handle_on_failure_jump;
3892 abort (); /* We have listed all the cases. */
3895 /* Getting here means we have found the possible starting
3896 characters for one path of the pattern -- and that the empty
3897 string does not match. We need not follow this path further.
3898 Instead, look at the next alternative (remembered on the
3899 stack), or quit if no more. The test at the top of the loop
3900 does these things. */
3901 path_can_be_null = false;
3905 /* Set `can_be_null' for the last path (also the first path, if the
3906 pattern is empty). */
3907 bufp->can_be_null |= path_can_be_null;
3910 RESET_FAIL_STACK ();
3912 } /* re_compile_fastmap */
3914 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3915 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3916 this memory for recording register information. STARTS and ENDS
3917 must be allocated using the malloc library routine, and must each
3918 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3920 If NUM_REGS == 0, then subsequent matches should allocate their own
3923 Unless this function is called, the first search or match using
3924 PATTERN_BUFFER will allocate its own register data, without
3925 freeing the old data. */
3928 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3929 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3933 bufp->regs_allocated = REGS_REALLOCATE;
3934 regs->num_regs = num_regs;
3935 regs->start = starts;
3940 bufp->regs_allocated = REGS_UNALLOCATED;
3942 regs->start = regs->end = (regoff_t *) 0;
3946 /* Searching routines. */
3948 /* Like re_search_2, below, but only one string is specified, and
3949 doesn't let you say where to stop matching. */
3952 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3953 int startpos, int range, struct re_registers *regs)
3955 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3960 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3961 # define bytecount_to_charcount(ptr, len) (len)
3962 # define charcount_to_bytecount(ptr, len) (len)
3963 typedef int Charcount;
3966 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3967 virtual concatenation of STRING1 and STRING2, starting first at index
3968 STARTPOS, then at STARTPOS + 1, and so on.
3970 With MULE, STARTPOS is a byte position, not a char position. And the
3971 search will increment STARTPOS by the width of the current leading
3974 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3976 RANGE is how far to scan while trying to match. RANGE = 0 means try
3977 only at STARTPOS; in general, the last start tried is STARTPOS +
3980 With MULE, RANGE is a byte position, not a char position. The last
3981 start tried is the character starting <= STARTPOS + RANGE.
3983 In REGS, return the indices of the virtual concatenation of STRING1
3984 and STRING2 that matched the entire BUFP->buffer and its contained
3987 Do not consider matching one past the index STOP in the virtual
3988 concatenation of STRING1 and STRING2.
3990 We return either the position in the strings at which the match was
3991 found, -1 if no match, or -2 if error (such as failure
3995 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
3996 int size1, const char *str2, int size2, int startpos,
3997 int range, struct re_registers *regs, int stop)
4000 re_char *string1 = (re_char *) str1;
4001 re_char *string2 = (re_char *) str2;
4002 REGISTER char *fastmap = bufp->fastmap;
4003 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4004 int total_size = size1 + size2;
4005 int endpos = startpos + range;
4006 #ifdef REGEX_BEGLINE_CHECK
4007 int anchored_at_begline = 0;
4012 /* Check for out-of-range STARTPOS. */
4013 if (startpos < 0 || startpos > total_size)
4016 /* Fix up RANGE if it might eventually take us outside
4017 the virtual concatenation of STRING1 and STRING2. */
4019 range = 0 - startpos;
4020 else if (endpos > total_size)
4021 range = total_size - startpos;
4023 /* If the search isn't to be a backwards one, don't waste time in a
4024 search for a pattern that must be anchored. */
4025 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4031 d = ((const unsigned char *)
4032 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4033 range = charcount_to_bytecount (d, 1);
4038 /* In a forward search for something that starts with \=.
4039 don't keep searching past point. */
4040 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4042 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4049 /* Update the fastmap now if not correct already. */
4050 if (fastmap && !bufp->fastmap_accurate)
4051 if (re_compile_fastmap (bufp) == -2)
4054 #ifdef REGEX_BEGLINE_CHECK
4058 while (i < bufp->used)
4060 if (bufp->buffer[i] == start_memory ||
4061 bufp->buffer[i] == stop_memory)
4066 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4071 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4073 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4079 /* Loop through the string, looking for a place to start matching. */
4082 #ifdef REGEX_BEGLINE_CHECK
4083 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4084 then we can speed things up by skipping to the next beginning-of-
4086 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4089 /* whose stupid idea was it anyway to make this
4090 function take two strings to match?? */
4094 if (startpos < size1 && startpos + range >= size1)
4095 lim = range - (size1 - startpos);
4097 d = ((const unsigned char *)
4098 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4099 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4100 d_size = charcount_to_bytecount (d, 1);
4102 if (TRANSLATE_P (translate))
4103 while (range > lim && *d != '\n')
4105 d += d_size; /* Speedier INC_CHARPTR(d) */
4106 d_size = charcount_to_bytecount (d, 1);
4110 while (range > lim && *d != '\n')
4112 d += d_size; /* Speedier INC_CHARPTR(d) */
4113 d_size = charcount_to_bytecount (d, 1);
4117 startpos += irange - range;
4119 #endif /* REGEX_BEGLINE_CHECK */
4121 /* If a fastmap is supplied, skip quickly over characters that
4122 cannot be the start of a match. If the pattern can match the
4123 null string, however, we don't need to skip characters; we want
4124 the first null string. */
4125 if (fastmap && startpos < total_size && !bufp->can_be_null)
4127 if (range > 0) /* Searching forwards. */
4132 if (startpos < size1 && startpos + range >= size1)
4133 lim = range - (size1 - startpos);
4135 d = ((const unsigned char *)
4136 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4138 /* Written out as an if-else to avoid testing `translate'
4140 if (TRANSLATE_P (translate))
4146 buf_ch = charptr_emchar (d);
4147 buf_ch = RE_TRANSLATE (buf_ch);
4148 if (buf_ch >= 0200 || fastmap[(unsigned char) buf_ch])
4151 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4154 d_size = charcount_to_bytecount (d, 1);
4156 d += d_size; /* Speedier INC_CHARPTR(d) */
4159 while (range > lim && !fastmap[*d])
4161 d_size = charcount_to_bytecount (d, 1);
4163 d += d_size; /* Speedier INC_CHARPTR(d) */
4166 startpos += irange - range;
4168 else /* Searching backwards. */
4170 Emchar c = (size1 == 0 || startpos >= size1
4171 ? charptr_emchar (string2 + startpos - size1)
4172 : charptr_emchar (string1 + startpos));
4175 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4178 if (!fastmap[(unsigned char) c])
4184 /* If can't match the null string, and that's all we have left, fail. */
4185 if (range >= 0 && startpos == total_size && fastmap
4186 && !bufp->can_be_null)
4189 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4190 if (!no_quit_in_re_search)
4193 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4194 startpos, regs, stop);
4195 #ifndef REGEX_MALLOC
4212 d = ((const unsigned char *)
4213 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4214 d_size = charcount_to_bytecount (d, 1);
4220 /* Note startpos > size1 not >=. If we are on the
4221 string1/string2 boundary, we want to backup into string1. */
4222 d = ((const unsigned char *)
4223 (startpos > size1 ? string2 - size1 : string1) + startpos);
4225 d_size = charcount_to_bytecount (d, 1);
4233 /* Declarations and macros for re_match_2. */
4235 /* This converts PTR, a pointer into one of the search strings `string1'
4236 and `string2' into an offset from the beginning of that string. */
4237 #define POINTER_TO_OFFSET(ptr) \
4238 (FIRST_STRING_P (ptr) \
4239 ? ((regoff_t) ((ptr) - string1)) \
4240 : ((regoff_t) ((ptr) - string2 + size1)))
4242 /* Macros for dealing with the split strings in re_match_2. */
4244 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4246 /* Call before fetching a character with *d. This switches over to
4247 string2 if necessary. */
4248 #define REGEX_PREFETCH() \
4251 /* End of string2 => fail. */ \
4252 if (dend == end_match_2) \
4254 /* End of string1 => advance to string2. */ \
4256 dend = end_match_2; \
4260 /* Test if at very beginning or at very end of the virtual concatenation
4261 of `string1' and `string2'. If only one string, it's `string2'. */
4262 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4263 #define AT_STRINGS_END(d) ((d) == end2)
4266 If the given position straddles the string gap, return the equivalent
4267 position that is before or after the gap, respectively; otherwise,
4268 return the same position. */
4269 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4270 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4272 /* Test if CH is a word-constituent character. (XEmacs change) */
4274 #define WORDCHAR_P_UNSAFE(ch) \
4275 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->syntax_table), \
4278 #define WORDCHAR_P_UNSAFE(ch) \
4279 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4283 /* Free everything we malloc. */
4284 #ifdef MATCH_MAY_ALLOCATE
4285 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4286 #define FREE_VARIABLES() \
4288 REGEX_FREE_STACK (fail_stack.stack); \
4289 FREE_VAR (regstart); \
4290 FREE_VAR (regend); \
4291 FREE_VAR (old_regstart); \
4292 FREE_VAR (old_regend); \
4293 FREE_VAR (best_regstart); \
4294 FREE_VAR (best_regend); \
4295 FREE_VAR (reg_info); \
4296 FREE_VAR (reg_dummy); \
4297 FREE_VAR (reg_info_dummy); \
4299 #else /* not MATCH_MAY_ALLOCATE */
4300 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4301 #endif /* MATCH_MAY_ALLOCATE */
4303 /* These values must meet several constraints. They must not be valid
4304 register values; since we have a limit of 255 registers (because
4305 we use only one byte in the pattern for the register number), we can
4306 use numbers larger than 255. They must differ by 1, because of
4307 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4308 be larger than the value for the highest register, so we do not try
4309 to actually save any registers when none are active. */
4310 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4311 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4313 /* Matching routines. */
4315 #ifndef emacs /* Emacs never uses this. */
4316 /* re_match is like re_match_2 except it takes only a single string. */
4319 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4320 int pos, struct re_registers *regs)
4322 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4327 #endif /* not emacs */
4330 /* re_match_2 matches the compiled pattern in BUFP against the
4331 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4332 SIZE2, respectively). We start matching at POS, and stop matching
4335 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4336 store offsets for the substring each group matched in REGS. See the
4337 documentation for exactly how many groups we fill.
4339 We return -1 if no match, -2 if an internal error (such as the
4340 failure stack overflowing). Otherwise, we return the length of the
4341 matched substring. */
4344 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4345 int size1, const char *string2, int size2, int pos,
4346 struct re_registers *regs, int stop)
4351 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4353 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4359 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4360 (re_char *) string2, size2,
4367 /* This is a separate function so that we can force an alloca cleanup
4370 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4371 int size1, re_char *string2, int size2, int pos,
4372 struct re_registers *regs, int stop)
4374 /* General temporaries. */
4377 int should_succeed; /* XEmacs change */
4379 /* Just past the end of the corresponding string. */
4380 re_char *end1, *end2;
4382 /* Pointers into string1 and string2, just past the last characters in
4383 each to consider matching. */
4384 re_char *end_match_1, *end_match_2;
4386 /* Where we are in the data, and the end of the current string. */
4389 /* Where we are in the pattern, and the end of the pattern. */
4390 unsigned char *p = bufp->buffer;
4391 REGISTER unsigned char *pend = p + bufp->used;
4393 /* Mark the opcode just after a start_memory, so we can test for an
4394 empty subpattern when we get to the stop_memory. */
4395 re_char *just_past_start_mem = 0;
4397 /* We use this to map every character in the string. */
4398 RE_TRANSLATE_TYPE translate = bufp->translate;
4400 /* Failure point stack. Each place that can handle a failure further
4401 down the line pushes a failure point on this stack. It consists of
4402 restart, regend, and reg_info for all registers corresponding to
4403 the subexpressions we're currently inside, plus the number of such
4404 registers, and, finally, two char *'s. The first char * is where
4405 to resume scanning the pattern; the second one is where to resume
4406 scanning the strings. If the latter is zero, the failure point is
4407 a ``dummy''; if a failure happens and the failure point is a dummy,
4408 it gets discarded and the next one is tried. */
4409 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4410 fail_stack_type fail_stack;
4413 static unsigned failure_id;
4414 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4418 /* This holds the pointer to the failure stack, when
4419 it is allocated relocatably. */
4420 fail_stack_elt_t *failure_stack_ptr;
4423 /* We fill all the registers internally, independent of what we
4424 return, for use in backreferences. The number here includes
4425 an element for register zero. */
4426 unsigned num_regs = bufp->re_nsub + 1;
4428 /* The currently active registers. */
4429 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4430 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4432 /* Information on the contents of registers. These are pointers into
4433 the input strings; they record just what was matched (on this
4434 attempt) by a subexpression part of the pattern, that is, the
4435 regnum-th regstart pointer points to where in the pattern we began
4436 matching and the regnum-th regend points to right after where we
4437 stopped matching the regnum-th subexpression. (The zeroth register
4438 keeps track of what the whole pattern matches.) */
4439 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4440 re_char **regstart, **regend;
4443 /* If a group that's operated upon by a repetition operator fails to
4444 match anything, then the register for its start will need to be
4445 restored because it will have been set to wherever in the string we
4446 are when we last see its open-group operator. Similarly for a
4448 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4449 re_char **old_regstart, **old_regend;
4452 /* The is_active field of reg_info helps us keep track of which (possibly
4453 nested) subexpressions we are currently in. The matched_something
4454 field of reg_info[reg_num] helps us tell whether or not we have
4455 matched any of the pattern so far this time through the reg_num-th
4456 subexpression. These two fields get reset each time through any
4457 loop their register is in. */
4458 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4459 register_info_type *reg_info;
4462 /* The following record the register info as found in the above
4463 variables when we find a match better than any we've seen before.
4464 This happens as we backtrack through the failure points, which in
4465 turn happens only if we have not yet matched the entire string. */
4466 unsigned best_regs_set = false;
4467 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4468 re_char **best_regstart, **best_regend;
4471 /* Logically, this is `best_regend[0]'. But we don't want to have to
4472 allocate space for that if we're not allocating space for anything
4473 else (see below). Also, we never need info about register 0 for
4474 any of the other register vectors, and it seems rather a kludge to
4475 treat `best_regend' differently than the rest. So we keep track of
4476 the end of the best match so far in a separate variable. We
4477 initialize this to NULL so that when we backtrack the first time
4478 and need to test it, it's not garbage. */
4479 re_char *match_end = NULL;
4481 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4482 int set_regs_matched_done = 0;
4484 /* Used when we pop values we don't care about. */
4485 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4486 re_char **reg_dummy;
4487 register_info_type *reg_info_dummy;
4491 /* Counts the total number of registers pushed. */
4492 unsigned num_regs_pushed = 0;
4495 /* 1 if this match ends in the same string (string1 or string2)
4496 as the best previous match. */
4499 /* 1 if this match is the best seen so far. */
4500 re_bool best_match_p;
4502 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4506 #ifdef MATCH_MAY_ALLOCATE
4507 /* Do not bother to initialize all the register variables if there are
4508 no groups in the pattern, as it takes a fair amount of time. If
4509 there are groups, we include space for register 0 (the whole
4510 pattern), even though we never use it, since it simplifies the
4511 array indexing. We should fix this. */
4514 regstart = REGEX_TALLOC (num_regs, re_char *);
4515 regend = REGEX_TALLOC (num_regs, re_char *);
4516 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4517 old_regend = REGEX_TALLOC (num_regs, re_char *);
4518 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4519 best_regend = REGEX_TALLOC (num_regs, re_char *);
4520 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4521 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4522 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4524 if (!(regstart && regend && old_regstart && old_regend && reg_info
4525 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4533 /* We must initialize all our variables to NULL, so that
4534 `FREE_VARIABLES' doesn't try to free them. */
4535 regstart = regend = old_regstart = old_regend = best_regstart
4536 = best_regend = reg_dummy = NULL;
4537 reg_info = reg_info_dummy = (register_info_type *) NULL;
4539 #endif /* MATCH_MAY_ALLOCATE */
4541 /* The starting position is bogus. */
4542 if (pos < 0 || pos > size1 + size2)
4548 /* Initialize subexpression text positions to -1 to mark ones that no
4549 start_memory/stop_memory has been seen for. Also initialize the
4550 register information struct. */
4551 for (mcnt = 1; mcnt < num_regs; mcnt++)
4553 regstart[mcnt] = regend[mcnt]
4554 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4556 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4557 IS_ACTIVE (reg_info[mcnt]) = 0;
4558 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4559 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4561 /* We move `string1' into `string2' if the latter's empty -- but not if
4562 `string1' is null. */
4563 if (size2 == 0 && string1 != NULL)
4570 end1 = string1 + size1;
4571 end2 = string2 + size2;
4573 /* Compute where to stop matching, within the two strings. */
4576 end_match_1 = string1 + stop;
4577 end_match_2 = string2;
4582 end_match_2 = string2 + stop - size1;
4585 /* `p' scans through the pattern as `d' scans through the data.
4586 `dend' is the end of the input string that `d' points within. `d'
4587 is advanced into the following input string whenever necessary, but
4588 this happens before fetching; therefore, at the beginning of the
4589 loop, `d' can be pointing at the end of a string, but it cannot
4591 if (size1 > 0 && pos <= size1)
4598 d = string2 + pos - size1;
4602 DEBUG_PRINT1 ("The compiled pattern is: \n");
4603 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4604 DEBUG_PRINT1 ("The string to match is: `");
4605 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4606 DEBUG_PRINT1 ("'\n");
4608 /* This loops over pattern commands. It exits by returning from the
4609 function if the match is complete, or it drops through if the match
4610 fails at this starting point in the input data. */
4613 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4614 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4615 if (!no_quit_in_re_search)
4620 { /* End of pattern means we might have succeeded. */
4621 DEBUG_PRINT1 ("end of pattern ... ");
4623 /* If we haven't matched the entire string, and we want the
4624 longest match, try backtracking. */
4625 if (d != end_match_2)
4627 same_str_p = (FIRST_STRING_P (match_end)
4628 == MATCHING_IN_FIRST_STRING);
4630 /* AIX compiler got confused when this was combined
4631 with the previous declaration. */
4633 best_match_p = d > match_end;
4635 best_match_p = !MATCHING_IN_FIRST_STRING;
4637 DEBUG_PRINT1 ("backtracking.\n");
4639 if (!FAIL_STACK_EMPTY ())
4640 { /* More failure points to try. */
4642 /* If exceeds best match so far, save it. */
4643 if (!best_regs_set || best_match_p)
4645 best_regs_set = true;
4648 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4650 for (mcnt = 1; mcnt < num_regs; mcnt++)
4652 best_regstart[mcnt] = regstart[mcnt];
4653 best_regend[mcnt] = regend[mcnt];
4659 /* If no failure points, don't restore garbage. And if
4660 last match is real best match, don't restore second
4662 else if (best_regs_set && !best_match_p)
4665 /* Restore best match. It may happen that `dend ==
4666 end_match_1' while the restored d is in string2.
4667 For example, the pattern `x.*y.*z' against the
4668 strings `x-' and `y-z-', if the two strings are
4669 not consecutive in memory. */
4670 DEBUG_PRINT1 ("Restoring best registers.\n");
4673 dend = ((d >= string1 && d <= end1)
4674 ? end_match_1 : end_match_2);
4676 for (mcnt = 1; mcnt < num_regs; mcnt++)
4678 regstart[mcnt] = best_regstart[mcnt];
4679 regend[mcnt] = best_regend[mcnt];
4682 } /* d != end_match_2 */
4685 DEBUG_PRINT1 ("Accepting match.\n");
4687 /* If caller wants register contents data back, do it. */
4688 if (regs && !bufp->no_sub)
4690 /* Have the register data arrays been allocated? */
4691 if (bufp->regs_allocated == REGS_UNALLOCATED)
4692 { /* No. So allocate them with malloc. We need one
4693 extra element beyond `num_regs' for the `-1' marker
4695 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4696 regs->start = TALLOC (regs->num_regs, regoff_t);
4697 regs->end = TALLOC (regs->num_regs, regoff_t);
4698 if (regs->start == NULL || regs->end == NULL)
4703 bufp->regs_allocated = REGS_REALLOCATE;
4705 else if (bufp->regs_allocated == REGS_REALLOCATE)
4706 { /* Yes. If we need more elements than were already
4707 allocated, reallocate them. If we need fewer, just
4709 if (regs->num_regs < num_regs + 1)
4711 regs->num_regs = num_regs + 1;
4712 RETALLOC (regs->start, regs->num_regs, regoff_t);
4713 RETALLOC (regs->end, regs->num_regs, regoff_t);
4714 if (regs->start == NULL || regs->end == NULL)
4723 /* These braces fend off a "empty body in an else-statement"
4724 warning under GCC when assert expands to nothing. */
4725 assert (bufp->regs_allocated == REGS_FIXED);
4728 /* Convert the pointer data in `regstart' and `regend' to
4729 indices. Register zero has to be set differently,
4730 since we haven't kept track of any info for it. */
4731 if (regs->num_regs > 0)
4733 regs->start[0] = pos;
4734 regs->end[0] = (MATCHING_IN_FIRST_STRING
4735 ? ((regoff_t) (d - string1))
4736 : ((regoff_t) (d - string2 + size1)));
4739 /* Go through the first `min (num_regs, regs->num_regs)'
4740 registers, since that is all we initialized. */
4741 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4743 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4744 regs->start[mcnt] = regs->end[mcnt] = -1;
4748 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4750 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4754 /* If the regs structure we return has more elements than
4755 were in the pattern, set the extra elements to -1. If
4756 we (re)allocated the registers, this is the case,
4757 because we always allocate enough to have at least one
4759 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4760 regs->start[mcnt] = regs->end[mcnt] = -1;
4761 } /* regs && !bufp->no_sub */
4763 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4764 nfailure_points_pushed, nfailure_points_popped,
4765 nfailure_points_pushed - nfailure_points_popped);
4766 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4768 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4772 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4778 /* Otherwise match next pattern command. */
4779 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4781 /* Ignore these. Used to ignore the n of succeed_n's which
4782 currently have n == 0. */
4784 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4788 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4791 /* Match the next n pattern characters exactly. The following
4792 byte in the pattern defines n, and the n bytes after that
4793 are the characters to match. */
4796 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4798 /* This is written out as an if-else so we don't waste time
4799 testing `translate' inside the loop. */
4800 if (TRANSLATE_P (translate))
4805 Emchar pat_ch, buf_ch;
4809 pat_ch = charptr_emchar (p);
4810 buf_ch = charptr_emchar (d);
4811 if (RE_TRANSLATE (buf_ch) != pat_ch)
4814 pat_len = charcount_to_bytecount (p, 1);
4819 #else /* not MULE */
4821 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4833 if (*d++ != *p++) goto fail;
4837 SET_REGS_MATCHED ();
4841 /* Match any character except possibly a newline or a null. */
4843 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4847 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4848 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4851 SET_REGS_MATCHED ();
4852 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4853 INC_CHARPTR (d); /* XEmacs change */
4860 REGISTER unsigned char c;
4861 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4863 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4866 c = TRANSLATE (*d); /* The character to match. */
4868 /* Cast to `unsigned' instead of `unsigned char' in case the
4869 bit list is a full 32 bytes long. */
4870 if (c < (unsigned) (*p * BYTEWIDTH)
4871 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4876 if (!not_p) goto fail;
4878 SET_REGS_MATCHED ();
4879 INC_CHARPTR (d); /* XEmacs change */
4885 case charset_mule_not:
4888 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4890 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4893 c = charptr_emchar ((const Bufbyte *) d);
4894 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4896 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4899 p += unified_range_table_bytes_used (p);
4901 if (!not_p) goto fail;
4903 SET_REGS_MATCHED ();
4910 /* The beginning of a group is represented by start_memory.
4911 The arguments are the register number in the next byte, and the
4912 number of groups inner to this one in the next. The text
4913 matched within the group is recorded (in the internal
4914 registers data structure) under the register number. */
4916 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4918 /* Find out if this group can match the empty string. */
4919 p1 = p; /* To send to group_match_null_string_p. */
4921 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4922 REG_MATCH_NULL_STRING_P (reg_info[*p])
4923 = group_match_null_string_p (&p1, pend, reg_info);
4925 /* Save the position in the string where we were the last time
4926 we were at this open-group operator in case the group is
4927 operated upon by a repetition operator, e.g., with `(a*)*b'
4928 against `ab'; then we want to ignore where we are now in
4929 the string in case this attempt to match fails. */
4930 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4931 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4933 DEBUG_PRINT2 (" old_regstart: %d\n",
4934 POINTER_TO_OFFSET (old_regstart[*p]));
4937 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4939 IS_ACTIVE (reg_info[*p]) = 1;
4940 MATCHED_SOMETHING (reg_info[*p]) = 0;
4942 /* Clear this whenever we change the register activity status. */
4943 set_regs_matched_done = 0;
4945 /* This is the new highest active register. */
4946 highest_active_reg = *p;
4948 /* If nothing was active before, this is the new lowest active
4950 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4951 lowest_active_reg = *p;
4953 /* Move past the register number and inner group count. */
4955 just_past_start_mem = p;
4960 /* The stop_memory opcode represents the end of a group. Its
4961 arguments are the same as start_memory's: the register
4962 number, and the number of inner groups. */
4964 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4966 /* We need to save the string position the last time we were at
4967 this close-group operator in case the group is operated
4968 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4969 against `aba'; then we want to ignore where we are now in
4970 the string in case this attempt to match fails. */
4971 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4972 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4974 DEBUG_PRINT2 (" old_regend: %d\n",
4975 POINTER_TO_OFFSET (old_regend[*p]));
4978 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4980 /* This register isn't active anymore. */
4981 IS_ACTIVE (reg_info[*p]) = 0;
4983 /* Clear this whenever we change the register activity status. */
4984 set_regs_matched_done = 0;
4986 /* If this was the only register active, nothing is active
4988 if (lowest_active_reg == highest_active_reg)
4990 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4991 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4994 { /* We must scan for the new highest active register, since
4995 it isn't necessarily one less than now: consider
4996 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4997 new highest active register is 1. */
4998 unsigned char r = *p - 1;
4999 while (r > 0 && !IS_ACTIVE (reg_info[r]))
5002 /* If we end up at register zero, that means that we saved
5003 the registers as the result of an `on_failure_jump', not
5004 a `start_memory', and we jumped to past the innermost
5005 `stop_memory'. For example, in ((.)*) we save
5006 registers 1 and 2 as a result of the *, but when we pop
5007 back to the second ), we are at the stop_memory 1.
5008 Thus, nothing is active. */
5011 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5012 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5016 highest_active_reg = r;
5018 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
5020 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
5022 lowest_active_reg = r;
5026 /* If just failed to match something this time around with a
5027 group that's operated on by a repetition operator, try to
5028 force exit from the ``loop'', and restore the register
5029 information for this group that we had before trying this
5031 if ((!MATCHED_SOMETHING (reg_info[*p])
5032 || just_past_start_mem == p - 1)
5035 re_bool is_a_jump_n = false;
5039 switch ((re_opcode_t) *p1++)
5043 case pop_failure_jump:
5044 case maybe_pop_jump:
5046 case dummy_failure_jump:
5047 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5057 /* If the next operation is a jump backwards in the pattern
5058 to an on_failure_jump right before the start_memory
5059 corresponding to this stop_memory, exit from the loop
5060 by forcing a failure after pushing on the stack the
5061 on_failure_jump's jump in the pattern, and d. */
5062 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5063 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5065 /* If this group ever matched anything, then restore
5066 what its registers were before trying this last
5067 failed match, e.g., with `(a*)*b' against `ab' for
5068 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5069 against `aba' for regend[3].
5071 Also restore the registers for inner groups for,
5072 e.g., `((a*)(b*))*' against `aba' (register 3 would
5073 otherwise get trashed). */
5075 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5079 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5081 /* Restore this and inner groups' (if any) registers. */
5082 for (r = *p; r < *p + *(p + 1); r++)
5084 regstart[r] = old_regstart[r];
5086 /* xx why this test? */
5087 if (old_regend[r] >= regstart[r])
5088 regend[r] = old_regend[r];
5092 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5093 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5099 /* Move past the register number and the inner group count. */
5104 /* \<digit> has been turned into a `duplicate' command which is
5105 followed by the numeric value of <digit> as the register number. */
5108 REGISTER re_char *d2, *dend2;
5109 int regno = *p++; /* Get which register to match against. */
5110 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5112 /* Can't back reference a group which we've never matched. */
5113 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5116 /* Where in input to try to start matching. */
5117 d2 = regstart[regno];
5119 /* Where to stop matching; if both the place to start and
5120 the place to stop matching are in the same string, then
5121 set to the place to stop, otherwise, for now have to use
5122 the end of the first string. */
5124 dend2 = ((FIRST_STRING_P (regstart[regno])
5125 == FIRST_STRING_P (regend[regno]))
5126 ? regend[regno] : end_match_1);
5129 /* If necessary, advance to next segment in register
5133 if (dend2 == end_match_2) break;
5134 if (dend2 == regend[regno]) break;
5136 /* End of string1 => advance to string2. */
5138 dend2 = regend[regno];
5140 /* At end of register contents => success */
5141 if (d2 == dend2) break;
5143 /* If necessary, advance to next segment in data. */
5146 /* How many characters left in this segment to match. */
5149 /* Want how many consecutive characters we can match in
5150 one shot, so, if necessary, adjust the count. */
5151 if (mcnt > dend2 - d2)
5154 /* Compare that many; failure if mismatch, else move
5156 if (TRANSLATE_P (translate)
5157 ? bcmp_translate ((unsigned char *) d,
5158 (unsigned char *) d2, mcnt, translate)
5159 : memcmp (d, d2, mcnt))
5161 d += mcnt, d2 += mcnt;
5163 /* Do this because we've match some characters. */
5164 SET_REGS_MATCHED ();
5170 /* begline matches the empty string at the beginning of the string
5171 (unless `not_bol' is set in `bufp'), and, if
5172 `newline_anchor' is set, after newlines. */
5174 DEBUG_PRINT1 ("EXECUTING begline.\n");
5176 if (AT_STRINGS_BEG (d))
5178 if (!bufp->not_bol) break;
5180 else if (d[-1] == '\n' && bufp->newline_anchor)
5184 /* In all other cases, we fail. */
5188 /* endline is the dual of begline. */
5190 DEBUG_PRINT1 ("EXECUTING endline.\n");
5192 if (AT_STRINGS_END (d))
5194 if (!bufp->not_eol) break;
5197 /* We have to ``prefetch'' the next character. */
5198 else if ((d == end1 ? *string2 : *d) == '\n'
5199 && bufp->newline_anchor)
5206 /* Match at the very beginning of the data. */
5208 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5209 if (AT_STRINGS_BEG (d))
5214 /* Match at the very end of the data. */
5216 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5217 if (AT_STRINGS_END (d))
5222 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5223 pushes NULL as the value for the string on the stack. Then
5224 `pop_failure_point' will keep the current value for the
5225 string, instead of restoring it. To see why, consider
5226 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5227 then the . fails against the \n. But the next thing we want
5228 to do is match the \n against the \n; if we restored the
5229 string value, we would be back at the foo.
5231 Because this is used only in specific cases, we don't need to
5232 check all the things that `on_failure_jump' does, to make
5233 sure the right things get saved on the stack. Hence we don't
5234 share its code. The only reason to push anything on the
5235 stack at all is that otherwise we would have to change
5236 `anychar's code to do something besides goto fail in this
5237 case; that seems worse than this. */
5238 case on_failure_keep_string_jump:
5239 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5241 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5242 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5244 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5248 /* Uses of on_failure_jump:
5250 Each alternative starts with an on_failure_jump that points
5251 to the beginning of the next alternative. Each alternative
5252 except the last ends with a jump that in effect jumps past
5253 the rest of the alternatives. (They really jump to the
5254 ending jump of the following alternative, because tensioning
5255 these jumps is a hassle.)
5257 Repeats start with an on_failure_jump that points past both
5258 the repetition text and either the following jump or
5259 pop_failure_jump back to this on_failure_jump. */
5260 case on_failure_jump:
5262 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5264 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5265 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5267 /* If this on_failure_jump comes right before a group (i.e.,
5268 the original * applied to a group), save the information
5269 for that group and all inner ones, so that if we fail back
5270 to this point, the group's information will be correct.
5271 For example, in \(a*\)*\1, we need the preceding group,
5272 and in \(\(a*\)b*\)\2, we need the inner group. */
5274 /* We can't use `p' to check ahead because we push
5275 a failure point to `p + mcnt' after we do this. */
5278 /* We need to skip no_op's before we look for the
5279 start_memory in case this on_failure_jump is happening as
5280 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5282 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5285 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5287 /* We have a new highest active register now. This will
5288 get reset at the start_memory we are about to get to,
5289 but we will have saved all the registers relevant to
5290 this repetition op, as described above. */
5291 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5292 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5293 lowest_active_reg = *(p1 + 1);
5296 DEBUG_PRINT1 (":\n");
5297 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5301 /* A smart repeat ends with `maybe_pop_jump'.
5302 We change it to either `pop_failure_jump' or `jump'. */
5303 case maybe_pop_jump:
5304 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5305 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5307 REGISTER unsigned char *p2 = p;
5309 /* Compare the beginning of the repeat with what in the
5310 pattern follows its end. If we can establish that there
5311 is nothing that they would both match, i.e., that we
5312 would have to backtrack because of (as in, e.g., `a*a')
5313 then we can change to pop_failure_jump, because we'll
5314 never have to backtrack.
5316 This is not true in the case of alternatives: in
5317 `(a|ab)*' we do need to backtrack to the `ab' alternative
5318 (e.g., if the string was `ab'). But instead of trying to
5319 detect that here, the alternative has put on a dummy
5320 failure point which is what we will end up popping. */
5322 /* Skip over open/close-group commands.
5323 If what follows this loop is a ...+ construct,
5324 look at what begins its body, since we will have to
5325 match at least one of that. */
5329 && ((re_opcode_t) *p2 == stop_memory
5330 || (re_opcode_t) *p2 == start_memory))
5332 else if (p2 + 6 < pend
5333 && (re_opcode_t) *p2 == dummy_failure_jump)
5340 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5341 to the `maybe_finalize_jump' of this case. Examine what
5344 /* If we're at the end of the pattern, we can change. */
5347 /* Consider what happens when matching ":\(.*\)"
5348 against ":/". I don't really understand this code
5350 p[-3] = (unsigned char) pop_failure_jump;
5352 (" End of pattern: change to `pop_failure_jump'.\n");
5355 else if ((re_opcode_t) *p2 == exactn
5356 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5358 REGISTER unsigned char c
5359 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5361 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5363 p[-3] = (unsigned char) pop_failure_jump;
5364 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5368 else if ((re_opcode_t) p1[3] == charset
5369 || (re_opcode_t) p1[3] == charset_not)
5371 int not_p = (re_opcode_t) p1[3] == charset_not;
5373 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5374 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5377 /* `not_p' is equal to 1 if c would match, which means
5378 that we can't change to pop_failure_jump. */
5381 p[-3] = (unsigned char) pop_failure_jump;
5382 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5386 else if ((re_opcode_t) *p2 == charset)
5389 REGISTER unsigned char c
5390 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5393 if ((re_opcode_t) p1[3] == exactn
5394 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5395 && (p2[2 + p1[5] / BYTEWIDTH]
5396 & (1 << (p1[5] % BYTEWIDTH)))))
5398 p[-3] = (unsigned char) pop_failure_jump;
5399 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5403 else if ((re_opcode_t) p1[3] == charset_not)
5406 /* We win if the charset_not inside the loop
5407 lists every character listed in the charset after. */
5408 for (idx = 0; idx < (int) p2[1]; idx++)
5409 if (! (p2[2 + idx] == 0
5410 || (idx < (int) p1[4]
5411 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5416 p[-3] = (unsigned char) pop_failure_jump;
5417 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5420 else if ((re_opcode_t) p1[3] == charset)
5423 /* We win if the charset inside the loop
5424 has no overlap with the one after the loop. */
5426 idx < (int) p2[1] && idx < (int) p1[4];
5428 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5431 if (idx == p2[1] || idx == p1[4])
5433 p[-3] = (unsigned char) pop_failure_jump;
5434 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5439 p -= 2; /* Point at relative address again. */
5440 if ((re_opcode_t) p[-1] != pop_failure_jump)
5442 p[-1] = (unsigned char) jump;
5443 DEBUG_PRINT1 (" Match => jump.\n");
5444 goto unconditional_jump;
5446 /* Note fall through. */
5449 /* The end of a simple repeat has a pop_failure_jump back to
5450 its matching on_failure_jump, where the latter will push a
5451 failure point. The pop_failure_jump takes off failure
5452 points put on by this pop_failure_jump's matching
5453 on_failure_jump; we got through the pattern to here from the
5454 matching on_failure_jump, so didn't fail. */
5455 case pop_failure_jump:
5457 /* We need to pass separate storage for the lowest and
5458 highest registers, even though we don't care about the
5459 actual values. Otherwise, we will restore only one
5460 register from the stack, since lowest will == highest in
5461 `pop_failure_point'. */
5462 unsigned dummy_low_reg, dummy_high_reg;
5463 unsigned char *pdummy;
5464 re_char *sdummy = NULL;
5466 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5467 POP_FAILURE_POINT (sdummy, pdummy,
5468 dummy_low_reg, dummy_high_reg,
5469 reg_dummy, reg_dummy, reg_info_dummy);
5471 /* Note fall through. */
5474 /* Unconditionally jump (without popping any failure points). */
5477 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5478 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5479 p += mcnt; /* Do the jump. */
5480 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5484 /* We need this opcode so we can detect where alternatives end
5485 in `group_match_null_string_p' et al. */
5487 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5488 goto unconditional_jump;
5491 /* Normally, the on_failure_jump pushes a failure point, which
5492 then gets popped at pop_failure_jump. We will end up at
5493 pop_failure_jump, also, and with a pattern of, say, `a+', we
5494 are skipping over the on_failure_jump, so we have to push
5495 something meaningless for pop_failure_jump to pop. */
5496 case dummy_failure_jump:
5497 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5498 /* It doesn't matter what we push for the string here. What
5499 the code at `fail' tests is the value for the pattern. */
5500 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5501 goto unconditional_jump;
5504 /* At the end of an alternative, we need to push a dummy failure
5505 point in case we are followed by a `pop_failure_jump', because
5506 we don't want the failure point for the alternative to be
5507 popped. For example, matching `(a|ab)*' against `aab'
5508 requires that we match the `ab' alternative. */
5509 case push_dummy_failure:
5510 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5511 /* See comments just above at `dummy_failure_jump' about the
5513 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5516 /* Have to succeed matching what follows at least n times.
5517 After that, handle like `on_failure_jump'. */
5519 EXTRACT_NUMBER (mcnt, p + 2);
5520 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5523 /* Originally, this is how many times we HAVE to succeed. */
5528 STORE_NUMBER_AND_INCR (p, mcnt);
5529 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5533 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5535 p[2] = (unsigned char) no_op;
5536 p[3] = (unsigned char) no_op;
5542 EXTRACT_NUMBER (mcnt, p + 2);
5543 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5545 /* Originally, this is how many times we CAN jump. */
5549 STORE_NUMBER (p + 2, mcnt);
5550 goto unconditional_jump;
5552 /* If don't have to jump any more, skip over the rest of command. */
5559 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5561 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5563 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5564 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5565 STORE_NUMBER (p1, mcnt);
5570 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5576 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5580 re_char *d_before = POS_BEFORE_GAP_UNSAFE (d);
5581 re_char *d_after = POS_AFTER_GAP_UNSAFE (d);
5583 /* emch1 is the character before d, syn1 is the syntax of emch1,
5584 emch2 is the character at d, and syn2 is the syntax of emch2. */
5585 Emchar emch1, emch2;
5591 DEC_CHARPTR (d_before);
5592 emch1 = charptr_emchar (d_before);
5593 emch2 = charptr_emchar (d_after);
5596 pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5597 UPDATE_SYNTAX_CACHE (pos_before);
5599 syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5602 UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1);
5604 syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5607 result = ((syn1 == Sword) != (syn2 == Sword));
5609 if (result == should_succeed)
5615 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5617 goto matchwordbound;
5620 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5621 if (AT_STRINGS_END (d))
5624 /* XEmacs: this originally read:
5626 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5630 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5631 Emchar emch = charptr_emchar (dtmp);
5633 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5634 UPDATE_SYNTAX_CACHE (charpos);
5636 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5639 if (AT_STRINGS_BEG (d))
5641 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5643 emch = charptr_emchar (dtmp);
5645 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5647 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5654 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5655 if (AT_STRINGS_BEG (d))
5658 /* XEmacs: this originally read:
5660 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5661 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5664 The or condition is incorrect (reversed).
5669 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5670 UPDATE_SYNTAX_CACHE (charpos);
5672 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5674 emch = charptr_emchar (dtmp);
5675 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5678 if (AT_STRINGS_END (d))
5680 dtmp = POS_AFTER_GAP_UNSAFE (d);
5681 emch = charptr_emchar (dtmp);
5683 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5685 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5693 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5694 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5695 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5696 >= BUF_PT (regex_emacs_buffer)))
5701 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5702 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5703 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5704 != BUF_PT (regex_emacs_buffer)))
5709 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5710 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5711 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5712 <= BUF_PT (regex_emacs_buffer)))
5715 #if 0 /* not emacs19 */
5717 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5718 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5719 != BUF_PT (regex_emacs_buffer))
5722 #endif /* not emacs19 */
5725 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5730 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5742 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5743 UPDATE_SYNTAX_CACHE (charpos);
5747 emch = charptr_emchar ((const Bufbyte *) d);
5749 matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->syntax_table,
5750 emch) == (enum syntaxcode) mcnt);
5752 matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->mirror_syntax_table,
5753 emch) == (enum syntaxcode) mcnt);
5756 if (matches != should_succeed)
5758 SET_REGS_MATCHED ();
5763 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5765 goto matchnotsyntax;
5768 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5772 goto matchornotsyntax;
5775 /* 97/2/17 jhod Mule category code patch */
5784 emch = charptr_emchar ((const Bufbyte *) d);
5786 if (check_category_char(emch, regex_emacs_buffer->category_table,
5787 mcnt, should_succeed))
5789 SET_REGS_MATCHED ();
5793 case notcategoryspec:
5795 goto matchornotcategory;
5796 /* end of category patch */
5798 #else /* not emacs */
5800 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5802 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5804 SET_REGS_MATCHED ();
5809 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5811 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5813 SET_REGS_MATCHED ();
5821 continue; /* Successfully executed one pattern command; keep going. */
5824 /* We goto here if a matching operation fails. */
5826 if (!FAIL_STACK_EMPTY ())
5827 { /* A restart point is known. Restore to that state. */
5828 DEBUG_PRINT1 ("\nFAIL:\n");
5829 POP_FAILURE_POINT (d, p,
5830 lowest_active_reg, highest_active_reg,
5831 regstart, regend, reg_info);
5833 /* If this failure point is a dummy, try the next one. */
5837 /* If we failed to the end of the pattern, don't examine *p. */
5841 re_bool is_a_jump_n = false;
5843 /* If failed to a backwards jump that's part of a repetition
5844 loop, need to pop this failure point and use the next one. */
5845 switch ((re_opcode_t) *p)
5849 case maybe_pop_jump:
5850 case pop_failure_jump:
5853 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5856 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5858 && (re_opcode_t) *p1 == on_failure_jump))
5866 if (d >= string1 && d <= end1)
5870 break; /* Matching at this starting point really fails. */
5874 goto restore_best_regs;
5878 return -1; /* Failure to match. */
5881 /* Subroutine definitions for re_match_2. */
5884 /* We are passed P pointing to a register number after a start_memory.
5886 Return true if the pattern up to the corresponding stop_memory can
5887 match the empty string, and false otherwise.
5889 If we find the matching stop_memory, sets P to point to one past its number.
5890 Otherwise, sets P to an undefined byte less than or equal to END.
5892 We don't handle duplicates properly (yet). */
5895 group_match_null_string_p (unsigned char **p, unsigned char *end,
5896 register_info_type *reg_info)
5899 /* Point to after the args to the start_memory. */
5900 unsigned char *p1 = *p + 2;
5904 /* Skip over opcodes that can match nothing, and return true or
5905 false, as appropriate, when we get to one that can't, or to the
5906 matching stop_memory. */
5908 switch ((re_opcode_t) *p1)
5910 /* Could be either a loop or a series of alternatives. */
5911 case on_failure_jump:
5913 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5915 /* If the next operation is not a jump backwards in the
5920 /* Go through the on_failure_jumps of the alternatives,
5921 seeing if any of the alternatives cannot match nothing.
5922 The last alternative starts with only a jump,
5923 whereas the rest start with on_failure_jump and end
5924 with a jump, e.g., here is the pattern for `a|b|c':
5926 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5927 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5930 So, we have to first go through the first (n-1)
5931 alternatives and then deal with the last one separately. */
5934 /* Deal with the first (n-1) alternatives, which start
5935 with an on_failure_jump (see above) that jumps to right
5936 past a jump_past_alt. */
5938 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5940 /* `mcnt' holds how many bytes long the alternative
5941 is, including the ending `jump_past_alt' and
5944 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5948 /* Move to right after this alternative, including the
5952 /* Break if it's the beginning of an n-th alternative
5953 that doesn't begin with an on_failure_jump. */
5954 if ((re_opcode_t) *p1 != on_failure_jump)
5957 /* Still have to check that it's not an n-th
5958 alternative that starts with an on_failure_jump. */
5960 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5961 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5963 /* Get to the beginning of the n-th alternative. */
5969 /* Deal with the last alternative: go back and get number
5970 of the `jump_past_alt' just before it. `mcnt' contains
5971 the length of the alternative. */
5972 EXTRACT_NUMBER (mcnt, p1 - 2);
5974 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5977 p1 += mcnt; /* Get past the n-th alternative. */
5983 assert (p1[1] == **p);
5989 if (!common_op_match_null_string_p (&p1, end, reg_info))
5992 } /* while p1 < end */
5995 } /* group_match_null_string_p */
5998 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5999 It expects P to be the first byte of a single alternative and END one
6000 byte past the last. The alternative can contain groups. */
6003 alt_match_null_string_p (unsigned char *p, unsigned char *end,
6004 register_info_type *reg_info)
6007 unsigned char *p1 = p;
6011 /* Skip over opcodes that can match nothing, and break when we get
6012 to one that can't. */
6014 switch ((re_opcode_t) *p1)
6017 case on_failure_jump:
6019 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6024 if (!common_op_match_null_string_p (&p1, end, reg_info))
6027 } /* while p1 < end */
6030 } /* alt_match_null_string_p */
6033 /* Deals with the ops common to group_match_null_string_p and
6034 alt_match_null_string_p.
6036 Sets P to one after the op and its arguments, if any. */
6039 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
6040 register_info_type *reg_info)
6045 unsigned char *p1 = *p;
6047 switch ((re_opcode_t) *p1++)
6067 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6068 ret = group_match_null_string_p (&p1, end, reg_info);
6070 /* Have to set this here in case we're checking a group which
6071 contains a group and a back reference to it. */
6073 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6074 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6080 /* If this is an optimized succeed_n for zero times, make the jump. */
6082 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6090 /* Get to the number of times to succeed. */
6092 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6097 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6105 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6113 /* All other opcodes mean we cannot match the empty string. */
6119 } /* common_op_match_null_string_p */
6122 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6123 bytes; nonzero otherwise. */
6126 bcmp_translate (re_char *s1, re_char *s2,
6127 REGISTER int len, RE_TRANSLATE_TYPE translate)
6129 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6131 const unsigned char *p1_end = s1 + len;
6132 const unsigned char *p2_end = s2 + len;
6134 while (p1 != p1_end && p2 != p2_end)
6136 Emchar p1_ch, p2_ch;
6138 p1_ch = charptr_emchar (p1);
6139 p2_ch = charptr_emchar (p2);
6141 if (RE_TRANSLATE (p1_ch)
6142 != RE_TRANSLATE (p2_ch))
6147 #else /* not MULE */
6150 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6157 /* Entry points for GNU code. */
6159 /* re_compile_pattern is the GNU regular expression compiler: it
6160 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6161 Returns 0 if the pattern was valid, otherwise an error string.
6163 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6164 are set in BUFP on entry.
6166 We call regex_compile to do the actual compilation. */
6169 re_compile_pattern (const char *pattern, int length,
6170 struct re_pattern_buffer *bufp)
6174 /* GNU code is written to assume at least RE_NREGS registers will be set
6175 (and at least one extra will be -1). */
6176 bufp->regs_allocated = REGS_UNALLOCATED;
6178 /* And GNU code determines whether or not to get register information
6179 by passing null for the REGS argument to re_match, etc., not by
6183 /* Match anchors at newline. */
6184 bufp->newline_anchor = 1;
6186 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6190 return gettext (re_error_msgid[(int) ret]);
6193 /* Entry points compatible with 4.2 BSD regex library. We don't define
6194 them unless specifically requested. */
6196 #ifdef _REGEX_RE_COMP
6198 /* BSD has one and only one pattern buffer. */
6199 static struct re_pattern_buffer re_comp_buf;
6202 re_comp (const char *s)
6208 if (!re_comp_buf.buffer)
6209 return gettext ("No previous regular expression");
6213 if (!re_comp_buf.buffer)
6215 re_comp_buf.buffer = (unsigned char *) malloc (200);
6216 if (re_comp_buf.buffer == NULL)
6217 return gettext (re_error_msgid[(int) REG_ESPACE]);
6218 re_comp_buf.allocated = 200;
6220 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6221 if (re_comp_buf.fastmap == NULL)
6222 return gettext (re_error_msgid[(int) REG_ESPACE]);
6225 /* Since `re_exec' always passes NULL for the `regs' argument, we
6226 don't need to initialize the pattern buffer fields which affect it. */
6228 /* Match anchors at newlines. */
6229 re_comp_buf.newline_anchor = 1;
6231 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6236 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6237 return (char *) gettext (re_error_msgid[(int) ret]);
6242 re_exec (const char *s)
6244 const int len = strlen (s);
6246 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6248 #endif /* _REGEX_RE_COMP */
6250 /* POSIX.2 functions. Don't define these for Emacs. */
6254 /* regcomp takes a regular expression as a string and compiles it.
6256 PREG is a regex_t *. We do not expect any fields to be initialized,
6257 since POSIX says we shouldn't. Thus, we set
6259 `buffer' to the compiled pattern;
6260 `used' to the length of the compiled pattern;
6261 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6262 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6263 RE_SYNTAX_POSIX_BASIC;
6264 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6265 `fastmap' and `fastmap_accurate' to zero;
6266 `re_nsub' to the number of subexpressions in PATTERN.
6268 PATTERN is the address of the pattern string.
6270 CFLAGS is a series of bits which affect compilation.
6272 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6273 use POSIX basic syntax.
6275 If REG_NEWLINE is set, then . and [^...] don't match newline.
6276 Also, regexec will try a match beginning after every newline.
6278 If REG_ICASE is set, then we considers upper- and lowercase
6279 versions of letters to be equivalent when matching.
6281 If REG_NOSUB is set, then when PREG is passed to regexec, that
6282 routine will report only success or failure, and nothing about the
6285 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6286 the return codes and their meanings.) */
6289 regcomp (regex_t *preg, const char *pattern, int cflags)
6293 = (cflags & REG_EXTENDED) ?
6294 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6296 /* regex_compile will allocate the space for the compiled pattern. */
6298 preg->allocated = 0;
6301 /* Don't bother to use a fastmap when searching. This simplifies the
6302 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6303 characters after newlines into the fastmap. This way, we just try
6307 if (cflags & REG_ICASE)
6311 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6312 if (preg->translate == NULL)
6313 return (int) REG_ESPACE;
6315 /* Map uppercase characters to corresponding lowercase ones. */
6316 for (i = 0; i < CHAR_SET_SIZE; i++)
6317 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6320 preg->translate = NULL;
6322 /* If REG_NEWLINE is set, newlines are treated differently. */
6323 if (cflags & REG_NEWLINE)
6324 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6325 syntax &= ~RE_DOT_NEWLINE;
6326 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6327 /* It also changes the matching behavior. */
6328 preg->newline_anchor = 1;
6331 preg->newline_anchor = 0;
6333 preg->no_sub = !!(cflags & REG_NOSUB);
6335 /* POSIX says a null character in the pattern terminates it, so we
6336 can use strlen here in compiling the pattern. */
6337 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6339 /* POSIX doesn't distinguish between an unmatched open-group and an
6340 unmatched close-group: both are REG_EPAREN. */
6341 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6347 /* regexec searches for a given pattern, specified by PREG, in the
6350 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6351 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6352 least NMATCH elements, and we set them to the offsets of the
6353 corresponding matched substrings.
6355 EFLAGS specifies `execution flags' which affect matching: if
6356 REG_NOTBOL is set, then ^ does not match at the beginning of the
6357 string; if REG_NOTEOL is set, then $ does not match at the end.
6359 We return 0 if we find a match and REG_NOMATCH if not. */
6362 regexec (const regex_t *preg, const char *string, size_t nmatch,
6363 regmatch_t pmatch[], int eflags)
6366 struct re_registers regs;
6367 regex_t private_preg;
6368 int len = strlen (string);
6369 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6371 private_preg = *preg;
6373 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6374 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6376 /* The user has told us exactly how many registers to return
6377 information about, via `nmatch'. We have to pass that on to the
6378 matching routines. */
6379 private_preg.regs_allocated = REGS_FIXED;
6383 regs.num_regs = nmatch;
6384 regs.start = TALLOC (nmatch, regoff_t);
6385 regs.end = TALLOC (nmatch, regoff_t);
6386 if (regs.start == NULL || regs.end == NULL)
6387 return (int) REG_NOMATCH;
6390 /* Perform the searching operation. */
6391 ret = re_search (&private_preg, string, len,
6392 /* start: */ 0, /* range: */ len,
6393 want_reg_info ? ®s : (struct re_registers *) 0);
6395 /* Copy the register information to the POSIX structure. */
6402 for (r = 0; r < nmatch; r++)
6404 pmatch[r].rm_so = regs.start[r];
6405 pmatch[r].rm_eo = regs.end[r];
6409 /* If we needed the temporary register info, free the space now. */
6414 /* We want zero return to mean success, unlike `re_search'. */
6415 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6419 /* Returns a message corresponding to an error code, ERRCODE, returned
6420 from either regcomp or regexec. We don't use PREG here. */
6423 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
6429 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6430 /* Only error codes returned by the rest of the code should be passed
6431 to this routine. If we are given anything else, or if other regex
6432 code generates an invalid error code, then the program has a bug.
6433 Dump core so we can fix it. */
6436 msg = gettext (re_error_msgid[errcode]);
6438 msg_size = strlen (msg) + 1; /* Includes the null. */
6440 if (errbuf_size != 0)
6442 if (msg_size > errbuf_size)
6444 strncpy (errbuf, msg, errbuf_size - 1);
6445 errbuf[errbuf_size - 1] = 0;
6448 strcpy (errbuf, msg);
6455 /* Free dynamically allocated space used by PREG. */
6458 regfree (regex_t *preg)
6460 if (preg->buffer != NULL)
6461 free (preg->buffer);
6462 preg->buffer = NULL;
6464 preg->allocated = 0;
6467 if (preg->fastmap != NULL)
6468 free (preg->fastmap);
6469 preg->fastmap = NULL;
6470 preg->fastmap_accurate = 0;
6472 if (preg->translate != NULL)
6473 free (preg->translate);
6474 preg->translate = NULL;
6477 #endif /* not emacs */
6481 make-backup-files: t
6483 trim-versions-without-asking: nil