1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA. */
25 /* Synched up with: FSF 19.29. */
27 /* Changes made for XEmacs:
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead.
34 (3) Lots and lots of changes for Mule. They are bracketed by
35 `#ifdef MULE' or with comments that have `XEmacs' in them.
42 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
51 /* Converts the pointer to the char to BEG-based offset from the start. */
52 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
53 ? (d) - string1 : (d) - (string2 - size1))
55 #define PTR_TO_OFFSET(d) 0
58 /* We assume non-Mule if emacs isn't defined. */
63 /* We need this for `regex.h', and perhaps for the Emacs include files. */
64 #include <sys/types.h>
66 /* This is for other GNU distributions with internationalized messages. */
67 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
70 # define gettext(msgid) (msgid)
73 /* XEmacs: define this to add in a speedup for patterns anchored at
74 the beginning of a line. Keep the ifdefs so that it's easier to
75 tell where/why this code has diverged from v19. */
76 #define REGEX_BEGLINE_CHECK
78 /* XEmacs: the current mmap-based ralloc handles small blocks very
79 poorly, so we disable it here. */
81 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
85 /* The `emacs' switch turns on certain matching commands
86 that make sense only in Emacs. */
93 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
99 Lisp_Object Vthe_lisp_rangetab;
102 complex_vars_of_regex (void)
104 Vthe_lisp_rangetab = Fmake_range_table ();
105 staticpro (&Vthe_lisp_rangetab);
111 complex_vars_of_regex (void)
117 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
118 #define TRANSLATE_P(tr) (!NILP (tr))
120 #else /* not emacs */
122 /* If we are not linking with Emacs proper,
123 we can't use the relocating allocator
124 even if config.h says that we can. */
127 #if defined (STDC_HEADERS) || defined (_LIBC)
134 #define charptr_emchar(str) ((Emchar) (str)[0])
136 #if (LONGBITS > INTBITS)
137 # define EMACS_INT long
139 # define EMACS_INT int
144 #define INC_CHARPTR(p) ((p)++)
145 #define DEC_CHARPTR(p) ((p)--)
149 /* Define the syntax stuff for \<, \>, etc. */
151 /* This must be nonzero for the wordchar and notwordchar pattern
152 commands in re_match_2. */
159 extern char *re_syntax_table;
161 #else /* not SYNTAX_TABLE */
163 /* How many characters in the character set. */
164 #define CHAR_SET_SIZE 256
166 static char re_syntax_table[CHAR_SET_SIZE];
169 init_syntax_once (void)
175 const char *word_syntax_chars =
176 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
178 memset (re_syntax_table, 0, sizeof (re_syntax_table));
180 while (*word_syntax_chars)
181 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
187 #endif /* SYNTAX_TABLE */
189 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
190 #undef SYNTAX_FROM_CACHE
191 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
193 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
194 #define TRANSLATE_P(tr) tr
198 /* Under XEmacs, this is needed because we don't define it elsewhere. */
199 #ifdef SWITCH_ENUM_BUG
200 #define SWITCH_ENUM_CAST(x) ((int)(x))
202 #define SWITCH_ENUM_CAST(x) (x)
206 /* Get the interface, including the syntax bits. */
209 /* isalpha etc. are used for the character classes. */
212 /* Jim Meyering writes:
214 "... Some ctype macros are valid only for character codes that
215 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
216 using /bin/cc or gcc but without giving an ansi option). So, all
217 ctype uses should be through macros like ISPRINT... If
218 STDC_HEADERS is defined, then autoconf has verified that the ctype
219 macros don't need to be guarded with references to isascii. ...
220 Defining isascii to 1 should let any compiler worth its salt
221 eliminate the && through constant folding." */
223 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
224 #define ISASCII_1(c) 1
226 #define ISASCII_1(c) isascii(c)
230 /* The IS*() macros can be passed any character, including an extended
231 one. We need to make sure there are no crashes, which would occur
232 otherwise due to out-of-bounds array references. */
233 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
235 #define ISASCII(c) ISASCII_1 (c)
239 #define ISBLANK(c) (ISASCII (c) && isblank (c))
241 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
244 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
246 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
249 #define ISPRINT(c) (ISASCII (c) && isprint (c))
250 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
251 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
252 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
253 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
254 #define ISLOWER(c) (ISASCII (c) && islower (c))
255 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
256 #define ISSPACE(c) (ISASCII (c) && isspace (c))
257 #define ISUPPER(c) (ISASCII (c) && isupper (c))
258 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
261 #define NULL (void *)0
264 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
265 since ours (we hope) works properly with all combinations of
266 machines, compilers, `char' and `unsigned char' argument types.
267 (Per Bothner suggested the basic approach.) */
268 #undef SIGN_EXTEND_CHAR
270 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
271 #else /* not __STDC__ */
272 /* As in Harbison and Steele. */
273 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
276 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
277 use `alloca' instead of `malloc'. This is because using malloc in
278 re_search* or re_match* could cause memory leaks when C-g is used in
279 Emacs; also, malloc is slower and causes storage fragmentation. On
280 the other hand, malloc is more portable, and easier to debug.
282 Because we sometimes use alloca, some routines have to be macros,
283 not functions -- `alloca'-allocated space disappears at the end of the
284 function it is called in. */
288 #define REGEX_ALLOCATE malloc
289 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
290 #define REGEX_FREE free
292 #else /* not REGEX_MALLOC */
294 /* Emacs already defines alloca, sometimes. */
297 /* Make alloca work the best possible way. */
299 #define alloca __builtin_alloca
300 #else /* not __GNUC__ */
303 #else /* not __GNUC__ or HAVE_ALLOCA_H */
304 #ifndef _AIX /* Already did AIX, up at the top. */
306 #endif /* not _AIX */
307 #endif /* HAVE_ALLOCA_H */
308 #endif /* __GNUC__ */
310 #endif /* not alloca */
312 #define REGEX_ALLOCATE alloca
314 /* Assumes a `char *destination' variable. */
315 #define REGEX_REALLOCATE(source, osize, nsize) \
316 (destination = (char *) alloca (nsize), \
317 memmove (destination, source, osize), \
320 /* No need to do anything to free, after alloca. */
321 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
323 #endif /* REGEX_MALLOC */
325 /* Define how to allocate the failure stack. */
328 #define REGEX_ALLOCATE_STACK(size) \
329 r_alloc ((char **) &failure_stack_ptr, (size))
330 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
331 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
332 #define REGEX_FREE_STACK(ptr) \
333 r_alloc_free ((void **) &failure_stack_ptr)
335 #else /* not REL_ALLOC */
339 #define REGEX_ALLOCATE_STACK malloc
340 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
341 #define REGEX_FREE_STACK free
343 #else /* not REGEX_MALLOC */
345 #define REGEX_ALLOCATE_STACK alloca
347 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
348 REGEX_REALLOCATE (source, osize, nsize)
349 /* No need to explicitly free anything. */
350 #define REGEX_FREE_STACK(arg)
352 #endif /* REGEX_MALLOC */
353 #endif /* REL_ALLOC */
356 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
357 `string1' or just past its end. This works if PTR is NULL, which is
359 #define FIRST_STRING_P(ptr) \
360 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
362 /* (Re)Allocate N items of type T using malloc, or fail. */
363 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
364 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
365 #define RETALLOC_IF(addr, n, t) \
366 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
367 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
369 #define BYTEWIDTH 8 /* In bits. */
371 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
375 #define MAX(a, b) ((a) > (b) ? (a) : (b))
376 #define MIN(a, b) ((a) < (b) ? (a) : (b))
378 /* Type of source-pattern and string chars. */
379 typedef const unsigned char re_char;
381 typedef char re_bool;
386 /* These are the command codes that appear in compiled regular
387 expressions. Some opcodes are followed by argument bytes. A
388 command code can specify any interpretation whatsoever for its
389 arguments. Zero bytes may appear in the compiled regular expression. */
395 /* Succeed right away--no more backtracking. */
398 /* Followed by one byte giving n, then by n literal bytes. */
401 /* Matches any (more or less) character. */
404 /* Matches any one char belonging to specified set. First
405 following byte is number of bitmap bytes. Then come bytes
406 for a bitmap saying which chars are in. Bits in each byte
407 are ordered low-bit-first. A character is in the set if its
408 bit is 1. A character too large to have a bit in the map is
409 automatically not in the set. */
412 /* Same parameters as charset, but match any character that is
413 not one of those specified. */
416 /* Start remembering the text that is matched, for storing in a
417 register. Followed by one byte with the register number, in
418 the range 0 to one less than the pattern buffer's re_nsub
419 field. Then followed by one byte with the number of groups
420 inner to this one. (This last has to be part of the
421 start_memory only because we need it in the on_failure_jump
425 /* Stop remembering the text that is matched and store it in a
426 memory register. Followed by one byte with the register
427 number, in the range 0 to one less than `re_nsub' in the
428 pattern buffer, and one byte with the number of inner groups,
429 just like `start_memory'. (We need the number of inner
430 groups here because we don't have any easy way of finding the
431 corresponding start_memory when we're at a stop_memory.) */
434 /* Match a duplicate of something remembered. Followed by one
435 byte containing the register number. */
438 /* Fail unless at beginning of line. */
441 /* Fail unless at end of line. */
444 /* Succeeds if at beginning of buffer (if emacs) or at beginning
445 of string to be matched (if not). */
448 /* Analogously, for end of buffer/string. */
451 /* Followed by two byte relative address to which to jump. */
454 /* Same as jump, but marks the end of an alternative. */
457 /* Followed by two-byte relative address of place to resume at
458 in case of failure. */
461 /* Like on_failure_jump, but pushes a placeholder instead of the
462 current string position when executed. */
463 on_failure_keep_string_jump,
465 /* Throw away latest failure point and then jump to following
466 two-byte relative address. */
469 /* Change to pop_failure_jump if know won't have to backtrack to
470 match; otherwise change to jump. This is used to jump
471 back to the beginning of a repeat. If what follows this jump
472 clearly won't match what the repeat does, such that we can be
473 sure that there is no use backtracking out of repetitions
474 already matched, then we change it to a pop_failure_jump.
475 Followed by two-byte address. */
478 /* Jump to following two-byte address, and push a dummy failure
479 point. This failure point will be thrown away if an attempt
480 is made to use it for a failure. A `+' construct makes this
481 before the first repeat. Also used as an intermediary kind
482 of jump when compiling an alternative. */
485 /* Push a dummy failure point and continue. Used at the end of
489 /* Followed by two-byte relative address and two-byte number n.
490 After matching N times, jump to the address upon failure. */
493 /* Followed by two-byte relative address, and two-byte number n.
494 Jump to the address N times, then fail. */
497 /* Set the following two-byte relative address to the
498 subsequent two-byte number. The address *includes* the two
502 wordchar, /* Matches any word-constituent character. */
503 notwordchar, /* Matches any char that is not a word-constituent. */
505 wordbeg, /* Succeeds if at word beginning. */
506 wordend, /* Succeeds if at word end. */
508 wordbound, /* Succeeds if at a word boundary. */
509 notwordbound /* Succeeds if not at a word boundary. */
512 ,before_dot, /* Succeeds if before point. */
513 at_dot, /* Succeeds if at point. */
514 after_dot, /* Succeeds if after point. */
516 /* Matches any character whose syntax is specified. Followed by
517 a byte which contains a syntax code, e.g., Sword. */
520 /* Matches any character whose syntax is not that specified. */
526 /* need extra stuff to be able to properly work with XEmacs/Mule
527 characters (which may take up more than one byte) */
529 ,charset_mule, /* Matches any character belonging to specified set.
530 The set is stored in "unified range-table
531 format"; see rangetab.c. Unlike the `charset'
532 opcode, this can handle arbitrary characters. */
534 charset_mule_not /* Same parameters as charset_mule, but match any
535 character that is not one of those specified. */
537 /* 97/2/17 jhod: The following two were merged back in from the Mule
538 2.3 code to enable some language specific processing */
539 ,categoryspec, /* Matches entries in the character category tables */
540 notcategoryspec /* The opposite of the above */
545 /* Common operations on the compiled pattern. */
547 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
549 #define STORE_NUMBER(destination, number) \
551 (destination)[0] = (number) & 0377; \
552 (destination)[1] = (number) >> 8; \
555 /* Same as STORE_NUMBER, except increment DESTINATION to
556 the byte after where the number is stored. Therefore, DESTINATION
557 must be an lvalue. */
559 #define STORE_NUMBER_AND_INCR(destination, number) \
561 STORE_NUMBER (destination, number); \
562 (destination) += 2; \
565 /* Put into DESTINATION a number stored in two contiguous bytes starting
568 #define EXTRACT_NUMBER(destination, source) \
570 (destination) = *(source) & 0377; \
571 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
576 extract_number (int *dest, re_char *source)
578 int temp = SIGN_EXTEND_CHAR (*(source + 1));
579 *dest = *source & 0377;
583 #ifndef EXTRACT_MACROS /* To debug the macros. */
584 #undef EXTRACT_NUMBER
585 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
586 #endif /* not EXTRACT_MACROS */
590 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
591 SOURCE must be an lvalue. */
593 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
595 EXTRACT_NUMBER (destination, source); \
601 extract_number_and_incr (int *destination, unsigned char **source)
603 extract_number (destination, *source);
607 #ifndef EXTRACT_MACROS
608 #undef EXTRACT_NUMBER_AND_INCR
609 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
610 extract_number_and_incr (&dest, &src)
611 #endif /* not EXTRACT_MACROS */
615 /* If DEBUG is defined, Regex prints many voluminous messages about what
616 it is doing (if the variable `debug' is nonzero). If linked with the
617 main program in `iregex.c', you can enter patterns and strings
618 interactively. And if linked with the main program in `main.c' and
619 the other test files, you can run the already-written tests. */
623 /* We use standard I/O for debugging. */
627 /* XEmacs provides its own version of assert() */
628 /* It is useful to test things that ``must'' be true when debugging. */
632 static int debug = 0;
634 #define DEBUG_STATEMENT(e) e
635 #define DEBUG_PRINT1(x) if (debug) printf (x)
636 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
637 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
638 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
639 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
640 if (debug) print_partial_compiled_pattern (s, e)
641 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
642 if (debug) print_double_string (w, s1, sz1, s2, sz2)
645 /* Print the fastmap in human-readable form. */
648 print_fastmap (char *fastmap)
650 unsigned was_a_range = 0;
653 while (i < (1 << BYTEWIDTH))
659 while (i < (1 << BYTEWIDTH) && fastmap[i])
675 /* Print a compiled pattern string in human-readable form, starting at
676 the START pointer into it and ending just before the pointer END. */
679 print_partial_compiled_pattern (re_char *start, re_char *end)
682 unsigned char *p = (unsigned char *) start;
691 /* Loop over pattern commands. */
694 printf ("%ld:\t", (long)(p - start));
696 switch ((re_opcode_t) *p++)
704 printf ("/exactn/%d", mcnt);
715 printf ("/start_memory/%d/%d", mcnt, *p++);
720 printf ("/stop_memory/%d/%d", mcnt, *p++);
724 printf ("/duplicate/%d", *p++);
734 REGISTER int c, last = -100;
735 REGISTER int in_range = 0;
737 printf ("/charset [%s",
738 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
740 assert (p + *p < pend);
742 for (c = 0; c < 256; c++)
743 if (((unsigned char) (c / 8) < *p)
744 && (p[1 + (c/8)] & (1 << (c % 8))))
746 /* Are we starting a range? */
747 if (last + 1 == c && ! in_range)
752 /* Have we broken a range? */
753 else if (last + 1 != c && in_range)
776 case charset_mule_not:
780 printf ("/charset_mule [%s",
781 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
782 nentries = unified_range_table_nentries (p);
783 for (i = 0; i < nentries; i++)
785 EMACS_INT first, last;
786 Lisp_Object dummy_val;
788 unified_range_table_get_range (p, i, &first, &last,
793 printf ("(0x%lx)", (long)first);
800 printf ("(0x%lx)", (long)last);
804 p += unified_range_table_bytes_used (p);
817 case on_failure_jump:
818 extract_number_and_incr (&mcnt, &p);
819 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
822 case on_failure_keep_string_jump:
823 extract_number_and_incr (&mcnt, &p);
824 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
827 case dummy_failure_jump:
828 extract_number_and_incr (&mcnt, &p);
829 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
832 case push_dummy_failure:
833 printf ("/push_dummy_failure");
837 extract_number_and_incr (&mcnt, &p);
838 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
841 case pop_failure_jump:
842 extract_number_and_incr (&mcnt, &p);
843 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
847 extract_number_and_incr (&mcnt, &p);
848 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
852 extract_number_and_incr (&mcnt, &p);
853 printf ("/jump to %ld", (long)(p + mcnt - start));
857 extract_number_and_incr (&mcnt, &p);
858 extract_number_and_incr (&mcnt2, &p);
859 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
863 extract_number_and_incr (&mcnt, &p);
864 extract_number_and_incr (&mcnt2, &p);
865 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
869 extract_number_and_incr (&mcnt, &p);
870 extract_number_and_incr (&mcnt2, &p);
871 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
875 printf ("/wordbound");
879 printf ("/notwordbound");
891 printf ("/before_dot");
899 printf ("/after_dot");
903 printf ("/syntaxspec");
905 printf ("/%d", mcnt);
909 printf ("/notsyntaxspec");
911 printf ("/%d", mcnt);
915 /* 97/2/17 jhod Mule category patch */
917 printf ("/categoryspec");
919 printf ("/%d", mcnt);
922 case notcategoryspec:
923 printf ("/notcategoryspec");
925 printf ("/%d", mcnt);
927 /* end of category patch */
932 printf ("/wordchar");
936 printf ("/notwordchar");
948 printf ("?%d", *(p-1));
954 printf ("%ld:\tend of pattern.\n", (long)(p - start));
959 print_compiled_pattern (struct re_pattern_buffer *bufp)
961 re_char *buffer = bufp->buffer;
963 print_partial_compiled_pattern (buffer, buffer + bufp->used);
964 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
967 if (bufp->fastmap_accurate && bufp->fastmap)
969 printf ("fastmap: ");
970 print_fastmap (bufp->fastmap);
973 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
974 printf ("regs_alloc: %d\t", bufp->regs_allocated);
975 printf ("can_be_null: %d\t", bufp->can_be_null);
976 printf ("newline_anchor: %d\n", bufp->newline_anchor);
977 printf ("no_sub: %d\t", bufp->no_sub);
978 printf ("not_bol: %d\t", bufp->not_bol);
979 printf ("not_eol: %d\t", bufp->not_eol);
980 printf ("syntax: %d\n", bufp->syntax);
981 /* Perhaps we should print the translate table? */
982 /* and maybe the category table? */
987 print_double_string (re_char *where, re_char *string1, int size1,
988 re_char *string2, int size2)
994 unsigned int this_char;
996 if (FIRST_STRING_P (where))
998 for (this_char = where - string1; this_char < size1; this_char++)
999 putchar (string1[this_char]);
1004 for (this_char = where - string2; this_char < size2; this_char++)
1005 putchar (string2[this_char]);
1009 #else /* not DEBUG */
1014 #define DEBUG_STATEMENT(e)
1015 #define DEBUG_PRINT1(x)
1016 #define DEBUG_PRINT2(x1, x2)
1017 #define DEBUG_PRINT3(x1, x2, x3)
1018 #define DEBUG_PRINT4(x1, x2, x3, x4)
1019 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1020 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1024 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1025 also be assigned to arbitrarily: each pattern buffer stores its own
1026 syntax, so it can be changed between regex compilations. */
1027 /* This has no initializer because initialized variables in Emacs
1028 become read-only after dumping. */
1029 reg_syntax_t re_syntax_options;
1032 /* Specify the precise syntax of regexps for compilation. This provides
1033 for compatibility for various utilities which historically have
1034 different, incompatible syntaxes.
1036 The argument SYNTAX is a bit mask comprised of the various bits
1037 defined in regex.h. We return the old syntax. */
1040 re_set_syntax (reg_syntax_t syntax)
1042 reg_syntax_t ret = re_syntax_options;
1044 re_syntax_options = syntax;
1048 /* This table gives an error message for each of the error codes listed
1049 in regex.h. Obviously the order here has to be same as there.
1050 POSIX doesn't require that we do anything for REG_NOERROR,
1051 but why not be nice? */
1053 static const char *re_error_msgid[] =
1055 "Success", /* REG_NOERROR */
1056 "No match", /* REG_NOMATCH */
1057 "Invalid regular expression", /* REG_BADPAT */
1058 "Invalid collation character", /* REG_ECOLLATE */
1059 "Invalid character class name", /* REG_ECTYPE */
1060 "Trailing backslash", /* REG_EESCAPE */
1061 "Invalid back reference", /* REG_ESUBREG */
1062 "Unmatched [ or [^", /* REG_EBRACK */
1063 "Unmatched ( or \\(", /* REG_EPAREN */
1064 "Unmatched \\{", /* REG_EBRACE */
1065 "Invalid content of \\{\\}", /* REG_BADBR */
1066 "Invalid range end", /* REG_ERANGE */
1067 "Memory exhausted", /* REG_ESPACE */
1068 "Invalid preceding regular expression", /* REG_BADRPT */
1069 "Premature end of regular expression", /* REG_EEND */
1070 "Regular expression too big", /* REG_ESIZE */
1071 "Unmatched ) or \\)", /* REG_ERPAREN */
1073 "Invalid syntax designator", /* REG_ESYNTAX */
1076 "Ranges may not span charsets", /* REG_ERANGESPAN */
1077 "Invalid category designator", /* REG_ECATEGORY */
1081 /* Avoiding alloca during matching, to placate r_alloc. */
1083 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1084 searching and matching functions should not call alloca. On some
1085 systems, alloca is implemented in terms of malloc, and if we're
1086 using the relocating allocator routines, then malloc could cause a
1087 relocation, which might (if the strings being searched are in the
1088 ralloc heap) shift the data out from underneath the regexp
1091 Here's another reason to avoid allocation: Emacs
1092 processes input from X in a signal handler; processing X input may
1093 call malloc; if input arrives while a matching routine is calling
1094 malloc, then we're scrod. But Emacs can't just block input while
1095 calling matching routines; then we don't notice interrupts when
1096 they come in. So, Emacs blocks input around all regexp calls
1097 except the matching calls, which it leaves unprotected, in the
1098 faith that they will not malloc. */
1100 /* Normally, this is fine. */
1101 #define MATCH_MAY_ALLOCATE
1103 /* When using GNU C, we are not REALLY using the C alloca, no matter
1104 what config.h may say. So don't take precautions for it. */
1109 /* The match routines may not allocate if (1) they would do it with malloc
1110 and (2) it's not safe for them to use malloc.
1111 Note that if REL_ALLOC is defined, matching would not use malloc for the
1112 failure stack, but we would still use it for the register vectors;
1113 so REL_ALLOC should not affect this. */
1114 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1115 #undef MATCH_MAY_ALLOCATE
1119 /* Failure stack declarations and macros; both re_compile_fastmap and
1120 re_match_2 use a failure stack. These have to be macros because of
1121 REGEX_ALLOCATE_STACK. */
1124 /* Number of failure points for which to initially allocate space
1125 when matching. If this number is exceeded, we allocate more
1126 space, so it is not a hard limit. */
1127 #ifndef INIT_FAILURE_ALLOC
1128 #define INIT_FAILURE_ALLOC 5
1131 /* Roughly the maximum number of failure points on the stack. Would be
1132 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1133 This is a variable only so users of regex can assign to it; we never
1134 change it ourselves. */
1135 #if defined (MATCH_MAY_ALLOCATE)
1136 /* 4400 was enough to cause a crash on Alpha OSF/1,
1137 whose default stack limit is 2mb. */
1138 int re_max_failures = 20000;
1140 int re_max_failures = 2000;
1143 union fail_stack_elt
1149 typedef union fail_stack_elt fail_stack_elt_t;
1153 fail_stack_elt_t *stack;
1155 size_t avail; /* Offset of next open position. */
1158 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1159 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1160 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1163 /* Define macros to initialize and free the failure stack.
1164 Do `return -2' if the alloc fails. */
1166 #ifdef MATCH_MAY_ALLOCATE
1167 #define INIT_FAIL_STACK() \
1169 fail_stack.stack = (fail_stack_elt_t *) \
1170 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1172 if (fail_stack.stack == NULL) \
1175 fail_stack.size = INIT_FAILURE_ALLOC; \
1176 fail_stack.avail = 0; \
1179 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1181 #define INIT_FAIL_STACK() \
1183 fail_stack.avail = 0; \
1186 #define RESET_FAIL_STACK()
1190 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1192 Return 1 if succeeds, and 0 if either ran out of memory
1193 allocating space for it or it was already too large.
1195 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1197 #define DOUBLE_FAIL_STACK(fail_stack) \
1198 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1200 : ((fail_stack).stack = (fail_stack_elt_t *) \
1201 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1202 (fail_stack).size * sizeof (fail_stack_elt_t), \
1203 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1205 (fail_stack).stack == NULL \
1207 : ((fail_stack).size <<= 1, \
1211 /* Push pointer POINTER on FAIL_STACK.
1212 Return 1 if was able to do so and 0 if ran out of memory allocating
1214 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1215 ((FAIL_STACK_FULL () \
1216 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1218 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1221 /* Push a pointer value onto the failure stack.
1222 Assumes the variable `fail_stack'. Probably should only
1223 be called from within `PUSH_FAILURE_POINT'. */
1224 #define PUSH_FAILURE_POINTER(item) \
1225 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1227 /* This pushes an integer-valued item onto the failure stack.
1228 Assumes the variable `fail_stack'. Probably should only
1229 be called from within `PUSH_FAILURE_POINT'. */
1230 #define PUSH_FAILURE_INT(item) \
1231 fail_stack.stack[fail_stack.avail++].integer = (item)
1233 /* Push a fail_stack_elt_t value onto the failure stack.
1234 Assumes the variable `fail_stack'. Probably should only
1235 be called from within `PUSH_FAILURE_POINT'. */
1236 #define PUSH_FAILURE_ELT(item) \
1237 fail_stack.stack[fail_stack.avail++] = (item)
1239 /* These three POP... operations complement the three PUSH... operations.
1240 All assume that `fail_stack' is nonempty. */
1241 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1242 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1243 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1245 /* Used to omit pushing failure point id's when we're not debugging. */
1247 #define DEBUG_PUSH PUSH_FAILURE_INT
1248 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1250 #define DEBUG_PUSH(item)
1251 #define DEBUG_POP(item_addr)
1255 /* Push the information about the state we will need
1256 if we ever fail back to it.
1258 Requires variables fail_stack, regstart, regend, reg_info, and
1259 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1262 Does `return FAILURE_CODE' if runs out of memory. */
1264 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1265 #define DECLARE_DESTINATION char *destination
1267 #define DECLARE_DESTINATION DECLARE_NOTHING
1270 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1272 DECLARE_DESTINATION; \
1273 /* Must be int, so when we don't save any registers, the arithmetic \
1274 of 0 + -1 isn't done as unsigned. */ \
1277 DEBUG_STATEMENT (failure_id++); \
1278 DEBUG_STATEMENT (nfailure_points_pushed++); \
1279 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1280 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1281 (unsigned long) (fail_stack).avail); \
1282 DEBUG_PRINT2 (" size: %lu\n", \
1283 (unsigned long) (fail_stack).size); \
1285 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1286 DEBUG_PRINT2 (" available: %ld\n", \
1287 (long) REMAINING_AVAIL_SLOTS); \
1289 /* Ensure we have enough space allocated for what we will push. */ \
1290 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1292 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1293 return failure_code; \
1295 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1296 (unsigned long) (fail_stack).size); \
1297 DEBUG_PRINT2 (" slots available: %ld\n", \
1298 (long) REMAINING_AVAIL_SLOTS); \
1301 /* Push the info, starting with the registers. */ \
1302 DEBUG_PRINT1 ("\n"); \
1304 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1307 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1308 DEBUG_STATEMENT (num_regs_pushed++); \
1310 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1311 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1313 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1314 PUSH_FAILURE_POINTER (regend[this_reg]); \
1316 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1317 * (long *) (®_info[this_reg])); \
1318 DEBUG_PRINT2 (" match_null=%d", \
1319 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1320 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1321 DEBUG_PRINT2 (" matched_something=%d", \
1322 MATCHED_SOMETHING (reg_info[this_reg])); \
1323 DEBUG_PRINT2 (" ever_matched_something=%d", \
1324 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1325 DEBUG_PRINT1 ("\n"); \
1326 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1329 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1330 PUSH_FAILURE_INT (lowest_active_reg); \
1332 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1333 PUSH_FAILURE_INT (highest_active_reg); \
1335 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1336 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1337 PUSH_FAILURE_POINTER (pattern_place); \
1339 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1340 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1342 DEBUG_PRINT1 ("'\n"); \
1343 PUSH_FAILURE_POINTER (string_place); \
1345 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1346 DEBUG_PUSH (failure_id); \
1349 /* This is the number of items that are pushed and popped on the stack
1350 for each register. */
1351 #define NUM_REG_ITEMS 3
1353 /* Individual items aside from the registers. */
1355 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1357 #define NUM_NONREG_ITEMS 4
1360 /* We push at most this many items on the stack. */
1361 /* We used to use (num_regs - 1), which is the number of registers
1362 this regexp will save; but that was changed to 5
1363 to avoid stack overflow for a regexp with lots of parens. */
1364 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1366 /* We actually push this many items. */
1367 #define NUM_FAILURE_ITEMS \
1368 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1371 /* How many items can still be added to the stack without overflowing it. */
1372 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1375 /* Pops what PUSH_FAIL_STACK pushes.
1377 We restore into the parameters, all of which should be lvalues:
1378 STR -- the saved data position.
1379 PAT -- the saved pattern position.
1380 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1381 REGSTART, REGEND -- arrays of string positions.
1382 REG_INFO -- array of information about each subexpression.
1384 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1385 `pend', `string1', `size1', `string2', and `size2'. */
1387 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1388 regstart, regend, reg_info) \
1390 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1392 const unsigned char *string_temp; \
1394 assert (!FAIL_STACK_EMPTY ()); \
1396 /* Remove failure points and point to how many regs pushed. */ \
1397 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1398 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1399 (unsigned long) fail_stack.avail); \
1400 DEBUG_PRINT2 (" size: %lu\n", \
1401 (unsigned long) fail_stack.size); \
1403 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1405 DEBUG_POP (&ffailure_id.integer); \
1406 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1407 * (unsigned int *) &ffailure_id); \
1409 /* If the saved string location is NULL, it came from an \
1410 on_failure_keep_string_jump opcode, and we want to throw away the \
1411 saved NULL, thus retaining our current position in the string. */ \
1412 string_temp = POP_FAILURE_POINTER (); \
1413 if (string_temp != NULL) \
1414 str = string_temp; \
1416 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1417 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1418 DEBUG_PRINT1 ("'\n"); \
1420 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1421 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1422 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1424 /* Restore register info. */ \
1425 high_reg = (unsigned) POP_FAILURE_INT (); \
1426 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1428 low_reg = (unsigned) POP_FAILURE_INT (); \
1429 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1431 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1433 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1435 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1436 DEBUG_PRINT2 (" info: 0x%lx\n", \
1437 * (long *) ®_info[this_reg]); \
1439 regend[this_reg] = POP_FAILURE_POINTER (); \
1440 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1442 regstart[this_reg] = POP_FAILURE_POINTER (); \
1443 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1446 set_regs_matched_done = 0; \
1447 DEBUG_STATEMENT (nfailure_points_popped++); \
1448 } while (0) /* POP_FAILURE_POINT */
1452 /* Structure for per-register (a.k.a. per-group) information.
1453 Other register information, such as the
1454 starting and ending positions (which are addresses), and the list of
1455 inner groups (which is a bits list) are maintained in separate
1458 We are making a (strictly speaking) nonportable assumption here: that
1459 the compiler will pack our bit fields into something that fits into
1460 the type of `word', i.e., is something that fits into one item on the
1465 fail_stack_elt_t word;
1468 /* This field is one if this group can match the empty string,
1469 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1470 #define MATCH_NULL_UNSET_VALUE 3
1471 unsigned match_null_string_p : 2;
1472 unsigned is_active : 1;
1473 unsigned matched_something : 1;
1474 unsigned ever_matched_something : 1;
1476 } register_info_type;
1478 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1479 #define IS_ACTIVE(R) ((R).bits.is_active)
1480 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1481 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1484 /* Call this when have matched a real character; it sets `matched' flags
1485 for the subexpressions which we are currently inside. Also records
1486 that those subexprs have matched. */
1487 #define SET_REGS_MATCHED() \
1490 if (!set_regs_matched_done) \
1493 set_regs_matched_done = 1; \
1494 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1496 MATCHED_SOMETHING (reg_info[r]) \
1497 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1504 /* Registers are set to a sentinel when they haven't yet matched. */
1505 static unsigned char reg_unset_dummy;
1506 #define REG_UNSET_VALUE (®_unset_dummy)
1507 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1509 /* Subroutine declarations and macros for regex_compile. */
1511 /* Fetch the next character in the uncompiled pattern---translating it
1512 if necessary. Also cast from a signed character in the constant
1513 string passed to us by the user to an unsigned char that we can use
1514 as an array index (in, e.g., `translate'). */
1515 #define PATFETCH(c) \
1518 c = TRANSLATE (c); \
1521 /* Fetch the next character in the uncompiled pattern, with no
1523 #define PATFETCH_RAW(c) \
1524 do {if (p == pend) return REG_EEND; \
1525 assert (p < pend); \
1526 c = charptr_emchar (p); \
1530 /* Go backwards one character in the pattern. */
1531 #define PATUNFETCH DEC_CHARPTR (p)
1535 #define PATFETCH_EXTENDED(emch) \
1536 do {if (p == pend) return REG_EEND; \
1537 assert (p < pend); \
1538 emch = charptr_emchar ((const Bufbyte *) p); \
1540 if (TRANSLATE_P (translate) && emch < 0x80) \
1541 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1544 #define PATFETCH_RAW_EXTENDED(emch) \
1545 do {if (p == pend) return REG_EEND; \
1546 assert (p < pend); \
1547 emch = charptr_emchar ((const Bufbyte *) p); \
1551 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1553 #define PATFETCH_EITHER(emch) \
1555 if (has_extended_chars) \
1556 PATFETCH_EXTENDED (emch); \
1561 #define PATFETCH_RAW_EITHER(emch) \
1563 if (has_extended_chars) \
1564 PATFETCH_RAW_EXTENDED (emch); \
1566 PATFETCH_RAW (emch); \
1569 #define PATUNFETCH_EITHER \
1571 if (has_extended_chars) \
1572 PATUNFETCH_EXTENDED (emch); \
1574 PATUNFETCH (emch); \
1577 #else /* not MULE */
1579 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1580 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1581 #define PATUNFETCH_EITHER PATUNFETCH
1585 /* If `translate' is non-null, return translate[D], else just D. We
1586 cast the subscript to translate because some data is declared as
1587 `char *', to avoid warnings when a string constant is passed. But
1588 when we use a character as a subscript we must make it unsigned. */
1589 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1593 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1594 (TRANSLATE_P (translate) && emch < 0x80 ? RE_TRANSLATE (emch) : (emch))
1598 /* Macros for outputting the compiled pattern into `buffer'. */
1600 /* If the buffer isn't allocated when it comes in, use this. */
1601 #define INIT_BUF_SIZE 32
1603 /* Make sure we have at least N more bytes of space in buffer. */
1604 #define GET_BUFFER_SPACE(n) \
1605 while (buf_end - bufp->buffer + (n) > bufp->allocated) \
1608 /* Make sure we have one more byte of buffer space and then add C to it. */
1609 #define BUF_PUSH(c) \
1611 GET_BUFFER_SPACE (1); \
1612 *buf_end++ = (unsigned char) (c); \
1616 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1617 #define BUF_PUSH_2(c1, c2) \
1619 GET_BUFFER_SPACE (2); \
1620 *buf_end++ = (unsigned char) (c1); \
1621 *buf_end++ = (unsigned char) (c2); \
1625 /* As with BUF_PUSH_2, except for three bytes. */
1626 #define BUF_PUSH_3(c1, c2, c3) \
1628 GET_BUFFER_SPACE (3); \
1629 *buf_end++ = (unsigned char) (c1); \
1630 *buf_end++ = (unsigned char) (c2); \
1631 *buf_end++ = (unsigned char) (c3); \
1635 /* Store a jump with opcode OP at LOC to location TO. We store a
1636 relative address offset by the three bytes the jump itself occupies. */
1637 #define STORE_JUMP(op, loc, to) \
1638 store_op1 (op, loc, (to) - (loc) - 3)
1640 /* Likewise, for a two-argument jump. */
1641 #define STORE_JUMP2(op, loc, to, arg) \
1642 store_op2 (op, loc, (to) - (loc) - 3, arg)
1644 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1646 #define INSERT_JUMP(op, loc, to) \
1647 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1649 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1651 #define INSERT_JUMP2(op, loc, to, arg) \
1652 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1655 /* This is not an arbitrary limit: the arguments which represent offsets
1656 into the pattern are two bytes long. So if 2^16 bytes turns out to
1657 be too small, many things would have to change. */
1658 #define MAX_BUF_SIZE (1L << 16)
1661 /* Extend the buffer by twice its current size via realloc and
1662 reset the pointers that pointed into the old block to point to the
1663 correct places in the new one. If extending the buffer results in it
1664 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1665 #define EXTEND_BUFFER() \
1667 re_char *old_buffer = bufp->buffer; \
1668 if (bufp->allocated == MAX_BUF_SIZE) \
1670 bufp->allocated <<= 1; \
1671 if (bufp->allocated > MAX_BUF_SIZE) \
1672 bufp->allocated = MAX_BUF_SIZE; \
1673 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1674 if (bufp->buffer == NULL) \
1675 return REG_ESPACE; \
1676 /* If the buffer moved, move all the pointers into it. */ \
1677 if (old_buffer != bufp->buffer) \
1679 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1680 begalt = (begalt - old_buffer) + bufp->buffer; \
1681 if (fixup_alt_jump) \
1682 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1684 laststart = (laststart - old_buffer) + bufp->buffer; \
1685 if (pending_exact) \
1686 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1691 /* Since we have one byte reserved for the register number argument to
1692 {start,stop}_memory, the maximum number of groups we can report
1693 things about is what fits in that byte. */
1694 #define MAX_REGNUM 255
1696 /* But patterns can have more than `MAX_REGNUM' registers. We just
1697 ignore the excess. */
1698 typedef unsigned regnum_t;
1701 /* Macros for the compile stack. */
1703 /* Since offsets can go either forwards or backwards, this type needs to
1704 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1705 typedef int pattern_offset_t;
1709 pattern_offset_t begalt_offset;
1710 pattern_offset_t fixup_alt_jump;
1711 pattern_offset_t inner_group_offset;
1712 pattern_offset_t laststart_offset;
1714 } compile_stack_elt_t;
1719 compile_stack_elt_t *stack;
1721 unsigned avail; /* Offset of next open position. */
1722 } compile_stack_type;
1725 #define INIT_COMPILE_STACK_SIZE 32
1727 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1728 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1730 /* The next available element. */
1731 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1734 /* Set the bit for character C in a bit vector. */
1735 #define SET_LIST_BIT(c) \
1736 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1737 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1741 /* Set the "bit" for character C in a range table. */
1742 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1744 /* Set the "bit" for character c in the appropriate table. */
1745 #define SET_EITHER_BIT(c) \
1747 if (has_extended_chars) \
1748 SET_RANGETAB_BIT (c); \
1753 #else /* not MULE */
1755 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1760 /* Get the next unsigned number in the uncompiled pattern. */
1761 #define GET_UNSIGNED_NUMBER(num) \
1765 while (ISDIGIT (c)) \
1769 num = num * 10 + c - '0'; \
1777 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1779 #define IS_CHAR_CLASS(string) \
1780 (STREQ (string, "alpha") || STREQ (string, "upper") \
1781 || STREQ (string, "lower") || STREQ (string, "digit") \
1782 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1783 || STREQ (string, "space") || STREQ (string, "print") \
1784 || STREQ (string, "punct") || STREQ (string, "graph") \
1785 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1787 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1788 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1789 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1790 unsigned char *end);
1791 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1792 unsigned char *end);
1793 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1794 reg_syntax_t syntax);
1795 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1796 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1798 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1799 RE_TRANSLATE_TYPE translate,
1800 reg_syntax_t syntax,
1803 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1805 RE_TRANSLATE_TYPE translate,
1806 reg_syntax_t syntax,
1809 static re_bool group_match_null_string_p (unsigned char **p,
1811 register_info_type *reg_info);
1812 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1813 register_info_type *reg_info);
1814 static re_bool common_op_match_null_string_p (unsigned char **p,
1816 register_info_type *reg_info);
1817 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1818 REGISTER int len, RE_TRANSLATE_TYPE translate);
1819 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1820 re_char *string1, int size1,
1821 re_char *string2, int size2, int pos,
1822 struct re_registers *regs, int stop);
1824 #ifndef MATCH_MAY_ALLOCATE
1826 /* If we cannot allocate large objects within re_match_2_internal,
1827 we make the fail stack and register vectors global.
1828 The fail stack, we grow to the maximum size when a regexp
1830 The register vectors, we adjust in size each time we
1831 compile a regexp, according to the number of registers it needs. */
1833 static fail_stack_type fail_stack;
1835 /* Size with which the following vectors are currently allocated.
1836 That is so we can make them bigger as needed,
1837 but never make them smaller. */
1838 static int regs_allocated_size;
1840 static re_char ** regstart, ** regend;
1841 static re_char ** old_regstart, ** old_regend;
1842 static re_char **best_regstart, **best_regend;
1843 static register_info_type *reg_info;
1844 static re_char **reg_dummy;
1845 static register_info_type *reg_info_dummy;
1847 /* Make the register vectors big enough for NUM_REGS registers,
1848 but don't make them smaller. */
1851 regex_grow_registers (int num_regs)
1853 if (num_regs > regs_allocated_size)
1855 RETALLOC_IF (regstart, num_regs, re_char *);
1856 RETALLOC_IF (regend, num_regs, re_char *);
1857 RETALLOC_IF (old_regstart, num_regs, re_char *);
1858 RETALLOC_IF (old_regend, num_regs, re_char *);
1859 RETALLOC_IF (best_regstart, num_regs, re_char *);
1860 RETALLOC_IF (best_regend, num_regs, re_char *);
1861 RETALLOC_IF (reg_info, num_regs, register_info_type);
1862 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1863 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1865 regs_allocated_size = num_regs;
1869 #endif /* not MATCH_MAY_ALLOCATE */
1871 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1872 Returns one of error codes defined in `regex.h', or zero for success.
1874 Assumes the `allocated' (and perhaps `buffer') and `translate'
1875 fields are set in BUFP on entry.
1877 If it succeeds, results are put in BUFP (if it returns an error, the
1878 contents of BUFP are undefined):
1879 `buffer' is the compiled pattern;
1880 `syntax' is set to SYNTAX;
1881 `used' is set to the length of the compiled pattern;
1882 `fastmap_accurate' is zero;
1883 `re_nsub' is the number of subexpressions in PATTERN;
1884 `not_bol' and `not_eol' are zero;
1886 The `fastmap' and `newline_anchor' fields are neither
1887 examined nor set. */
1889 /* Return, freeing storage we allocated. */
1890 #define FREE_STACK_RETURN(value) \
1891 return (free (compile_stack.stack), value)
1893 static reg_errcode_t
1894 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1895 struct re_pattern_buffer *bufp)
1897 /* We fetch characters from PATTERN here. We declare these as int
1898 (or possibly long) so that chars above 127 can be used as
1899 array indices. The macros that fetch a character from the pattern
1900 make sure to coerce to unsigned char before assigning, so we won't
1901 get bitten by negative numbers here. */
1902 /* XEmacs change: used to be unsigned char. */
1903 REGISTER EMACS_INT c, c1;
1905 /* A random temporary spot in PATTERN. */
1908 /* Points to the end of the buffer, where we should append. */
1909 REGISTER unsigned char *buf_end;
1911 /* Keeps track of unclosed groups. */
1912 compile_stack_type compile_stack;
1914 /* Points to the current (ending) position in the pattern. */
1915 re_char *p = pattern;
1916 re_char *pend = pattern + size;
1918 /* How to translate the characters in the pattern. */
1919 RE_TRANSLATE_TYPE translate = bufp->translate;
1921 /* Address of the count-byte of the most recently inserted `exactn'
1922 command. This makes it possible to tell if a new exact-match
1923 character can be added to that command or if the character requires
1924 a new `exactn' command. */
1925 unsigned char *pending_exact = 0;
1927 /* Address of start of the most recently finished expression.
1928 This tells, e.g., postfix * where to find the start of its
1929 operand. Reset at the beginning of groups and alternatives. */
1930 unsigned char *laststart = 0;
1932 /* Address of beginning of regexp, or inside of last group. */
1933 unsigned char *begalt;
1935 /* Place in the uncompiled pattern (i.e., the {) to
1936 which to go back if the interval is invalid. */
1937 re_char *beg_interval;
1939 /* Address of the place where a forward jump should go to the end of
1940 the containing expression. Each alternative of an `or' -- except the
1941 last -- ends with a forward jump of this sort. */
1942 unsigned char *fixup_alt_jump = 0;
1944 /* Counts open-groups as they are encountered. Remembered for the
1945 matching close-group on the compile stack, so the same register
1946 number is put in the stop_memory as the start_memory. */
1947 regnum_t regnum = 0;
1950 DEBUG_PRINT1 ("\nCompiling pattern: ");
1953 unsigned debug_count;
1955 for (debug_count = 0; debug_count < size; debug_count++)
1956 putchar (pattern[debug_count]);
1961 /* Initialize the compile stack. */
1962 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1963 if (compile_stack.stack == NULL)
1966 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1967 compile_stack.avail = 0;
1969 /* Initialize the pattern buffer. */
1970 bufp->syntax = syntax;
1971 bufp->fastmap_accurate = 0;
1972 bufp->not_bol = bufp->not_eol = 0;
1974 /* Set `used' to zero, so that if we return an error, the pattern
1975 printer (for debugging) will think there's no pattern. We reset it
1979 /* Always count groups, whether or not bufp->no_sub is set. */
1982 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1983 /* Initialize the syntax table. */
1984 init_syntax_once ();
1987 if (bufp->allocated == 0)
1990 { /* If zero allocated, but buffer is non-null, try to realloc
1991 enough space. This loses if buffer's address is bogus, but
1992 that is the user's responsibility. */
1993 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1996 { /* Caller did not allocate a buffer. Do it for them. */
1997 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1999 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2001 bufp->allocated = INIT_BUF_SIZE;
2004 begalt = buf_end = bufp->buffer;
2006 /* Loop through the uncompiled pattern until we're at the end. */
2015 if ( /* If at start of pattern, it's an operator. */
2017 /* If context independent, it's an operator. */
2018 || syntax & RE_CONTEXT_INDEP_ANCHORS
2019 /* Otherwise, depends on what's come before. */
2020 || at_begline_loc_p (pattern, p, syntax))
2030 if ( /* If at end of pattern, it's an operator. */
2032 /* If context independent, it's an operator. */
2033 || syntax & RE_CONTEXT_INDEP_ANCHORS
2034 /* Otherwise, depends on what's next. */
2035 || at_endline_loc_p (p, pend, syntax))
2045 if ((syntax & RE_BK_PLUS_QM)
2046 || (syntax & RE_LIMITED_OPS))
2050 /* If there is no previous pattern... */
2053 if (syntax & RE_CONTEXT_INVALID_OPS)
2054 FREE_STACK_RETURN (REG_BADRPT);
2055 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2060 /* true means zero/many matches are allowed. */
2061 re_bool zero_times_ok = c != '+';
2062 re_bool many_times_ok = c != '?';
2064 /* true means match shortest string possible. */
2065 re_bool minimal = false;
2067 /* If there is a sequence of repetition chars, collapse it
2068 down to just one (the right one). We can't combine
2069 interval operators with these because of, e.g., `a{2}*',
2070 which should only match an even number of `a's. */
2075 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2076 && (c == '+' || c == '?')))
2079 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2081 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2084 if (!(c1 == '+' || c1 == '?'))
2099 /* If we get here, we found another repeat character. */
2100 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2102 /* "*?" and "+?" and "??" are okay (and mean match
2103 minimally), but other sequences (such as "*??" and
2104 "+++") are rejected (reserved for future use). */
2105 if (minimal || c != '?')
2106 FREE_STACK_RETURN (REG_BADRPT);
2111 zero_times_ok |= c != '+';
2112 many_times_ok |= c != '?';
2116 /* Star, etc. applied to an empty pattern is equivalent
2117 to an empty pattern. */
2121 /* Now we know whether zero matches is allowed
2122 and whether two or more matches is allowed
2123 and whether we want minimal or maximal matching. */
2129 0: /on_failure_jump to 6
2134 GET_BUFFER_SPACE (6);
2135 INSERT_JUMP (jump, laststart, buf_end + 3);
2137 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2140 else if (zero_times_ok)
2145 6: /on_failure_jump to 3
2148 GET_BUFFER_SPACE (6);
2149 INSERT_JUMP (jump, laststart, buf_end + 3);
2151 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2158 3: /on_failure_jump to 0
2161 GET_BUFFER_SPACE (3);
2162 STORE_JUMP (on_failure_jump, buf_end, laststart);
2168 /* Are we optimizing this jump? */
2169 re_bool keep_string_p = false;
2172 { /* More than one repetition is allowed, so put in
2173 at the end a backward relative jump from
2174 `buf_end' to before the next jump we're going
2175 to put in below (which jumps from laststart to
2178 But if we are at the `*' in the exact sequence `.*\n',
2179 insert an unconditional jump backwards to the .,
2180 instead of the beginning of the loop. This way we only
2181 push a failure point once, instead of every time
2182 through the loop. */
2183 assert (p - 1 > pattern);
2185 /* Allocate the space for the jump. */
2186 GET_BUFFER_SPACE (3);
2188 /* We know we are not at the first character of the
2189 pattern, because laststart was nonzero. And we've
2190 already incremented `p', by the way, to be the
2191 character after the `*'. Do we have to do something
2192 analogous here for null bytes, because of
2196 && p < pend && *p == '\n'
2197 && !(syntax & RE_DOT_NEWLINE))
2198 { /* We have .*\n. */
2199 STORE_JUMP (jump, buf_end, laststart);
2200 keep_string_p = true;
2203 /* Anything else. */
2204 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2206 /* We've added more stuff to the buffer. */
2210 /* On failure, jump from laststart to buf_end + 3,
2211 which will be the end of the buffer after this jump
2213 GET_BUFFER_SPACE (3);
2214 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2216 laststart, buf_end + 3);
2221 /* At least one repetition is required, so insert a
2222 `dummy_failure_jump' before the initial
2223 `on_failure_jump' instruction of the loop. This
2224 effects a skip over that instruction the first time
2225 we hit that loop. */
2226 GET_BUFFER_SPACE (3);
2227 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2237 laststart = buf_end;
2244 /* XEmacs change: this whole section */
2245 re_bool had_char_class = false;
2247 re_bool has_extended_chars = false;
2248 REGISTER Lisp_Object rtab = Qnil;
2251 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2253 /* Ensure that we have enough space to push a charset: the
2254 opcode, the length count, and the bitset; 34 bytes in all. */
2255 GET_BUFFER_SPACE (34);
2257 laststart = buf_end;
2259 /* We test `*p == '^' twice, instead of using an if
2260 statement, so we only need one BUF_PUSH. */
2261 BUF_PUSH (*p == '^' ? charset_not : charset);
2265 /* Remember the first position in the bracket expression. */
2268 /* Push the number of bytes in the bitmap. */
2269 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2271 /* Clear the whole map. */
2272 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2274 /* charset_not matches newline according to a syntax bit. */
2275 if ((re_opcode_t) buf_end[-2] == charset_not
2276 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2277 SET_LIST_BIT ('\n');
2280 start_over_with_extended:
2281 if (has_extended_chars)
2283 /* There are extended chars here, which means we need to start
2284 over and shift to unified range-table format. */
2285 if (buf_end[-2] == charset)
2286 buf_end[-2] = charset_mule;
2288 buf_end[-2] = charset_mule_not;
2290 p = p1; /* go back to the beginning of the charset, after
2292 rtab = Vthe_lisp_rangetab;
2293 Fclear_range_table (rtab);
2295 /* charset_not matches newline according to a syntax bit. */
2296 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2297 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2298 SET_EITHER_BIT ('\n');
2302 /* Read in characters and ranges, setting map bits. */
2305 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2310 if (c >= 0x80 && !has_extended_chars)
2312 has_extended_chars = 1;
2313 /* Frumble-bumble, we've found some extended chars.
2314 Need to start over, process everything using
2315 the general extended-char mechanism, and need
2316 to use charset_mule and charset_mule_not instead
2317 of charset and charset_not. */
2318 goto start_over_with_extended;
2321 /* \ might escape characters inside [...] and [^...]. */
2322 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2324 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2328 if (c1 >= 0x80 && !has_extended_chars)
2330 has_extended_chars = 1;
2331 goto start_over_with_extended;
2334 SET_EITHER_BIT (c1);
2338 /* Could be the end of the bracket expression. If it's
2339 not (i.e., when the bracket expression is `[]' so
2340 far), the ']' character bit gets set way below. */
2341 if (c == ']' && p != p1 + 1)
2344 /* Look ahead to see if it's a range when the last thing
2345 was a character class. */
2346 if (had_char_class && c == '-' && *p != ']')
2347 FREE_STACK_RETURN (REG_ERANGE);
2349 /* Look ahead to see if it's a range when the last thing
2350 was a character: if this is a hyphen not at the
2351 beginning or the end of a list, then it's the range
2354 && !(p - 2 >= pattern && p[-2] == '[')
2355 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2361 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2363 has_extended_chars = 1;
2364 goto start_over_with_extended;
2366 if (has_extended_chars)
2367 ret = compile_extended_range (&p, pend, translate,
2371 ret = compile_range (&p, pend, translate, syntax, buf_end);
2372 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2375 else if (p[0] == '-' && p[1] != ']')
2376 { /* This handles ranges made up of characters only. */
2379 /* Move past the `-'. */
2383 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2385 has_extended_chars = 1;
2386 goto start_over_with_extended;
2388 if (has_extended_chars)
2389 ret = compile_extended_range (&p, pend, translate,
2393 ret = compile_range (&p, pend, translate, syntax, buf_end);
2394 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2397 /* See if we're at the beginning of a possible character
2400 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2401 { /* Leave room for the null. */
2402 char str[CHAR_CLASS_MAX_LENGTH + 1];
2407 /* If pattern is `[[:'. */
2408 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2412 /* #### This code is unused.
2413 Correctness is not checked after TRT
2416 if (c == ':' || c == ']' || p == pend
2417 || c1 == CHAR_CLASS_MAX_LENGTH)
2419 str[c1++] = (char) c;
2423 /* If isn't a word bracketed by `[:' and `:]':
2424 undo the ending character, the letters, and leave
2425 the leading `:' and `[' (but set bits for them). */
2426 if (c == ':' && *p == ']')
2429 re_bool is_alnum = STREQ (str, "alnum");
2430 re_bool is_alpha = STREQ (str, "alpha");
2431 re_bool is_blank = STREQ (str, "blank");
2432 re_bool is_cntrl = STREQ (str, "cntrl");
2433 re_bool is_digit = STREQ (str, "digit");
2434 re_bool is_graph = STREQ (str, "graph");
2435 re_bool is_lower = STREQ (str, "lower");
2436 re_bool is_print = STREQ (str, "print");
2437 re_bool is_punct = STREQ (str, "punct");
2438 re_bool is_space = STREQ (str, "space");
2439 re_bool is_upper = STREQ (str, "upper");
2440 re_bool is_xdigit = STREQ (str, "xdigit");
2442 if (!IS_CHAR_CLASS (str))
2443 FREE_STACK_RETURN (REG_ECTYPE);
2445 /* Throw away the ] at the end of the character
2449 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2451 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2453 /* This was split into 3 if's to
2454 avoid an arbitrary limit in some compiler. */
2455 if ( (is_alnum && ISALNUM (ch))
2456 || (is_alpha && ISALPHA (ch))
2457 || (is_blank && ISBLANK (ch))
2458 || (is_cntrl && ISCNTRL (ch)))
2459 SET_EITHER_BIT (ch);
2460 if ( (is_digit && ISDIGIT (ch))
2461 || (is_graph && ISGRAPH (ch))
2462 || (is_lower && ISLOWER (ch))
2463 || (is_print && ISPRINT (ch)))
2464 SET_EITHER_BIT (ch);
2465 if ( (is_punct && ISPUNCT (ch))
2466 || (is_space && ISSPACE (ch))
2467 || (is_upper && ISUPPER (ch))
2468 || (is_xdigit && ISXDIGIT (ch)))
2469 SET_EITHER_BIT (ch);
2471 had_char_class = true;
2478 SET_EITHER_BIT ('[');
2479 SET_EITHER_BIT (':');
2480 had_char_class = false;
2485 had_char_class = false;
2491 if (has_extended_chars)
2493 /* We have a range table, not a bit vector. */
2495 unified_range_table_bytes_needed (rtab);
2496 GET_BUFFER_SPACE (bytes_needed);
2497 unified_range_table_copy_data (rtab, buf_end);
2498 buf_end += unified_range_table_bytes_used (buf_end);
2502 /* Discard any (non)matching list bytes that are all 0 at the
2503 end of the map. Decrease the map-length byte too. */
2504 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2506 buf_end += buf_end[-1];
2512 if (syntax & RE_NO_BK_PARENS)
2519 if (syntax & RE_NO_BK_PARENS)
2526 if (syntax & RE_NEWLINE_ALT)
2533 if (syntax & RE_NO_BK_VBAR)
2540 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2541 goto handle_interval;
2547 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2549 /* Do not translate the character after the \, so that we can
2550 distinguish, e.g., \B from \b, even if we normally would
2551 translate, e.g., B to b. */
2557 if (syntax & RE_NO_BK_PARENS)
2558 goto normal_backslash;
2564 if (!(syntax & RE_NO_SHY_GROUPS)
2572 case ':': /* shy groups */
2576 /* All others are reserved for future constructs. */
2578 FREE_STACK_RETURN (REG_BADPAT);
2587 if (COMPILE_STACK_FULL)
2589 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2590 compile_stack_elt_t);
2591 if (compile_stack.stack == NULL) return REG_ESPACE;
2593 compile_stack.size <<= 1;
2596 /* These are the values to restore when we hit end of this
2597 group. They are all relative offsets, so that if the
2598 whole pattern moves because of realloc, they will still
2600 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2601 COMPILE_STACK_TOP.fixup_alt_jump
2602 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2603 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2604 COMPILE_STACK_TOP.regnum = r;
2606 /* We will eventually replace the 0 with the number of
2607 groups inner to this one. But do not push a
2608 start_memory for groups beyond the last one we can
2609 represent in the compiled pattern. */
2610 if (r <= MAX_REGNUM)
2612 COMPILE_STACK_TOP.inner_group_offset
2613 = buf_end - bufp->buffer + 2;
2614 BUF_PUSH_3 (start_memory, r, 0);
2617 compile_stack.avail++;
2622 /* If we've reached MAX_REGNUM groups, then this open
2623 won't actually generate any code, so we'll have to
2624 clear pending_exact explicitly. */
2631 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2633 if (COMPILE_STACK_EMPTY) {
2634 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2635 goto normal_backslash;
2637 FREE_STACK_RETURN (REG_ERPAREN);
2642 { /* Push a dummy failure point at the end of the
2643 alternative for a possible future
2644 `pop_failure_jump' to pop. See comments at
2645 `push_dummy_failure' in `re_match_2'. */
2646 BUF_PUSH (push_dummy_failure);
2648 /* We allocated space for this jump when we assigned
2649 to `fixup_alt_jump', in the `handle_alt' case below. */
2650 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2653 /* See similar code for backslashed left paren above. */
2654 if (COMPILE_STACK_EMPTY) {
2655 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2658 FREE_STACK_RETURN (REG_ERPAREN);
2661 /* Since we just checked for an empty stack above, this
2662 ``can't happen''. */
2663 assert (compile_stack.avail != 0);
2665 /* We don't just want to restore into `regnum', because
2666 later groups should continue to be numbered higher,
2667 as in `(ab)c(de)' -- the second group is #2. */
2668 regnum_t this_group_regnum;
2670 compile_stack.avail--;
2671 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2673 = COMPILE_STACK_TOP.fixup_alt_jump
2674 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2676 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2677 this_group_regnum = COMPILE_STACK_TOP.regnum;
2678 /* If we've reached MAX_REGNUM groups, then this open
2679 won't actually generate any code, so we'll have to
2680 clear pending_exact explicitly. */
2683 /* We're at the end of the group, so now we know how many
2684 groups were inside this one. */
2685 if (this_group_regnum <= MAX_REGNUM)
2687 unsigned char *inner_group_loc
2688 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2690 *inner_group_loc = regnum - this_group_regnum;
2691 BUF_PUSH_3 (stop_memory, this_group_regnum,
2692 regnum - this_group_regnum);
2698 case '|': /* `\|'. */
2699 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2700 goto normal_backslash;
2702 if (syntax & RE_LIMITED_OPS)
2705 /* Insert before the previous alternative a jump which
2706 jumps to this alternative if the former fails. */
2707 GET_BUFFER_SPACE (3);
2708 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2712 /* The alternative before this one has a jump after it
2713 which gets executed if it gets matched. Adjust that
2714 jump so it will jump to this alternative's analogous
2715 jump (put in below, which in turn will jump to the next
2716 (if any) alternative's such jump, etc.). The last such
2717 jump jumps to the correct final destination. A picture:
2723 If we are at `b', then fixup_alt_jump right now points to a
2724 three-byte space after `a'. We'll put in the jump, set
2725 fixup_alt_jump to right after `b', and leave behind three
2726 bytes which we'll fill in when we get to after `c'. */
2729 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2731 /* Mark and leave space for a jump after this alternative,
2732 to be filled in later either by next alternative or
2733 when know we're at the end of a series of alternatives. */
2734 fixup_alt_jump = buf_end;
2735 GET_BUFFER_SPACE (3);
2744 /* If \{ is a literal. */
2745 if (!(syntax & RE_INTERVALS)
2746 /* If we're at `\{' and it's not the open-interval
2748 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2749 || (p - 2 == pattern && p == pend))
2750 goto normal_backslash;
2754 /* If got here, then the syntax allows intervals. */
2756 /* At least (most) this many matches must be made. */
2757 int lower_bound = -1, upper_bound = -1;
2759 beg_interval = p - 1;
2763 if (syntax & RE_NO_BK_BRACES)
2764 goto unfetch_interval;
2766 FREE_STACK_RETURN (REG_EBRACE);
2769 GET_UNSIGNED_NUMBER (lower_bound);
2773 GET_UNSIGNED_NUMBER (upper_bound);
2774 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2777 /* Interval such as `{1}' => match exactly once. */
2778 upper_bound = lower_bound;
2780 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2781 || lower_bound > upper_bound)
2783 if (syntax & RE_NO_BK_BRACES)
2784 goto unfetch_interval;
2786 FREE_STACK_RETURN (REG_BADBR);
2789 if (!(syntax & RE_NO_BK_BRACES))
2791 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2798 if (syntax & RE_NO_BK_BRACES)
2799 goto unfetch_interval;
2801 FREE_STACK_RETURN (REG_BADBR);
2804 /* We just parsed a valid interval. */
2806 /* If it's invalid to have no preceding re. */
2809 if (syntax & RE_CONTEXT_INVALID_OPS)
2810 FREE_STACK_RETURN (REG_BADRPT);
2811 else if (syntax & RE_CONTEXT_INDEP_OPS)
2812 laststart = buf_end;
2814 goto unfetch_interval;
2817 /* If the upper bound is zero, don't want to succeed at
2818 all; jump from `laststart' to `b + 3', which will be
2819 the end of the buffer after we insert the jump. */
2820 if (upper_bound == 0)
2822 GET_BUFFER_SPACE (3);
2823 INSERT_JUMP (jump, laststart, buf_end + 3);
2827 /* Otherwise, we have a nontrivial interval. When
2828 we're all done, the pattern will look like:
2829 set_number_at <jump count> <upper bound>
2830 set_number_at <succeed_n count> <lower bound>
2831 succeed_n <after jump addr> <succeed_n count>
2833 jump_n <succeed_n addr> <jump count>
2834 (The upper bound and `jump_n' are omitted if
2835 `upper_bound' is 1, though.) */
2837 { /* If the upper bound is > 1, we need to insert
2838 more at the end of the loop. */
2839 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2841 GET_BUFFER_SPACE (nbytes);
2843 /* Initialize lower bound of the `succeed_n', even
2844 though it will be set during matching by its
2845 attendant `set_number_at' (inserted next),
2846 because `re_compile_fastmap' needs to know.
2847 Jump to the `jump_n' we might insert below. */
2848 INSERT_JUMP2 (succeed_n, laststart,
2849 buf_end + 5 + (upper_bound > 1) * 5,
2853 /* Code to initialize the lower bound. Insert
2854 before the `succeed_n'. The `5' is the last two
2855 bytes of this `set_number_at', plus 3 bytes of
2856 the following `succeed_n'. */
2857 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2860 if (upper_bound > 1)
2861 { /* More than one repetition is allowed, so
2862 append a backward jump to the `succeed_n'
2863 that starts this interval.
2865 When we've reached this during matching,
2866 we'll have matched the interval once, so
2867 jump back only `upper_bound - 1' times. */
2868 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2872 /* The location we want to set is the second
2873 parameter of the `jump_n'; that is `b-2' as
2874 an absolute address. `laststart' will be
2875 the `set_number_at' we're about to insert;
2876 `laststart+3' the number to set, the source
2877 for the relative address. But we are
2878 inserting into the middle of the pattern --
2879 so everything is getting moved up by 5.
2880 Conclusion: (b - 2) - (laststart + 3) + 5,
2881 i.e., b - laststart.
2883 We insert this at the beginning of the loop
2884 so that if we fail during matching, we'll
2885 reinitialize the bounds. */
2886 insert_op2 (set_number_at, laststart,
2887 buf_end - laststart,
2888 upper_bound - 1, buf_end);
2893 beg_interval = NULL;
2898 /* If an invalid interval, match the characters as literals. */
2899 assert (beg_interval);
2901 beg_interval = NULL;
2903 /* normal_char and normal_backslash need `c'. */
2906 if (!(syntax & RE_NO_BK_BRACES))
2908 if (p > pattern && p[-1] == '\\')
2909 goto normal_backslash;
2914 /* There is no way to specify the before_dot and after_dot
2915 operators. rms says this is ok. --karl */
2921 laststart = buf_end;
2923 /* XEmacs addition */
2924 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2925 FREE_STACK_RETURN (REG_ESYNTAX);
2926 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2930 laststart = buf_end;
2932 /* XEmacs addition */
2933 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2934 FREE_STACK_RETURN (REG_ESYNTAX);
2935 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2939 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2941 laststart = buf_end;
2943 if (c < 32 || c > 127)
2944 FREE_STACK_RETURN (REG_ECATEGORY);
2945 BUF_PUSH_2 (categoryspec, c);
2949 laststart = buf_end;
2951 if (c < 32 || c > 127)
2952 FREE_STACK_RETURN (REG_ECATEGORY);
2953 BUF_PUSH_2 (notcategoryspec, c);
2955 /* end of category patch */
2961 laststart = buf_end;
2962 BUF_PUSH (wordchar);
2967 laststart = buf_end;
2968 BUF_PUSH (notwordchar);
2981 BUF_PUSH (wordbound);
2985 BUF_PUSH (notwordbound);
2996 case '1': case '2': case '3': case '4': case '5':
2997 case '6': case '7': case '8': case '9':
3000 if (syntax & RE_NO_BK_REFS)
3006 FREE_STACK_RETURN (REG_ESUBREG);
3008 /* Can't back reference to a subexpression if inside of it. */
3009 if (group_in_compile_stack (compile_stack, reg))
3012 laststart = buf_end;
3013 BUF_PUSH_2 (duplicate, reg);
3020 if (syntax & RE_BK_PLUS_QM)
3023 goto normal_backslash;
3027 /* You might think it would be useful for \ to mean
3028 not to translate; but if we don't translate it,
3029 it will never match anything. */
3037 /* Expects the character in `c'. */
3038 /* `p' points to the location after where `c' came from. */
3041 /* XEmacs: modifications here for Mule. */
3042 /* `q' points to the beginning of the next char. */
3045 /* If no exactn currently being built. */
3048 /* If last exactn not at current position. */
3049 || pending_exact + *pending_exact + 1 != buf_end
3051 /* We have only one byte following the exactn for the count. */
3052 || ((unsigned int) (*pending_exact + (q - p)) >=
3053 ((unsigned int) (1 << BYTEWIDTH) - 1))
3055 /* If followed by a repetition operator. */
3056 || *q == '*' || *q == '^'
3057 || ((syntax & RE_BK_PLUS_QM)
3058 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3059 : (*q == '+' || *q == '?'))
3060 || ((syntax & RE_INTERVALS)
3061 && ((syntax & RE_NO_BK_BRACES)
3063 : (q[0] == '\\' && q[1] == '{'))))
3065 /* Start building a new exactn. */
3067 laststart = buf_end;
3069 BUF_PUSH_2 (exactn, 0);
3070 pending_exact = buf_end - 1;
3079 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3082 bt_count = set_charptr_emchar (tmp_buf, c);
3084 for (i = 0; i < bt_count; i++)
3086 BUF_PUSH (tmp_buf[i]);
3094 } /* while p != pend */
3097 /* Through the pattern now. */
3100 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3102 if (!COMPILE_STACK_EMPTY)
3103 FREE_STACK_RETURN (REG_EPAREN);
3105 /* If we don't want backtracking, force success
3106 the first time we reach the end of the compiled pattern. */
3107 if (syntax & RE_NO_POSIX_BACKTRACKING)
3110 free (compile_stack.stack);
3112 /* We have succeeded; set the length of the buffer. */
3113 bufp->used = buf_end - bufp->buffer;
3118 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3119 print_compiled_pattern (bufp);
3123 #ifndef MATCH_MAY_ALLOCATE
3124 /* Initialize the failure stack to the largest possible stack. This
3125 isn't necessary unless we're trying to avoid calling alloca in
3126 the search and match routines. */
3128 int num_regs = bufp->re_nsub + 1;
3130 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3131 is strictly greater than re_max_failures, the largest possible stack
3132 is 2 * re_max_failures failure points. */
3133 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3135 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3138 if (! fail_stack.stack)
3140 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3141 * sizeof (fail_stack_elt_t));
3144 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3146 * sizeof (fail_stack_elt_t)));
3147 #else /* not emacs */
3148 if (! fail_stack.stack)
3150 = (fail_stack_elt_t *) malloc (fail_stack.size
3151 * sizeof (fail_stack_elt_t));
3154 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3156 * sizeof (fail_stack_elt_t)));
3160 regex_grow_registers (num_regs);
3162 #endif /* not MATCH_MAY_ALLOCATE */
3165 } /* regex_compile */
3167 /* Subroutines for `regex_compile'. */
3169 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3172 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3174 *loc = (unsigned char) op;
3175 STORE_NUMBER (loc + 1, arg);
3179 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3182 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3184 *loc = (unsigned char) op;
3185 STORE_NUMBER (loc + 1, arg1);
3186 STORE_NUMBER (loc + 3, arg2);
3190 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3191 for OP followed by two-byte integer parameter ARG. */
3194 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3196 REGISTER unsigned char *pfrom = end;
3197 REGISTER unsigned char *pto = end + 3;
3199 while (pfrom != loc)
3202 store_op1 (op, loc, arg);
3206 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3209 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3212 REGISTER unsigned char *pfrom = end;
3213 REGISTER unsigned char *pto = end + 5;
3215 while (pfrom != loc)
3218 store_op2 (op, loc, arg1, arg2);
3222 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3223 after an alternative or a begin-subexpression. We assume there is at
3224 least one character before the ^. */
3227 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3229 re_char *prev = p - 2;
3230 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3233 /* After a subexpression? */
3234 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3235 /* After an alternative? */
3236 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3240 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3241 at least one character after the $, i.e., `P < PEND'. */
3244 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3247 re_bool next_backslash = *next == '\\';
3248 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3251 /* Before a subexpression? */
3252 (syntax & RE_NO_BK_PARENS ? *next == ')'
3253 : next_backslash && next_next && *next_next == ')')
3254 /* Before an alternative? */
3255 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3256 : next_backslash && next_next && *next_next == '|');
3260 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3261 false if it's not. */
3264 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3268 for (this_element = compile_stack.avail - 1;
3271 if (compile_stack.stack[this_element].regnum == regnum)
3278 /* Read the ending character of a range (in a bracket expression) from the
3279 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3280 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3281 Then we set the translation of all bits between the starting and
3282 ending characters (inclusive) in the compiled pattern B.
3284 Return an error code.
3286 We use these short variable names so we can use the same macros as
3287 `regex_compile' itself. */
3289 static reg_errcode_t
3290 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3291 reg_syntax_t syntax, unsigned char *buf_end)
3295 re_char *p = *p_ptr;
3296 int range_start, range_end;
3301 /* Even though the pattern is a signed `char *', we need to fetch
3302 with unsigned char *'s; if the high bit of the pattern character
3303 is set, the range endpoints will be negative if we fetch using a
3306 We also want to fetch the endpoints without translating them; the
3307 appropriate translation is done in the bit-setting loop below. */
3308 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3309 range_start = ((const unsigned char *) p)[-2];
3310 range_end = ((const unsigned char *) p)[0];
3312 /* Have to increment the pointer into the pattern string, so the
3313 caller isn't still at the ending character. */
3316 /* If the start is after the end, the range is empty. */
3317 if (range_start > range_end)
3318 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3320 /* Here we see why `this_char' has to be larger than an `unsigned
3321 char' -- the range is inclusive, so if `range_end' == 0xff
3322 (assuming 8-bit characters), we would otherwise go into an infinite
3323 loop, since all characters <= 0xff. */
3324 for (this_char = range_start; this_char <= range_end; this_char++)
3326 SET_LIST_BIT (TRANSLATE (this_char));
3334 static reg_errcode_t
3335 compile_extended_range (re_char **p_ptr, re_char *pend,
3336 RE_TRANSLATE_TYPE translate,
3337 reg_syntax_t syntax, Lisp_Object rtab)
3339 Emchar this_char, range_start, range_end;
3345 p = (const Bufbyte *) *p_ptr;
3346 range_end = charptr_emchar (p);
3347 p--; /* back to '-' */
3348 DEC_CHARPTR (p); /* back to start of range */
3349 /* We also want to fetch the endpoints without translating them; the
3350 appropriate translation is done in the bit-setting loop below. */
3351 range_start = charptr_emchar (p);
3352 INC_CHARPTR (*p_ptr);
3354 /* If the start is after the end, the range is empty. */
3355 if (range_start > range_end)
3356 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3358 /* Can't have ranges spanning different charsets, except maybe for
3359 ranges entirely within the first 256 chars. */
3361 if ((range_start >= 0x100 || range_end >= 0x100)
3362 && CHAR_LEADING_BYTE (range_start) !=
3363 CHAR_LEADING_BYTE (range_end))
3364 return REG_ERANGESPAN;
3366 /* As advertised, translations only work over the 0 - 0x7F range.
3367 Making this kind of stuff work generally is much harder.
3368 Iterating over the whole range like this would be way efficient
3369 if the range encompasses 10,000 chars or something. You'd have
3370 to do something like this:
3374 map over translation table in [range_start, range_end] of
3375 (put the mapped range in a;
3376 put the translation in b)
3377 invert the range in a and truncate to [range_start, range_end]
3378 compute the union of a, b
3379 union the result into rtab
3381 for (this_char = range_start;
3382 this_char <= range_end && this_char < 0x80; this_char++)
3384 SET_RANGETAB_BIT (TRANSLATE (this_char));
3387 if (this_char <= range_end)
3388 put_range_table (rtab, this_char, range_end, Qt);
3395 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3396 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3397 characters can start a string that matches the pattern. This fastmap
3398 is used by re_search to skip quickly over impossible starting points.
3400 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3401 area as BUFP->fastmap.
3403 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3406 Returns 0 if we succeed, -2 if an internal error. */
3409 re_compile_fastmap (struct re_pattern_buffer *bufp)
3412 #ifdef MATCH_MAY_ALLOCATE
3413 fail_stack_type fail_stack;
3415 DECLARE_DESTINATION;
3416 /* We don't push any register information onto the failure stack. */
3418 REGISTER char *fastmap = bufp->fastmap;
3419 unsigned char *pattern = bufp->buffer;
3420 unsigned long size = bufp->used;
3421 unsigned char *p = pattern;
3422 REGISTER unsigned char *pend = pattern + size;
3425 /* This holds the pointer to the failure stack, when
3426 it is allocated relocatably. */
3427 fail_stack_elt_t *failure_stack_ptr;
3430 /* Assume that each path through the pattern can be null until
3431 proven otherwise. We set this false at the bottom of switch
3432 statement, to which we get only if a particular path doesn't
3433 match the empty string. */
3434 re_bool path_can_be_null = true;
3436 /* We aren't doing a `succeed_n' to begin with. */
3437 re_bool succeed_n_p = false;
3439 assert (fastmap != NULL && p != NULL);
3442 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3443 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3444 bufp->can_be_null = 0;
3448 if (p == pend || *p == succeed)
3450 /* We have reached the (effective) end of pattern. */
3451 if (!FAIL_STACK_EMPTY ())
3453 bufp->can_be_null |= path_can_be_null;
3455 /* Reset for next path. */
3456 path_can_be_null = true;
3458 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3466 /* We should never be about to go beyond the end of the pattern. */
3469 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3472 /* I guess the idea here is to simply not bother with a fastmap
3473 if a backreference is used, since it's too hard to figure out
3474 the fastmap for the corresponding group. Setting
3475 `can_be_null' stops `re_search_2' from using the fastmap, so
3476 that is all we do. */
3478 bufp->can_be_null = 1;
3482 /* Following are the cases which match a character. These end
3491 /* XEmacs: Under Mule, these bit vectors will
3492 only contain values for characters below 0x80. */
3493 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3494 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3500 /* Chars beyond end of map must be allowed. */
3502 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3504 /* And all extended characters must be allowed, too. */
3505 for (j = 0x80; j < 0xA0; j++)
3507 #else /* not MULE */
3508 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3512 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3513 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3523 nentries = unified_range_table_nentries (p);
3524 for (i = 0; i < nentries; i++)
3526 EMACS_INT first, last;
3527 Lisp_Object dummy_val;
3529 Bufbyte strr[MAX_EMCHAR_LEN];
3531 unified_range_table_get_range (p, i, &first, &last,
3533 for (jj = first; jj <= last && jj < 0x80; jj++)
3535 /* Ranges below 0x100 can span charsets, but there
3536 are only two (Control-1 and Latin-1), and
3537 either first or last has to be in them. */
3538 set_charptr_emchar (strr, first);
3542 set_charptr_emchar (strr, last);
3549 case charset_mule_not:
3554 nentries = unified_range_table_nentries (p);
3555 for (i = 0; i < nentries; i++)
3557 EMACS_INT first, last;
3558 Lisp_Object dummy_val;
3560 int smallest_prev = 0;
3562 unified_range_table_get_range (p, i, &first, &last,
3564 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3566 smallest_prev = last + 1;
3567 if (smallest_prev >= 0x80)
3570 /* Calculating which leading bytes are actually allowed
3571 here is rather difficult, so we just punt and allow
3573 for (i = 0x80; i < 0xA0; i++)
3585 for (j = 0; j < (1 << BYTEWIDTH); j++)
3588 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3597 goto matchnotsyntax;
3599 for (j = 0; j < (1 << BYTEWIDTH); j++)
3602 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3610 int fastmap_newline = fastmap['\n'];
3612 /* `.' matches anything ... */
3614 /* "anything" only includes bytes that can be the
3615 first byte of a character. */
3616 for (j = 0; j < 0xA0; j++)
3619 for (j = 0; j < (1 << BYTEWIDTH); j++)
3623 /* ... except perhaps newline. */
3624 if (!(bufp->syntax & RE_DOT_NEWLINE))
3625 fastmap['\n'] = fastmap_newline;
3627 /* Return if we have already set `can_be_null'; if we have,
3628 then the fastmap is irrelevant. Something's wrong here. */
3629 else if (bufp->can_be_null)
3632 /* Otherwise, have to check alternative paths. */
3643 /* This match depends on text properties. These end with
3644 aborting optimizations. */
3645 bufp->can_be_null = 1;
3649 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3655 for (j = 0; j < 0x80; j++)
3658 (regex_emacs_buffer->mirror_syntax_table), j) ==
3659 (enum syntaxcode) k)
3661 for (j = 0x80; j < 0xA0; j++)
3663 if (LEADING_BYTE_PREFIX_P(j))
3664 /* too complicated to calculate this right */
3671 cset = CHARSET_BY_LEADING_BYTE (j);
3672 if (CHARSETP (cset))
3674 if (charset_syntax (regex_emacs_buffer, cset,
3676 == Sword || multi_p)
3681 #else /* not MULE */
3682 for (j = 0; j < (1 << BYTEWIDTH); j++)
3685 (regex_emacs_buffer->mirror_syntax_table), j) ==
3686 (enum syntaxcode) k)
3692 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3698 for (j = 0; j < 0x80; j++)
3701 (regex_emacs_buffer->mirror_syntax_table), j) !=
3702 (enum syntaxcode) k)
3704 for (j = 0x80; j < 0xA0; j++)
3706 if (LEADING_BYTE_PREFIX_P(j))
3707 /* too complicated to calculate this right */
3714 cset = CHARSET_BY_LEADING_BYTE (j);
3715 if (CHARSETP (cset))
3717 if (charset_syntax (regex_emacs_buffer, cset,
3719 != Sword || multi_p)
3724 #else /* not MULE */
3725 for (j = 0; j < (1 << BYTEWIDTH); j++)
3728 (regex_emacs_buffer->mirror_syntax_table), j) !=
3729 (enum syntaxcode) k)
3736 /* 97/2/17 jhod category patch */
3738 case notcategoryspec:
3739 bufp->can_be_null = 1;
3741 /* end if category patch */
3744 /* All cases after this match the empty string. These end with
3752 #endif /* not emacs */
3766 case push_dummy_failure:
3771 case pop_failure_jump:
3772 case maybe_pop_jump:
3775 case dummy_failure_jump:
3776 EXTRACT_NUMBER_AND_INCR (j, p);
3781 /* Jump backward implies we just went through the body of a
3782 loop and matched nothing. Opcode jumped to should be
3783 `on_failure_jump' or `succeed_n'. Just treat it like an
3784 ordinary jump. For a * loop, it has pushed its failure
3785 point already; if so, discard that as redundant. */
3786 if ((re_opcode_t) *p != on_failure_jump
3787 && (re_opcode_t) *p != succeed_n)
3791 EXTRACT_NUMBER_AND_INCR (j, p);
3794 /* If what's on the stack is where we are now, pop it. */
3795 if (!FAIL_STACK_EMPTY ()
3796 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3802 case on_failure_jump:
3803 case on_failure_keep_string_jump:
3804 handle_on_failure_jump:
3805 EXTRACT_NUMBER_AND_INCR (j, p);
3807 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3808 end of the pattern. We don't want to push such a point,
3809 since when we restore it above, entering the switch will
3810 increment `p' past the end of the pattern. We don't need
3811 to push such a point since we obviously won't find any more
3812 fastmap entries beyond `pend'. Such a pattern can match
3813 the null string, though. */
3816 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3818 RESET_FAIL_STACK ();
3823 bufp->can_be_null = 1;
3827 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3828 succeed_n_p = false;
3835 /* Get to the number of times to succeed. */
3838 /* Increment p past the n for when k != 0. */
3839 EXTRACT_NUMBER_AND_INCR (k, p);
3843 succeed_n_p = true; /* Spaghetti code alert. */
3844 goto handle_on_failure_jump;
3861 abort (); /* We have listed all the cases. */
3864 /* Getting here means we have found the possible starting
3865 characters for one path of the pattern -- and that the empty
3866 string does not match. We need not follow this path further.
3867 Instead, look at the next alternative (remembered on the
3868 stack), or quit if no more. The test at the top of the loop
3869 does these things. */
3870 path_can_be_null = false;
3874 /* Set `can_be_null' for the last path (also the first path, if the
3875 pattern is empty). */
3876 bufp->can_be_null |= path_can_be_null;
3879 RESET_FAIL_STACK ();
3881 } /* re_compile_fastmap */
3883 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3884 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3885 this memory for recording register information. STARTS and ENDS
3886 must be allocated using the malloc library routine, and must each
3887 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3889 If NUM_REGS == 0, then subsequent matches should allocate their own
3892 Unless this function is called, the first search or match using
3893 PATTERN_BUFFER will allocate its own register data, without
3894 freeing the old data. */
3897 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3898 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3902 bufp->regs_allocated = REGS_REALLOCATE;
3903 regs->num_regs = num_regs;
3904 regs->start = starts;
3909 bufp->regs_allocated = REGS_UNALLOCATED;
3911 regs->start = regs->end = (regoff_t *) 0;
3915 /* Searching routines. */
3917 /* Like re_search_2, below, but only one string is specified, and
3918 doesn't let you say where to stop matching. */
3921 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3922 int startpos, int range, struct re_registers *regs)
3924 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3929 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3930 # define bytecount_to_charcount(ptr, len) (len)
3931 # define charcount_to_bytecount(ptr, len) (len)
3932 typedef int Charcount;
3935 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3936 virtual concatenation of STRING1 and STRING2, starting first at index
3937 STARTPOS, then at STARTPOS + 1, and so on.
3939 With MULE, STARTPOS is a byte position, not a char position. And the
3940 search will increment STARTPOS by the width of the current leading
3943 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3945 RANGE is how far to scan while trying to match. RANGE = 0 means try
3946 only at STARTPOS; in general, the last start tried is STARTPOS +
3949 With MULE, RANGE is a byte position, not a char position. The last
3950 start tried is the character starting <= STARTPOS + RANGE.
3952 In REGS, return the indices of the virtual concatenation of STRING1
3953 and STRING2 that matched the entire BUFP->buffer and its contained
3956 Do not consider matching one past the index STOP in the virtual
3957 concatenation of STRING1 and STRING2.
3959 We return either the position in the strings at which the match was
3960 found, -1 if no match, or -2 if error (such as failure
3964 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
3965 int size1, const char *str2, int size2, int startpos,
3966 int range, struct re_registers *regs, int stop)
3969 re_char *string1 = (re_char *) str1;
3970 re_char *string2 = (re_char *) str2;
3971 REGISTER char *fastmap = bufp->fastmap;
3972 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
3973 int total_size = size1 + size2;
3974 int endpos = startpos + range;
3975 #ifdef REGEX_BEGLINE_CHECK
3976 int anchored_at_begline = 0;
3981 /* Check for out-of-range STARTPOS. */
3982 if (startpos < 0 || startpos > total_size)
3985 /* Fix up RANGE if it might eventually take us outside
3986 the virtual concatenation of STRING1 and STRING2. */
3988 range = 0 - startpos;
3989 else if (endpos > total_size)
3990 range = total_size - startpos;
3992 /* If the search isn't to be a backwards one, don't waste time in a
3993 search for a pattern that must be anchored. */
3994 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4000 d = ((const unsigned char *)
4001 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4002 range = charcount_to_bytecount (d, 1);
4007 /* In a forward search for something that starts with \=.
4008 don't keep searching past point. */
4009 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4011 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4018 /* Update the fastmap now if not correct already. */
4019 if (fastmap && !bufp->fastmap_accurate)
4020 if (re_compile_fastmap (bufp) == -2)
4023 #ifdef REGEX_BEGLINE_CHECK
4027 while (i < bufp->used)
4029 if (bufp->buffer[i] == start_memory ||
4030 bufp->buffer[i] == stop_memory)
4035 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4040 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4042 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4048 /* Loop through the string, looking for a place to start matching. */
4051 #ifdef REGEX_BEGLINE_CHECK
4052 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4053 then we can speed things up by skipping to the next beginning-of-
4055 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4058 /* whose stupid idea was it anyway to make this
4059 function take two strings to match?? */
4063 if (startpos < size1 && startpos + range >= size1)
4064 lim = range - (size1 - startpos);
4066 d = ((const unsigned char *)
4067 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4068 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4069 d_size = charcount_to_bytecount (d, 1);
4071 if (TRANSLATE_P (translate))
4072 while (range > lim && *d != '\n')
4074 d += d_size; /* Speedier INC_CHARPTR(d) */
4075 d_size = charcount_to_bytecount (d, 1);
4079 while (range > lim && *d != '\n')
4081 d += d_size; /* Speedier INC_CHARPTR(d) */
4082 d_size = charcount_to_bytecount (d, 1);
4086 startpos += irange - range;
4088 #endif /* REGEX_BEGLINE_CHECK */
4090 /* If a fastmap is supplied, skip quickly over characters that
4091 cannot be the start of a match. If the pattern can match the
4092 null string, however, we don't need to skip characters; we want
4093 the first null string. */
4094 if (fastmap && startpos < total_size && !bufp->can_be_null)
4096 if (range > 0) /* Searching forwards. */
4101 if (startpos < size1 && startpos + range >= size1)
4102 lim = range - (size1 - startpos);
4104 d = ((const unsigned char *)
4105 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4107 /* Written out as an if-else to avoid testing `translate'
4109 if (TRANSLATE_P (translate))
4115 buf_ch = charptr_emchar (d);
4116 buf_ch = RE_TRANSLATE (buf_ch);
4117 if (buf_ch >= 0200 || fastmap[(unsigned char) buf_ch])
4120 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4123 d_size = charcount_to_bytecount (d, 1);
4125 d += d_size; /* Speedier INC_CHARPTR(d) */
4128 while (range > lim && !fastmap[*d])
4130 d_size = charcount_to_bytecount (d, 1);
4132 d += d_size; /* Speedier INC_CHARPTR(d) */
4135 startpos += irange - range;
4137 else /* Searching backwards. */
4139 Emchar c = (size1 == 0 || startpos >= size1
4140 ? charptr_emchar (string2 + startpos - size1)
4141 : charptr_emchar (string1 + startpos));
4144 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4147 if (!fastmap[(unsigned char) c])
4153 /* If can't match the null string, and that's all we have left, fail. */
4154 if (range >= 0 && startpos == total_size && fastmap
4155 && !bufp->can_be_null)
4158 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4159 if (!no_quit_in_re_search)
4162 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4163 startpos, regs, stop);
4164 #ifndef REGEX_MALLOC
4181 d = ((const unsigned char *)
4182 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4183 d_size = charcount_to_bytecount (d, 1);
4189 /* Note startpos > size1 not >=. If we are on the
4190 string1/string2 boundary, we want to backup into string1. */
4191 d = ((const unsigned char *)
4192 (startpos > size1 ? string2 - size1 : string1) + startpos);
4194 d_size = charcount_to_bytecount (d, 1);
4202 /* Declarations and macros for re_match_2. */
4204 /* This converts PTR, a pointer into one of the search strings `string1'
4205 and `string2' into an offset from the beginning of that string. */
4206 #define POINTER_TO_OFFSET(ptr) \
4207 (FIRST_STRING_P (ptr) \
4208 ? ((regoff_t) ((ptr) - string1)) \
4209 : ((regoff_t) ((ptr) - string2 + size1)))
4211 /* Macros for dealing with the split strings in re_match_2. */
4213 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4215 /* Call before fetching a character with *d. This switches over to
4216 string2 if necessary. */
4217 #define REGEX_PREFETCH() \
4220 /* End of string2 => fail. */ \
4221 if (dend == end_match_2) \
4223 /* End of string1 => advance to string2. */ \
4225 dend = end_match_2; \
4229 /* Test if at very beginning or at very end of the virtual concatenation
4230 of `string1' and `string2'. If only one string, it's `string2'. */
4231 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4232 #define AT_STRINGS_END(d) ((d) == end2)
4235 If the given position straddles the string gap, return the equivalent
4236 position that is before or after the gap, respectively; otherwise,
4237 return the same position. */
4238 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4239 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4241 /* Test if CH is a word-constituent character. (XEmacs change) */
4242 #define WORDCHAR_P_UNSAFE(ch) \
4243 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4246 /* Free everything we malloc. */
4247 #ifdef MATCH_MAY_ALLOCATE
4248 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4249 #define FREE_VARIABLES() \
4251 REGEX_FREE_STACK (fail_stack.stack); \
4252 FREE_VAR (regstart); \
4253 FREE_VAR (regend); \
4254 FREE_VAR (old_regstart); \
4255 FREE_VAR (old_regend); \
4256 FREE_VAR (best_regstart); \
4257 FREE_VAR (best_regend); \
4258 FREE_VAR (reg_info); \
4259 FREE_VAR (reg_dummy); \
4260 FREE_VAR (reg_info_dummy); \
4262 #else /* not MATCH_MAY_ALLOCATE */
4263 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4264 #endif /* MATCH_MAY_ALLOCATE */
4266 /* These values must meet several constraints. They must not be valid
4267 register values; since we have a limit of 255 registers (because
4268 we use only one byte in the pattern for the register number), we can
4269 use numbers larger than 255. They must differ by 1, because of
4270 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4271 be larger than the value for the highest register, so we do not try
4272 to actually save any registers when none are active. */
4273 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4274 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4276 /* Matching routines. */
4278 #ifndef emacs /* Emacs never uses this. */
4279 /* re_match is like re_match_2 except it takes only a single string. */
4282 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4283 int pos, struct re_registers *regs)
4285 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4290 #endif /* not emacs */
4293 /* re_match_2 matches the compiled pattern in BUFP against the
4294 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4295 SIZE2, respectively). We start matching at POS, and stop matching
4298 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4299 store offsets for the substring each group matched in REGS. See the
4300 documentation for exactly how many groups we fill.
4302 We return -1 if no match, -2 if an internal error (such as the
4303 failure stack overflowing). Otherwise, we return the length of the
4304 matched substring. */
4307 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4308 int size1, const char *string2, int size2, int pos,
4309 struct re_registers *regs, int stop)
4314 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4316 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4322 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4323 (re_char *) string2, size2,
4330 /* This is a separate function so that we can force an alloca cleanup
4333 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4334 int size1, re_char *string2, int size2, int pos,
4335 struct re_registers *regs, int stop)
4337 /* General temporaries. */
4340 int should_succeed; /* XEmacs change */
4342 /* Just past the end of the corresponding string. */
4343 re_char *end1, *end2;
4345 /* Pointers into string1 and string2, just past the last characters in
4346 each to consider matching. */
4347 re_char *end_match_1, *end_match_2;
4349 /* Where we are in the data, and the end of the current string. */
4352 /* Where we are in the pattern, and the end of the pattern. */
4353 unsigned char *p = bufp->buffer;
4354 REGISTER unsigned char *pend = p + bufp->used;
4356 /* Mark the opcode just after a start_memory, so we can test for an
4357 empty subpattern when we get to the stop_memory. */
4358 re_char *just_past_start_mem = 0;
4360 /* We use this to map every character in the string. */
4361 RE_TRANSLATE_TYPE translate = bufp->translate;
4363 /* Failure point stack. Each place that can handle a failure further
4364 down the line pushes a failure point on this stack. It consists of
4365 restart, regend, and reg_info for all registers corresponding to
4366 the subexpressions we're currently inside, plus the number of such
4367 registers, and, finally, two char *'s. The first char * is where
4368 to resume scanning the pattern; the second one is where to resume
4369 scanning the strings. If the latter is zero, the failure point is
4370 a ``dummy''; if a failure happens and the failure point is a dummy,
4371 it gets discarded and the next one is tried. */
4372 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4373 fail_stack_type fail_stack;
4376 static unsigned failure_id;
4377 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4381 /* This holds the pointer to the failure stack, when
4382 it is allocated relocatably. */
4383 fail_stack_elt_t *failure_stack_ptr;
4386 /* We fill all the registers internally, independent of what we
4387 return, for use in backreferences. The number here includes
4388 an element for register zero. */
4389 unsigned num_regs = bufp->re_nsub + 1;
4391 /* The currently active registers. */
4392 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4393 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4395 /* Information on the contents of registers. These are pointers into
4396 the input strings; they record just what was matched (on this
4397 attempt) by a subexpression part of the pattern, that is, the
4398 regnum-th regstart pointer points to where in the pattern we began
4399 matching and the regnum-th regend points to right after where we
4400 stopped matching the regnum-th subexpression. (The zeroth register
4401 keeps track of what the whole pattern matches.) */
4402 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4403 re_char **regstart, **regend;
4406 /* If a group that's operated upon by a repetition operator fails to
4407 match anything, then the register for its start will need to be
4408 restored because it will have been set to wherever in the string we
4409 are when we last see its open-group operator. Similarly for a
4411 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4412 re_char **old_regstart, **old_regend;
4415 /* The is_active field of reg_info helps us keep track of which (possibly
4416 nested) subexpressions we are currently in. The matched_something
4417 field of reg_info[reg_num] helps us tell whether or not we have
4418 matched any of the pattern so far this time through the reg_num-th
4419 subexpression. These two fields get reset each time through any
4420 loop their register is in. */
4421 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4422 register_info_type *reg_info;
4425 /* The following record the register info as found in the above
4426 variables when we find a match better than any we've seen before.
4427 This happens as we backtrack through the failure points, which in
4428 turn happens only if we have not yet matched the entire string. */
4429 unsigned best_regs_set = false;
4430 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4431 re_char **best_regstart, **best_regend;
4434 /* Logically, this is `best_regend[0]'. But we don't want to have to
4435 allocate space for that if we're not allocating space for anything
4436 else (see below). Also, we never need info about register 0 for
4437 any of the other register vectors, and it seems rather a kludge to
4438 treat `best_regend' differently than the rest. So we keep track of
4439 the end of the best match so far in a separate variable. We
4440 initialize this to NULL so that when we backtrack the first time
4441 and need to test it, it's not garbage. */
4442 re_char *match_end = NULL;
4444 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4445 int set_regs_matched_done = 0;
4447 /* Used when we pop values we don't care about. */
4448 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4449 re_char **reg_dummy;
4450 register_info_type *reg_info_dummy;
4454 /* Counts the total number of registers pushed. */
4455 unsigned num_regs_pushed = 0;
4458 /* 1 if this match ends in the same string (string1 or string2)
4459 as the best previous match. */
4462 /* 1 if this match is the best seen so far. */
4463 re_bool best_match_p;
4465 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4469 #ifdef MATCH_MAY_ALLOCATE
4470 /* Do not bother to initialize all the register variables if there are
4471 no groups in the pattern, as it takes a fair amount of time. If
4472 there are groups, we include space for register 0 (the whole
4473 pattern), even though we never use it, since it simplifies the
4474 array indexing. We should fix this. */
4477 regstart = REGEX_TALLOC (num_regs, re_char *);
4478 regend = REGEX_TALLOC (num_regs, re_char *);
4479 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4480 old_regend = REGEX_TALLOC (num_regs, re_char *);
4481 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4482 best_regend = REGEX_TALLOC (num_regs, re_char *);
4483 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4484 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4485 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4487 if (!(regstart && regend && old_regstart && old_regend && reg_info
4488 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4496 /* We must initialize all our variables to NULL, so that
4497 `FREE_VARIABLES' doesn't try to free them. */
4498 regstart = regend = old_regstart = old_regend = best_regstart
4499 = best_regend = reg_dummy = NULL;
4500 reg_info = reg_info_dummy = (register_info_type *) NULL;
4502 #endif /* MATCH_MAY_ALLOCATE */
4504 /* The starting position is bogus. */
4505 if (pos < 0 || pos > size1 + size2)
4511 /* Initialize subexpression text positions to -1 to mark ones that no
4512 start_memory/stop_memory has been seen for. Also initialize the
4513 register information struct. */
4514 for (mcnt = 1; mcnt < num_regs; mcnt++)
4516 regstart[mcnt] = regend[mcnt]
4517 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4519 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4520 IS_ACTIVE (reg_info[mcnt]) = 0;
4521 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4522 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4524 /* We move `string1' into `string2' if the latter's empty -- but not if
4525 `string1' is null. */
4526 if (size2 == 0 && string1 != NULL)
4533 end1 = string1 + size1;
4534 end2 = string2 + size2;
4536 /* Compute where to stop matching, within the two strings. */
4539 end_match_1 = string1 + stop;
4540 end_match_2 = string2;
4545 end_match_2 = string2 + stop - size1;
4548 /* `p' scans through the pattern as `d' scans through the data.
4549 `dend' is the end of the input string that `d' points within. `d'
4550 is advanced into the following input string whenever necessary, but
4551 this happens before fetching; therefore, at the beginning of the
4552 loop, `d' can be pointing at the end of a string, but it cannot
4554 if (size1 > 0 && pos <= size1)
4561 d = string2 + pos - size1;
4565 DEBUG_PRINT1 ("The compiled pattern is: \n");
4566 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4567 DEBUG_PRINT1 ("The string to match is: `");
4568 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4569 DEBUG_PRINT1 ("'\n");
4571 /* This loops over pattern commands. It exits by returning from the
4572 function if the match is complete, or it drops through if the match
4573 fails at this starting point in the input data. */
4576 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4577 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4578 if (!no_quit_in_re_search)
4583 { /* End of pattern means we might have succeeded. */
4584 DEBUG_PRINT1 ("end of pattern ... ");
4586 /* If we haven't matched the entire string, and we want the
4587 longest match, try backtracking. */
4588 if (d != end_match_2)
4590 same_str_p = (FIRST_STRING_P (match_end)
4591 == MATCHING_IN_FIRST_STRING);
4593 /* AIX compiler got confused when this was combined
4594 with the previous declaration. */
4596 best_match_p = d > match_end;
4598 best_match_p = !MATCHING_IN_FIRST_STRING;
4600 DEBUG_PRINT1 ("backtracking.\n");
4602 if (!FAIL_STACK_EMPTY ())
4603 { /* More failure points to try. */
4605 /* If exceeds best match so far, save it. */
4606 if (!best_regs_set || best_match_p)
4608 best_regs_set = true;
4611 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4613 for (mcnt = 1; mcnt < num_regs; mcnt++)
4615 best_regstart[mcnt] = regstart[mcnt];
4616 best_regend[mcnt] = regend[mcnt];
4622 /* If no failure points, don't restore garbage. And if
4623 last match is real best match, don't restore second
4625 else if (best_regs_set && !best_match_p)
4628 /* Restore best match. It may happen that `dend ==
4629 end_match_1' while the restored d is in string2.
4630 For example, the pattern `x.*y.*z' against the
4631 strings `x-' and `y-z-', if the two strings are
4632 not consecutive in memory. */
4633 DEBUG_PRINT1 ("Restoring best registers.\n");
4636 dend = ((d >= string1 && d <= end1)
4637 ? end_match_1 : end_match_2);
4639 for (mcnt = 1; mcnt < num_regs; mcnt++)
4641 regstart[mcnt] = best_regstart[mcnt];
4642 regend[mcnt] = best_regend[mcnt];
4645 } /* d != end_match_2 */
4648 DEBUG_PRINT1 ("Accepting match.\n");
4650 /* If caller wants register contents data back, do it. */
4651 if (regs && !bufp->no_sub)
4653 /* Have the register data arrays been allocated? */
4654 if (bufp->regs_allocated == REGS_UNALLOCATED)
4655 { /* No. So allocate them with malloc. We need one
4656 extra element beyond `num_regs' for the `-1' marker
4658 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4659 regs->start = TALLOC (regs->num_regs, regoff_t);
4660 regs->end = TALLOC (regs->num_regs, regoff_t);
4661 if (regs->start == NULL || regs->end == NULL)
4666 bufp->regs_allocated = REGS_REALLOCATE;
4668 else if (bufp->regs_allocated == REGS_REALLOCATE)
4669 { /* Yes. If we need more elements than were already
4670 allocated, reallocate them. If we need fewer, just
4672 if (regs->num_regs < num_regs + 1)
4674 regs->num_regs = num_regs + 1;
4675 RETALLOC (regs->start, regs->num_regs, regoff_t);
4676 RETALLOC (regs->end, regs->num_regs, regoff_t);
4677 if (regs->start == NULL || regs->end == NULL)
4686 /* These braces fend off a "empty body in an else-statement"
4687 warning under GCC when assert expands to nothing. */
4688 assert (bufp->regs_allocated == REGS_FIXED);
4691 /* Convert the pointer data in `regstart' and `regend' to
4692 indices. Register zero has to be set differently,
4693 since we haven't kept track of any info for it. */
4694 if (regs->num_regs > 0)
4696 regs->start[0] = pos;
4697 regs->end[0] = (MATCHING_IN_FIRST_STRING
4698 ? ((regoff_t) (d - string1))
4699 : ((regoff_t) (d - string2 + size1)));
4702 /* Go through the first `min (num_regs, regs->num_regs)'
4703 registers, since that is all we initialized. */
4704 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4706 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4707 regs->start[mcnt] = regs->end[mcnt] = -1;
4711 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4713 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4717 /* If the regs structure we return has more elements than
4718 were in the pattern, set the extra elements to -1. If
4719 we (re)allocated the registers, this is the case,
4720 because we always allocate enough to have at least one
4722 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4723 regs->start[mcnt] = regs->end[mcnt] = -1;
4724 } /* regs && !bufp->no_sub */
4726 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4727 nfailure_points_pushed, nfailure_points_popped,
4728 nfailure_points_pushed - nfailure_points_popped);
4729 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4731 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4735 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4741 /* Otherwise match next pattern command. */
4742 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4744 /* Ignore these. Used to ignore the n of succeed_n's which
4745 currently have n == 0. */
4747 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4751 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4754 /* Match the next n pattern characters exactly. The following
4755 byte in the pattern defines n, and the n bytes after that
4756 are the characters to match. */
4759 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4761 /* This is written out as an if-else so we don't waste time
4762 testing `translate' inside the loop. */
4763 if (TRANSLATE_P (translate))
4768 Emchar pat_ch, buf_ch;
4772 pat_ch = charptr_emchar (p);
4773 buf_ch = charptr_emchar (d);
4774 if (RE_TRANSLATE (buf_ch) != pat_ch)
4777 pat_len = charcount_to_bytecount (p, 1);
4782 #else /* not MULE */
4784 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4796 if (*d++ != *p++) goto fail;
4800 SET_REGS_MATCHED ();
4804 /* Match any character except possibly a newline or a null. */
4806 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4810 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4811 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4814 SET_REGS_MATCHED ();
4815 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4816 INC_CHARPTR (d); /* XEmacs change */
4823 REGISTER unsigned char c;
4824 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4826 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4829 c = TRANSLATE (*d); /* The character to match. */
4831 /* Cast to `unsigned' instead of `unsigned char' in case the
4832 bit list is a full 32 bytes long. */
4833 if (c < (unsigned) (*p * BYTEWIDTH)
4834 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4839 if (!not_p) goto fail;
4841 SET_REGS_MATCHED ();
4842 INC_CHARPTR (d); /* XEmacs change */
4848 case charset_mule_not:
4851 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4853 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4856 c = charptr_emchar ((const Bufbyte *) d);
4857 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4859 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4862 p += unified_range_table_bytes_used (p);
4864 if (!not_p) goto fail;
4866 SET_REGS_MATCHED ();
4873 /* The beginning of a group is represented by start_memory.
4874 The arguments are the register number in the next byte, and the
4875 number of groups inner to this one in the next. The text
4876 matched within the group is recorded (in the internal
4877 registers data structure) under the register number. */
4879 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4881 /* Find out if this group can match the empty string. */
4882 p1 = p; /* To send to group_match_null_string_p. */
4884 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4885 REG_MATCH_NULL_STRING_P (reg_info[*p])
4886 = group_match_null_string_p (&p1, pend, reg_info);
4888 /* Save the position in the string where we were the last time
4889 we were at this open-group operator in case the group is
4890 operated upon by a repetition operator, e.g., with `(a*)*b'
4891 against `ab'; then we want to ignore where we are now in
4892 the string in case this attempt to match fails. */
4893 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4894 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4896 DEBUG_PRINT2 (" old_regstart: %d\n",
4897 POINTER_TO_OFFSET (old_regstart[*p]));
4900 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4902 IS_ACTIVE (reg_info[*p]) = 1;
4903 MATCHED_SOMETHING (reg_info[*p]) = 0;
4905 /* Clear this whenever we change the register activity status. */
4906 set_regs_matched_done = 0;
4908 /* This is the new highest active register. */
4909 highest_active_reg = *p;
4911 /* If nothing was active before, this is the new lowest active
4913 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4914 lowest_active_reg = *p;
4916 /* Move past the register number and inner group count. */
4918 just_past_start_mem = p;
4923 /* The stop_memory opcode represents the end of a group. Its
4924 arguments are the same as start_memory's: the register
4925 number, and the number of inner groups. */
4927 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4929 /* We need to save the string position the last time we were at
4930 this close-group operator in case the group is operated
4931 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4932 against `aba'; then we want to ignore where we are now in
4933 the string in case this attempt to match fails. */
4934 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4935 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4937 DEBUG_PRINT2 (" old_regend: %d\n",
4938 POINTER_TO_OFFSET (old_regend[*p]));
4941 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4943 /* This register isn't active anymore. */
4944 IS_ACTIVE (reg_info[*p]) = 0;
4946 /* Clear this whenever we change the register activity status. */
4947 set_regs_matched_done = 0;
4949 /* If this was the only register active, nothing is active
4951 if (lowest_active_reg == highest_active_reg)
4953 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4954 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4957 { /* We must scan for the new highest active register, since
4958 it isn't necessarily one less than now: consider
4959 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4960 new highest active register is 1. */
4961 unsigned char r = *p - 1;
4962 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4965 /* If we end up at register zero, that means that we saved
4966 the registers as the result of an `on_failure_jump', not
4967 a `start_memory', and we jumped to past the innermost
4968 `stop_memory'. For example, in ((.)*) we save
4969 registers 1 and 2 as a result of the *, but when we pop
4970 back to the second ), we are at the stop_memory 1.
4971 Thus, nothing is active. */
4974 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4975 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4979 highest_active_reg = r;
4981 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
4983 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
4985 lowest_active_reg = r;
4989 /* If just failed to match something this time around with a
4990 group that's operated on by a repetition operator, try to
4991 force exit from the ``loop'', and restore the register
4992 information for this group that we had before trying this
4994 if ((!MATCHED_SOMETHING (reg_info[*p])
4995 || just_past_start_mem == p - 1)
4998 re_bool is_a_jump_n = false;
5002 switch ((re_opcode_t) *p1++)
5006 case pop_failure_jump:
5007 case maybe_pop_jump:
5009 case dummy_failure_jump:
5010 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5020 /* If the next operation is a jump backwards in the pattern
5021 to an on_failure_jump right before the start_memory
5022 corresponding to this stop_memory, exit from the loop
5023 by forcing a failure after pushing on the stack the
5024 on_failure_jump's jump in the pattern, and d. */
5025 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5026 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5028 /* If this group ever matched anything, then restore
5029 what its registers were before trying this last
5030 failed match, e.g., with `(a*)*b' against `ab' for
5031 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5032 against `aba' for regend[3].
5034 Also restore the registers for inner groups for,
5035 e.g., `((a*)(b*))*' against `aba' (register 3 would
5036 otherwise get trashed). */
5038 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5042 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5044 /* Restore this and inner groups' (if any) registers. */
5045 for (r = *p; r < *p + *(p + 1); r++)
5047 regstart[r] = old_regstart[r];
5049 /* xx why this test? */
5050 if (old_regend[r] >= regstart[r])
5051 regend[r] = old_regend[r];
5055 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5056 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5062 /* Move past the register number and the inner group count. */
5067 /* \<digit> has been turned into a `duplicate' command which is
5068 followed by the numeric value of <digit> as the register number. */
5071 REGISTER re_char *d2, *dend2;
5072 int regno = *p++; /* Get which register to match against. */
5073 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5075 /* Can't back reference a group which we've never matched. */
5076 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5079 /* Where in input to try to start matching. */
5080 d2 = regstart[regno];
5082 /* Where to stop matching; if both the place to start and
5083 the place to stop matching are in the same string, then
5084 set to the place to stop, otherwise, for now have to use
5085 the end of the first string. */
5087 dend2 = ((FIRST_STRING_P (regstart[regno])
5088 == FIRST_STRING_P (regend[regno]))
5089 ? regend[regno] : end_match_1);
5092 /* If necessary, advance to next segment in register
5096 if (dend2 == end_match_2) break;
5097 if (dend2 == regend[regno]) break;
5099 /* End of string1 => advance to string2. */
5101 dend2 = regend[regno];
5103 /* At end of register contents => success */
5104 if (d2 == dend2) break;
5106 /* If necessary, advance to next segment in data. */
5109 /* How many characters left in this segment to match. */
5112 /* Want how many consecutive characters we can match in
5113 one shot, so, if necessary, adjust the count. */
5114 if (mcnt > dend2 - d2)
5117 /* Compare that many; failure if mismatch, else move
5119 if (TRANSLATE_P (translate)
5120 ? bcmp_translate ((unsigned char *) d,
5121 (unsigned char *) d2, mcnt, translate)
5122 : memcmp (d, d2, mcnt))
5124 d += mcnt, d2 += mcnt;
5126 /* Do this because we've match some characters. */
5127 SET_REGS_MATCHED ();
5133 /* begline matches the empty string at the beginning of the string
5134 (unless `not_bol' is set in `bufp'), and, if
5135 `newline_anchor' is set, after newlines. */
5137 DEBUG_PRINT1 ("EXECUTING begline.\n");
5139 if (AT_STRINGS_BEG (d))
5141 if (!bufp->not_bol) break;
5143 else if (d[-1] == '\n' && bufp->newline_anchor)
5147 /* In all other cases, we fail. */
5151 /* endline is the dual of begline. */
5153 DEBUG_PRINT1 ("EXECUTING endline.\n");
5155 if (AT_STRINGS_END (d))
5157 if (!bufp->not_eol) break;
5160 /* We have to ``prefetch'' the next character. */
5161 else if ((d == end1 ? *string2 : *d) == '\n'
5162 && bufp->newline_anchor)
5169 /* Match at the very beginning of the data. */
5171 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5172 if (AT_STRINGS_BEG (d))
5177 /* Match at the very end of the data. */
5179 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5180 if (AT_STRINGS_END (d))
5185 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5186 pushes NULL as the value for the string on the stack. Then
5187 `pop_failure_point' will keep the current value for the
5188 string, instead of restoring it. To see why, consider
5189 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5190 then the . fails against the \n. But the next thing we want
5191 to do is match the \n against the \n; if we restored the
5192 string value, we would be back at the foo.
5194 Because this is used only in specific cases, we don't need to
5195 check all the things that `on_failure_jump' does, to make
5196 sure the right things get saved on the stack. Hence we don't
5197 share its code. The only reason to push anything on the
5198 stack at all is that otherwise we would have to change
5199 `anychar's code to do something besides goto fail in this
5200 case; that seems worse than this. */
5201 case on_failure_keep_string_jump:
5202 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5204 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5205 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5207 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5211 /* Uses of on_failure_jump:
5213 Each alternative starts with an on_failure_jump that points
5214 to the beginning of the next alternative. Each alternative
5215 except the last ends with a jump that in effect jumps past
5216 the rest of the alternatives. (They really jump to the
5217 ending jump of the following alternative, because tensioning
5218 these jumps is a hassle.)
5220 Repeats start with an on_failure_jump that points past both
5221 the repetition text and either the following jump or
5222 pop_failure_jump back to this on_failure_jump. */
5223 case on_failure_jump:
5225 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5227 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5228 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5230 /* If this on_failure_jump comes right before a group (i.e.,
5231 the original * applied to a group), save the information
5232 for that group and all inner ones, so that if we fail back
5233 to this point, the group's information will be correct.
5234 For example, in \(a*\)*\1, we need the preceding group,
5235 and in \(\(a*\)b*\)\2, we need the inner group. */
5237 /* We can't use `p' to check ahead because we push
5238 a failure point to `p + mcnt' after we do this. */
5241 /* We need to skip no_op's before we look for the
5242 start_memory in case this on_failure_jump is happening as
5243 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5245 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5248 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5250 /* We have a new highest active register now. This will
5251 get reset at the start_memory we are about to get to,
5252 but we will have saved all the registers relevant to
5253 this repetition op, as described above. */
5254 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5255 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5256 lowest_active_reg = *(p1 + 1);
5259 DEBUG_PRINT1 (":\n");
5260 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5264 /* A smart repeat ends with `maybe_pop_jump'.
5265 We change it to either `pop_failure_jump' or `jump'. */
5266 case maybe_pop_jump:
5267 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5268 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5270 REGISTER unsigned char *p2 = p;
5272 /* Compare the beginning of the repeat with what in the
5273 pattern follows its end. If we can establish that there
5274 is nothing that they would both match, i.e., that we
5275 would have to backtrack because of (as in, e.g., `a*a')
5276 then we can change to pop_failure_jump, because we'll
5277 never have to backtrack.
5279 This is not true in the case of alternatives: in
5280 `(a|ab)*' we do need to backtrack to the `ab' alternative
5281 (e.g., if the string was `ab'). But instead of trying to
5282 detect that here, the alternative has put on a dummy
5283 failure point which is what we will end up popping. */
5285 /* Skip over open/close-group commands.
5286 If what follows this loop is a ...+ construct,
5287 look at what begins its body, since we will have to
5288 match at least one of that. */
5292 && ((re_opcode_t) *p2 == stop_memory
5293 || (re_opcode_t) *p2 == start_memory))
5295 else if (p2 + 6 < pend
5296 && (re_opcode_t) *p2 == dummy_failure_jump)
5303 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5304 to the `maybe_finalize_jump' of this case. Examine what
5307 /* If we're at the end of the pattern, we can change. */
5310 /* Consider what happens when matching ":\(.*\)"
5311 against ":/". I don't really understand this code
5313 p[-3] = (unsigned char) pop_failure_jump;
5315 (" End of pattern: change to `pop_failure_jump'.\n");
5318 else if ((re_opcode_t) *p2 == exactn
5319 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5321 REGISTER unsigned char c
5322 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5324 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5326 p[-3] = (unsigned char) pop_failure_jump;
5327 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5331 else if ((re_opcode_t) p1[3] == charset
5332 || (re_opcode_t) p1[3] == charset_not)
5334 int not_p = (re_opcode_t) p1[3] == charset_not;
5336 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5337 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5340 /* `not_p' is equal to 1 if c would match, which means
5341 that we can't change to pop_failure_jump. */
5344 p[-3] = (unsigned char) pop_failure_jump;
5345 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5349 else if ((re_opcode_t) *p2 == charset)
5352 REGISTER unsigned char c
5353 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5356 if ((re_opcode_t) p1[3] == exactn
5357 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5358 && (p2[2 + p1[5] / BYTEWIDTH]
5359 & (1 << (p1[5] % BYTEWIDTH)))))
5361 p[-3] = (unsigned char) pop_failure_jump;
5362 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5366 else if ((re_opcode_t) p1[3] == charset_not)
5369 /* We win if the charset_not inside the loop
5370 lists every character listed in the charset after. */
5371 for (idx = 0; idx < (int) p2[1]; idx++)
5372 if (! (p2[2 + idx] == 0
5373 || (idx < (int) p1[4]
5374 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5379 p[-3] = (unsigned char) pop_failure_jump;
5380 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5383 else if ((re_opcode_t) p1[3] == charset)
5386 /* We win if the charset inside the loop
5387 has no overlap with the one after the loop. */
5389 idx < (int) p2[1] && idx < (int) p1[4];
5391 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5394 if (idx == p2[1] || idx == p1[4])
5396 p[-3] = (unsigned char) pop_failure_jump;
5397 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5402 p -= 2; /* Point at relative address again. */
5403 if ((re_opcode_t) p[-1] != pop_failure_jump)
5405 p[-1] = (unsigned char) jump;
5406 DEBUG_PRINT1 (" Match => jump.\n");
5407 goto unconditional_jump;
5409 /* Note fall through. */
5412 /* The end of a simple repeat has a pop_failure_jump back to
5413 its matching on_failure_jump, where the latter will push a
5414 failure point. The pop_failure_jump takes off failure
5415 points put on by this pop_failure_jump's matching
5416 on_failure_jump; we got through the pattern to here from the
5417 matching on_failure_jump, so didn't fail. */
5418 case pop_failure_jump:
5420 /* We need to pass separate storage for the lowest and
5421 highest registers, even though we don't care about the
5422 actual values. Otherwise, we will restore only one
5423 register from the stack, since lowest will == highest in
5424 `pop_failure_point'. */
5425 unsigned dummy_low_reg, dummy_high_reg;
5426 unsigned char *pdummy;
5427 re_char *sdummy = NULL;
5429 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5430 POP_FAILURE_POINT (sdummy, pdummy,
5431 dummy_low_reg, dummy_high_reg,
5432 reg_dummy, reg_dummy, reg_info_dummy);
5434 /* Note fall through. */
5437 /* Unconditionally jump (without popping any failure points). */
5440 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5441 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5442 p += mcnt; /* Do the jump. */
5443 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5447 /* We need this opcode so we can detect where alternatives end
5448 in `group_match_null_string_p' et al. */
5450 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5451 goto unconditional_jump;
5454 /* Normally, the on_failure_jump pushes a failure point, which
5455 then gets popped at pop_failure_jump. We will end up at
5456 pop_failure_jump, also, and with a pattern of, say, `a+', we
5457 are skipping over the on_failure_jump, so we have to push
5458 something meaningless for pop_failure_jump to pop. */
5459 case dummy_failure_jump:
5460 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5461 /* It doesn't matter what we push for the string here. What
5462 the code at `fail' tests is the value for the pattern. */
5463 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5464 goto unconditional_jump;
5467 /* At the end of an alternative, we need to push a dummy failure
5468 point in case we are followed by a `pop_failure_jump', because
5469 we don't want the failure point for the alternative to be
5470 popped. For example, matching `(a|ab)*' against `aab'
5471 requires that we match the `ab' alternative. */
5472 case push_dummy_failure:
5473 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5474 /* See comments just above at `dummy_failure_jump' about the
5476 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5479 /* Have to succeed matching what follows at least n times.
5480 After that, handle like `on_failure_jump'. */
5482 EXTRACT_NUMBER (mcnt, p + 2);
5483 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5486 /* Originally, this is how many times we HAVE to succeed. */
5491 STORE_NUMBER_AND_INCR (p, mcnt);
5492 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5496 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5498 p[2] = (unsigned char) no_op;
5499 p[3] = (unsigned char) no_op;
5505 EXTRACT_NUMBER (mcnt, p + 2);
5506 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5508 /* Originally, this is how many times we CAN jump. */
5512 STORE_NUMBER (p + 2, mcnt);
5513 goto unconditional_jump;
5515 /* If don't have to jump any more, skip over the rest of command. */
5522 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5524 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5526 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5527 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5528 STORE_NUMBER (p1, mcnt);
5533 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5539 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5543 re_char *d_before = POS_BEFORE_GAP_UNSAFE (d);
5544 re_char *d_after = POS_AFTER_GAP_UNSAFE (d);
5546 /* emch1 is the character before d, syn1 is the syntax of emch1,
5547 emch2 is the character at d, and syn2 is the syntax of emch2. */
5548 Emchar emch1, emch2;
5554 DEC_CHARPTR (d_before);
5555 emch1 = charptr_emchar (d_before);
5556 emch2 = charptr_emchar (d_after);
5559 pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5560 UPDATE_SYNTAX_CACHE (pos_before);
5562 syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5565 UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1);
5567 syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5570 result = ((syn1 == Sword) != (syn2 == Sword));
5572 if (result == should_succeed)
5578 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5580 goto matchwordbound;
5583 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5584 if (AT_STRINGS_END (d))
5587 /* XEmacs: this originally read:
5589 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5593 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5594 Emchar emch = charptr_emchar (dtmp);
5596 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5597 UPDATE_SYNTAX_CACHE (charpos);
5599 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5602 if (AT_STRINGS_BEG (d))
5604 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5606 emch = charptr_emchar (dtmp);
5608 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5610 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5617 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5618 if (AT_STRINGS_BEG (d))
5621 /* XEmacs: this originally read:
5623 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5624 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5627 The or condition is incorrect (reversed).
5632 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5633 UPDATE_SYNTAX_CACHE (charpos);
5635 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5637 emch = charptr_emchar (dtmp);
5638 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5641 if (AT_STRINGS_END (d))
5643 dtmp = POS_AFTER_GAP_UNSAFE (d);
5644 emch = charptr_emchar (dtmp);
5646 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5648 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5656 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5657 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5658 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5659 >= BUF_PT (regex_emacs_buffer)))
5664 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5665 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5666 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5667 != BUF_PT (regex_emacs_buffer)))
5672 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5673 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5674 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5675 <= BUF_PT (regex_emacs_buffer)))
5678 #if 0 /* not emacs19 */
5680 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5681 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5682 != BUF_PT (regex_emacs_buffer))
5685 #endif /* not emacs19 */
5688 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5693 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5705 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5706 UPDATE_SYNTAX_CACHE (charpos);
5710 emch = charptr_emchar ((const Bufbyte *) d);
5711 matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->mirror_syntax_table,
5712 emch) == (enum syntaxcode) mcnt);
5714 if (matches != should_succeed)
5716 SET_REGS_MATCHED ();
5721 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5723 goto matchnotsyntax;
5726 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5730 goto matchornotsyntax;
5733 /* 97/2/17 jhod Mule category code patch */
5742 emch = charptr_emchar ((const Bufbyte *) d);
5744 if (check_category_char(emch, regex_emacs_buffer->category_table,
5745 mcnt, should_succeed))
5747 SET_REGS_MATCHED ();
5751 case notcategoryspec:
5753 goto matchornotcategory;
5754 /* end of category patch */
5756 #else /* not emacs */
5758 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5760 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5762 SET_REGS_MATCHED ();
5767 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5769 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5771 SET_REGS_MATCHED ();
5779 continue; /* Successfully executed one pattern command; keep going. */
5782 /* We goto here if a matching operation fails. */
5784 if (!FAIL_STACK_EMPTY ())
5785 { /* A restart point is known. Restore to that state. */
5786 DEBUG_PRINT1 ("\nFAIL:\n");
5787 POP_FAILURE_POINT (d, p,
5788 lowest_active_reg, highest_active_reg,
5789 regstart, regend, reg_info);
5791 /* If this failure point is a dummy, try the next one. */
5795 /* If we failed to the end of the pattern, don't examine *p. */
5799 re_bool is_a_jump_n = false;
5801 /* If failed to a backwards jump that's part of a repetition
5802 loop, need to pop this failure point and use the next one. */
5803 switch ((re_opcode_t) *p)
5807 case maybe_pop_jump:
5808 case pop_failure_jump:
5811 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5814 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5816 && (re_opcode_t) *p1 == on_failure_jump))
5824 if (d >= string1 && d <= end1)
5828 break; /* Matching at this starting point really fails. */
5832 goto restore_best_regs;
5836 return -1; /* Failure to match. */
5839 /* Subroutine definitions for re_match_2. */
5842 /* We are passed P pointing to a register number after a start_memory.
5844 Return true if the pattern up to the corresponding stop_memory can
5845 match the empty string, and false otherwise.
5847 If we find the matching stop_memory, sets P to point to one past its number.
5848 Otherwise, sets P to an undefined byte less than or equal to END.
5850 We don't handle duplicates properly (yet). */
5853 group_match_null_string_p (unsigned char **p, unsigned char *end,
5854 register_info_type *reg_info)
5857 /* Point to after the args to the start_memory. */
5858 unsigned char *p1 = *p + 2;
5862 /* Skip over opcodes that can match nothing, and return true or
5863 false, as appropriate, when we get to one that can't, or to the
5864 matching stop_memory. */
5866 switch ((re_opcode_t) *p1)
5868 /* Could be either a loop or a series of alternatives. */
5869 case on_failure_jump:
5871 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5873 /* If the next operation is not a jump backwards in the
5878 /* Go through the on_failure_jumps of the alternatives,
5879 seeing if any of the alternatives cannot match nothing.
5880 The last alternative starts with only a jump,
5881 whereas the rest start with on_failure_jump and end
5882 with a jump, e.g., here is the pattern for `a|b|c':
5884 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5885 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5888 So, we have to first go through the first (n-1)
5889 alternatives and then deal with the last one separately. */
5892 /* Deal with the first (n-1) alternatives, which start
5893 with an on_failure_jump (see above) that jumps to right
5894 past a jump_past_alt. */
5896 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5898 /* `mcnt' holds how many bytes long the alternative
5899 is, including the ending `jump_past_alt' and
5902 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5906 /* Move to right after this alternative, including the
5910 /* Break if it's the beginning of an n-th alternative
5911 that doesn't begin with an on_failure_jump. */
5912 if ((re_opcode_t) *p1 != on_failure_jump)
5915 /* Still have to check that it's not an n-th
5916 alternative that starts with an on_failure_jump. */
5918 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5919 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5921 /* Get to the beginning of the n-th alternative. */
5927 /* Deal with the last alternative: go back and get number
5928 of the `jump_past_alt' just before it. `mcnt' contains
5929 the length of the alternative. */
5930 EXTRACT_NUMBER (mcnt, p1 - 2);
5932 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5935 p1 += mcnt; /* Get past the n-th alternative. */
5941 assert (p1[1] == **p);
5947 if (!common_op_match_null_string_p (&p1, end, reg_info))
5950 } /* while p1 < end */
5953 } /* group_match_null_string_p */
5956 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5957 It expects P to be the first byte of a single alternative and END one
5958 byte past the last. The alternative can contain groups. */
5961 alt_match_null_string_p (unsigned char *p, unsigned char *end,
5962 register_info_type *reg_info)
5965 unsigned char *p1 = p;
5969 /* Skip over opcodes that can match nothing, and break when we get
5970 to one that can't. */
5972 switch ((re_opcode_t) *p1)
5975 case on_failure_jump:
5977 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5982 if (!common_op_match_null_string_p (&p1, end, reg_info))
5985 } /* while p1 < end */
5988 } /* alt_match_null_string_p */
5991 /* Deals with the ops common to group_match_null_string_p and
5992 alt_match_null_string_p.
5994 Sets P to one after the op and its arguments, if any. */
5997 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
5998 register_info_type *reg_info)
6003 unsigned char *p1 = *p;
6005 switch ((re_opcode_t) *p1++)
6025 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6026 ret = group_match_null_string_p (&p1, end, reg_info);
6028 /* Have to set this here in case we're checking a group which
6029 contains a group and a back reference to it. */
6031 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6032 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6038 /* If this is an optimized succeed_n for zero times, make the jump. */
6040 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6048 /* Get to the number of times to succeed. */
6050 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6055 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6063 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6071 /* All other opcodes mean we cannot match the empty string. */
6077 } /* common_op_match_null_string_p */
6080 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6081 bytes; nonzero otherwise. */
6084 bcmp_translate (re_char *s1, re_char *s2,
6085 REGISTER int len, RE_TRANSLATE_TYPE translate)
6087 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6089 const unsigned char *p1_end = s1 + len;
6090 const unsigned char *p2_end = s2 + len;
6092 while (p1 != p1_end && p2 != p2_end)
6094 Emchar p1_ch, p2_ch;
6096 p1_ch = charptr_emchar (p1);
6097 p2_ch = charptr_emchar (p2);
6099 if (RE_TRANSLATE (p1_ch)
6100 != RE_TRANSLATE (p2_ch))
6105 #else /* not MULE */
6108 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6115 /* Entry points for GNU code. */
6117 /* re_compile_pattern is the GNU regular expression compiler: it
6118 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6119 Returns 0 if the pattern was valid, otherwise an error string.
6121 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6122 are set in BUFP on entry.
6124 We call regex_compile to do the actual compilation. */
6127 re_compile_pattern (const char *pattern, int length,
6128 struct re_pattern_buffer *bufp)
6132 /* GNU code is written to assume at least RE_NREGS registers will be set
6133 (and at least one extra will be -1). */
6134 bufp->regs_allocated = REGS_UNALLOCATED;
6136 /* And GNU code determines whether or not to get register information
6137 by passing null for the REGS argument to re_match, etc., not by
6141 /* Match anchors at newline. */
6142 bufp->newline_anchor = 1;
6144 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6148 return gettext (re_error_msgid[(int) ret]);
6151 /* Entry points compatible with 4.2 BSD regex library. We don't define
6152 them unless specifically requested. */
6154 #ifdef _REGEX_RE_COMP
6156 /* BSD has one and only one pattern buffer. */
6157 static struct re_pattern_buffer re_comp_buf;
6160 re_comp (const char *s)
6166 if (!re_comp_buf.buffer)
6167 return gettext ("No previous regular expression");
6171 if (!re_comp_buf.buffer)
6173 re_comp_buf.buffer = (unsigned char *) malloc (200);
6174 if (re_comp_buf.buffer == NULL)
6175 return gettext (re_error_msgid[(int) REG_ESPACE]);
6176 re_comp_buf.allocated = 200;
6178 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6179 if (re_comp_buf.fastmap == NULL)
6180 return gettext (re_error_msgid[(int) REG_ESPACE]);
6183 /* Since `re_exec' always passes NULL for the `regs' argument, we
6184 don't need to initialize the pattern buffer fields which affect it. */
6186 /* Match anchors at newlines. */
6187 re_comp_buf.newline_anchor = 1;
6189 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6194 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6195 return (char *) gettext (re_error_msgid[(int) ret]);
6200 re_exec (const char *s)
6202 const int len = strlen (s);
6204 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6206 #endif /* _REGEX_RE_COMP */
6208 /* POSIX.2 functions. Don't define these for Emacs. */
6212 /* regcomp takes a regular expression as a string and compiles it.
6214 PREG is a regex_t *. We do not expect any fields to be initialized,
6215 since POSIX says we shouldn't. Thus, we set
6217 `buffer' to the compiled pattern;
6218 `used' to the length of the compiled pattern;
6219 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6220 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6221 RE_SYNTAX_POSIX_BASIC;
6222 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6223 `fastmap' and `fastmap_accurate' to zero;
6224 `re_nsub' to the number of subexpressions in PATTERN.
6226 PATTERN is the address of the pattern string.
6228 CFLAGS is a series of bits which affect compilation.
6230 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6231 use POSIX basic syntax.
6233 If REG_NEWLINE is set, then . and [^...] don't match newline.
6234 Also, regexec will try a match beginning after every newline.
6236 If REG_ICASE is set, then we considers upper- and lowercase
6237 versions of letters to be equivalent when matching.
6239 If REG_NOSUB is set, then when PREG is passed to regexec, that
6240 routine will report only success or failure, and nothing about the
6243 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6244 the return codes and their meanings.) */
6247 regcomp (regex_t *preg, const char *pattern, int cflags)
6251 = (cflags & REG_EXTENDED) ?
6252 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6254 /* regex_compile will allocate the space for the compiled pattern. */
6256 preg->allocated = 0;
6259 /* Don't bother to use a fastmap when searching. This simplifies the
6260 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6261 characters after newlines into the fastmap. This way, we just try
6265 if (cflags & REG_ICASE)
6269 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6270 if (preg->translate == NULL)
6271 return (int) REG_ESPACE;
6273 /* Map uppercase characters to corresponding lowercase ones. */
6274 for (i = 0; i < CHAR_SET_SIZE; i++)
6275 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6278 preg->translate = NULL;
6280 /* If REG_NEWLINE is set, newlines are treated differently. */
6281 if (cflags & REG_NEWLINE)
6282 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6283 syntax &= ~RE_DOT_NEWLINE;
6284 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6285 /* It also changes the matching behavior. */
6286 preg->newline_anchor = 1;
6289 preg->newline_anchor = 0;
6291 preg->no_sub = !!(cflags & REG_NOSUB);
6293 /* POSIX says a null character in the pattern terminates it, so we
6294 can use strlen here in compiling the pattern. */
6295 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6297 /* POSIX doesn't distinguish between an unmatched open-group and an
6298 unmatched close-group: both are REG_EPAREN. */
6299 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6305 /* regexec searches for a given pattern, specified by PREG, in the
6308 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6309 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6310 least NMATCH elements, and we set them to the offsets of the
6311 corresponding matched substrings.
6313 EFLAGS specifies `execution flags' which affect matching: if
6314 REG_NOTBOL is set, then ^ does not match at the beginning of the
6315 string; if REG_NOTEOL is set, then $ does not match at the end.
6317 We return 0 if we find a match and REG_NOMATCH if not. */
6320 regexec (const regex_t *preg, const char *string, size_t nmatch,
6321 regmatch_t pmatch[], int eflags)
6324 struct re_registers regs;
6325 regex_t private_preg;
6326 int len = strlen (string);
6327 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6329 private_preg = *preg;
6331 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6332 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6334 /* The user has told us exactly how many registers to return
6335 information about, via `nmatch'. We have to pass that on to the
6336 matching routines. */
6337 private_preg.regs_allocated = REGS_FIXED;
6341 regs.num_regs = nmatch;
6342 regs.start = TALLOC (nmatch, regoff_t);
6343 regs.end = TALLOC (nmatch, regoff_t);
6344 if (regs.start == NULL || regs.end == NULL)
6345 return (int) REG_NOMATCH;
6348 /* Perform the searching operation. */
6349 ret = re_search (&private_preg, string, len,
6350 /* start: */ 0, /* range: */ len,
6351 want_reg_info ? ®s : (struct re_registers *) 0);
6353 /* Copy the register information to the POSIX structure. */
6360 for (r = 0; r < nmatch; r++)
6362 pmatch[r].rm_so = regs.start[r];
6363 pmatch[r].rm_eo = regs.end[r];
6367 /* If we needed the temporary register info, free the space now. */
6372 /* We want zero return to mean success, unlike `re_search'. */
6373 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6377 /* Returns a message corresponding to an error code, ERRCODE, returned
6378 from either regcomp or regexec. We don't use PREG here. */
6381 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
6387 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6388 /* Only error codes returned by the rest of the code should be passed
6389 to this routine. If we are given anything else, or if other regex
6390 code generates an invalid error code, then the program has a bug.
6391 Dump core so we can fix it. */
6394 msg = gettext (re_error_msgid[errcode]);
6396 msg_size = strlen (msg) + 1; /* Includes the null. */
6398 if (errbuf_size != 0)
6400 if (msg_size > errbuf_size)
6402 strncpy (errbuf, msg, errbuf_size - 1);
6403 errbuf[errbuf_size - 1] = 0;
6406 strcpy (errbuf, msg);
6413 /* Free dynamically allocated space used by PREG. */
6416 regfree (regex_t *preg)
6418 if (preg->buffer != NULL)
6419 free (preg->buffer);
6420 preg->buffer = NULL;
6422 preg->allocated = 0;
6425 if (preg->fastmap != NULL)
6426 free (preg->fastmap);
6427 preg->fastmap = NULL;
6428 preg->fastmap_accurate = 0;
6430 if (preg->translate != NULL)
6431 free (preg->translate);
6432 preg->translate = NULL;
6435 #endif /* not emacs */
6439 make-backup-files: t
6441 trim-versions-without-asking: nil