1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA. */
25 /* Synched up with: FSF 19.29. */
27 /* Changes made for XEmacs:
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead.
34 (3) Lots and lots of changes for Mule. They are bracketed by
35 `#ifdef MULE' or with comments that have `XEmacs' in them.
42 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
51 /* Converts the pointer to the char to BEG-based offset from the start. */
52 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
53 ? (d) - string1 : (d) - (string2 - size1))
55 #define PTR_TO_OFFSET(d) 0
58 /* We assume non-Mule if emacs isn't defined. */
63 /* We need this for `regex.h', and perhaps for the Emacs include files. */
64 #include <sys/types.h>
66 /* This is for other GNU distributions with internationalized messages. */
67 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
70 # define gettext(msgid) (msgid)
73 /* XEmacs: define this to add in a speedup for patterns anchored at
74 the beginning of a line. Keep the ifdefs so that it's easier to
75 tell where/why this code has diverged from v19. */
76 #define REGEX_BEGLINE_CHECK
78 /* XEmacs: the current mmap-based ralloc handles small blocks very
79 poorly, so we disable it here. */
81 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
85 /* The `emacs' switch turns on certain matching commands
86 that make sense only in Emacs. */
93 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
99 Lisp_Object Vthe_lisp_rangetab;
102 complex_vars_of_regex (void)
104 Vthe_lisp_rangetab = Fmake_range_table ();
105 staticpro (&Vthe_lisp_rangetab);
111 complex_vars_of_regex (void)
117 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
118 #define TRANSLATE_P(tr) (!NILP (tr))
120 #else /* not emacs */
124 /* If we are not linking with Emacs proper,
125 we can't use the relocating allocator
126 even if config.h says that we can. */
129 #if defined (STDC_HEADERS) || defined (_LIBC)
136 /* Types normally included via lisp.h */
137 #include <stddef.h> /* for ptrdiff_t */
140 #ifndef DECLARE_NOTHING
141 #define DECLARE_NOTHING struct nosuchstruct
147 #define charptr_emchar(str) ((Emchar) (str)[0])
149 #define INC_CHARPTR(p) ((p)++)
150 #define DEC_CHARPTR(p) ((p)--)
154 /* Define the syntax stuff for \<, \>, etc. */
156 /* This must be nonzero for the wordchar and notwordchar pattern
157 commands in re_match_2. */
164 extern char *re_syntax_table;
166 #else /* not SYNTAX_TABLE */
168 /* How many characters in the character set. */
169 #define CHAR_SET_SIZE 256
171 static char re_syntax_table[CHAR_SET_SIZE];
174 init_syntax_once (void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset (re_syntax_table, 0, sizeof (re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
192 #endif /* SYNTAX_TABLE */
194 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
195 #undef SYNTAX_FROM_CACHE
196 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
198 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
199 #define TRANSLATE_P(tr) tr
203 /* Under XEmacs, this is needed because we don't define it elsewhere. */
204 #ifdef SWITCH_ENUM_BUG
205 #define SWITCH_ENUM_CAST(x) ((int)(x))
207 #define SWITCH_ENUM_CAST(x) (x)
211 /* Get the interface, including the syntax bits. */
214 /* isalpha etc. are used for the character classes. */
217 /* Jim Meyering writes:
219 "... Some ctype macros are valid only for character codes that
220 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
221 using /bin/cc or gcc but without giving an ansi option). So, all
222 ctype uses should be through macros like ISPRINT... If
223 STDC_HEADERS is defined, then autoconf has verified that the ctype
224 macros don't need to be guarded with references to isascii. ...
225 Defining isascii to 1 should let any compiler worth its salt
226 eliminate the && through constant folding." */
228 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
229 #define ISASCII_1(c) 1
231 #define ISASCII_1(c) isascii(c)
235 /* The IS*() macros can be passed any character, including an extended
236 one. We need to make sure there are no crashes, which would occur
237 otherwise due to out-of-bounds array references. */
238 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
240 #define ISASCII(c) ISASCII_1 (c)
244 #define ISBLANK(c) (ISASCII (c) && isblank (c))
246 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
249 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
251 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
254 #define ISPRINT(c) (ISASCII (c) && isprint (c))
255 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
256 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
257 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
258 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
259 #define ISLOWER(c) (ISASCII (c) && islower (c))
260 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
261 #define ISSPACE(c) (ISASCII (c) && isspace (c))
262 #define ISUPPER(c) (ISASCII (c) && isupper (c))
263 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
266 #define NULL (void *)0
269 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
270 since ours (we hope) works properly with all combinations of
271 machines, compilers, `char' and `unsigned char' argument types.
272 (Per Bothner suggested the basic approach.) */
273 #undef SIGN_EXTEND_CHAR
275 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
276 #else /* not __STDC__ */
277 /* As in Harbison and Steele. */
278 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
281 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
282 use `alloca' instead of `malloc'. This is because using malloc in
283 re_search* or re_match* could cause memory leaks when C-g is used in
284 Emacs; also, malloc is slower and causes storage fragmentation. On
285 the other hand, malloc is more portable, and easier to debug.
287 Because we sometimes use alloca, some routines have to be macros,
288 not functions -- `alloca'-allocated space disappears at the end of the
289 function it is called in. */
293 #define REGEX_ALLOCATE malloc
294 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
295 #define REGEX_FREE free
297 #else /* not REGEX_MALLOC */
299 /* Emacs already defines alloca, sometimes. */
302 /* Make alloca work the best possible way. */
304 #define alloca __builtin_alloca
305 #else /* not __GNUC__ */
308 #else /* not __GNUC__ or HAVE_ALLOCA_H */
309 #ifndef _AIX /* Already did AIX, up at the top. */
311 #endif /* not _AIX */
312 #endif /* HAVE_ALLOCA_H */
313 #endif /* __GNUC__ */
315 #endif /* not alloca */
317 #define REGEX_ALLOCATE alloca
319 /* Assumes a `char *destination' variable. */
320 #define REGEX_REALLOCATE(source, osize, nsize) \
321 (destination = (char *) alloca (nsize), \
322 memmove (destination, source, osize), \
325 /* No need to do anything to free, after alloca. */
326 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
328 #endif /* REGEX_MALLOC */
330 /* Define how to allocate the failure stack. */
333 #define REGEX_ALLOCATE_STACK(size) \
334 r_alloc ((char **) &failure_stack_ptr, (size))
335 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
336 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
337 #define REGEX_FREE_STACK(ptr) \
338 r_alloc_free ((void **) &failure_stack_ptr)
340 #else /* not REL_ALLOC */
344 #define REGEX_ALLOCATE_STACK malloc
345 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
346 #define REGEX_FREE_STACK free
348 #else /* not REGEX_MALLOC */
350 #define REGEX_ALLOCATE_STACK alloca
352 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
353 REGEX_REALLOCATE (source, osize, nsize)
354 /* No need to explicitly free anything. */
355 #define REGEX_FREE_STACK(arg)
357 #endif /* REGEX_MALLOC */
358 #endif /* REL_ALLOC */
361 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
362 `string1' or just past its end. This works if PTR is NULL, which is
364 #define FIRST_STRING_P(ptr) \
365 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
367 /* (Re)Allocate N items of type T using malloc, or fail. */
368 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
369 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
370 #define RETALLOC_IF(addr, n, t) \
371 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
372 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
374 #define BYTEWIDTH 8 /* In bits. */
376 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
380 #define MAX(a, b) ((a) > (b) ? (a) : (b))
381 #define MIN(a, b) ((a) < (b) ? (a) : (b))
383 /* Type of source-pattern and string chars. */
384 typedef const unsigned char re_char;
386 typedef char re_bool;
391 /* These are the command codes that appear in compiled regular
392 expressions. Some opcodes are followed by argument bytes. A
393 command code can specify any interpretation whatsoever for its
394 arguments. Zero bytes may appear in the compiled regular expression. */
400 /* Succeed right away--no more backtracking. */
403 /* Followed by one byte giving n, then by n literal bytes. */
406 /* Matches any (more or less) character. */
409 /* Matches any one char belonging to specified set. First
410 following byte is number of bitmap bytes. Then come bytes
411 for a bitmap saying which chars are in. Bits in each byte
412 are ordered low-bit-first. A character is in the set if its
413 bit is 1. A character too large to have a bit in the map is
414 automatically not in the set. */
417 /* Same parameters as charset, but match any character that is
418 not one of those specified. */
421 /* Start remembering the text that is matched, for storing in a
422 register. Followed by one byte with the register number, in
423 the range 1 to the pattern buffer's re_ngroups
424 field. Then followed by one byte with the number of groups
425 inner to this one. (This last has to be part of the
426 start_memory only because we need it in the on_failure_jump
430 /* Stop remembering the text that is matched and store it in a
431 memory register. Followed by one byte with the register
432 number, in the range 1 to `re_ngroups' in the
433 pattern buffer, and one byte with the number of inner groups,
434 just like `start_memory'. (We need the number of inner
435 groups here because we don't have any easy way of finding the
436 corresponding start_memory when we're at a stop_memory.) */
439 /* Match a duplicate of something remembered. Followed by one
440 byte containing the register number. */
443 /* Fail unless at beginning of line. */
446 /* Fail unless at end of line. */
449 /* Succeeds if at beginning of buffer (if emacs) or at beginning
450 of string to be matched (if not). */
453 /* Analogously, for end of buffer/string. */
456 /* Followed by two byte relative address to which to jump. */
459 /* Same as jump, but marks the end of an alternative. */
462 /* Followed by two-byte relative address of place to resume at
463 in case of failure. */
466 /* Like on_failure_jump, but pushes a placeholder instead of the
467 current string position when executed. */
468 on_failure_keep_string_jump,
470 /* Throw away latest failure point and then jump to following
471 two-byte relative address. */
474 /* Change to pop_failure_jump if know won't have to backtrack to
475 match; otherwise change to jump. This is used to jump
476 back to the beginning of a repeat. If what follows this jump
477 clearly won't match what the repeat does, such that we can be
478 sure that there is no use backtracking out of repetitions
479 already matched, then we change it to a pop_failure_jump.
480 Followed by two-byte address. */
483 /* Jump to following two-byte address, and push a dummy failure
484 point. This failure point will be thrown away if an attempt
485 is made to use it for a failure. A `+' construct makes this
486 before the first repeat. Also used as an intermediary kind
487 of jump when compiling an alternative. */
490 /* Push a dummy failure point and continue. Used at the end of
494 /* Followed by two-byte relative address and two-byte number n.
495 After matching N times, jump to the address upon failure. */
498 /* Followed by two-byte relative address, and two-byte number n.
499 Jump to the address N times, then fail. */
502 /* Set the following two-byte relative address to the
503 subsequent two-byte number. The address *includes* the two
507 wordchar, /* Matches any word-constituent character. */
508 notwordchar, /* Matches any char that is not a word-constituent. */
510 wordbeg, /* Succeeds if at word beginning. */
511 wordend, /* Succeeds if at word end. */
513 wordbound, /* Succeeds if at a word boundary. */
514 notwordbound /* Succeeds if not at a word boundary. */
517 ,before_dot, /* Succeeds if before point. */
518 at_dot, /* Succeeds if at point. */
519 after_dot, /* Succeeds if after point. */
521 /* Matches any character whose syntax is specified. Followed by
522 a byte which contains a syntax code, e.g., Sword. */
525 /* Matches any character whose syntax is not that specified. */
531 /* need extra stuff to be able to properly work with XEmacs/Mule
532 characters (which may take up more than one byte) */
534 ,charset_mule, /* Matches any character belonging to specified set.
535 The set is stored in "unified range-table
536 format"; see rangetab.c. Unlike the `charset'
537 opcode, this can handle arbitrary characters. */
539 charset_mule_not /* Same parameters as charset_mule, but match any
540 character that is not one of those specified. */
542 /* 97/2/17 jhod: The following two were merged back in from the Mule
543 2.3 code to enable some language specific processing */
544 ,categoryspec, /* Matches entries in the character category tables */
545 notcategoryspec /* The opposite of the above */
550 /* Common operations on the compiled pattern. */
552 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
554 #define STORE_NUMBER(destination, number) \
556 (destination)[0] = (number) & 0377; \
557 (destination)[1] = (number) >> 8; \
560 /* Same as STORE_NUMBER, except increment DESTINATION to
561 the byte after where the number is stored. Therefore, DESTINATION
562 must be an lvalue. */
564 #define STORE_NUMBER_AND_INCR(destination, number) \
566 STORE_NUMBER (destination, number); \
567 (destination) += 2; \
570 /* Put into DESTINATION a number stored in two contiguous bytes starting
573 #define EXTRACT_NUMBER(destination, source) \
575 (destination) = *(source) & 0377; \
576 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
581 extract_number (int *dest, re_char *source)
583 int temp = SIGN_EXTEND_CHAR (*(source + 1));
584 *dest = *source & 0377;
588 #ifndef EXTRACT_MACROS /* To debug the macros. */
589 #undef EXTRACT_NUMBER
590 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
591 #endif /* not EXTRACT_MACROS */
595 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
596 SOURCE must be an lvalue. */
598 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
600 EXTRACT_NUMBER (destination, source); \
606 extract_number_and_incr (int *destination, unsigned char **source)
608 extract_number (destination, *source);
612 #ifndef EXTRACT_MACROS
613 #undef EXTRACT_NUMBER_AND_INCR
614 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
615 extract_number_and_incr (&dest, &src)
616 #endif /* not EXTRACT_MACROS */
620 /* If DEBUG is defined, Regex prints many voluminous messages about what
621 it is doing (if the variable `debug' is nonzero). If linked with the
622 main program in `iregex.c', you can enter patterns and strings
623 interactively. And if linked with the main program in `main.c' and
624 the other test files, you can run the already-written tests. */
628 /* We use standard I/O for debugging. */
632 /* XEmacs provides its own version of assert() */
633 /* It is useful to test things that ``must'' be true when debugging. */
637 static int debug = 0;
639 #define DEBUG_STATEMENT(e) e
640 #define DEBUG_PRINT1(x) if (debug) printf (x)
641 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
642 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
643 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
644 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
645 if (debug) print_partial_compiled_pattern (s, e)
646 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
647 if (debug) print_double_string (w, s1, sz1, s2, sz2)
650 /* Print the fastmap in human-readable form. */
653 print_fastmap (char *fastmap)
655 unsigned was_a_range = 0;
658 while (i < (1 << BYTEWIDTH))
664 while (i < (1 << BYTEWIDTH) && fastmap[i])
680 /* Print a compiled pattern string in human-readable form, starting at
681 the START pointer into it and ending just before the pointer END. */
684 print_partial_compiled_pattern (re_char *start, re_char *end)
687 unsigned char *p = (unsigned char *) start;
696 /* Loop over pattern commands. */
699 printf ("%ld:\t", (long)(p - start));
701 switch ((re_opcode_t) *p++)
709 printf ("/exactn/%d", mcnt);
720 printf ("/start_memory/%d/%d", mcnt, *p++);
725 printf ("/stop_memory/%d/%d", mcnt, *p++);
729 printf ("/duplicate/%d", *p++);
739 REGISTER int c, last = -100;
740 REGISTER int in_range = 0;
742 printf ("/charset [%s",
743 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
745 assert (p + *p < pend);
747 for (c = 0; c < 256; c++)
748 if (((unsigned char) (c / 8) < *p)
749 && (p[1 + (c/8)] & (1 << (c % 8))))
751 /* Are we starting a range? */
752 if (last + 1 == c && ! in_range)
757 /* Have we broken a range? */
758 else if (last + 1 != c && in_range)
781 case charset_mule_not:
785 printf ("/charset_mule [%s",
786 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
787 nentries = unified_range_table_nentries (p);
788 for (i = 0; i < nentries; i++)
790 EMACS_INT first, last;
791 Lisp_Object dummy_val;
793 unified_range_table_get_range (p, i, &first, &last,
798 printf ("(0x%lx)", (long)first);
805 printf ("(0x%lx)", (long)last);
809 p += unified_range_table_bytes_used (p);
822 case on_failure_jump:
823 extract_number_and_incr (&mcnt, &p);
824 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
827 case on_failure_keep_string_jump:
828 extract_number_and_incr (&mcnt, &p);
829 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
832 case dummy_failure_jump:
833 extract_number_and_incr (&mcnt, &p);
834 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
837 case push_dummy_failure:
838 printf ("/push_dummy_failure");
842 extract_number_and_incr (&mcnt, &p);
843 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
846 case pop_failure_jump:
847 extract_number_and_incr (&mcnt, &p);
848 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
852 extract_number_and_incr (&mcnt, &p);
853 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
857 extract_number_and_incr (&mcnt, &p);
858 printf ("/jump to %ld", (long)(p + mcnt - start));
862 extract_number_and_incr (&mcnt, &p);
863 extract_number_and_incr (&mcnt2, &p);
864 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
868 extract_number_and_incr (&mcnt, &p);
869 extract_number_and_incr (&mcnt2, &p);
870 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
874 extract_number_and_incr (&mcnt, &p);
875 extract_number_and_incr (&mcnt2, &p);
876 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
880 printf ("/wordbound");
884 printf ("/notwordbound");
896 printf ("/before_dot");
904 printf ("/after_dot");
908 printf ("/syntaxspec");
910 printf ("/%d", mcnt);
914 printf ("/notsyntaxspec");
916 printf ("/%d", mcnt);
920 /* 97/2/17 jhod Mule category patch */
922 printf ("/categoryspec");
924 printf ("/%d", mcnt);
927 case notcategoryspec:
928 printf ("/notcategoryspec");
930 printf ("/%d", mcnt);
932 /* end of category patch */
937 printf ("/wordchar");
941 printf ("/notwordchar");
953 printf ("?%d", *(p-1));
959 printf ("%ld:\tend of pattern.\n", (long)(p - start));
964 print_compiled_pattern (struct re_pattern_buffer *bufp)
966 re_char *buffer = bufp->buffer;
968 print_partial_compiled_pattern (buffer, buffer + bufp->used);
969 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
972 if (bufp->fastmap_accurate && bufp->fastmap)
974 printf ("fastmap: ");
975 print_fastmap (bufp->fastmap);
978 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
979 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups);
980 printf ("regs_alloc: %d\t", bufp->regs_allocated);
981 printf ("can_be_null: %d\t", bufp->can_be_null);
982 printf ("newline_anchor: %d\n", bufp->newline_anchor);
983 printf ("no_sub: %d\t", bufp->no_sub);
984 printf ("not_bol: %d\t", bufp->not_bol);
985 printf ("not_eol: %d\t", bufp->not_eol);
986 printf ("syntax: %d\n", bufp->syntax);
987 /* Perhaps we should print the translate table? */
988 /* and maybe the category table? */
990 if (bufp->external_to_internal_register)
994 printf ("external_to_internal_register:\n");
995 for (i = 0; i <= bufp->re_nsub; i++)
999 printf ("%d -> %d", i, bufp->external_to_internal_register[i]);
1007 print_double_string (re_char *where, re_char *string1, int size1,
1008 re_char *string2, int size2)
1014 Element_count this_char;
1016 if (FIRST_STRING_P (where))
1018 for (this_char = where - string1; this_char < size1; this_char++)
1019 putchar (string1[this_char]);
1024 for (this_char = where - string2; this_char < size2; this_char++)
1025 putchar (string2[this_char]);
1029 #else /* not DEBUG */
1034 #define DEBUG_STATEMENT(e)
1035 #define DEBUG_PRINT1(x)
1036 #define DEBUG_PRINT2(x1, x2)
1037 #define DEBUG_PRINT3(x1, x2, x3)
1038 #define DEBUG_PRINT4(x1, x2, x3, x4)
1039 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1040 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1044 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1045 also be assigned to arbitrarily: each pattern buffer stores its own
1046 syntax, so it can be changed between regex compilations. */
1047 /* This has no initializer because initialized variables in Emacs
1048 become read-only after dumping. */
1049 reg_syntax_t re_syntax_options;
1052 /* Specify the precise syntax of regexps for compilation. This provides
1053 for compatibility for various utilities which historically have
1054 different, incompatible syntaxes.
1056 The argument SYNTAX is a bit mask comprised of the various bits
1057 defined in regex.h. We return the old syntax. */
1060 re_set_syntax (reg_syntax_t syntax)
1062 reg_syntax_t ret = re_syntax_options;
1064 re_syntax_options = syntax;
1068 /* This table gives an error message for each of the error codes listed
1069 in regex.h. Obviously the order here has to be same as there.
1070 POSIX doesn't require that we do anything for REG_NOERROR,
1071 but why not be nice? */
1073 static const char *re_error_msgid[] =
1075 "Success", /* REG_NOERROR */
1076 "No match", /* REG_NOMATCH */
1077 "Invalid regular expression", /* REG_BADPAT */
1078 "Invalid collation character", /* REG_ECOLLATE */
1079 "Invalid character class name", /* REG_ECTYPE */
1080 "Trailing backslash", /* REG_EESCAPE */
1081 "Invalid back reference", /* REG_ESUBREG */
1082 "Unmatched [ or [^", /* REG_EBRACK */
1083 "Unmatched ( or \\(", /* REG_EPAREN */
1084 "Unmatched \\{", /* REG_EBRACE */
1085 "Invalid content of \\{\\}", /* REG_BADBR */
1086 "Invalid range end", /* REG_ERANGE */
1087 "Memory exhausted", /* REG_ESPACE */
1088 "Invalid preceding regular expression", /* REG_BADRPT */
1089 "Premature end of regular expression", /* REG_EEND */
1090 "Regular expression too big", /* REG_ESIZE */
1091 "Unmatched ) or \\)", /* REG_ERPAREN */
1093 "Invalid syntax designator", /* REG_ESYNTAX */
1096 "Ranges may not span charsets", /* REG_ERANGESPAN */
1097 "Invalid category designator", /* REG_ECATEGORY */
1101 /* Avoiding alloca during matching, to placate r_alloc. */
1103 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1104 searching and matching functions should not call alloca. On some
1105 systems, alloca is implemented in terms of malloc, and if we're
1106 using the relocating allocator routines, then malloc could cause a
1107 relocation, which might (if the strings being searched are in the
1108 ralloc heap) shift the data out from underneath the regexp
1111 Here's another reason to avoid allocation: Emacs
1112 processes input from X in a signal handler; processing X input may
1113 call malloc; if input arrives while a matching routine is calling
1114 malloc, then we're scrod. But Emacs can't just block input while
1115 calling matching routines; then we don't notice interrupts when
1116 they come in. So, Emacs blocks input around all regexp calls
1117 except the matching calls, which it leaves unprotected, in the
1118 faith that they will not malloc. */
1120 /* Normally, this is fine. */
1121 #define MATCH_MAY_ALLOCATE
1123 /* When using GNU C, we are not REALLY using the C alloca, no matter
1124 what config.h may say. So don't take precautions for it. */
1129 /* The match routines may not allocate if (1) they would do it with malloc
1130 and (2) it's not safe for them to use malloc.
1131 Note that if REL_ALLOC is defined, matching would not use malloc for the
1132 failure stack, but we would still use it for the register vectors;
1133 so REL_ALLOC should not affect this. */
1134 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1135 #undef MATCH_MAY_ALLOCATE
1139 /* Failure stack declarations and macros; both re_compile_fastmap and
1140 re_match_2 use a failure stack. These have to be macros because of
1141 REGEX_ALLOCATE_STACK. */
1144 /* Number of failure points for which to initially allocate space
1145 when matching. If this number is exceeded, we allocate more
1146 space, so it is not a hard limit. */
1147 #ifndef INIT_FAILURE_ALLOC
1148 #define INIT_FAILURE_ALLOC 5
1151 /* Roughly the maximum number of failure points on the stack. Would be
1152 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1153 This is a variable only so users of regex can assign to it; we never
1154 change it ourselves. */
1155 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1156 /* 4400 was enough to cause a crash on Alpha OSF/1,
1157 whose default stack limit is 2mb. */
1158 int re_max_failures = 20000;
1160 int re_max_failures = 2000;
1163 union fail_stack_elt
1169 typedef union fail_stack_elt fail_stack_elt_t;
1173 fail_stack_elt_t *stack;
1175 Element_count avail; /* Offset of next open position. */
1178 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1179 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1180 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1183 /* Define macros to initialize and free the failure stack.
1184 Do `return -2' if the alloc fails. */
1186 #ifdef MATCH_MAY_ALLOCATE
1187 #define INIT_FAIL_STACK() \
1189 fail_stack.stack = (fail_stack_elt_t *) \
1190 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1192 if (fail_stack.stack == NULL) \
1195 fail_stack.size = INIT_FAILURE_ALLOC; \
1196 fail_stack.avail = 0; \
1199 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1201 #define INIT_FAIL_STACK() \
1203 fail_stack.avail = 0; \
1206 #define RESET_FAIL_STACK()
1210 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1212 Return 1 if succeeds, and 0 if either ran out of memory
1213 allocating space for it or it was already too large.
1215 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1217 #define DOUBLE_FAIL_STACK(fail_stack) \
1218 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1220 : ((fail_stack).stack = (fail_stack_elt_t *) \
1221 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1222 (fail_stack).size * sizeof (fail_stack_elt_t), \
1223 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1225 (fail_stack).stack == NULL \
1227 : ((fail_stack).size <<= 1, \
1231 /* Push pointer POINTER on FAIL_STACK.
1232 Return 1 if was able to do so and 0 if ran out of memory allocating
1234 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1235 ((FAIL_STACK_FULL () \
1236 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1238 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1241 /* Push a pointer value onto the failure stack.
1242 Assumes the variable `fail_stack'. Probably should only
1243 be called from within `PUSH_FAILURE_POINT'. */
1244 #define PUSH_FAILURE_POINTER(item) \
1245 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1247 /* This pushes an integer-valued item onto the failure stack.
1248 Assumes the variable `fail_stack'. Probably should only
1249 be called from within `PUSH_FAILURE_POINT'. */
1250 #define PUSH_FAILURE_INT(item) \
1251 fail_stack.stack[fail_stack.avail++].integer = (item)
1253 /* Push a fail_stack_elt_t value onto the failure stack.
1254 Assumes the variable `fail_stack'. Probably should only
1255 be called from within `PUSH_FAILURE_POINT'. */
1256 #define PUSH_FAILURE_ELT(item) \
1257 fail_stack.stack[fail_stack.avail++] = (item)
1259 /* These three POP... operations complement the three PUSH... operations.
1260 All assume that `fail_stack' is nonempty. */
1261 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1262 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1263 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1265 /* Used to omit pushing failure point id's when we're not debugging. */
1267 #define DEBUG_PUSH PUSH_FAILURE_INT
1268 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1270 #define DEBUG_PUSH(item)
1271 #define DEBUG_POP(item_addr)
1275 /* Push the information about the state we will need
1276 if we ever fail back to it.
1278 Requires variables fail_stack, regstart, regend, reg_info, and
1279 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1282 Does `return FAILURE_CODE' if runs out of memory. */
1284 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1285 #define DECLARE_DESTINATION char *destination
1287 #define DECLARE_DESTINATION DECLARE_NOTHING
1290 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1292 DECLARE_DESTINATION; \
1293 /* Must be int, so when we don't save any registers, the arithmetic \
1294 of 0 + -1 isn't done as unsigned. */ \
1297 DEBUG_STATEMENT (failure_id++); \
1298 DEBUG_STATEMENT (nfailure_points_pushed++); \
1299 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1300 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1301 (unsigned long) (fail_stack).avail); \
1302 DEBUG_PRINT2 (" size: %lu\n", \
1303 (unsigned long) (fail_stack).size); \
1305 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1306 DEBUG_PRINT2 (" available: %ld\n", \
1307 (long) REMAINING_AVAIL_SLOTS); \
1309 /* Ensure we have enough space allocated for what we will push. */ \
1310 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1312 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1313 return failure_code; \
1315 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1316 (unsigned long) (fail_stack).size); \
1317 DEBUG_PRINT2 (" slots available: %ld\n", \
1318 (long) REMAINING_AVAIL_SLOTS); \
1321 /* Push the info, starting with the registers. */ \
1322 DEBUG_PRINT1 ("\n"); \
1324 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1327 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1328 DEBUG_STATEMENT (num_regs_pushed++); \
1330 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1331 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1333 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1334 PUSH_FAILURE_POINTER (regend[this_reg]); \
1336 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1337 * (long *) (®_info[this_reg])); \
1338 DEBUG_PRINT2 (" match_null=%d", \
1339 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1340 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1341 DEBUG_PRINT2 (" matched_something=%d", \
1342 MATCHED_SOMETHING (reg_info[this_reg])); \
1343 DEBUG_PRINT2 (" ever_matched_something=%d", \
1344 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1345 DEBUG_PRINT1 ("\n"); \
1346 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1349 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \
1350 PUSH_FAILURE_INT (lowest_active_reg); \
1352 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \
1353 PUSH_FAILURE_INT (highest_active_reg); \
1355 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \
1356 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1357 PUSH_FAILURE_POINTER (pattern_place); \
1359 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1360 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1362 DEBUG_PRINT1 ("'\n"); \
1363 PUSH_FAILURE_POINTER (string_place); \
1365 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1366 DEBUG_PUSH (failure_id); \
1369 /* This is the number of items that are pushed and popped on the stack
1370 for each register. */
1371 #define NUM_REG_ITEMS 3
1373 /* Individual items aside from the registers. */
1375 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1377 #define NUM_NONREG_ITEMS 4
1380 /* We push at most this many items on the stack. */
1381 /* We used to use (num_regs - 1), which is the number of registers
1382 this regexp will save; but that was changed to 5
1383 to avoid stack overflow for a regexp with lots of parens. */
1384 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1386 /* We actually push this many items. */
1387 #define NUM_FAILURE_ITEMS \
1388 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1391 /* How many items can still be added to the stack without overflowing it. */
1392 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1395 /* Pops what PUSH_FAIL_STACK pushes.
1397 We restore into the parameters, all of which should be lvalues:
1398 STR -- the saved data position.
1399 PAT -- the saved pattern position.
1400 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1401 REGSTART, REGEND -- arrays of string positions.
1402 REG_INFO -- array of information about each subexpression.
1404 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1405 `pend', `string1', `size1', `string2', and `size2'. */
1407 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1408 regstart, regend, reg_info) \
1410 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1412 const unsigned char *string_temp; \
1414 assert (!FAIL_STACK_EMPTY ()); \
1416 /* Remove failure points and point to how many regs pushed. */ \
1417 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1418 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1419 (unsigned long) fail_stack.avail); \
1420 DEBUG_PRINT2 (" size: %lu\n", \
1421 (unsigned long) fail_stack.size); \
1423 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1425 DEBUG_POP (&ffailure_id.integer); \
1426 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1427 * (unsigned int *) &ffailure_id); \
1429 /* If the saved string location is NULL, it came from an \
1430 on_failure_keep_string_jump opcode, and we want to throw away the \
1431 saved NULL, thus retaining our current position in the string. */ \
1432 string_temp = POP_FAILURE_POINTER (); \
1433 if (string_temp != NULL) \
1434 str = string_temp; \
1436 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1437 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1438 DEBUG_PRINT1 ("'\n"); \
1440 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1441 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1442 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1444 /* Restore register info. */ \
1445 high_reg = (unsigned) POP_FAILURE_INT (); \
1446 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1448 low_reg = (unsigned) POP_FAILURE_INT (); \
1449 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1451 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1453 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1455 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1456 DEBUG_PRINT2 (" info: 0x%lx\n", \
1457 * (long *) ®_info[this_reg]); \
1459 regend[this_reg] = POP_FAILURE_POINTER (); \
1460 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1462 regstart[this_reg] = POP_FAILURE_POINTER (); \
1463 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1466 set_regs_matched_done = 0; \
1467 DEBUG_STATEMENT (nfailure_points_popped++); \
1468 } while (0) /* POP_FAILURE_POINT */
1472 /* Structure for per-register (a.k.a. per-group) information.
1473 Other register information, such as the
1474 starting and ending positions (which are addresses), and the list of
1475 inner groups (which is a bits list) are maintained in separate
1478 We are making a (strictly speaking) nonportable assumption here: that
1479 the compiler will pack our bit fields into something that fits into
1480 the type of `word', i.e., is something that fits into one item on the
1485 fail_stack_elt_t word;
1488 /* This field is one if this group can match the empty string,
1489 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1490 #define MATCH_NULL_UNSET_VALUE 3
1491 unsigned match_null_string_p : 2;
1492 unsigned is_active : 1;
1493 unsigned matched_something : 1;
1494 unsigned ever_matched_something : 1;
1496 } register_info_type;
1498 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1499 #define IS_ACTIVE(R) ((R).bits.is_active)
1500 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1501 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1504 /* Call this when have matched a real character; it sets `matched' flags
1505 for the subexpressions which we are currently inside. Also records
1506 that those subexprs have matched. */
1507 #define SET_REGS_MATCHED() \
1510 if (!set_regs_matched_done) \
1513 set_regs_matched_done = 1; \
1514 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1516 MATCHED_SOMETHING (reg_info[r]) \
1517 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1524 /* Registers are set to a sentinel when they haven't yet matched. */
1525 static unsigned char reg_unset_dummy;
1526 #define REG_UNSET_VALUE (®_unset_dummy)
1527 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1529 /* Subroutine declarations and macros for regex_compile. */
1531 /* Fetch the next character in the uncompiled pattern---translating it
1532 if necessary. Also cast from a signed character in the constant
1533 string passed to us by the user to an unsigned char that we can use
1534 as an array index (in, e.g., `translate'). */
1535 #define PATFETCH(c) \
1538 c = TRANSLATE (c); \
1541 /* Fetch the next character in the uncompiled pattern, with no
1543 #define PATFETCH_RAW(c) \
1544 do {if (p == pend) return REG_EEND; \
1545 assert (p < pend); \
1546 c = charptr_emchar (p); \
1550 /* Go backwards one character in the pattern. */
1551 #define PATUNFETCH DEC_CHARPTR (p)
1555 #define PATFETCH_EXTENDED(emch) \
1556 do {if (p == pend) return REG_EEND; \
1557 assert (p < pend); \
1558 emch = charptr_emchar ((const Bufbyte *) p); \
1560 if (TRANSLATE_P (translate) && emch < 0x80) \
1561 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1564 #define PATFETCH_RAW_EXTENDED(emch) \
1565 do {if (p == pend) return REG_EEND; \
1566 assert (p < pend); \
1567 emch = charptr_emchar ((const Bufbyte *) p); \
1571 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1573 #define PATFETCH_EITHER(emch) \
1575 if (has_extended_chars) \
1576 PATFETCH_EXTENDED (emch); \
1581 #define PATFETCH_RAW_EITHER(emch) \
1583 if (has_extended_chars) \
1584 PATFETCH_RAW_EXTENDED (emch); \
1586 PATFETCH_RAW (emch); \
1589 #define PATUNFETCH_EITHER \
1591 if (has_extended_chars) \
1592 PATUNFETCH_EXTENDED (emch); \
1594 PATUNFETCH (emch); \
1597 #else /* not MULE */
1599 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1600 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1601 #define PATUNFETCH_EITHER PATUNFETCH
1605 /* If `translate' is non-null, return translate[D], else just D. We
1606 cast the subscript to translate because some data is declared as
1607 `char *', to avoid warnings when a string constant is passed. But
1608 when we use a character as a subscript we must make it unsigned. */
1609 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1611 /* Macros for outputting the compiled pattern into `buffer'. */
1613 /* If the buffer isn't allocated when it comes in, use this. */
1614 #define INIT_BUF_SIZE 32
1616 /* Make sure we have at least N more bytes of space in buffer. */
1617 #define GET_BUFFER_SPACE(n) \
1618 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1621 /* Make sure we have one more byte of buffer space and then add C to it. */
1622 #define BUF_PUSH(c) \
1624 GET_BUFFER_SPACE (1); \
1625 *buf_end++ = (unsigned char) (c); \
1629 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1630 #define BUF_PUSH_2(c1, c2) \
1632 GET_BUFFER_SPACE (2); \
1633 *buf_end++ = (unsigned char) (c1); \
1634 *buf_end++ = (unsigned char) (c2); \
1638 /* As with BUF_PUSH_2, except for three bytes. */
1639 #define BUF_PUSH_3(c1, c2, c3) \
1641 GET_BUFFER_SPACE (3); \
1642 *buf_end++ = (unsigned char) (c1); \
1643 *buf_end++ = (unsigned char) (c2); \
1644 *buf_end++ = (unsigned char) (c3); \
1648 /* Store a jump with opcode OP at LOC to location TO. We store a
1649 relative address offset by the three bytes the jump itself occupies. */
1650 #define STORE_JUMP(op, loc, to) \
1651 store_op1 (op, loc, (to) - (loc) - 3)
1653 /* Likewise, for a two-argument jump. */
1654 #define STORE_JUMP2(op, loc, to, arg) \
1655 store_op2 (op, loc, (to) - (loc) - 3, arg)
1657 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1659 #define INSERT_JUMP(op, loc, to) \
1660 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1662 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1664 #define INSERT_JUMP2(op, loc, to, arg) \
1665 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1668 /* This is not an arbitrary limit: the arguments which represent offsets
1669 into the pattern are two bytes long. So if 2^16 bytes turns out to
1670 be too small, many things would have to change. */
1671 #define MAX_BUF_SIZE (1L << 16)
1674 /* Extend the buffer by twice its current size via realloc and
1675 reset the pointers that pointed into the old block to point to the
1676 correct places in the new one. If extending the buffer results in it
1677 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1678 #define EXTEND_BUFFER() \
1680 re_char *old_buffer = bufp->buffer; \
1681 if (bufp->allocated == MAX_BUF_SIZE) \
1683 bufp->allocated <<= 1; \
1684 if (bufp->allocated > MAX_BUF_SIZE) \
1685 bufp->allocated = MAX_BUF_SIZE; \
1686 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1687 if (bufp->buffer == NULL) \
1688 return REG_ESPACE; \
1689 /* If the buffer moved, move all the pointers into it. */ \
1690 if (old_buffer != bufp->buffer) \
1692 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1693 begalt = (begalt - old_buffer) + bufp->buffer; \
1694 if (fixup_alt_jump) \
1695 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1697 laststart = (laststart - old_buffer) + bufp->buffer; \
1698 if (pending_exact) \
1699 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1704 /* Since we have one byte reserved for the register number argument to
1705 {start,stop}_memory, the maximum number of groups we can report
1706 things about is what fits in that byte. */
1707 #define MAX_REGNUM 255
1709 /* But patterns can have more than `MAX_REGNUM' registers. We just
1710 ignore the excess. */
1711 typedef unsigned regnum_t;
1713 #define INIT_REG_TRANSLATE_SIZE 5
1715 /* Macros for the compile stack. */
1717 /* Since offsets can go either forwards or backwards, this type needs to
1718 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1719 typedef int pattern_offset_t;
1723 pattern_offset_t begalt_offset;
1724 pattern_offset_t fixup_alt_jump;
1725 pattern_offset_t inner_group_offset;
1726 pattern_offset_t laststart_offset;
1728 } compile_stack_elt_t;
1733 compile_stack_elt_t *stack;
1735 unsigned avail; /* Offset of next open position. */
1736 } compile_stack_type;
1739 #define INIT_COMPILE_STACK_SIZE 32
1741 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1742 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1744 /* The next available element. */
1745 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1748 /* Set the bit for character C in a bit vector. */
1749 #define SET_LIST_BIT(c) \
1750 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1751 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1755 /* Set the "bit" for character C in a range table. */
1756 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1758 /* Set the "bit" for character c in the appropriate table. */
1759 #define SET_EITHER_BIT(c) \
1761 if (has_extended_chars) \
1762 SET_RANGETAB_BIT (c); \
1767 #else /* not MULE */
1769 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1774 /* Get the next unsigned number in the uncompiled pattern. */
1775 #define GET_UNSIGNED_NUMBER(num) \
1779 while (ISDIGIT (c)) \
1783 num = num * 10 + c - '0'; \
1791 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1793 #define IS_CHAR_CLASS(string) \
1794 (STREQ (string, "alpha") || STREQ (string, "upper") \
1795 || STREQ (string, "lower") || STREQ (string, "digit") \
1796 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1797 || STREQ (string, "space") || STREQ (string, "print") \
1798 || STREQ (string, "punct") || STREQ (string, "graph") \
1799 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1801 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1802 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1803 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1804 unsigned char *end);
1805 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1806 unsigned char *end);
1807 static re_bool at_begline_loc_p (re_char *pattern, re_char *p,
1808 reg_syntax_t syntax);
1809 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax);
1810 static re_bool group_in_compile_stack (compile_stack_type compile_stack,
1812 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend,
1813 RE_TRANSLATE_TYPE translate,
1814 reg_syntax_t syntax,
1817 static reg_errcode_t compile_extended_range (re_char **p_ptr,
1819 RE_TRANSLATE_TYPE translate,
1820 reg_syntax_t syntax,
1823 static re_bool group_match_null_string_p (unsigned char **p,
1825 register_info_type *reg_info);
1826 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
1827 register_info_type *reg_info);
1828 static re_bool common_op_match_null_string_p (unsigned char **p,
1830 register_info_type *reg_info);
1831 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2,
1832 REGISTER int len, RE_TRANSLATE_TYPE translate);
1833 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1834 re_char *string1, int size1,
1835 re_char *string2, int size2, int pos,
1836 struct re_registers *regs, int stop);
1838 #ifndef MATCH_MAY_ALLOCATE
1840 /* If we cannot allocate large objects within re_match_2_internal,
1841 we make the fail stack and register vectors global.
1842 The fail stack, we grow to the maximum size when a regexp
1844 The register vectors, we adjust in size each time we
1845 compile a regexp, according to the number of registers it needs. */
1847 static fail_stack_type fail_stack;
1849 /* Size with which the following vectors are currently allocated.
1850 That is so we can make them bigger as needed,
1851 but never make them smaller. */
1852 static int regs_allocated_size;
1854 static re_char ** regstart, ** regend;
1855 static re_char ** old_regstart, ** old_regend;
1856 static re_char **best_regstart, **best_regend;
1857 static register_info_type *reg_info;
1858 static re_char **reg_dummy;
1859 static register_info_type *reg_info_dummy;
1861 /* Make the register vectors big enough for NUM_REGS registers,
1862 but don't make them smaller. */
1865 regex_grow_registers (int num_regs)
1867 if (num_regs > regs_allocated_size)
1869 RETALLOC_IF (regstart, num_regs, re_char *);
1870 RETALLOC_IF (regend, num_regs, re_char *);
1871 RETALLOC_IF (old_regstart, num_regs, re_char *);
1872 RETALLOC_IF (old_regend, num_regs, re_char *);
1873 RETALLOC_IF (best_regstart, num_regs, re_char *);
1874 RETALLOC_IF (best_regend, num_regs, re_char *);
1875 RETALLOC_IF (reg_info, num_regs, register_info_type);
1876 RETALLOC_IF (reg_dummy, num_regs, re_char *);
1877 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1879 regs_allocated_size = num_regs;
1883 #endif /* not MATCH_MAY_ALLOCATE */
1885 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1886 Returns one of error codes defined in `regex.h', or zero for success.
1888 Assumes the `allocated' (and perhaps `buffer') and `translate'
1889 fields are set in BUFP on entry.
1891 If it succeeds, results are put in BUFP (if it returns an error, the
1892 contents of BUFP are undefined):
1893 `buffer' is the compiled pattern;
1894 `syntax' is set to SYNTAX;
1895 `used' is set to the length of the compiled pattern;
1896 `fastmap_accurate' is zero;
1897 `re_ngroups' is the number of groups/subexpressions (including shy
1899 `re_nsub' is the number of non-shy groups in PATTERN;
1900 `not_bol' and `not_eol' are zero;
1902 The `fastmap' and `newline_anchor' fields are neither
1903 examined nor set. */
1905 /* Return, freeing storage we allocated. */
1906 #define FREE_STACK_RETURN(value) \
1907 return (free (compile_stack.stack), value)
1909 static reg_errcode_t
1910 regex_compile (re_char *pattern, int size, reg_syntax_t syntax,
1911 struct re_pattern_buffer *bufp)
1913 /* We fetch characters from PATTERN here. We declare these as int
1914 (or possibly long) so that chars above 127 can be used as
1915 array indices. The macros that fetch a character from the pattern
1916 make sure to coerce to unsigned char before assigning, so we won't
1917 get bitten by negative numbers here. */
1918 /* XEmacs change: used to be unsigned char. */
1919 REGISTER EMACS_INT c, c1;
1921 /* A random temporary spot in PATTERN. */
1924 /* Points to the end of the buffer, where we should append. */
1925 REGISTER unsigned char *buf_end;
1927 /* Keeps track of unclosed groups. */
1928 compile_stack_type compile_stack;
1930 /* Points to the current (ending) position in the pattern. */
1931 re_char *p = pattern;
1932 re_char *pend = pattern + size;
1934 /* How to translate the characters in the pattern. */
1935 RE_TRANSLATE_TYPE translate = bufp->translate;
1937 /* Address of the count-byte of the most recently inserted `exactn'
1938 command. This makes it possible to tell if a new exact-match
1939 character can be added to that command or if the character requires
1940 a new `exactn' command. */
1941 unsigned char *pending_exact = 0;
1943 /* Address of start of the most recently finished expression.
1944 This tells, e.g., postfix * where to find the start of its
1945 operand. Reset at the beginning of groups and alternatives. */
1946 unsigned char *laststart = 0;
1948 /* Address of beginning of regexp, or inside of last group. */
1949 unsigned char *begalt;
1951 /* Place in the uncompiled pattern (i.e., the {) to
1952 which to go back if the interval is invalid. */
1953 re_char *beg_interval;
1955 /* Address of the place where a forward jump should go to the end of
1956 the containing expression. Each alternative of an `or' -- except the
1957 last -- ends with a forward jump of this sort. */
1958 unsigned char *fixup_alt_jump = 0;
1960 /* Counts open-groups as they are encountered. Remembered for the
1961 matching close-group on the compile stack, so the same register
1962 number is put in the stop_memory as the start_memory. */
1963 regnum_t regnum = 0;
1966 DEBUG_PRINT1 ("\nCompiling pattern: ");
1971 for (debug_count = 0; debug_count < size; debug_count++)
1972 putchar (pattern[debug_count]);
1977 /* Initialize the compile stack. */
1978 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1979 if (compile_stack.stack == NULL)
1982 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1983 compile_stack.avail = 0;
1985 /* Initialize the pattern buffer. */
1986 bufp->syntax = syntax;
1987 bufp->fastmap_accurate = 0;
1988 bufp->not_bol = bufp->not_eol = 0;
1990 /* Set `used' to zero, so that if we return an error, the pattern
1991 printer (for debugging) will think there's no pattern. We reset it
1995 /* Always count groups, whether or not bufp->no_sub is set. */
1997 bufp->re_ngroups = 0;
1999 /* Allocate index translation array if needed. */
2000 if (bufp->external_to_internal_register == 0)
2002 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE;
2003 RETALLOC (bufp->external_to_internal_register,
2004 bufp->external_to_internal_register_size,
2008 /* Initialize translations to impossible value to aid debugging. */
2012 bufp->external_to_internal_register[0] = 0;
2013 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2014 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF;
2017 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2018 /* Initialize the syntax table. */
2019 init_syntax_once ();
2022 if (bufp->allocated == 0)
2025 { /* If zero allocated, but buffer is non-null, try to realloc
2026 enough space. This loses if buffer's address is bogus, but
2027 that is the user's responsibility. */
2028 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2031 { /* Caller did not allocate a buffer. Do it for them. */
2032 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2034 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2036 bufp->allocated = INIT_BUF_SIZE;
2039 begalt = buf_end = bufp->buffer;
2041 /* Loop through the uncompiled pattern until we're at the end. */
2050 if ( /* If at start of pattern, it's an operator. */
2052 /* If context independent, it's an operator. */
2053 || syntax & RE_CONTEXT_INDEP_ANCHORS
2054 /* Otherwise, depends on what's come before. */
2055 || at_begline_loc_p (pattern, p, syntax))
2065 if ( /* If at end of pattern, it's an operator. */
2067 /* If context independent, it's an operator. */
2068 || syntax & RE_CONTEXT_INDEP_ANCHORS
2069 /* Otherwise, depends on what's next. */
2070 || at_endline_loc_p (p, pend, syntax))
2080 if ((syntax & RE_BK_PLUS_QM)
2081 || (syntax & RE_LIMITED_OPS))
2085 /* If there is no previous pattern... */
2088 if (syntax & RE_CONTEXT_INVALID_OPS)
2089 FREE_STACK_RETURN (REG_BADRPT);
2090 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2095 /* true means zero/many matches are allowed. */
2096 re_bool zero_times_ok = c != '+';
2097 re_bool many_times_ok = c != '?';
2099 /* true means match shortest string possible. */
2100 re_bool minimal = false;
2102 /* If there is a sequence of repetition chars, collapse it
2103 down to just one (the right one). We can't combine
2104 interval operators with these because of, e.g., `a{2}*',
2105 which should only match an even number of `a's. */
2110 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2111 && (c == '+' || c == '?')))
2114 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2116 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2119 if (!(c1 == '+' || c1 == '?'))
2134 /* If we get here, we found another repeat character. */
2135 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2137 /* "*?" and "+?" and "??" are okay (and mean match
2138 minimally), but other sequences (such as "*??" and
2139 "+++") are rejected (reserved for future use). */
2140 if (minimal || c != '?')
2141 FREE_STACK_RETURN (REG_BADRPT);
2146 zero_times_ok |= c != '+';
2147 many_times_ok |= c != '?';
2151 /* Star, etc. applied to an empty pattern is equivalent
2152 to an empty pattern. */
2156 /* Now we know whether zero matches is allowed
2157 and whether two or more matches is allowed
2158 and whether we want minimal or maximal matching. */
2164 0: /on_failure_jump to 6
2169 GET_BUFFER_SPACE (6);
2170 INSERT_JUMP (jump, laststart, buf_end + 3);
2172 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2175 else if (zero_times_ok)
2180 6: /on_failure_jump to 3
2183 GET_BUFFER_SPACE (6);
2184 INSERT_JUMP (jump, laststart, buf_end + 3);
2186 STORE_JUMP (on_failure_jump, buf_end, laststart + 3);
2193 3: /on_failure_jump to 0
2196 GET_BUFFER_SPACE (3);
2197 STORE_JUMP (on_failure_jump, buf_end, laststart);
2203 /* Are we optimizing this jump? */
2204 re_bool keep_string_p = false;
2207 { /* More than one repetition is allowed, so put in
2208 at the end a backward relative jump from
2209 `buf_end' to before the next jump we're going
2210 to put in below (which jumps from laststart to
2213 But if we are at the `*' in the exact sequence `.*\n',
2214 insert an unconditional jump backwards to the .,
2215 instead of the beginning of the loop. This way we only
2216 push a failure point once, instead of every time
2217 through the loop. */
2218 assert (p - 1 > pattern);
2220 /* Allocate the space for the jump. */
2221 GET_BUFFER_SPACE (3);
2223 /* We know we are not at the first character of the
2224 pattern, because laststart was nonzero. And we've
2225 already incremented `p', by the way, to be the
2226 character after the `*'. Do we have to do something
2227 analogous here for null bytes, because of
2231 && p < pend && *p == '\n'
2232 && !(syntax & RE_DOT_NEWLINE))
2233 { /* We have .*\n. */
2234 STORE_JUMP (jump, buf_end, laststart);
2235 keep_string_p = true;
2238 /* Anything else. */
2239 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3);
2241 /* We've added more stuff to the buffer. */
2245 /* On failure, jump from laststart to buf_end + 3,
2246 which will be the end of the buffer after this jump
2248 GET_BUFFER_SPACE (3);
2249 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2251 laststart, buf_end + 3);
2256 /* At least one repetition is required, so insert a
2257 `dummy_failure_jump' before the initial
2258 `on_failure_jump' instruction of the loop. This
2259 effects a skip over that instruction the first time
2260 we hit that loop. */
2261 GET_BUFFER_SPACE (3);
2262 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2272 laststart = buf_end;
2279 /* XEmacs change: this whole section */
2280 re_bool had_char_class = false;
2282 re_bool has_extended_chars = false;
2283 REGISTER Lisp_Object rtab = Qnil;
2286 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2288 /* Ensure that we have enough space to push a charset: the
2289 opcode, the length count, and the bitset; 34 bytes in all. */
2290 GET_BUFFER_SPACE (34);
2292 laststart = buf_end;
2294 /* We test `*p == '^' twice, instead of using an if
2295 statement, so we only need one BUF_PUSH. */
2296 BUF_PUSH (*p == '^' ? charset_not : charset);
2300 /* Remember the first position in the bracket expression. */
2303 /* Push the number of bytes in the bitmap. */
2304 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2306 /* Clear the whole map. */
2307 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2309 /* charset_not matches newline according to a syntax bit. */
2310 if ((re_opcode_t) buf_end[-2] == charset_not
2311 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2312 SET_LIST_BIT ('\n');
2315 start_over_with_extended:
2316 if (has_extended_chars)
2318 /* There are extended chars here, which means we need to start
2319 over and shift to unified range-table format. */
2320 if (buf_end[-2] == charset)
2321 buf_end[-2] = charset_mule;
2323 buf_end[-2] = charset_mule_not;
2325 p = p1; /* go back to the beginning of the charset, after
2327 rtab = Vthe_lisp_rangetab;
2328 Fclear_range_table (rtab);
2330 /* charset_not matches newline according to a syntax bit. */
2331 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2332 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2333 SET_EITHER_BIT ('\n');
2337 /* Read in characters and ranges, setting map bits. */
2340 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2345 if (c >= 0x80 && !has_extended_chars)
2347 has_extended_chars = 1;
2348 /* Frumble-bumble, we've found some extended chars.
2349 Need to start over, process everything using
2350 the general extended-char mechanism, and need
2351 to use charset_mule and charset_mule_not instead
2352 of charset and charset_not. */
2353 goto start_over_with_extended;
2356 /* \ might escape characters inside [...] and [^...]. */
2357 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2359 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2363 if (c1 >= 0x80 && !has_extended_chars)
2365 has_extended_chars = 1;
2366 goto start_over_with_extended;
2369 SET_EITHER_BIT (c1);
2373 /* Could be the end of the bracket expression. If it's
2374 not (i.e., when the bracket expression is `[]' so
2375 far), the ']' character bit gets set way below. */
2376 if (c == ']' && p != p1 + 1)
2379 /* Look ahead to see if it's a range when the last thing
2380 was a character class. */
2381 if (had_char_class && c == '-' && *p != ']')
2382 FREE_STACK_RETURN (REG_ERANGE);
2384 /* Look ahead to see if it's a range when the last thing
2385 was a character: if this is a hyphen not at the
2386 beginning or the end of a list, then it's the range
2389 && !(p - 2 >= pattern && p[-2] == '[')
2390 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2396 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2398 has_extended_chars = 1;
2399 goto start_over_with_extended;
2401 if (has_extended_chars)
2402 ret = compile_extended_range (&p, pend, translate,
2406 ret = compile_range (&p, pend, translate, syntax, buf_end);
2407 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2410 else if (p[0] == '-' && p[1] != ']')
2411 { /* This handles ranges made up of characters only. */
2414 /* Move past the `-'. */
2418 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2420 has_extended_chars = 1;
2421 goto start_over_with_extended;
2423 if (has_extended_chars)
2424 ret = compile_extended_range (&p, pend, translate,
2428 ret = compile_range (&p, pend, translate, syntax, buf_end);
2429 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2432 /* See if we're at the beginning of a possible character
2435 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2436 { /* Leave room for the null. */
2437 char str[CHAR_CLASS_MAX_LENGTH + 1];
2442 /* If pattern is `[[:'. */
2443 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2447 /* #### This code is unused.
2448 Correctness is not checked after TRT
2451 if (c == ':' || c == ']' || p == pend
2452 || c1 == CHAR_CLASS_MAX_LENGTH)
2454 str[c1++] = (char) c;
2458 /* If isn't a word bracketed by `[:' and `:]':
2459 undo the ending character, the letters, and leave
2460 the leading `:' and `[' (but set bits for them). */
2461 if (c == ':' && *p == ']')
2464 re_bool is_alnum = STREQ (str, "alnum");
2465 re_bool is_alpha = STREQ (str, "alpha");
2466 re_bool is_blank = STREQ (str, "blank");
2467 re_bool is_cntrl = STREQ (str, "cntrl");
2468 re_bool is_digit = STREQ (str, "digit");
2469 re_bool is_graph = STREQ (str, "graph");
2470 re_bool is_lower = STREQ (str, "lower");
2471 re_bool is_print = STREQ (str, "print");
2472 re_bool is_punct = STREQ (str, "punct");
2473 re_bool is_space = STREQ (str, "space");
2474 re_bool is_upper = STREQ (str, "upper");
2475 re_bool is_xdigit = STREQ (str, "xdigit");
2477 if (!IS_CHAR_CLASS (str))
2478 FREE_STACK_RETURN (REG_ECTYPE);
2480 /* Throw away the ] at the end of the character
2484 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2486 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2488 /* This was split into 3 if's to
2489 avoid an arbitrary limit in some compiler. */
2490 if ( (is_alnum && ISALNUM (ch))
2491 || (is_alpha && ISALPHA (ch))
2492 || (is_blank && ISBLANK (ch))
2493 || (is_cntrl && ISCNTRL (ch)))
2494 SET_EITHER_BIT (ch);
2495 if ( (is_digit && ISDIGIT (ch))
2496 || (is_graph && ISGRAPH (ch))
2497 || (is_lower && ISLOWER (ch))
2498 || (is_print && ISPRINT (ch)))
2499 SET_EITHER_BIT (ch);
2500 if ( (is_punct && ISPUNCT (ch))
2501 || (is_space && ISSPACE (ch))
2502 || (is_upper && ISUPPER (ch))
2503 || (is_xdigit && ISXDIGIT (ch)))
2504 SET_EITHER_BIT (ch);
2506 had_char_class = true;
2513 SET_EITHER_BIT ('[');
2514 SET_EITHER_BIT (':');
2515 had_char_class = false;
2520 had_char_class = false;
2526 if (has_extended_chars)
2528 /* We have a range table, not a bit vector. */
2530 unified_range_table_bytes_needed (rtab);
2531 GET_BUFFER_SPACE (bytes_needed);
2532 unified_range_table_copy_data (rtab, buf_end);
2533 buf_end += unified_range_table_bytes_used (buf_end);
2537 /* Discard any (non)matching list bytes that are all 0 at the
2538 end of the map. Decrease the map-length byte too. */
2539 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2541 buf_end += buf_end[-1];
2547 if (syntax & RE_NO_BK_PARENS)
2554 if (syntax & RE_NO_BK_PARENS)
2561 if (syntax & RE_NEWLINE_ALT)
2568 if (syntax & RE_NO_BK_VBAR)
2575 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2576 goto handle_interval;
2582 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2584 /* Do not translate the character after the \, so that we can
2585 distinguish, e.g., \B from \b, even if we normally would
2586 translate, e.g., B to b. */
2592 if (syntax & RE_NO_BK_PARENS)
2593 goto normal_backslash;
2600 if (!(syntax & RE_NO_SHY_GROUPS)
2608 case ':': /* shy groups */
2612 /* All others are reserved for future constructs. */
2614 FREE_STACK_RETURN (REG_BADPAT);
2621 /* Record the translation from capturing group index to
2622 register number, reallocating table as needed. */
2625 while (bufp->external_to_internal_register_size <=
2630 bufp->external_to_internal_register_size;
2631 bufp->external_to_internal_register_size += 5;
2632 RETALLOC (bufp->external_to_internal_register,
2633 bufp->external_to_internal_register_size,
2637 i < bufp->external_to_internal_register_size; i++)
2638 bufp->external_to_internal_register[i] =
2642 bufp->external_to_internal_register[bufp->re_nsub] =
2646 if (COMPILE_STACK_FULL)
2648 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2649 compile_stack_elt_t);
2650 if (compile_stack.stack == NULL) return REG_ESPACE;
2652 compile_stack.size <<= 1;
2655 /* These are the values to restore when we hit end of this
2656 group. They are all relative offsets, so that if the
2657 whole pattern moves because of realloc, they will still
2659 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2660 COMPILE_STACK_TOP.fixup_alt_jump
2661 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2662 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer;
2663 COMPILE_STACK_TOP.regnum = r;
2665 /* We will eventually replace the 0 with the number of
2666 groups inner to this one. But do not push a
2667 start_memory for groups beyond the last one we can
2668 represent in the compiled pattern.
2669 #### bad bad bad. this will fail in lots of ways, if we
2670 ever have to backtrack for these groups.
2672 if (r <= MAX_REGNUM)
2674 COMPILE_STACK_TOP.inner_group_offset
2675 = buf_end - bufp->buffer + 2;
2676 BUF_PUSH_3 (start_memory, r, 0);
2679 compile_stack.avail++;
2684 /* If we've reached MAX_REGNUM groups, then this open
2685 won't actually generate any code, so we'll have to
2686 clear pending_exact explicitly. */
2693 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2695 if (COMPILE_STACK_EMPTY) {
2696 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2697 goto normal_backslash;
2699 FREE_STACK_RETURN (REG_ERPAREN);
2704 { /* Push a dummy failure point at the end of the
2705 alternative for a possible future
2706 `pop_failure_jump' to pop. See comments at
2707 `push_dummy_failure' in `re_match_2'. */
2708 BUF_PUSH (push_dummy_failure);
2710 /* We allocated space for this jump when we assigned
2711 to `fixup_alt_jump', in the `handle_alt' case below. */
2712 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1);
2715 /* See similar code for backslashed left paren above. */
2716 if (COMPILE_STACK_EMPTY) {
2717 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2720 FREE_STACK_RETURN (REG_ERPAREN);
2723 /* Since we just checked for an empty stack above, this
2724 ``can't happen''. */
2725 assert (compile_stack.avail != 0);
2727 /* We don't just want to restore into `regnum', because
2728 later groups should continue to be numbered higher,
2729 as in `(ab)c(de)' -- the second group is #2. */
2730 regnum_t this_group_regnum;
2732 compile_stack.avail--;
2733 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2735 = COMPILE_STACK_TOP.fixup_alt_jump
2736 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2738 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2739 this_group_regnum = COMPILE_STACK_TOP.regnum;
2740 /* If we've reached MAX_REGNUM groups, then this open
2741 won't actually generate any code, so we'll have to
2742 clear pending_exact explicitly. */
2745 /* We're at the end of the group, so now we know how many
2746 groups were inside this one. */
2747 if (this_group_regnum <= MAX_REGNUM)
2749 unsigned char *inner_group_loc
2750 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2752 *inner_group_loc = regnum - this_group_regnum;
2753 BUF_PUSH_3 (stop_memory, this_group_regnum,
2754 regnum - this_group_regnum);
2760 case '|': /* `\|'. */
2761 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2762 goto normal_backslash;
2764 if (syntax & RE_LIMITED_OPS)
2767 /* Insert before the previous alternative a jump which
2768 jumps to this alternative if the former fails. */
2769 GET_BUFFER_SPACE (3);
2770 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6);
2774 /* The alternative before this one has a jump after it
2775 which gets executed if it gets matched. Adjust that
2776 jump so it will jump to this alternative's analogous
2777 jump (put in below, which in turn will jump to the next
2778 (if any) alternative's such jump, etc.). The last such
2779 jump jumps to the correct final destination. A picture:
2785 If we are at `b', then fixup_alt_jump right now points to a
2786 three-byte space after `a'. We'll put in the jump, set
2787 fixup_alt_jump to right after `b', and leave behind three
2788 bytes which we'll fill in when we get to after `c'. */
2791 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
2793 /* Mark and leave space for a jump after this alternative,
2794 to be filled in later either by next alternative or
2795 when know we're at the end of a series of alternatives. */
2796 fixup_alt_jump = buf_end;
2797 GET_BUFFER_SPACE (3);
2806 /* If \{ is a literal. */
2807 if (!(syntax & RE_INTERVALS)
2808 /* If we're at `\{' and it's not the open-interval
2810 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2811 || (p - 2 == pattern && p == pend))
2812 goto normal_backslash;
2816 /* If got here, then the syntax allows intervals. */
2818 /* At least (most) this many matches must be made. */
2819 int lower_bound = -1, upper_bound = -1;
2821 beg_interval = p - 1;
2825 if (syntax & RE_NO_BK_BRACES)
2826 goto unfetch_interval;
2828 FREE_STACK_RETURN (REG_EBRACE);
2831 GET_UNSIGNED_NUMBER (lower_bound);
2835 GET_UNSIGNED_NUMBER (upper_bound);
2836 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2839 /* Interval such as `{1}' => match exactly once. */
2840 upper_bound = lower_bound;
2842 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2843 || lower_bound > upper_bound)
2845 if (syntax & RE_NO_BK_BRACES)
2846 goto unfetch_interval;
2848 FREE_STACK_RETURN (REG_BADBR);
2851 if (!(syntax & RE_NO_BK_BRACES))
2853 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2860 if (syntax & RE_NO_BK_BRACES)
2861 goto unfetch_interval;
2863 FREE_STACK_RETURN (REG_BADBR);
2866 /* We just parsed a valid interval. */
2868 /* If it's invalid to have no preceding re. */
2871 if (syntax & RE_CONTEXT_INVALID_OPS)
2872 FREE_STACK_RETURN (REG_BADRPT);
2873 else if (syntax & RE_CONTEXT_INDEP_OPS)
2874 laststart = buf_end;
2876 goto unfetch_interval;
2879 /* If the upper bound is zero, don't want to succeed at
2880 all; jump from `laststart' to `b + 3', which will be
2881 the end of the buffer after we insert the jump. */
2882 if (upper_bound == 0)
2884 GET_BUFFER_SPACE (3);
2885 INSERT_JUMP (jump, laststart, buf_end + 3);
2889 /* Otherwise, we have a nontrivial interval. When
2890 we're all done, the pattern will look like:
2891 set_number_at <jump count> <upper bound>
2892 set_number_at <succeed_n count> <lower bound>
2893 succeed_n <after jump addr> <succeed_n count>
2895 jump_n <succeed_n addr> <jump count>
2896 (The upper bound and `jump_n' are omitted if
2897 `upper_bound' is 1, though.) */
2899 { /* If the upper bound is > 1, we need to insert
2900 more at the end of the loop. */
2901 Memory_count nbytes = 10 + (upper_bound > 1) * 10;
2903 GET_BUFFER_SPACE (nbytes);
2905 /* Initialize lower bound of the `succeed_n', even
2906 though it will be set during matching by its
2907 attendant `set_number_at' (inserted next),
2908 because `re_compile_fastmap' needs to know.
2909 Jump to the `jump_n' we might insert below. */
2910 INSERT_JUMP2 (succeed_n, laststart,
2911 buf_end + 5 + (upper_bound > 1) * 5,
2915 /* Code to initialize the lower bound. Insert
2916 before the `succeed_n'. The `5' is the last two
2917 bytes of this `set_number_at', plus 3 bytes of
2918 the following `succeed_n'. */
2919 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end);
2922 if (upper_bound > 1)
2923 { /* More than one repetition is allowed, so
2924 append a backward jump to the `succeed_n'
2925 that starts this interval.
2927 When we've reached this during matching,
2928 we'll have matched the interval once, so
2929 jump back only `upper_bound - 1' times. */
2930 STORE_JUMP2 (jump_n, buf_end, laststart + 5,
2934 /* The location we want to set is the second
2935 parameter of the `jump_n'; that is `b-2' as
2936 an absolute address. `laststart' will be
2937 the `set_number_at' we're about to insert;
2938 `laststart+3' the number to set, the source
2939 for the relative address. But we are
2940 inserting into the middle of the pattern --
2941 so everything is getting moved up by 5.
2942 Conclusion: (b - 2) - (laststart + 3) + 5,
2943 i.e., b - laststart.
2945 We insert this at the beginning of the loop
2946 so that if we fail during matching, we'll
2947 reinitialize the bounds. */
2948 insert_op2 (set_number_at, laststart,
2949 buf_end - laststart,
2950 upper_bound - 1, buf_end);
2955 beg_interval = NULL;
2960 /* If an invalid interval, match the characters as literals. */
2961 assert (beg_interval);
2963 beg_interval = NULL;
2965 /* normal_char and normal_backslash need `c'. */
2968 if (!(syntax & RE_NO_BK_BRACES))
2970 if (p > pattern && p[-1] == '\\')
2971 goto normal_backslash;
2976 /* There is no way to specify the before_dot and after_dot
2977 operators. rms says this is ok. --karl */
2983 laststart = buf_end;
2985 /* XEmacs addition */
2986 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2987 FREE_STACK_RETURN (REG_ESYNTAX);
2988 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2992 laststart = buf_end;
2994 /* XEmacs addition */
2995 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2996 FREE_STACK_RETURN (REG_ESYNTAX);
2997 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3001 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3003 laststart = buf_end;
3005 if (c < 32 || c > 127)
3006 FREE_STACK_RETURN (REG_ECATEGORY);
3007 BUF_PUSH_2 (categoryspec, c);
3011 laststart = buf_end;
3013 if (c < 32 || c > 127)
3014 FREE_STACK_RETURN (REG_ECATEGORY);
3015 BUF_PUSH_2 (notcategoryspec, c);
3017 /* end of category patch */
3023 laststart = buf_end;
3024 BUF_PUSH (wordchar);
3029 laststart = buf_end;
3030 BUF_PUSH (notwordchar);
3043 BUF_PUSH (wordbound);
3047 BUF_PUSH (notwordbound);
3058 case '1': case '2': case '3': case '4': case '5':
3059 case '6': case '7': case '8': case '9':
3063 if (syntax & RE_NO_BK_REFS)
3066 /* External register indexing. */
3069 if (reg > bufp->re_nsub)
3070 FREE_STACK_RETURN (REG_ESUBREG);
3072 /* Convert external to internal as soon as possible. */
3073 reg = bufp->external_to_internal_register[reg];
3075 /* Can't back reference to a subexpression if inside it. */
3076 if (group_in_compile_stack (compile_stack, reg))
3079 laststart = buf_end;
3080 BUF_PUSH_2 (duplicate, reg);
3087 if (syntax & RE_BK_PLUS_QM)
3090 goto normal_backslash;
3094 /* You might think it would be useful for \ to mean
3095 not to translate; but if we don't translate it,
3096 it will never match anything. */
3104 /* Expects the character in `c'. */
3105 /* `p' points to the location after where `c' came from. */
3108 /* XEmacs: modifications here for Mule. */
3109 /* `q' points to the beginning of the next char. */
3112 /* If no exactn currently being built. */
3115 /* If last exactn not at current position. */
3116 || pending_exact + *pending_exact + 1 != buf_end
3118 /* We have only one byte following the exactn for the count. */
3119 || ((unsigned int) (*pending_exact + (q - p)) >=
3120 ((unsigned int) (1 << BYTEWIDTH) - 1))
3122 /* If followed by a repetition operator. */
3123 || *q == '*' || *q == '^'
3124 || ((syntax & RE_BK_PLUS_QM)
3125 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3126 : (*q == '+' || *q == '?'))
3127 || ((syntax & RE_INTERVALS)
3128 && ((syntax & RE_NO_BK_BRACES)
3130 : (q[0] == '\\' && q[1] == '{'))))
3132 /* Start building a new exactn. */
3134 laststart = buf_end;
3136 BUF_PUSH_2 (exactn, 0);
3137 pending_exact = buf_end - 1;
3146 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3149 bt_count = set_charptr_emchar (tmp_buf, c);
3151 for (i = 0; i < bt_count; i++)
3153 BUF_PUSH (tmp_buf[i]);
3161 } /* while p != pend */
3164 /* Through the pattern now. */
3167 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end);
3169 if (!COMPILE_STACK_EMPTY)
3170 FREE_STACK_RETURN (REG_EPAREN);
3172 /* If we don't want backtracking, force success
3173 the first time we reach the end of the compiled pattern. */
3174 if (syntax & RE_NO_POSIX_BACKTRACKING)
3177 free (compile_stack.stack);
3179 /* We have succeeded; set the length of the buffer. */
3180 bufp->used = buf_end - bufp->buffer;
3185 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3186 print_compiled_pattern (bufp);
3190 #ifndef MATCH_MAY_ALLOCATE
3191 /* Initialize the failure stack to the largest possible stack. This
3192 isn't necessary unless we're trying to avoid calling alloca in
3193 the search and match routines. */
3195 int num_regs = bufp->re_ngroups + 1;
3197 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3198 is strictly greater than re_max_failures, the largest possible stack
3199 is 2 * re_max_failures failure points. */
3200 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3202 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3205 if (! fail_stack.stack)
3207 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3208 * sizeof (fail_stack_elt_t));
3211 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3213 * sizeof (fail_stack_elt_t)));
3214 #else /* not emacs */
3215 if (! fail_stack.stack)
3217 = (fail_stack_elt_t *) malloc (fail_stack.size
3218 * sizeof (fail_stack_elt_t));
3221 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3223 * sizeof (fail_stack_elt_t)));
3227 regex_grow_registers (num_regs);
3229 #endif /* not MATCH_MAY_ALLOCATE */
3232 } /* regex_compile */
3234 /* Subroutines for `regex_compile'. */
3236 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3239 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3241 *loc = (unsigned char) op;
3242 STORE_NUMBER (loc + 1, arg);
3246 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3249 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3251 *loc = (unsigned char) op;
3252 STORE_NUMBER (loc + 1, arg1);
3253 STORE_NUMBER (loc + 3, arg2);
3257 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3258 for OP followed by two-byte integer parameter ARG. */
3261 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3263 REGISTER unsigned char *pfrom = end;
3264 REGISTER unsigned char *pto = end + 3;
3266 while (pfrom != loc)
3269 store_op1 (op, loc, arg);
3273 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3276 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3279 REGISTER unsigned char *pfrom = end;
3280 REGISTER unsigned char *pto = end + 5;
3282 while (pfrom != loc)
3285 store_op2 (op, loc, arg1, arg2);
3289 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3290 after an alternative or a begin-subexpression. We assume there is at
3291 least one character before the ^. */
3294 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3296 re_char *prev = p - 2;
3297 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3300 /* After a subexpression? */
3301 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3302 /* After an alternative? */
3303 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3307 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3308 at least one character after the $, i.e., `P < PEND'. */
3311 at_endline_loc_p (re_char *p, re_char *pend, int syntax)
3314 re_bool next_backslash = *next == '\\';
3315 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3318 /* Before a subexpression? */
3319 (syntax & RE_NO_BK_PARENS ? *next == ')'
3320 : next_backslash && next_next && *next_next == ')')
3321 /* Before an alternative? */
3322 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3323 : next_backslash && next_next && *next_next == '|');
3327 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3328 false if it's not. */
3331 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3335 for (this_element = compile_stack.avail - 1;
3338 if (compile_stack.stack[this_element].regnum == regnum)
3345 /* Read the ending character of a range (in a bracket expression) from the
3346 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3347 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3348 Then we set the translation of all bits between the starting and
3349 ending characters (inclusive) in the compiled pattern B.
3351 Return an error code.
3353 We use these short variable names so we can use the same macros as
3354 `regex_compile' itself. */
3356 static reg_errcode_t
3357 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate,
3358 reg_syntax_t syntax, unsigned char *buf_end)
3360 Element_count this_char;
3362 re_char *p = *p_ptr;
3363 int range_start, range_end;
3368 /* Even though the pattern is a signed `char *', we need to fetch
3369 with unsigned char *'s; if the high bit of the pattern character
3370 is set, the range endpoints will be negative if we fetch using a
3373 We also want to fetch the endpoints without translating them; the
3374 appropriate translation is done in the bit-setting loop below. */
3375 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3376 range_start = ((const unsigned char *) p)[-2];
3377 range_end = ((const unsigned char *) p)[0];
3379 /* Have to increment the pointer into the pattern string, so the
3380 caller isn't still at the ending character. */
3383 /* If the start is after the end, the range is empty. */
3384 if (range_start > range_end)
3385 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3387 /* Here we see why `this_char' has to be larger than an `unsigned
3388 char' -- the range is inclusive, so if `range_end' == 0xff
3389 (assuming 8-bit characters), we would otherwise go into an infinite
3390 loop, since all characters <= 0xff. */
3391 for (this_char = range_start; this_char <= range_end; this_char++)
3393 SET_LIST_BIT (TRANSLATE (this_char));
3401 static reg_errcode_t
3402 compile_extended_range (re_char **p_ptr, re_char *pend,
3403 RE_TRANSLATE_TYPE translate,
3404 reg_syntax_t syntax, Lisp_Object rtab)
3406 Emchar this_char, range_start, range_end;
3412 p = (const Bufbyte *) *p_ptr;
3413 range_end = charptr_emchar (p);
3414 p--; /* back to '-' */
3415 DEC_CHARPTR (p); /* back to start of range */
3416 /* We also want to fetch the endpoints without translating them; the
3417 appropriate translation is done in the bit-setting loop below. */
3418 range_start = charptr_emchar (p);
3419 INC_CHARPTR (*p_ptr);
3421 /* If the start is after the end, the range is empty. */
3422 if (range_start > range_end)
3423 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3425 /* Can't have ranges spanning different charsets, except maybe for
3426 ranges entirely within the first 256 chars. */
3428 if ((range_start >= 0x100 || range_end >= 0x100)
3429 && CHAR_LEADING_BYTE (range_start) !=
3430 CHAR_LEADING_BYTE (range_end))
3431 return REG_ERANGESPAN;
3433 /* As advertised, translations only work over the 0 - 0x7F range.
3434 Making this kind of stuff work generally is much harder.
3435 Iterating over the whole range like this would be way efficient
3436 if the range encompasses 10,000 chars or something. You'd have
3437 to do something like this:
3441 map over translation table in [range_start, range_end] of
3442 (put the mapped range in a;
3443 put the translation in b)
3444 invert the range in a and truncate to [range_start, range_end]
3445 compute the union of a, b
3446 union the result into rtab
3448 for (this_char = range_start;
3449 this_char <= range_end && this_char < 0x80; this_char++)
3451 SET_RANGETAB_BIT (TRANSLATE (this_char));
3454 if (this_char <= range_end)
3455 put_range_table (rtab, this_char, range_end, Qt);
3462 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3463 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3464 characters can start a string that matches the pattern. This fastmap
3465 is used by re_search to skip quickly over impossible starting points.
3467 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3468 area as BUFP->fastmap.
3470 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3473 Returns 0 if we succeed, -2 if an internal error. */
3476 re_compile_fastmap (struct re_pattern_buffer *bufp)
3479 #ifdef MATCH_MAY_ALLOCATE
3480 fail_stack_type fail_stack;
3482 DECLARE_DESTINATION;
3483 /* We don't push any register information onto the failure stack. */
3485 REGISTER char *fastmap = bufp->fastmap;
3486 unsigned char *pattern = bufp->buffer;
3487 unsigned long size = bufp->used;
3488 unsigned char *p = pattern;
3489 REGISTER unsigned char *pend = pattern + size;
3492 /* This holds the pointer to the failure stack, when
3493 it is allocated relocatably. */
3494 fail_stack_elt_t *failure_stack_ptr;
3497 /* Assume that each path through the pattern can be null until
3498 proven otherwise. We set this false at the bottom of switch
3499 statement, to which we get only if a particular path doesn't
3500 match the empty string. */
3501 re_bool path_can_be_null = true;
3503 /* We aren't doing a `succeed_n' to begin with. */
3504 re_bool succeed_n_p = false;
3506 assert (fastmap != NULL && p != NULL);
3509 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3510 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3511 bufp->can_be_null = 0;
3515 if (p == pend || *p == succeed)
3517 /* We have reached the (effective) end of pattern. */
3518 if (!FAIL_STACK_EMPTY ())
3520 bufp->can_be_null |= path_can_be_null;
3522 /* Reset for next path. */
3523 path_can_be_null = true;
3525 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer;
3533 /* We should never be about to go beyond the end of the pattern. */
3536 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3539 /* I guess the idea here is to simply not bother with a fastmap
3540 if a backreference is used, since it's too hard to figure out
3541 the fastmap for the corresponding group. Setting
3542 `can_be_null' stops `re_search_2' from using the fastmap, so
3543 that is all we do. */
3545 bufp->can_be_null = 1;
3549 /* Following are the cases which match a character. These end
3558 /* XEmacs: Under Mule, these bit vectors will
3559 only contain values for characters below 0x80. */
3560 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3561 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3567 /* Chars beyond end of map must be allowed. */
3569 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3571 /* And all extended characters must be allowed, too. */
3572 for (j = 0x80; j < 0xA0; j++)
3574 #else /* not MULE */
3575 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3579 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3580 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3590 nentries = unified_range_table_nentries (p);
3591 for (i = 0; i < nentries; i++)
3593 EMACS_INT first, last;
3594 Lisp_Object dummy_val;
3596 Bufbyte strr[MAX_EMCHAR_LEN];
3598 unified_range_table_get_range (p, i, &first, &last,
3600 for (jj = first; jj <= last && jj < 0x80; jj++)
3602 /* Ranges below 0x100 can span charsets, but there
3603 are only two (Control-1 and Latin-1), and
3604 either first or last has to be in them. */
3605 set_charptr_emchar (strr, first);
3609 set_charptr_emchar (strr, last);
3616 case charset_mule_not:
3621 nentries = unified_range_table_nentries (p);
3622 for (i = 0; i < nentries; i++)
3624 EMACS_INT first, last;
3625 Lisp_Object dummy_val;
3627 int smallest_prev = 0;
3629 unified_range_table_get_range (p, i, &first, &last,
3631 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3633 smallest_prev = last + 1;
3634 if (smallest_prev >= 0x80)
3637 /* Calculating which leading bytes are actually allowed
3638 here is rather difficult, so we just punt and allow
3640 for (i = 0x80; i < 0xA0; i++)
3652 for (j = 0; j < (1 << BYTEWIDTH); j++)
3655 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3664 goto matchnotsyntax;
3666 for (j = 0; j < (1 << BYTEWIDTH); j++)
3669 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3677 int fastmap_newline = fastmap['\n'];
3679 /* `.' matches anything ... */
3681 /* "anything" only includes bytes that can be the
3682 first byte of a character. */
3683 for (j = 0; j < 0xA0; j++)
3686 for (j = 0; j < (1 << BYTEWIDTH); j++)
3690 /* ... except perhaps newline. */
3691 if (!(bufp->syntax & RE_DOT_NEWLINE))
3692 fastmap['\n'] = fastmap_newline;
3694 /* Return if we have already set `can_be_null'; if we have,
3695 then the fastmap is irrelevant. Something's wrong here. */
3696 else if (bufp->can_be_null)
3699 /* Otherwise, have to check alternative paths. */
3710 /* This match depends on text properties. These end with
3711 aborting optimizations. */
3712 bufp->can_be_null = 1;
3716 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3722 for (j = 0; j < 0x80; j++)
3725 (regex_emacs_buffer->mirror_syntax_table), j) ==
3726 (enum syntaxcode) k)
3728 for (j = 0x80; j < 0xA0; j++)
3730 if (LEADING_BYTE_PREFIX_P(j))
3731 /* too complicated to calculate this right */
3738 cset = CHARSET_BY_LEADING_BYTE (j);
3739 if (CHARSETP (cset))
3741 if (charset_syntax (regex_emacs_buffer, cset,
3743 == Sword || multi_p)
3748 #else /* not MULE */
3749 for (j = 0; j < (1 << BYTEWIDTH); j++)
3752 (regex_emacs_buffer->mirror_syntax_table), j) ==
3753 (enum syntaxcode) k)
3759 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */
3765 for (j = 0; j < 0x80; j++)
3768 (regex_emacs_buffer->mirror_syntax_table), j) !=
3769 (enum syntaxcode) k)
3771 for (j = 0x80; j < 0xA0; j++)
3773 if (LEADING_BYTE_PREFIX_P(j))
3774 /* too complicated to calculate this right */
3781 cset = CHARSET_BY_LEADING_BYTE (j);
3782 if (CHARSETP (cset))
3784 if (charset_syntax (regex_emacs_buffer, cset,
3786 != Sword || multi_p)
3791 #else /* not MULE */
3792 for (j = 0; j < (1 << BYTEWIDTH); j++)
3795 (regex_emacs_buffer->mirror_syntax_table), j) !=
3796 (enum syntaxcode) k)
3803 /* 97/2/17 jhod category patch */
3805 case notcategoryspec:
3806 bufp->can_be_null = 1;
3808 /* end if category patch */
3811 /* All cases after this match the empty string. These end with
3833 case push_dummy_failure:
3838 case pop_failure_jump:
3839 case maybe_pop_jump:
3842 case dummy_failure_jump:
3843 EXTRACT_NUMBER_AND_INCR (j, p);
3848 /* Jump backward implies we just went through the body of a
3849 loop and matched nothing. Opcode jumped to should be
3850 `on_failure_jump' or `succeed_n'. Just treat it like an
3851 ordinary jump. For a * loop, it has pushed its failure
3852 point already; if so, discard that as redundant. */
3853 if ((re_opcode_t) *p != on_failure_jump
3854 && (re_opcode_t) *p != succeed_n)
3858 EXTRACT_NUMBER_AND_INCR (j, p);
3861 /* If what's on the stack is where we are now, pop it. */
3862 if (!FAIL_STACK_EMPTY ()
3863 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3869 case on_failure_jump:
3870 case on_failure_keep_string_jump:
3871 handle_on_failure_jump:
3872 EXTRACT_NUMBER_AND_INCR (j, p);
3874 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3875 end of the pattern. We don't want to push such a point,
3876 since when we restore it above, entering the switch will
3877 increment `p' past the end of the pattern. We don't need
3878 to push such a point since we obviously won't find any more
3879 fastmap entries beyond `pend'. Such a pattern can match
3880 the null string, though. */
3883 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3885 RESET_FAIL_STACK ();
3890 bufp->can_be_null = 1;
3894 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3895 succeed_n_p = false;
3902 /* Get to the number of times to succeed. */
3905 /* Increment p past the n for when k != 0. */
3906 EXTRACT_NUMBER_AND_INCR (k, p);
3910 succeed_n_p = true; /* Spaghetti code alert. */
3911 goto handle_on_failure_jump;
3928 ABORT (); /* We have listed all the cases. */
3931 /* Getting here means we have found the possible starting
3932 characters for one path of the pattern -- and that the empty
3933 string does not match. We need not follow this path further.
3934 Instead, look at the next alternative (remembered on the
3935 stack), or quit if no more. The test at the top of the loop
3936 does these things. */
3937 path_can_be_null = false;
3941 /* Set `can_be_null' for the last path (also the first path, if the
3942 pattern is empty). */
3943 bufp->can_be_null |= path_can_be_null;
3946 RESET_FAIL_STACK ();
3948 } /* re_compile_fastmap */
3950 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3951 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3952 this memory for recording register information. STARTS and ENDS
3953 must be allocated using the malloc library routine, and must each
3954 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3956 If NUM_REGS == 0, then subsequent matches should allocate their own
3959 Unless this function is called, the first search or match using
3960 PATTERN_BUFFER will allocate its own register data, without
3961 freeing the old data. */
3964 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3965 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3969 bufp->regs_allocated = REGS_REALLOCATE;
3970 regs->num_regs = num_regs;
3971 regs->start = starts;
3976 bufp->regs_allocated = REGS_UNALLOCATED;
3978 regs->start = regs->end = (regoff_t *) 0;
3982 /* Searching routines. */
3984 /* Like re_search_2, below, but only one string is specified, and
3985 doesn't let you say where to stop matching. */
3988 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
3989 int startpos, int range, struct re_registers *regs)
3991 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3996 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3997 # define bytecount_to_charcount(ptr, len) (len)
3998 # define charcount_to_bytecount(ptr, len) (len)
3999 typedef int Charcount;
4002 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4003 virtual concatenation of STRING1 and STRING2, starting first at index
4004 STARTPOS, then at STARTPOS + 1, and so on.
4006 With MULE, STARTPOS is a byte position, not a char position. And the
4007 search will increment STARTPOS by the width of the current leading
4010 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4012 RANGE is how far to scan while trying to match. RANGE = 0 means try
4013 only at STARTPOS; in general, the last start tried is STARTPOS +
4016 With MULE, RANGE is a byte position, not a char position. The last
4017 start tried is the character starting <= STARTPOS + RANGE.
4019 In REGS, return the indices of the virtual concatenation of STRING1
4020 and STRING2 that matched the entire BUFP->buffer and its contained
4023 Do not consider matching one past the index STOP in the virtual
4024 concatenation of STRING1 and STRING2.
4026 We return either the position in the strings at which the match was
4027 found, -1 if no match, or -2 if error (such as failure
4031 re_search_2 (struct re_pattern_buffer *bufp, const char *str1,
4032 int size1, const char *str2, int size2, int startpos,
4033 int range, struct re_registers *regs, int stop)
4036 re_char *string1 = (re_char *) str1;
4037 re_char *string2 = (re_char *) str2;
4038 REGISTER char *fastmap = bufp->fastmap;
4039 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4040 int total_size = size1 + size2;
4041 int endpos = startpos + range;
4042 #ifdef REGEX_BEGLINE_CHECK
4043 int anchored_at_begline = 0;
4048 /* Check for out-of-range STARTPOS. */
4049 if (startpos < 0 || startpos > total_size)
4052 /* Fix up RANGE if it might eventually take us outside
4053 the virtual concatenation of STRING1 and STRING2. */
4055 range = 0 - startpos;
4056 else if (endpos > total_size)
4057 range = total_size - startpos;
4059 /* If the search isn't to be a backwards one, don't waste time in a
4060 search for a pattern that must be anchored. */
4061 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4067 d = ((const unsigned char *)
4068 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4069 range = charcount_to_bytecount (d, 1);
4074 /* In a forward search for something that starts with \=.
4075 don't keep searching past point. */
4076 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4078 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer)
4085 /* Update the fastmap now if not correct already. */
4086 if (fastmap && !bufp->fastmap_accurate)
4087 if (re_compile_fastmap (bufp) == -2)
4090 #ifdef REGEX_BEGLINE_CHECK
4092 unsigned long i = 0;
4094 while (i < bufp->used)
4096 if (bufp->buffer[i] == start_memory ||
4097 bufp->buffer[i] == stop_memory)
4102 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
4107 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4109 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4115 /* Loop through the string, looking for a place to start matching. */
4118 #ifdef REGEX_BEGLINE_CHECK
4119 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4120 then we can speed things up by skipping to the next beginning-of-
4122 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4125 /* whose stupid idea was it anyway to make this
4126 function take two strings to match?? */
4130 if (startpos < size1 && startpos + range >= size1)
4131 lim = range - (size1 - startpos);
4133 d = ((const unsigned char *)
4134 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4135 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4136 d_size = charcount_to_bytecount (d, 1);
4138 if (TRANSLATE_P (translate))
4139 while (range > lim && *d != '\n')
4141 d += d_size; /* Speedier INC_CHARPTR(d) */
4142 d_size = charcount_to_bytecount (d, 1);
4146 while (range > lim && *d != '\n')
4148 d += d_size; /* Speedier INC_CHARPTR(d) */
4149 d_size = charcount_to_bytecount (d, 1);
4153 startpos += irange - range;
4155 #endif /* REGEX_BEGLINE_CHECK */
4157 /* If a fastmap is supplied, skip quickly over characters that
4158 cannot be the start of a match. If the pattern can match the
4159 null string, however, we don't need to skip characters; we want
4160 the first null string. */
4161 if (fastmap && startpos < total_size && !bufp->can_be_null)
4163 if (range > 0) /* Searching forwards. */
4168 if (startpos < size1 && startpos + range >= size1)
4169 lim = range - (size1 - startpos);
4171 d = ((const unsigned char *)
4172 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4174 /* Written out as an if-else to avoid testing `translate'
4176 if (TRANSLATE_P (translate))
4181 Bufbyte str[MAX_EMCHAR_LEN];
4183 buf_ch = charptr_emchar (d);
4184 buf_ch = RE_TRANSLATE (buf_ch);
4185 set_charptr_emchar (str, buf_ch);
4186 if (buf_ch >= 0200 || fastmap[(unsigned char) *str])
4189 if (fastmap[(unsigned char)RE_TRANSLATE (*d)])
4192 d_size = charcount_to_bytecount (d, 1);
4194 d += d_size; /* Speedier INC_CHARPTR(d) */
4197 while (range > lim && !fastmap[*d])
4199 d_size = charcount_to_bytecount (d, 1);
4201 d += d_size; /* Speedier INC_CHARPTR(d) */
4204 startpos += irange - range;
4206 else /* Searching backwards. */
4208 Emchar c = (size1 == 0 || startpos >= size1
4209 ? charptr_emchar (string2 + startpos - size1)
4210 : charptr_emchar (string1 + startpos));
4213 if (!(c >= 0200 || fastmap[(unsigned char) c]))
4216 if (!fastmap[(unsigned char) c])
4222 /* If can't match the null string, and that's all we have left, fail. */
4223 if (range >= 0 && startpos == total_size && fastmap
4224 && !bufp->can_be_null)
4227 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4228 if (!no_quit_in_re_search)
4231 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4232 startpos, regs, stop);
4233 #ifndef REGEX_MALLOC
4250 d = ((const unsigned char *)
4251 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4252 d_size = charcount_to_bytecount (d, 1);
4258 /* Note startpos > size1 not >=. If we are on the
4259 string1/string2 boundary, we want to backup into string1. */
4260 d = ((const unsigned char *)
4261 (startpos > size1 ? string2 - size1 : string1) + startpos);
4263 d_size = charcount_to_bytecount (d, 1);
4271 /* Declarations and macros for re_match_2. */
4273 /* This converts PTR, a pointer into one of the search strings `string1'
4274 and `string2' into an offset from the beginning of that string. */
4275 #define POINTER_TO_OFFSET(ptr) \
4276 (FIRST_STRING_P (ptr) \
4277 ? ((regoff_t) ((ptr) - string1)) \
4278 : ((regoff_t) ((ptr) - string2 + size1)))
4280 /* Macros for dealing with the split strings in re_match_2. */
4282 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4284 /* Call before fetching a character with *d. This switches over to
4285 string2 if necessary. */
4286 #define REGEX_PREFETCH() \
4289 /* End of string2 => fail. */ \
4290 if (dend == end_match_2) \
4292 /* End of string1 => advance to string2. */ \
4294 dend = end_match_2; \
4298 /* Test if at very beginning or at very end of the virtual concatenation
4299 of `string1' and `string2'. If only one string, it's `string2'. */
4300 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4301 #define AT_STRINGS_END(d) ((d) == end2)
4304 If the given position straddles the string gap, return the equivalent
4305 position that is before or after the gap, respectively; otherwise,
4306 return the same position. */
4307 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4308 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4310 /* Test if CH is a word-constituent character. (XEmacs change) */
4311 #define WORDCHAR_P_UNSAFE(ch) \
4312 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4315 /* Free everything we malloc. */
4316 #ifdef MATCH_MAY_ALLOCATE
4317 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4318 #define FREE_VARIABLES() \
4320 REGEX_FREE_STACK (fail_stack.stack); \
4321 FREE_VAR (regstart); \
4322 FREE_VAR (regend); \
4323 FREE_VAR (old_regstart); \
4324 FREE_VAR (old_regend); \
4325 FREE_VAR (best_regstart); \
4326 FREE_VAR (best_regend); \
4327 FREE_VAR (reg_info); \
4328 FREE_VAR (reg_dummy); \
4329 FREE_VAR (reg_info_dummy); \
4331 #else /* not MATCH_MAY_ALLOCATE */
4332 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4333 #endif /* MATCH_MAY_ALLOCATE */
4335 /* These values must meet several constraints. They must not be valid
4336 register values; since we have a limit of 255 registers (because
4337 we use only one byte in the pattern for the register number), we can
4338 use numbers larger than 255. They must differ by 1, because of
4339 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4340 be larger than the value for the highest register, so we do not try
4341 to actually save any registers when none are active. */
4342 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4343 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4345 /* Matching routines. */
4347 #ifndef emacs /* Emacs never uses this. */
4348 /* re_match is like re_match_2 except it takes only a single string. */
4351 re_match (struct re_pattern_buffer *bufp, const char *string, int size,
4352 int pos, struct re_registers *regs)
4354 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size,
4359 #endif /* not emacs */
4362 /* re_match_2 matches the compiled pattern in BUFP against the
4363 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4364 SIZE2, respectively). We start matching at POS, and stop matching
4367 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4368 store offsets for the substring each group matched in REGS. See the
4369 documentation for exactly how many groups we fill.
4371 We return -1 if no match, -2 if an internal error (such as the
4372 failure stack overflowing). Otherwise, we return the length of the
4373 matched substring. */
4376 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4377 int size1, const char *string2, int size2, int pos,
4378 struct re_registers *regs, int stop)
4383 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object,
4385 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object,
4391 result = re_match_2_internal (bufp, (re_char *) string1, size1,
4392 (re_char *) string2, size2,
4399 /* This is a separate function so that we can force an alloca cleanup
4402 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4403 int size1, re_char *string2, int size2, int pos,
4404 struct re_registers *regs, int stop)
4406 /* General temporaries. */
4409 int should_succeed; /* XEmacs change */
4411 /* Just past the end of the corresponding string. */
4412 re_char *end1, *end2;
4414 /* Pointers into string1 and string2, just past the last characters in
4415 each to consider matching. */
4416 re_char *end_match_1, *end_match_2;
4418 /* Where we are in the data, and the end of the current string. */
4421 /* Where we are in the pattern, and the end of the pattern. */
4422 unsigned char *p = bufp->buffer;
4423 REGISTER unsigned char *pend = p + bufp->used;
4425 /* Mark the opcode just after a start_memory, so we can test for an
4426 empty subpattern when we get to the stop_memory. */
4427 re_char *just_past_start_mem = 0;
4429 /* We use this to map every character in the string. */
4430 RE_TRANSLATE_TYPE translate = bufp->translate;
4432 /* Failure point stack. Each place that can handle a failure further
4433 down the line pushes a failure point on this stack. It consists of
4434 restart, regend, and reg_info for all registers corresponding to
4435 the subexpressions we're currently inside, plus the number of such
4436 registers, and, finally, two char *'s. The first char * is where
4437 to resume scanning the pattern; the second one is where to resume
4438 scanning the strings. If the latter is zero, the failure point is
4439 a ``dummy''; if a failure happens and the failure point is a dummy,
4440 it gets discarded and the next one is tried. */
4441 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4442 fail_stack_type fail_stack;
4445 static unsigned failure_id;
4446 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4450 /* This holds the pointer to the failure stack, when
4451 it is allocated relocatably. */
4452 fail_stack_elt_t *failure_stack_ptr;
4455 /* We fill all the registers internally, independent of what we
4456 return, for use in backreferences. The number here includes
4457 an element for register zero. */
4458 int num_regs = bufp->re_ngroups + 1;
4460 /* The currently active registers. */
4461 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4462 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4464 /* Information on the contents of registers. These are pointers into
4465 the input strings; they record just what was matched (on this
4466 attempt) by a subexpression part of the pattern, that is, the
4467 regnum-th regstart pointer points to where in the pattern we began
4468 matching and the regnum-th regend points to right after where we
4469 stopped matching the regnum-th subexpression. (The zeroth register
4470 keeps track of what the whole pattern matches.) */
4471 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4472 re_char **regstart, **regend;
4475 /* If a group that's operated upon by a repetition operator fails to
4476 match anything, then the register for its start will need to be
4477 restored because it will have been set to wherever in the string we
4478 are when we last see its open-group operator. Similarly for a
4480 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4481 re_char **old_regstart, **old_regend;
4484 /* The is_active field of reg_info helps us keep track of which (possibly
4485 nested) subexpressions we are currently in. The matched_something
4486 field of reg_info[reg_num] helps us tell whether or not we have
4487 matched any of the pattern so far this time through the reg_num-th
4488 subexpression. These two fields get reset each time through any
4489 loop their register is in. */
4490 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4491 register_info_type *reg_info;
4494 /* The following record the register info as found in the above
4495 variables when we find a match better than any we've seen before.
4496 This happens as we backtrack through the failure points, which in
4497 turn happens only if we have not yet matched the entire string. */
4498 unsigned best_regs_set = false;
4499 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4500 re_char **best_regstart, **best_regend;
4503 /* Logically, this is `best_regend[0]'. But we don't want to have to
4504 allocate space for that if we're not allocating space for anything
4505 else (see below). Also, we never need info about register 0 for
4506 any of the other register vectors, and it seems rather a kludge to
4507 treat `best_regend' differently than the rest. So we keep track of
4508 the end of the best match so far in a separate variable. We
4509 initialize this to NULL so that when we backtrack the first time
4510 and need to test it, it's not garbage. */
4511 re_char *match_end = NULL;
4513 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4514 int set_regs_matched_done = 0;
4516 /* Used when we pop values we don't care about. */
4517 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4518 re_char **reg_dummy;
4519 register_info_type *reg_info_dummy;
4523 /* Counts the total number of registers pushed. */
4524 unsigned num_regs_pushed = 0;
4527 /* 1 if this match ends in the same string (string1 or string2)
4528 as the best previous match. */
4531 /* 1 if this match is the best seen so far. */
4532 re_bool best_match_p;
4534 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4538 #ifdef MATCH_MAY_ALLOCATE
4539 /* Do not bother to initialize all the register variables if there are
4540 no groups in the pattern, as it takes a fair amount of time. If
4541 there are groups, we include space for register 0 (the whole
4542 pattern), even though we never use it, since it simplifies the
4543 array indexing. We should fix this. */
4544 if (bufp->re_ngroups)
4546 regstart = REGEX_TALLOC (num_regs, re_char *);
4547 regend = REGEX_TALLOC (num_regs, re_char *);
4548 old_regstart = REGEX_TALLOC (num_regs, re_char *);
4549 old_regend = REGEX_TALLOC (num_regs, re_char *);
4550 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4551 best_regend = REGEX_TALLOC (num_regs, re_char *);
4552 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4553 reg_dummy = REGEX_TALLOC (num_regs, re_char *);
4554 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4556 if (!(regstart && regend && old_regstart && old_regend && reg_info
4557 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4565 /* We must initialize all our variables to NULL, so that
4566 `FREE_VARIABLES' doesn't try to free them. */
4567 regstart = regend = old_regstart = old_regend = best_regstart
4568 = best_regend = reg_dummy = NULL;
4569 reg_info = reg_info_dummy = (register_info_type *) NULL;
4571 #endif /* MATCH_MAY_ALLOCATE */
4573 /* The starting position is bogus. */
4574 if (pos < 0 || pos > size1 + size2)
4580 /* Initialize subexpression text positions to -1 to mark ones that no
4581 start_memory/stop_memory has been seen for. Also initialize the
4582 register information struct. */
4583 for (mcnt = 1; mcnt < num_regs; mcnt++)
4585 regstart[mcnt] = regend[mcnt]
4586 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4588 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4589 IS_ACTIVE (reg_info[mcnt]) = 0;
4590 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4591 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4593 /* We move `string1' into `string2' if the latter's empty -- but not if
4594 `string1' is null. */
4595 if (size2 == 0 && string1 != NULL)
4602 end1 = string1 + size1;
4603 end2 = string2 + size2;
4605 /* Compute where to stop matching, within the two strings. */
4608 end_match_1 = string1 + stop;
4609 end_match_2 = string2;
4614 end_match_2 = string2 + stop - size1;
4617 /* `p' scans through the pattern as `d' scans through the data.
4618 `dend' is the end of the input string that `d' points within. `d'
4619 is advanced into the following input string whenever necessary, but
4620 this happens before fetching; therefore, at the beginning of the
4621 loop, `d' can be pointing at the end of a string, but it cannot
4623 if (size1 > 0 && pos <= size1)
4630 d = string2 + pos - size1;
4634 DEBUG_PRINT1 ("The compiled pattern is: \n");
4635 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4636 DEBUG_PRINT1 ("The string to match is: `");
4637 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4638 DEBUG_PRINT1 ("'\n");
4640 /* This loops over pattern commands. It exits by returning from the
4641 function if the match is complete, or it drops through if the match
4642 fails at this starting point in the input data. */
4645 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4646 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4647 if (!no_quit_in_re_search)
4652 { /* End of pattern means we might have succeeded. */
4653 DEBUG_PRINT1 ("end of pattern ... ");
4655 /* If we haven't matched the entire string, and we want the
4656 longest match, try backtracking. */
4657 if (d != end_match_2)
4659 same_str_p = (FIRST_STRING_P (match_end)
4660 == MATCHING_IN_FIRST_STRING);
4662 /* AIX compiler got confused when this was combined
4663 with the previous declaration. */
4665 best_match_p = d > match_end;
4667 best_match_p = !MATCHING_IN_FIRST_STRING;
4669 DEBUG_PRINT1 ("backtracking.\n");
4671 if (!FAIL_STACK_EMPTY ())
4672 { /* More failure points to try. */
4674 /* If exceeds best match so far, save it. */
4675 if (!best_regs_set || best_match_p)
4677 best_regs_set = true;
4680 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4682 for (mcnt = 1; mcnt < num_regs; mcnt++)
4684 best_regstart[mcnt] = regstart[mcnt];
4685 best_regend[mcnt] = regend[mcnt];
4691 /* If no failure points, don't restore garbage. And if
4692 last match is real best match, don't restore second
4694 else if (best_regs_set && !best_match_p)
4697 /* Restore best match. It may happen that `dend ==
4698 end_match_1' while the restored d is in string2.
4699 For example, the pattern `x.*y.*z' against the
4700 strings `x-' and `y-z-', if the two strings are
4701 not consecutive in memory. */
4702 DEBUG_PRINT1 ("Restoring best registers.\n");
4705 dend = ((d >= string1 && d <= end1)
4706 ? end_match_1 : end_match_2);
4708 for (mcnt = 1; mcnt < num_regs; mcnt++)
4710 regstart[mcnt] = best_regstart[mcnt];
4711 regend[mcnt] = best_regend[mcnt];
4714 } /* d != end_match_2 */
4717 DEBUG_PRINT1 ("Accepting match.\n");
4720 /* If caller wants register contents data back, fill REGS. */
4721 int num_nonshy_regs = bufp->re_nsub + 1;
4722 if (regs && !bufp->no_sub)
4724 /* Have the register data arrays been allocated? */
4725 if (bufp->regs_allocated == REGS_UNALLOCATED)
4726 { /* No. So allocate them with malloc. We need one
4727 extra element beyond `num_regs' for the `-1' marker
4729 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1);
4730 regs->start = TALLOC (regs->num_regs, regoff_t);
4731 regs->end = TALLOC (regs->num_regs, regoff_t);
4732 if (regs->start == NULL || regs->end == NULL)
4737 bufp->regs_allocated = REGS_REALLOCATE;
4739 else if (bufp->regs_allocated == REGS_REALLOCATE)
4740 { /* Yes. If we need more elements than were already
4741 allocated, reallocate them. If we need fewer, just
4743 if (regs->num_regs < num_nonshy_regs + 1)
4745 regs->num_regs = num_nonshy_regs + 1;
4746 RETALLOC (regs->start, regs->num_regs, regoff_t);
4747 RETALLOC (regs->end, regs->num_regs, regoff_t);
4748 if (regs->start == NULL || regs->end == NULL)
4757 /* The braces fend off a "empty body in an else-statement"
4758 warning under GCC when assert expands to nothing. */
4759 assert (bufp->regs_allocated == REGS_FIXED);
4762 /* Convert the pointer data in `regstart' and `regend' to
4763 indices. Register zero has to be set differently,
4764 since we haven't kept track of any info for it. */
4765 if (regs->num_regs > 0)
4767 regs->start[0] = pos;
4768 regs->end[0] = (MATCHING_IN_FIRST_STRING
4769 ? ((regoff_t) (d - string1))
4770 : ((regoff_t) (d - string2 + size1)));
4773 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
4774 Copy each into the corresponding external register.
4775 N.B. MCNT indexes external registers. */
4777 mcnt < MIN (num_nonshy_regs, regs->num_regs);
4780 int ireg = bufp->external_to_internal_register[mcnt];
4782 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
4783 regs->start[mcnt] = regs->end[mcnt] = -1;
4787 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
4789 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
4792 } /* regs && !bufp->no_sub */
4794 /* If we have regs and the regs structure has more elements than
4795 were in the pattern, set the extra elements to -1. If we
4796 (re)allocated the registers, this is the case, because we
4797 always allocate enough to have at least one -1 at the end.
4799 We do this even when no_sub is set because some applications
4800 (XEmacs) reuse register structures which may contain stale
4801 information, and permit attempts to access those registers.
4803 It would be possible to require the caller to do this, but we'd
4804 have to change the API for this function to reflect that, and
4805 audit all callers. */
4806 if (regs && regs->num_regs > 0)
4807 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
4808 regs->start[mcnt] = regs->end[mcnt] = -1;
4811 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4812 nfailure_points_pushed, nfailure_points_popped,
4813 nfailure_points_pushed - nfailure_points_popped);
4814 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4816 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4820 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4826 /* Otherwise match next pattern command. */
4827 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4829 /* Ignore these. Used to ignore the n of succeed_n's which
4830 currently have n == 0. */
4832 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4836 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4839 /* Match the next n pattern characters exactly. The following
4840 byte in the pattern defines n, and the n bytes after that
4841 are the characters to match. */
4844 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4846 /* This is written out as an if-else so we don't waste time
4847 testing `translate' inside the loop. */
4848 if (TRANSLATE_P (translate))
4853 Emchar pat_ch, buf_ch;
4857 pat_ch = charptr_emchar (p);
4858 buf_ch = charptr_emchar (d);
4859 if (RE_TRANSLATE (buf_ch) != pat_ch)
4862 pat_len = charcount_to_bytecount (p, 1);
4867 #else /* not MULE */
4869 if ((unsigned char) RE_TRANSLATE (*d++) != *p++)
4881 if (*d++ != *p++) goto fail;
4885 SET_REGS_MATCHED ();
4889 /* Match any character except possibly a newline or a null. */
4891 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4895 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4896 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4899 SET_REGS_MATCHED ();
4900 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4901 INC_CHARPTR (d); /* XEmacs change */
4908 REGISTER unsigned char c;
4909 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not;
4911 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : "");
4914 c = TRANSLATE (*d); /* The character to match. */
4916 /* Cast to `unsigned' instead of `unsigned char' in case the
4917 bit list is a full 32 bytes long. */
4918 if (c < (unsigned) (*p * BYTEWIDTH)
4919 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4924 if (!not_p) goto fail;
4926 SET_REGS_MATCHED ();
4927 INC_CHARPTR (d); /* XEmacs change */
4933 case charset_mule_not:
4936 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
4938 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
4941 c = charptr_emchar ((const Bufbyte *) d);
4942 c = TRANSLATE (c); /* The character to match. */
4944 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4947 p += unified_range_table_bytes_used (p);
4949 if (!not_p) goto fail;
4951 SET_REGS_MATCHED ();
4958 /* The beginning of a group is represented by start_memory.
4959 The arguments are the register number in the next byte, and the
4960 number of groups inner to this one in the next. The text
4961 matched within the group is recorded (in the internal
4962 registers data structure) under the register number. */
4964 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4966 /* Find out if this group can match the empty string. */
4967 p1 = p; /* To send to group_match_null_string_p. */
4969 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4970 REG_MATCH_NULL_STRING_P (reg_info[*p])
4971 = group_match_null_string_p (&p1, pend, reg_info);
4973 /* Save the position in the string where we were the last time
4974 we were at this open-group operator in case the group is
4975 operated upon by a repetition operator, e.g., with `(a*)*b'
4976 against `ab'; then we want to ignore where we are now in
4977 the string in case this attempt to match fails. */
4978 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4979 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4981 DEBUG_PRINT2 (" old_regstart: %d\n",
4982 POINTER_TO_OFFSET (old_regstart[*p]));
4985 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4987 IS_ACTIVE (reg_info[*p]) = 1;
4988 MATCHED_SOMETHING (reg_info[*p]) = 0;
4990 /* Clear this whenever we change the register activity status. */
4991 set_regs_matched_done = 0;
4993 /* This is the new highest active register. */
4994 highest_active_reg = *p;
4996 /* If nothing was active before, this is the new lowest active
4998 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4999 lowest_active_reg = *p;
5001 /* Move past the register number and inner group count. */
5003 just_past_start_mem = p;
5008 /* The stop_memory opcode represents the end of a group. Its
5009 arguments are the same as start_memory's: the register
5010 number, and the number of inner groups. */
5012 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
5014 /* We need to save the string position the last time we were at
5015 this close-group operator in case the group is operated
5016 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
5017 against `aba'; then we want to ignore where we are now in
5018 the string in case this attempt to match fails. */
5019 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
5020 ? REG_UNSET (regend[*p]) ? d : regend[*p]
5022 DEBUG_PRINT2 (" old_regend: %d\n",
5023 POINTER_TO_OFFSET (old_regend[*p]));
5026 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5028 /* This register isn't active anymore. */
5029 IS_ACTIVE (reg_info[*p]) = 0;
5031 /* Clear this whenever we change the register activity status. */
5032 set_regs_matched_done = 0;
5034 /* If this was the only register active, nothing is active
5036 if (lowest_active_reg == highest_active_reg)
5038 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5039 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5042 { /* We must scan for the new highest active register, since
5043 it isn't necessarily one less than now: consider
5044 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
5045 new highest active register is 1. */
5046 unsigned char r = *p - 1;
5047 while (r > 0 && !IS_ACTIVE (reg_info[r]))
5050 /* If we end up at register zero, that means that we saved
5051 the registers as the result of an `on_failure_jump', not
5052 a `start_memory', and we jumped to past the innermost
5053 `stop_memory'. For example, in ((.)*) we save
5054 registers 1 and 2 as a result of the *, but when we pop
5055 back to the second ), we are at the stop_memory 1.
5056 Thus, nothing is active. */
5059 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5060 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5064 highest_active_reg = r;
5066 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
5068 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
5070 lowest_active_reg = r;
5074 /* If just failed to match something this time around with a
5075 group that's operated on by a repetition operator, try to
5076 force exit from the ``loop'', and restore the register
5077 information for this group that we had before trying this
5079 if ((!MATCHED_SOMETHING (reg_info[*p])
5080 || just_past_start_mem == p - 1)
5083 re_bool is_a_jump_n = false;
5087 switch ((re_opcode_t) *p1++)
5091 case pop_failure_jump:
5092 case maybe_pop_jump:
5094 case dummy_failure_jump:
5095 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5105 /* If the next operation is a jump backwards in the pattern
5106 to an on_failure_jump right before the start_memory
5107 corresponding to this stop_memory, exit from the loop
5108 by forcing a failure after pushing on the stack the
5109 on_failure_jump's jump in the pattern, and d. */
5110 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5111 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5113 /* If this group ever matched anything, then restore
5114 what its registers were before trying this last
5115 failed match, e.g., with `(a*)*b' against `ab' for
5116 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5117 against `aba' for regend[3].
5119 Also restore the registers for inner groups for,
5120 e.g., `((a*)(b*))*' against `aba' (register 3 would
5121 otherwise get trashed). */
5123 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5127 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5129 /* Restore this and inner groups' (if any) registers. */
5130 for (r = *p; r < *p + *(p + 1); r++)
5132 regstart[r] = old_regstart[r];
5134 /* xx why this test? */
5135 if (old_regend[r] >= regstart[r])
5136 regend[r] = old_regend[r];
5140 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5141 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5147 /* Move past the register number and the inner group count. */
5152 /* \<digit> has been turned into a `duplicate' command which is
5153 followed by the numeric value of <digit> as the register number.
5154 (Already passed through external-to-internal-register mapping,
5155 so it refers to the actual group number, not the non-shy-only
5156 numbering used in the external world.) */
5159 REGISTER re_char *d2, *dend2;
5160 /* Get which register to match against. */
5162 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5164 /* Can't back reference a group which we've never matched. */
5165 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5168 /* Where in input to try to start matching. */
5169 d2 = regstart[regno];
5171 /* Where to stop matching; if both the place to start and
5172 the place to stop matching are in the same string, then
5173 set to the place to stop, otherwise, for now have to use
5174 the end of the first string. */
5176 dend2 = ((FIRST_STRING_P (regstart[regno])
5177 == FIRST_STRING_P (regend[regno]))
5178 ? regend[regno] : end_match_1);
5181 /* If necessary, advance to next segment in register
5185 if (dend2 == end_match_2) break;
5186 if (dend2 == regend[regno]) break;
5188 /* End of string1 => advance to string2. */
5190 dend2 = regend[regno];
5192 /* At end of register contents => success */
5193 if (d2 == dend2) break;
5195 /* If necessary, advance to next segment in data. */
5198 /* How many characters left in this segment to match. */
5201 /* Want how many consecutive characters we can match in
5202 one shot, so, if necessary, adjust the count. */
5203 if (mcnt > dend2 - d2)
5206 /* Compare that many; failure if mismatch, else move
5208 if (TRANSLATE_P (translate)
5209 ? bcmp_translate ((unsigned char *) d,
5210 (unsigned char *) d2, mcnt, translate)
5211 : memcmp (d, d2, mcnt))
5213 d += mcnt, d2 += mcnt;
5215 /* Do this because we've match some characters. */
5216 SET_REGS_MATCHED ();
5222 /* begline matches the empty string at the beginning of the string
5223 (unless `not_bol' is set in `bufp'), and, if
5224 `newline_anchor' is set, after newlines. */
5226 DEBUG_PRINT1 ("EXECUTING begline.\n");
5228 if (AT_STRINGS_BEG (d))
5230 if (!bufp->not_bol) break;
5232 else if (d[-1] == '\n' && bufp->newline_anchor)
5236 /* In all other cases, we fail. */
5240 /* endline is the dual of begline. */
5242 DEBUG_PRINT1 ("EXECUTING endline.\n");
5244 if (AT_STRINGS_END (d))
5246 if (!bufp->not_eol) break;
5249 /* We have to ``prefetch'' the next character. */
5250 else if ((d == end1 ? *string2 : *d) == '\n'
5251 && bufp->newline_anchor)
5258 /* Match at the very beginning of the data. */
5260 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5261 if (AT_STRINGS_BEG (d))
5266 /* Match at the very end of the data. */
5268 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5269 if (AT_STRINGS_END (d))
5274 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5275 pushes NULL as the value for the string on the stack. Then
5276 `pop_failure_point' will keep the current value for the
5277 string, instead of restoring it. To see why, consider
5278 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5279 then the . fails against the \n. But the next thing we want
5280 to do is match the \n against the \n; if we restored the
5281 string value, we would be back at the foo.
5283 Because this is used only in specific cases, we don't need to
5284 check all the things that `on_failure_jump' does, to make
5285 sure the right things get saved on the stack. Hence we don't
5286 share its code. The only reason to push anything on the
5287 stack at all is that otherwise we would have to change
5288 `anychar's code to do something besides goto fail in this
5289 case; that seems worse than this. */
5290 case on_failure_keep_string_jump:
5291 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5293 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5294 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5296 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2);
5300 /* Uses of on_failure_jump:
5302 Each alternative starts with an on_failure_jump that points
5303 to the beginning of the next alternative. Each alternative
5304 except the last ends with a jump that in effect jumps past
5305 the rest of the alternatives. (They really jump to the
5306 ending jump of the following alternative, because tensioning
5307 these jumps is a hassle.)
5309 Repeats start with an on_failure_jump that points past both
5310 the repetition text and either the following jump or
5311 pop_failure_jump back to this on_failure_jump. */
5312 case on_failure_jump:
5314 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5316 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5317 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5319 /* If this on_failure_jump comes right before a group (i.e.,
5320 the original * applied to a group), save the information
5321 for that group and all inner ones, so that if we fail back
5322 to this point, the group's information will be correct.
5323 For example, in \(a*\)*\1, we need the preceding group,
5324 and in \(\(a*\)b*\)\2, we need the inner group. */
5326 /* We can't use `p' to check ahead because we push
5327 a failure point to `p + mcnt' after we do this. */
5330 /* We need to skip no_op's before we look for the
5331 start_memory in case this on_failure_jump is happening as
5332 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5334 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5337 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5339 /* We have a new highest active register now. This will
5340 get reset at the start_memory we are about to get to,
5341 but we will have saved all the registers relevant to
5342 this repetition op, as described above. */
5343 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5344 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5345 lowest_active_reg = *(p1 + 1);
5348 DEBUG_PRINT1 (":\n");
5349 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5353 /* A smart repeat ends with `maybe_pop_jump'.
5354 We change it to either `pop_failure_jump' or `jump'. */
5355 case maybe_pop_jump:
5356 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5357 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5359 REGISTER unsigned char *p2 = p;
5361 /* Compare the beginning of the repeat with what in the
5362 pattern follows its end. If we can establish that there
5363 is nothing that they would both match, i.e., that we
5364 would have to backtrack because of (as in, e.g., `a*a')
5365 then we can change to pop_failure_jump, because we'll
5366 never have to backtrack.
5368 This is not true in the case of alternatives: in
5369 `(a|ab)*' we do need to backtrack to the `ab' alternative
5370 (e.g., if the string was `ab'). But instead of trying to
5371 detect that here, the alternative has put on a dummy
5372 failure point which is what we will end up popping. */
5374 /* Skip over open/close-group commands.
5375 If what follows this loop is a ...+ construct,
5376 look at what begins its body, since we will have to
5377 match at least one of that. */
5381 && ((re_opcode_t) *p2 == stop_memory
5382 || (re_opcode_t) *p2 == start_memory))
5384 else if (p2 + 6 < pend
5385 && (re_opcode_t) *p2 == dummy_failure_jump)
5392 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5393 to the `maybe_finalize_jump' of this case. Examine what
5396 /* If we're at the end of the pattern, we can change. */
5399 /* Consider what happens when matching ":\(.*\)"
5400 against ":/". I don't really understand this code
5402 p[-3] = (unsigned char) pop_failure_jump;
5404 (" End of pattern: change to `pop_failure_jump'.\n");
5407 else if ((re_opcode_t) *p2 == exactn
5408 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5410 REGISTER unsigned char c
5411 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5413 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5415 p[-3] = (unsigned char) pop_failure_jump;
5416 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5420 else if ((re_opcode_t) p1[3] == charset
5421 || (re_opcode_t) p1[3] == charset_not)
5423 int not_p = (re_opcode_t) p1[3] == charset_not;
5425 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5426 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5429 /* `not_p' is equal to 1 if c would match, which means
5430 that we can't change to pop_failure_jump. */
5433 p[-3] = (unsigned char) pop_failure_jump;
5434 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5438 else if ((re_opcode_t) *p2 == charset)
5441 REGISTER unsigned char c
5442 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5445 if ((re_opcode_t) p1[3] == exactn
5446 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5447 && (p2[2 + p1[5] / BYTEWIDTH]
5448 & (1 << (p1[5] % BYTEWIDTH)))))
5450 p[-3] = (unsigned char) pop_failure_jump;
5451 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5455 else if ((re_opcode_t) p1[3] == charset_not)
5458 /* We win if the charset_not inside the loop
5459 lists every character listed in the charset after. */
5460 for (idx = 0; idx < (int) p2[1]; idx++)
5461 if (! (p2[2 + idx] == 0
5462 || (idx < (int) p1[4]
5463 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5468 p[-3] = (unsigned char) pop_failure_jump;
5469 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5472 else if ((re_opcode_t) p1[3] == charset)
5475 /* We win if the charset inside the loop
5476 has no overlap with the one after the loop. */
5478 idx < (int) p2[1] && idx < (int) p1[4];
5480 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5483 if (idx == p2[1] || idx == p1[4])
5485 p[-3] = (unsigned char) pop_failure_jump;
5486 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5491 p -= 2; /* Point at relative address again. */
5492 if ((re_opcode_t) p[-1] != pop_failure_jump)
5494 p[-1] = (unsigned char) jump;
5495 DEBUG_PRINT1 (" Match => jump.\n");
5496 goto unconditional_jump;
5498 /* Note fall through. */
5501 /* The end of a simple repeat has a pop_failure_jump back to
5502 its matching on_failure_jump, where the latter will push a
5503 failure point. The pop_failure_jump takes off failure
5504 points put on by this pop_failure_jump's matching
5505 on_failure_jump; we got through the pattern to here from the
5506 matching on_failure_jump, so didn't fail. */
5507 case pop_failure_jump:
5509 /* We need to pass separate storage for the lowest and
5510 highest registers, even though we don't care about the
5511 actual values. Otherwise, we will restore only one
5512 register from the stack, since lowest will == highest in
5513 `pop_failure_point'. */
5514 int dummy_low_reg, dummy_high_reg;
5515 unsigned char *pdummy;
5516 re_char *sdummy = NULL;
5518 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5519 POP_FAILURE_POINT (sdummy, pdummy,
5520 dummy_low_reg, dummy_high_reg,
5521 reg_dummy, reg_dummy, reg_info_dummy);
5523 /* Note fall through. */
5526 /* Unconditionally jump (without popping any failure points). */
5529 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5530 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5531 p += mcnt; /* Do the jump. */
5532 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5536 /* We need this opcode so we can detect where alternatives end
5537 in `group_match_null_string_p' et al. */
5539 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5540 goto unconditional_jump;
5543 /* Normally, the on_failure_jump pushes a failure point, which
5544 then gets popped at pop_failure_jump. We will end up at
5545 pop_failure_jump, also, and with a pattern of, say, `a+', we
5546 are skipping over the on_failure_jump, so we have to push
5547 something meaningless for pop_failure_jump to pop. */
5548 case dummy_failure_jump:
5549 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5550 /* It doesn't matter what we push for the string here. What
5551 the code at `fail' tests is the value for the pattern. */
5552 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5553 goto unconditional_jump;
5556 /* At the end of an alternative, we need to push a dummy failure
5557 point in case we are followed by a `pop_failure_jump', because
5558 we don't want the failure point for the alternative to be
5559 popped. For example, matching `(a|ab)*' against `aab'
5560 requires that we match the `ab' alternative. */
5561 case push_dummy_failure:
5562 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5563 /* See comments just above at `dummy_failure_jump' about the
5565 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2);
5568 /* Have to succeed matching what follows at least n times.
5569 After that, handle like `on_failure_jump'. */
5571 EXTRACT_NUMBER (mcnt, p + 2);
5572 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5575 /* Originally, this is how many times we HAVE to succeed. */
5580 STORE_NUMBER_AND_INCR (p, mcnt);
5581 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5585 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5587 p[2] = (unsigned char) no_op;
5588 p[3] = (unsigned char) no_op;
5594 EXTRACT_NUMBER (mcnt, p + 2);
5595 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5597 /* Originally, this is how many times we CAN jump. */
5601 STORE_NUMBER (p + 2, mcnt);
5602 goto unconditional_jump;
5604 /* If don't have to jump any more, skip over the rest of command. */
5611 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5613 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5615 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5616 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5617 STORE_NUMBER (p1, mcnt);
5622 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5627 /* Straightforward and (I hope) correct implementation.
5628 Probably should be optimized by arranging to compute
5630 /* emch1 is the character before d, syn1 is the syntax of
5631 emch1, emch2 is the character at d, and syn2 is the
5633 Emchar emch1, emch2;
5634 /* GCC isn't smart enough to see these are initialized if used. */
5635 int syn1 = 0, syn2 = 0;
5636 re_char *d_before, *d_after;
5638 at_beg = AT_STRINGS_BEG (d),
5639 at_end = AT_STRINGS_END (d);
5644 if (at_beg && at_end)
5652 d_before = POS_BEFORE_GAP_UNSAFE (d);
5653 DEC_CHARPTR (d_before);
5654 emch1 = charptr_emchar (d_before);
5656 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5657 UPDATE_SYNTAX_CACHE (xpos);
5659 syn1 = SYNTAX_FROM_CACHE
5660 (XCHAR_TABLE (regex_emacs_buffer
5661 ->mirror_syntax_table),
5666 d_after = POS_AFTER_GAP_UNSAFE (d);
5667 emch2 = charptr_emchar (d_after);
5669 xpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5670 UPDATE_SYNTAX_CACHE_FORWARD (xpos + 1);
5672 syn2 = SYNTAX_FROM_CACHE
5673 (XCHAR_TABLE (regex_emacs_buffer
5674 ->mirror_syntax_table),
5679 result = (syn2 == Sword);
5681 result = (syn1 == Sword);
5683 result = ((syn1 == Sword) != (syn2 == Sword));
5686 if (result == should_succeed)
5692 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5694 goto matchwordbound;
5697 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5698 if (AT_STRINGS_END (d))
5701 /* XEmacs: this originally read:
5703 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5707 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d);
5708 Emchar emch = charptr_emchar (dtmp);
5710 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5711 UPDATE_SYNTAX_CACHE (charpos);
5713 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5716 if (AT_STRINGS_BEG (d))
5718 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5720 emch = charptr_emchar (dtmp);
5722 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1);
5724 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5731 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5732 if (AT_STRINGS_BEG (d))
5735 /* XEmacs: this originally read:
5737 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5738 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5741 The or condition is incorrect (reversed).
5746 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1;
5747 UPDATE_SYNTAX_CACHE (charpos);
5749 dtmp = POS_BEFORE_GAP_UNSAFE (d);
5751 emch = charptr_emchar (dtmp);
5752 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5755 if (AT_STRINGS_END (d))
5757 dtmp = POS_AFTER_GAP_UNSAFE (d);
5758 emch = charptr_emchar (dtmp);
5760 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1);
5762 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5770 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5771 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5772 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5773 >= BUF_PT (regex_emacs_buffer)))
5778 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5779 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5780 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5781 != BUF_PT (regex_emacs_buffer)))
5786 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5787 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object))
5788 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5789 <= BUF_PT (regex_emacs_buffer)))
5792 #if 0 /* not emacs19 */
5794 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5795 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5796 != BUF_PT (regex_emacs_buffer))
5799 #endif /* not emacs19 */
5802 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5807 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5819 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5820 UPDATE_SYNTAX_CACHE (charpos);
5824 emch = charptr_emchar ((const Bufbyte *) d);
5825 matches = (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5826 emch) == (enum syntaxcode) mcnt);
5828 if (matches != should_succeed)
5830 SET_REGS_MATCHED ();
5835 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5837 goto matchnotsyntax;
5840 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5844 goto matchornotsyntax;
5847 /* 97/2/17 jhod Mule category code patch */
5856 emch = charptr_emchar ((const Bufbyte *) d);
5858 if (check_category_char(emch, regex_emacs_buffer->category_table,
5859 mcnt, should_succeed))
5861 SET_REGS_MATCHED ();
5865 case notcategoryspec:
5867 goto matchornotcategory;
5868 /* end of category patch */
5870 #else /* not emacs */
5872 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5874 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5876 SET_REGS_MATCHED ();
5881 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5883 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5885 SET_REGS_MATCHED ();
5893 continue; /* Successfully executed one pattern command; keep going. */
5896 /* We goto here if a matching operation fails. */
5898 if (!FAIL_STACK_EMPTY ())
5899 { /* A restart point is known. Restore to that state. */
5900 DEBUG_PRINT1 ("\nFAIL:\n");
5901 POP_FAILURE_POINT (d, p,
5902 lowest_active_reg, highest_active_reg,
5903 regstart, regend, reg_info);
5905 /* If this failure point is a dummy, try the next one. */
5909 /* If we failed to the end of the pattern, don't examine *p. */
5913 re_bool is_a_jump_n = false;
5915 /* If failed to a backwards jump that's part of a repetition
5916 loop, need to pop this failure point and use the next one. */
5917 switch ((re_opcode_t) *p)
5921 case maybe_pop_jump:
5922 case pop_failure_jump:
5925 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5928 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5930 && (re_opcode_t) *p1 == on_failure_jump))
5938 if (d >= string1 && d <= end1)
5942 break; /* Matching at this starting point really fails. */
5946 goto restore_best_regs;
5950 return -1; /* Failure to match. */
5953 /* Subroutine definitions for re_match_2. */
5956 /* We are passed P pointing to a register number after a start_memory.
5958 Return true if the pattern up to the corresponding stop_memory can
5959 match the empty string, and false otherwise.
5961 If we find the matching stop_memory, sets P to point to one past its number.
5962 Otherwise, sets P to an undefined byte less than or equal to END.
5964 We don't handle duplicates properly (yet). */
5967 group_match_null_string_p (unsigned char **p, unsigned char *end,
5968 register_info_type *register_info)
5971 /* Point to after the args to the start_memory. */
5972 unsigned char *p1 = *p + 2;
5976 /* Skip over opcodes that can match nothing, and return true or
5977 false, as appropriate, when we get to one that can't, or to the
5978 matching stop_memory. */
5980 switch ((re_opcode_t) *p1)
5982 /* Could be either a loop or a series of alternatives. */
5983 case on_failure_jump:
5985 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5987 /* If the next operation is not a jump backwards in the
5992 /* Go through the on_failure_jumps of the alternatives,
5993 seeing if any of the alternatives cannot match nothing.
5994 The last alternative starts with only a jump,
5995 whereas the rest start with on_failure_jump and end
5996 with a jump, e.g., here is the pattern for `a|b|c':
5998 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5999 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6002 So, we have to first go through the first (n-1)
6003 alternatives and then deal with the last one separately. */
6006 /* Deal with the first (n-1) alternatives, which start
6007 with an on_failure_jump (see above) that jumps to right
6008 past a jump_past_alt. */
6010 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
6012 /* `mcnt' holds how many bytes long the alternative
6013 is, including the ending `jump_past_alt' and
6016 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
6020 /* Move to right after this alternative, including the
6024 /* Break if it's the beginning of an n-th alternative
6025 that doesn't begin with an on_failure_jump. */
6026 if ((re_opcode_t) *p1 != on_failure_jump)
6029 /* Still have to check that it's not an n-th
6030 alternative that starts with an on_failure_jump. */
6032 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6033 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
6035 /* Get to the beginning of the n-th alternative. */
6041 /* Deal with the last alternative: go back and get number
6042 of the `jump_past_alt' just before it. `mcnt' contains
6043 the length of the alternative. */
6044 EXTRACT_NUMBER (mcnt, p1 - 2);
6046 if (!alt_match_null_string_p (p1, p1 + mcnt, register_info))
6049 p1 += mcnt; /* Get past the n-th alternative. */
6055 assert (p1[1] == **p);
6061 if (!common_op_match_null_string_p (&p1, end, register_info))
6064 } /* while p1 < end */
6067 } /* group_match_null_string_p */
6070 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6071 It expects P to be the first byte of a single alternative and END one
6072 byte past the last. The alternative can contain groups. */
6075 alt_match_null_string_p (unsigned char *p, unsigned char *end,
6076 register_info_type *register_info)
6079 unsigned char *p1 = p;
6083 /* Skip over opcodes that can match nothing, and break when we get
6084 to one that can't. */
6086 switch ((re_opcode_t) *p1)
6089 case on_failure_jump:
6091 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6096 if (!common_op_match_null_string_p (&p1, end, register_info))
6099 } /* while p1 < end */
6102 } /* alt_match_null_string_p */
6105 /* Deals with the ops common to group_match_null_string_p and
6106 alt_match_null_string_p.
6108 Sets P to one after the op and its arguments, if any. */
6111 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
6112 register_info_type *register_info)
6117 unsigned char *p1 = *p;
6119 switch ((re_opcode_t) *p1++)
6139 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6140 ret = group_match_null_string_p (&p1, end, register_info);
6142 /* Have to set this here in case we're checking a group which
6143 contains a group and a back reference to it. */
6145 if (REG_MATCH_NULL_STRING_P (register_info[reg_no]) ==
6146 MATCH_NULL_UNSET_VALUE)
6147 REG_MATCH_NULL_STRING_P (register_info[reg_no]) = ret;
6153 /* If this is an optimized succeed_n for zero times, make the jump. */
6155 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6163 /* Get to the number of times to succeed. */
6165 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6170 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6178 if (!REG_MATCH_NULL_STRING_P (register_info[*p1]))
6186 /* All other opcodes mean we cannot match the empty string. */
6192 } /* common_op_match_null_string_p */
6195 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6196 bytes; nonzero otherwise. */
6199 bcmp_translate (re_char *s1, re_char *s2,
6200 REGISTER int len, RE_TRANSLATE_TYPE translate)
6202 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6204 const unsigned char *p1_end = s1 + len;
6205 const unsigned char *p2_end = s2 + len;
6207 while (p1 != p1_end && p2 != p2_end)
6209 Emchar p1_ch, p2_ch;
6211 p1_ch = charptr_emchar (p1);
6212 p2_ch = charptr_emchar (p2);
6214 if (RE_TRANSLATE (p1_ch)
6215 != RE_TRANSLATE (p2_ch))
6220 #else /* not MULE */
6223 if (RE_TRANSLATE (*p1++) != RE_TRANSLATE (*p2++)) return 1;
6230 /* Entry points for GNU code. */
6232 /* re_compile_pattern is the GNU regular expression compiler: it
6233 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6234 Returns 0 if the pattern was valid, otherwise an error string.
6236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6237 are set in BUFP on entry.
6239 We call regex_compile to do the actual compilation. */
6242 re_compile_pattern (const char *pattern, int length,
6243 struct re_pattern_buffer *bufp)
6247 /* GNU code is written to assume at least RE_NREGS registers will be set
6248 (and at least one extra will be -1). */
6249 bufp->regs_allocated = REGS_UNALLOCATED;
6251 /* And GNU code determines whether or not to get register information
6252 by passing null for the REGS argument to re_match, etc., not by
6256 /* Match anchors at newline. */
6257 bufp->newline_anchor = 1;
6259 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, bufp);
6263 return gettext (re_error_msgid[(int) ret]);
6266 /* Entry points compatible with 4.2 BSD regex library. We don't define
6267 them unless specifically requested. */
6269 #ifdef _REGEX_RE_COMP
6271 /* BSD has one and only one pattern buffer. */
6272 static struct re_pattern_buffer re_comp_buf;
6275 re_comp (const char *s)
6281 if (!re_comp_buf.buffer)
6282 return gettext ("No previous regular expression");
6286 if (!re_comp_buf.buffer)
6288 re_comp_buf.buffer = (unsigned char *) malloc (200);
6289 if (re_comp_buf.buffer == NULL)
6290 return gettext (re_error_msgid[(int) REG_ESPACE]);
6291 re_comp_buf.allocated = 200;
6293 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6294 if (re_comp_buf.fastmap == NULL)
6295 return gettext (re_error_msgid[(int) REG_ESPACE]);
6298 /* Since `re_exec' always passes NULL for the `regs' argument, we
6299 don't need to initialize the pattern buffer fields which affect it. */
6301 /* Match anchors at newlines. */
6302 re_comp_buf.newline_anchor = 1;
6304 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, &re_comp_buf);
6309 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6310 return (char *) gettext (re_error_msgid[(int) ret]);
6315 re_exec (const char *s)
6317 const int len = strlen (s);
6319 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6321 #endif /* _REGEX_RE_COMP */
6323 /* POSIX.2 functions. Don't define these for Emacs. */
6327 /* regcomp takes a regular expression as a string and compiles it.
6329 PREG is a regex_t *. We do not expect any fields to be initialized,
6330 since POSIX says we shouldn't. Thus, we set
6332 `buffer' to the compiled pattern;
6333 `used' to the length of the compiled pattern;
6334 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6335 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6336 RE_SYNTAX_POSIX_BASIC;
6337 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6338 `fastmap' and `fastmap_accurate' to zero;
6339 `re_nsub' to the number of subexpressions in PATTERN.
6340 (non-shy of course. POSIX probably doesn't know about
6341 shy ones, and in any case they should be invisible.)
6343 PATTERN is the address of the pattern string.
6345 CFLAGS is a series of bits which affect compilation.
6347 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6348 use POSIX basic syntax.
6350 If REG_NEWLINE is set, then . and [^...] don't match newline.
6351 Also, regexec will try a match beginning after every newline.
6353 If REG_ICASE is set, then we considers upper- and lowercase
6354 versions of letters to be equivalent when matching.
6356 If REG_NOSUB is set, then when PREG is passed to regexec, that
6357 routine will report only success or failure, and nothing about the
6360 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6361 the return codes and their meanings.) */
6364 regcomp (regex_t *preg, const char *pattern, int cflags)
6368 = (cflags & REG_EXTENDED) ?
6369 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6371 /* regex_compile will allocate the space for the compiled pattern. */
6373 preg->allocated = 0;
6376 /* Don't bother to use a fastmap when searching. This simplifies the
6377 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6378 characters after newlines into the fastmap. This way, we just try
6382 if (cflags & REG_ICASE)
6386 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6387 if (preg->translate == NULL)
6388 return (int) REG_ESPACE;
6390 /* Map uppercase characters to corresponding lowercase ones. */
6391 for (i = 0; i < CHAR_SET_SIZE; i++)
6392 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6395 preg->translate = NULL;
6397 /* If REG_NEWLINE is set, newlines are treated differently. */
6398 if (cflags & REG_NEWLINE)
6399 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6400 syntax &= ~RE_DOT_NEWLINE;
6401 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6402 /* It also changes the matching behavior. */
6403 preg->newline_anchor = 1;
6406 preg->newline_anchor = 0;
6408 preg->no_sub = !!(cflags & REG_NOSUB);
6410 /* POSIX says a null character in the pattern terminates it, so we
6411 can use strlen here in compiling the pattern. */
6412 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg);
6414 /* POSIX doesn't distinguish between an unmatched open-group and an
6415 unmatched close-group: both are REG_EPAREN. */
6416 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6422 /* regexec searches for a given pattern, specified by PREG, in the
6425 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6426 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6427 least NMATCH elements, and we set them to the offsets of the
6428 corresponding matched substrings.
6430 EFLAGS specifies `execution flags' which affect matching: if
6431 REG_NOTBOL is set, then ^ does not match at the beginning of the
6432 string; if REG_NOTEOL is set, then $ does not match at the end.
6434 We return 0 if we find a match and REG_NOMATCH if not. */
6437 regexec (const regex_t *preg, const char *string, Element_count nmatch,
6438 regmatch_t pmatch[], int eflags)
6441 struct re_registers regs;
6442 regex_t private_preg;
6443 int len = strlen (string);
6444 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6446 private_preg = *preg;
6448 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6449 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6451 /* The user has told us exactly how many registers to return
6452 information about, via `nmatch'. We have to pass that on to the
6453 matching routines. */
6454 private_preg.regs_allocated = REGS_FIXED;
6458 regs.num_regs = nmatch;
6459 regs.start = TALLOC (nmatch, regoff_t);
6460 regs.end = TALLOC (nmatch, regoff_t);
6461 if (regs.start == NULL || regs.end == NULL)
6462 return (int) REG_NOMATCH;
6465 /* Perform the searching operation. */
6466 ret = re_search (&private_preg, string, len,
6467 /* start: */ 0, /* range: */ len,
6468 want_reg_info ? ®s : (struct re_registers *) 0);
6470 /* Copy the register information to the POSIX structure. */
6477 for (r = 0; r < nmatch; r++)
6479 pmatch[r].rm_so = regs.start[r];
6480 pmatch[r].rm_eo = regs.end[r];
6484 /* If we needed the temporary register info, free the space now. */
6489 /* We want zero return to mean success, unlike `re_search'. */
6490 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6494 /* Returns a message corresponding to an error code, ERRCODE, returned
6495 from either regcomp or regexec. We don't use PREG here. */
6498 regerror (int errcode, const regex_t *preg, char *errbuf,
6499 Memory_count errbuf_size)
6502 Memory_count msg_size;
6505 || (size_t) errcode >= (sizeof (re_error_msgid)
6506 / sizeof (re_error_msgid[0])))
6507 /* Only error codes returned by the rest of the code should be passed
6508 to this routine. If we are given anything else, or if other regex
6509 code generates an invalid error code, then the program has a bug.
6510 Dump core so we can fix it. */
6513 msg = gettext (re_error_msgid[errcode]);
6515 msg_size = strlen (msg) + 1; /* Includes the null. */
6517 if (errbuf_size != 0)
6519 if (msg_size > errbuf_size)
6521 strncpy (errbuf, msg, errbuf_size - 1);
6522 errbuf[errbuf_size - 1] = 0;
6525 strcpy (errbuf, msg);
6532 /* Free dynamically allocated space used by PREG. */
6535 regfree (regex_t *preg)
6537 if (preg->buffer != NULL)
6538 free (preg->buffer);
6539 preg->buffer = NULL;
6541 preg->allocated = 0;
6544 if (preg->fastmap != NULL)
6545 free (preg->fastmap);
6546 preg->fastmap = NULL;
6547 preg->fastmap_accurate = 0;
6549 if (preg->translate != NULL)
6550 free (preg->translate);
6551 preg->translate = NULL;
6554 #endif /* not emacs */
6558 make-backup-files: t
6560 trim-versions-without-asking: nil