1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30#ifdef RE_ENABLE_I18N
31static void free_charset (re_charset_t *cset);
32#endif /* RE_ENABLE_I18N */
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
38static reg_errcode_t analyze (regex_t *preg);
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92#ifdef RE_ENABLE_I18N
93static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103#else /* not RE_ENABLE_I18N */
104static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110#endif /* not RE_ENABLE_I18N */
111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127/* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
132static const char __re_error_msgid[] =
133 {
134#define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
186static const size_t __re_error_msgid_idx[] =
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206
207/* Entry points for GNU code. */
208
209/* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
212
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
215
216const char *
217re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
219{
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235}
236weak_alias (__re_compile_pattern, re_compile_pattern)
237
238/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241/* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
249
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
252
253reg_syntax_t
254re_set_syntax (reg_syntax_t syntax)
255{
256 reg_syntax_t ret = re_syntax_options;
257
258 re_syntax_options = syntax;
259 return ret;
260}
261weak_alias (__re_set_syntax, re_set_syntax)
262
263int
264re_compile_fastmap (struct re_pattern_buffer *bufp)
265{
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
279}
280weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282static inline void
283__attribute__ ((always_inline))
284re_set_fastmap (char *fastmap, bool icase, int ch)
285{
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
289}
290
291/* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
293
294static void
295re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
297{
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
302 {
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
305
306 if (type == CHARACTER)
307 {
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309#ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
311 {
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
314 wchar_t wc;
315 mbstate_t state;
316
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 != (size_t) -1))
328 re_set_fastmap (fastmap, false, buf[0]);
329 }
330#endif
331 }
332 else if (type == SIMPLE_BRACKET)
333 {
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
336 {
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
342 }
343 }
344#ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
346 {
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 Idx i;
349
350# ifdef _LIBC
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
359 {
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
365 }
366# endif /* _LIBC */
367
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
374# ifdef _LIBC
375 || cset->nequiv_classes
376# endif /* _LIBC */
377 ))
378 {
379 unsigned char c = 0;
380 do
381 {
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
386 }
387 while (++c != 0);
388 }
389
390 else
391 {
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
394 {
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
401 {
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
405 }
406 }
407 }
408 }
409#endif /* RE_ENABLE_I18N */
410 else if (type == OP_PERIOD
411#ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413#endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
415 {
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
420 }
421 }
422}
423
424/* Entry point for POSIX code. */
425/* regcomp takes a regular expression as a string and compiles it.
426
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
429
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
439
440 PATTERN is the address of the pattern string.
441
442 CFLAGS is a series of bits which affect compilation.
443
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
446
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
449
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
452
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
456
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
459
460int
461regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462{
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
466
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
470
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
474 return REG_ESPACE;
475
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
477
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
485 }
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
490
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
497
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret == REG_NOERROR))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg);
503 else
504 {
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
508 }
509
510 return (int) ret;
511}
512libc_hidden_def (__regcomp)
513weak_alias (__regcomp, regcomp)
514
515/* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
517
518size_t
519regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 size_t errbuf_size)
521{
522 const char *msg;
523 size_t msg_size;
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
525
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
532
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535 msg_size = strlen (msg) + 1; /* Includes the null. */
536
537 if (__glibc_likely (errbuf_size != 0))
538 {
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
541 {
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
544 }
545 memcpy (errbuf, msg, cpy_size);
546 }
547
548 return msg_size;
549}
550weak_alias (__regerror, regerror)
551
552
553#ifdef RE_ENABLE_I18N
554/* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558static const bitset_t utf8_sb_map =
559{
560 /* Set the first 128 bits. */
561# if defined __GNUC__ && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563# else
564# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565# error "bitset_word_t is narrower than 32 bits"
566# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572# endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
575 ? 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577# endif
578};
579#endif
580
581
582static void
583free_dfa_content (re_dfa_t *dfa)
584{
585 Idx i, j;
586
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
592 {
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
599 }
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
604
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 {
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
610 {
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
613 }
614 re_free (entry->array);
615 }
616 re_free (dfa->state_table);
617#ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620#endif
621 re_free (dfa->subexp_map);
622#ifdef DEBUG
623 re_free (dfa->re_str);
624#endif
625
626 re_free (dfa);
627}
628
629
630/* Free dynamically allocated space used by PREG. */
631
632void
633regfree (regex_t *preg)
634{
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
637 {
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
640 }
641 preg->buffer = NULL;
642 preg->allocated = 0;
643
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
646
647 re_free (preg->translate);
648 preg->translate = NULL;
649}
650libc_hidden_def (__regfree)
651weak_alias (__regfree, regfree)
652
653/* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
655
656#if defined _REGEX_RE_COMP || defined _LIBC
657
658/* BSD has one and only one pattern buffer. */
659static struct re_pattern_buffer re_comp_buf;
660
661char *
662# ifdef _LIBC
663/* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666weak_function
667# endif
668re_comp (const char *s)
669{
670 reg_errcode_t ret;
671 char *fastmap;
672
673 if (!s)
674 {
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
678 }
679
680 if (re_comp_buf.buffer)
681 {
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
687 }
688
689 if (re_comp_buf.fastmap == NULL)
690 {
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
695 }
696
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
699
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
702
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705 if (!ret)
706 return NULL;
707
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
710}
711
712#ifdef _LIBC
713libc_freeres_fn (free_mem)
714{
715 __regfree (&re_comp_buf);
716}
717#endif
718
719#endif /* _REGEX_RE_COMP */
720
721/* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
724
725static reg_errcode_t
726re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 reg_syntax_t syntax)
728{
729 reg_errcode_t err = REG_NOERROR;
730 re_dfa_t *dfa;
731 re_string_t regexp;
732
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
737 preg->used = 0;
738 preg->re_nsub = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
741
742 /* Initialize the dfa. */
743 dfa = preg->buffer;
744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745 {
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751 if (dfa == NULL)
752 return REG_ESPACE;
753 preg->allocated = sizeof (re_dfa_t);
754 preg->buffer = dfa;
755 }
756 preg->used = sizeof (re_dfa_t);
757
758 err = init_dfa (dfa, length);
759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760 err = REG_ESPACE;
761 if (__glibc_unlikely (err != REG_NOERROR))
762 {
763 free_dfa_content (dfa);
764 preg->buffer = NULL;
765 preg->allocated = 0;
766 return err;
767 }
768#ifdef DEBUG
769 /* Note: length+1 will not overflow since it is checked in init_dfa. */
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
772#endif
773
774 err = re_string_construct (&regexp, pattern, length, preg->translate,
775 (syntax & RE_ICASE) != 0, dfa);
776 if (__glibc_unlikely (err != REG_NOERROR))
777 {
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (&regexp);
781 lock_fini (dfa->lock);
782 free_dfa_content (dfa);
783 preg->buffer = NULL;
784 preg->allocated = 0;
785 return err;
786 }
787
788 /* Parse the regular expression, and build a structure tree. */
789 preg->re_nsub = 0;
790 dfa->str_tree = parse (&regexp, preg, syntax, &err);
791 if (__glibc_unlikely (dfa->str_tree == NULL))
792 goto re_compile_internal_free_return;
793
794 /* Analyze the tree and create the nfa. */
795 err = analyze (preg);
796 if (__glibc_unlikely (err != REG_NOERROR))
797 goto re_compile_internal_free_return;
798
799#ifdef RE_ENABLE_I18N
800 /* If possible, do searching in single byte encoding to speed things up. */
801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802 optimize_utf8 (dfa);
803#endif
804
805 /* Then create the initial state of the dfa. */
806 err = create_initial_state (dfa);
807
808 /* Release work areas. */
809 free_workarea_compile (preg);
810 re_string_destruct (&regexp);
811
812 if (__glibc_unlikely (err != REG_NOERROR))
813 {
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
816 preg->buffer = NULL;
817 preg->allocated = 0;
818 }
819
820 return err;
821}
822
823/* Initialize DFA. We use the length of the regular expression PAT_LEN
824 as the initial length of some arrays. */
825
826static reg_errcode_t
827init_dfa (re_dfa_t *dfa, size_t pat_len)
828{
829 __re_size_t table_size;
830#ifndef _LIBC
831 const char *codeset_name;
832#endif
833#ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835#else
836 size_t max_i18n_object_size = 0;
837#endif
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
844
845 memset (dfa, '\0', sizeof (re_dfa_t));
846
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 <= pat_len))
856 return REG_ESPACE;
857
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
865
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
868
869 dfa->mb_cur_max = MB_CUR_MAX;
870#ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876#else
877 codeset_name = nl_langinfo (CODESET);
878 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
882 dfa->is_utf8 = 1;
883
884 /* We check exhaustively in the loop below if this charset is a
885 superset of ASCII. */
886 dfa->map_notascii = 0;
887#endif
888
889#ifdef RE_ENABLE_I18N
890 if (dfa->mb_cur_max > 1)
891 {
892 if (dfa->is_utf8)
893 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894 else
895 {
896 int i, j, ch;
897
898 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899 if (__glibc_unlikely (dfa->sb_char == NULL))
900 return REG_ESPACE;
901
902 /* Set the bits corresponding to single byte chars. */
903 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
905 {
906 wint_t wch = __btowc (ch);
907 if (wch != WEOF)
908 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
909# ifndef _LIBC
910 if (isascii (ch) && wch != ch)
911 dfa->map_notascii = 1;
912# endif
913 }
914 }
915 }
916#endif
917
918 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
919 return REG_ESPACE;
920 return REG_NOERROR;
921}
922
923/* Initialize WORD_CHAR table, which indicate which character is
924 "word". In this case "word" means that it is the word construction
925 character used by some operators like "\<", "\>", etc. */
926
927static void
928init_word_char (re_dfa_t *dfa)
929{
930 int i = 0;
931 int j;
932 int ch = 0;
933 dfa->word_ops_used = 1;
934 if (__glibc_likely (dfa->map_notascii == 0))
935 {
936 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937 them, an issue when this code is used in Gnulib. */
938 bitset_word_t bits0 = 0x00000000;
939 bitset_word_t bits1 = 0x03ff0000;
940 bitset_word_t bits2 = 0x87fffffe;
941 bitset_word_t bits3 = 0x07fffffe;
942 if (BITSET_WORD_BITS == 64)
943 {
944 /* Pacify gcc -Woverflow on 32-bit platformns. */
945 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947 i = 2;
948 }
949 else if (BITSET_WORD_BITS == 32)
950 {
951 dfa->word_char[0] = bits0;
952 dfa->word_char[1] = bits1;
953 dfa->word_char[2] = bits2;
954 dfa->word_char[3] = bits3;
955 i = 4;
956 }
957 else
958 goto general_case;
959 ch = 128;
960
961 if (__glibc_likely (dfa->is_utf8))
962 {
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
965 }
966 }
967
968 general_case:
969 for (; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973}
974
975/* Free the work area which are only used while compiling. */
976
977static void
978free_workarea_compile (regex_t *preg)
979{
980 re_dfa_t *dfa = preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
983 {
984 next = storage->next;
985 re_free (storage);
986 }
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
992}
993
994/* Create initial states for all contexts. */
995
996static reg_errcode_t
997create_initial_state (re_dfa_t *dfa)
998{
999 Idx first, i;
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1002
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (__glibc_unlikely (err != REG_NOERROR))
1009 return err;
1010
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1017 {
1018 Idx node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021 Idx clexp_idx;
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 {
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 break;
1031 }
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1034
1035 if (type == OP_BACK_REF)
1036 {
1037 Idx dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 {
1040 reg_errcode_t merge_err
1041 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 if (merge_err != REG_NOERROR)
1043 return merge_err;
1044 i = 0;
1045 }
1046 }
1047 }
1048
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (__glibc_unlikely (dfa->init_state == NULL))
1053 return err;
1054 if (dfa->init_state->has_constraint)
1055 {
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_WORD);
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_NEWLINE);
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
1064 if (__glibc_unlikely (dfa->init_state_word == NULL
1065 || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL))
1067 return err;
1068 }
1069 else
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1072
1073 re_node_set_free (&init_nodes);
1074 return REG_NOERROR;
1075}
1076
1077#ifdef RE_ENABLE_I18N
1078/* If it is possible to do searching in single byte encoding instead of UTF-8
1079 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080 DFA nodes where needed. */
1081
1082static void
1083optimize_utf8 (re_dfa_t *dfa)
1084{
1085 Idx node;
1086 int i;
1087 bool mb_chars = false;
1088 bool has_period = false;
1089
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1092 {
1093 case CHARACTER:
1094 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095 mb_chars = true;
1096 break;
1097 case ANCHOR:
1098 switch (dfa->nodes[node].opr.ctx_type)
1099 {
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1109 return;
1110 }
1111 break;
1112 case OP_PERIOD:
1113 has_period = true;
1114 break;
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
1122 case COMPLEX_BRACKET:
1123 return;
1124 case SIMPLE_BRACKET:
1125 /* Just double check. */
1126 {
1127 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1128 ? 0
1129 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 {
1132 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133 return;
1134 rshift = 0;
1135 }
1136 }
1137 break;
1138 default:
1139 abort ();
1140 }
1141
1142 if (mb_chars || has_period)
1143 for (node = 0; node < dfa->nodes_len; ++node)
1144 {
1145 if (dfa->nodes[node].type == CHARACTER
1146 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147 dfa->nodes[node].mb_partial = 0;
1148 else if (dfa->nodes[node].type == OP_PERIOD)
1149 dfa->nodes[node].type = OP_UTF8_PERIOD;
1150 }
1151
1152 /* The search can be in single byte locale. */
1153 dfa->mb_cur_max = 1;
1154 dfa->is_utf8 = 0;
1155 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1156}
1157#endif
1158
1159/* Analyze the structure tree, and calculate "first", "next", "edest",
1160 "eclosure", and "inveclosure". */
1161
1162static reg_errcode_t
1163analyze (regex_t *preg)
1164{
1165 re_dfa_t *dfa = preg->buffer;
1166 reg_errcode_t ret;
1167
1168 /* Allocate arrays. */
1169 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 || dfa->edests == NULL || dfa->eclosures == NULL))
1175 return REG_ESPACE;
1176
1177 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178 if (dfa->subexp_map != NULL)
1179 {
1180 Idx i;
1181 for (i = 0; i < preg->re_nsub; i++)
1182 dfa->subexp_map[i] = i;
1183 preorder (dfa->str_tree, optimize_subexps, dfa);
1184 for (i = 0; i < preg->re_nsub; i++)
1185 if (dfa->subexp_map[i] != i)
1186 break;
1187 if (i == preg->re_nsub)
1188 {
1189 re_free (dfa->subexp_map);
1190 dfa->subexp_map = NULL;
1191 }
1192 }
1193
1194 ret = postorder (dfa->str_tree, lower_subexps, preg);
1195 if (__glibc_unlikely (ret != REG_NOERROR))
1196 return ret;
1197 ret = postorder (dfa->str_tree, calc_first, dfa);
1198 if (__glibc_unlikely (ret != REG_NOERROR))
1199 return ret;
1200 preorder (dfa->str_tree, calc_next, dfa);
1201 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202 if (__glibc_unlikely (ret != REG_NOERROR))
1203 return ret;
1204 ret = calc_eclosure (dfa);
1205 if (__glibc_unlikely (ret != REG_NOERROR))
1206 return ret;
1207
1208 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1210 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211 || dfa->nbackref)
1212 {
1213 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214 if (__glibc_unlikely (dfa->inveclosures == NULL))
1215 return REG_ESPACE;
1216 ret = calc_inveclosure (dfa);
1217 }
1218
1219 return ret;
1220}
1221
1222/* Our parse trees are very unbalanced, so we cannot use a stack to
1223 implement parse tree visits. Instead, we use parent pointers and
1224 some hairy code in these two functions. */
1225static reg_errcode_t
1226postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227 void *extra)
1228{
1229 bin_tree_t *node, *prev;
1230
1231 for (node = root; ; )
1232 {
1233 /* Descend down the tree, preferably to the left (or to the right
1234 if that's the only child). */
1235 while (node->left || node->right)
1236 if (node->left)
1237 node = node->left;
1238 else
1239 node = node->right;
1240
1241 do
1242 {
1243 reg_errcode_t err = fn (extra, node);
1244 if (__glibc_unlikely (err != REG_NOERROR))
1245 return err;
1246 if (node->parent == NULL)
1247 return REG_NOERROR;
1248 prev = node;
1249 node = node->parent;
1250 }
1251 /* Go up while we have a node that is reached from the right. */
1252 while (node->right == prev || node->right == NULL);
1253 node = node->right;
1254 }
1255}
1256
1257static reg_errcode_t
1258preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259 void *extra)
1260{
1261 bin_tree_t *node;
1262
1263 for (node = root; ; )
1264 {
1265 reg_errcode_t err = fn (extra, node);
1266 if (__glibc_unlikely (err != REG_NOERROR))
1267 return err;
1268
1269 /* Go to the left node, or up and to the right. */
1270 if (node->left)
1271 node = node->left;
1272 else
1273 {
1274 bin_tree_t *prev = NULL;
1275 while (node->right == prev || node->right == NULL)
1276 {
1277 prev = node;
1278 node = node->parent;
1279 if (!node)
1280 return REG_NOERROR;
1281 }
1282 node = node->right;
1283 }
1284 }
1285}
1286
1287/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289 backreferences as well. Requires a preorder visit. */
1290static reg_errcode_t
1291optimize_subexps (void *extra, bin_tree_t *node)
1292{
1293 re_dfa_t *dfa = (re_dfa_t *) extra;
1294
1295 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296 {
1297 int idx = node->token.opr.idx;
1298 node->token.opr.idx = dfa->subexp_map[idx];
1299 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1300 }
1301
1302 else if (node->token.type == SUBEXP
1303 && node->left && node->left->token.type == SUBEXP)
1304 {
1305 Idx other_idx = node->left->token.opr.idx;
1306
1307 node->left = node->left->left;
1308 if (node->left)
1309 node->left->parent = node;
1310
1311 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312 if (other_idx < BITSET_WORD_BITS)
1313 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1314 }
1315
1316 return REG_NOERROR;
1317}
1318
1319/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1321static reg_errcode_t
1322lower_subexps (void *extra, bin_tree_t *node)
1323{
1324 regex_t *preg = (regex_t *) extra;
1325 reg_errcode_t err = REG_NOERROR;
1326
1327 if (node->left && node->left->token.type == SUBEXP)
1328 {
1329 node->left = lower_subexp (&err, preg, node->left);
1330 if (node->left)
1331 node->left->parent = node;
1332 }
1333 if (node->right && node->right->token.type == SUBEXP)
1334 {
1335 node->right = lower_subexp (&err, preg, node->right);
1336 if (node->right)
1337 node->right->parent = node;
1338 }
1339
1340 return err;
1341}
1342
1343static bin_tree_t *
1344lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1345{
1346 re_dfa_t *dfa = preg->buffer;
1347 bin_tree_t *body = node->left;
1348 bin_tree_t *op, *cls, *tree1, *tree;
1349
1350 if (preg->no_sub
1351 /* We do not optimize empty subexpressions, because otherwise we may
1352 have bad CONCAT nodes with NULL children. This is obviously not
1353 very common, so we do not lose much. An example that triggers
1354 this case is the sed "script" /\(\)/x. */
1355 && node->left != NULL
1356 && (node->token.opr.idx >= BITSET_WORD_BITS
1357 || !(dfa->used_bkref_map
1358 & ((bitset_word_t) 1 << node->token.opr.idx))))
1359 return node->left;
1360
1361 /* Convert the SUBEXP node to the concatenation of an
1362 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1363 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366 tree = create_tree (dfa, op, tree1, CONCAT);
1367 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 || op == NULL || cls == NULL))
1369 {
1370 *err = REG_ESPACE;
1371 return NULL;
1372 }
1373
1374 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376 return tree;
1377}
1378
1379/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380 nodes. Requires a postorder visit. */
1381static reg_errcode_t
1382calc_first (void *extra, bin_tree_t *node)
1383{
1384 re_dfa_t *dfa = (re_dfa_t *) extra;
1385 if (node->token.type == CONCAT)
1386 {
1387 node->first = node->left->first;
1388 node->node_idx = node->left->node_idx;
1389 }
1390 else
1391 {
1392 node->first = node;
1393 node->node_idx = re_dfa_add_node (dfa, node->token);
1394 if (__glibc_unlikely (node->node_idx == -1))
1395 return REG_ESPACE;
1396 if (node->token.type == ANCHOR)
1397 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398 }
1399 return REG_NOERROR;
1400}
1401
1402/* Pass 2: compute NEXT on the tree. Preorder visit. */
1403static reg_errcode_t
1404calc_next (void *extra, bin_tree_t *node)
1405{
1406 switch (node->token.type)
1407 {
1408 case OP_DUP_ASTERISK:
1409 node->left->next = node;
1410 break;
1411 case CONCAT:
1412 node->left->next = node->right->first;
1413 node->right->next = node->next;
1414 break;
1415 default:
1416 if (node->left)
1417 node->left->next = node->next;
1418 if (node->right)
1419 node->right->next = node->next;
1420 break;
1421 }
1422 return REG_NOERROR;
1423}
1424
1425/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1426static reg_errcode_t
1427link_nfa_nodes (void *extra, bin_tree_t *node)
1428{
1429 re_dfa_t *dfa = (re_dfa_t *) extra;
1430 Idx idx = node->node_idx;
1431 reg_errcode_t err = REG_NOERROR;
1432
1433 switch (node->token.type)
1434 {
1435 case CONCAT:
1436 break;
1437
1438 case END_OF_RE:
1439 assert (node->next == NULL);
1440 break;
1441
1442 case OP_DUP_ASTERISK:
1443 case OP_ALT:
1444 {
1445 Idx left, right;
1446 dfa->has_plural_match = 1;
1447 if (node->left != NULL)
1448 left = node->left->first->node_idx;
1449 else
1450 left = node->next->node_idx;
1451 if (node->right != NULL)
1452 right = node->right->first->node_idx;
1453 else
1454 right = node->next->node_idx;
1455 assert (left > -1);
1456 assert (right > -1);
1457 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458 }
1459 break;
1460
1461 case ANCHOR:
1462 case OP_OPEN_SUBEXP:
1463 case OP_CLOSE_SUBEXP:
1464 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465 break;
1466
1467 case OP_BACK_REF:
1468 dfa->nexts[idx] = node->next->node_idx;
1469 if (node->token.type == OP_BACK_REF)
1470 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471 break;
1472
1473 default:
1474 assert (!IS_EPSILON_NODE (node->token.type));
1475 dfa->nexts[idx] = node->next->node_idx;
1476 break;
1477 }
1478
1479 return err;
1480}
1481
1482/* Duplicate the epsilon closure of the node ROOT_NODE.
1483 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484 to their own constraint. */
1485
1486static reg_errcode_t
1487duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 Idx root_node, unsigned int init_constraint)
1489{
1490 Idx org_node, clone_node;
1491 bool ok;
1492 unsigned int constraint = init_constraint;
1493 for (org_node = top_org_node, clone_node = top_clone_node;;)
1494 {
1495 Idx org_dest, clone_dest;
1496 if (dfa->nodes[org_node].type == OP_BACK_REF)
1497 {
1498 /* If the back reference epsilon-transit, its destination must
1499 also have the constraint. Then duplicate the epsilon closure
1500 of the destination of the back reference, and store it in
1501 edests of the back reference. */
1502 org_dest = dfa->nexts[org_node];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (__glibc_unlikely (clone_dest == -1))
1506 return REG_ESPACE;
1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (__glibc_unlikely (! ok))
1510 return REG_ESPACE;
1511 }
1512 else if (dfa->edests[org_node].nelem == 0)
1513 {
1514 /* In case of the node can't epsilon-transit, don't duplicate the
1515 destination and store the original destination as the
1516 destination of the node. */
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 break;
1519 }
1520 else if (dfa->edests[org_node].nelem == 1)
1521 {
1522 /* In case of the node can epsilon-transit, and it has only one
1523 destination. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* If the node is root_node itself, it means the epsilon closure
1527 has a loop. Then tie it to the destination of the root_node. */
1528 if (org_node == root_node && clone_node != org_node)
1529 {
1530 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531 if (__glibc_unlikely (! ok))
1532 return REG_ESPACE;
1533 break;
1534 }
1535 /* In case the node has another constraint, append it. */
1536 constraint |= dfa->nodes[org_node].constraint;
1537 clone_dest = duplicate_node (dfa, org_dest, constraint);
1538 if (__glibc_unlikely (clone_dest == -1))
1539 return REG_ESPACE;
1540 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 if (__glibc_unlikely (! ok))
1542 return REG_ESPACE;
1543 }
1544 else /* dfa->edests[org_node].nelem == 2 */
1545 {
1546 /* In case of the node can epsilon-transit, and it has two
1547 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1548 org_dest = dfa->edests[org_node].elems[0];
1549 re_node_set_empty (dfa->edests + clone_node);
1550 /* Search for a duplicated node which satisfies the constraint. */
1551 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 if (clone_dest == -1)
1553 {
1554 /* There is no such duplicated node, create a new one. */
1555 reg_errcode_t err;
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (__glibc_unlikely (clone_dest == -1))
1558 return REG_ESPACE;
1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (__glibc_unlikely (! ok))
1561 return REG_ESPACE;
1562 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 root_node, constraint);
1564 if (__glibc_unlikely (err != REG_NOERROR))
1565 return err;
1566 }
1567 else
1568 {
1569 /* There is a duplicated node which satisfies the constraint,
1570 use it to avoid infinite loop. */
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1573 return REG_ESPACE;
1574 }
1575
1576 org_dest = dfa->edests[org_node].elems[1];
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (__glibc_unlikely (clone_dest == -1))
1579 return REG_ESPACE;
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 if (__glibc_unlikely (! ok))
1582 return REG_ESPACE;
1583 }
1584 org_node = org_dest;
1585 clone_node = clone_dest;
1586 }
1587 return REG_NOERROR;
1588}
1589
1590/* Search for a node which is duplicated from the node ORG_NODE, and
1591 satisfies the constraint CONSTRAINT. */
1592
1593static Idx
1594search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595 unsigned int constraint)
1596{
1597 Idx idx;
1598 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1599 {
1600 if (org_node == dfa->org_indices[idx]
1601 && constraint == dfa->nodes[idx].constraint)
1602 return idx; /* Found. */
1603 }
1604 return -1; /* Not found. */
1605}
1606
1607/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608 Return the index of the new node, or -1 if insufficient storage is
1609 available. */
1610
1611static Idx
1612duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1613{
1614 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615 if (__glibc_likely (dup_idx != -1))
1616 {
1617 dfa->nodes[dup_idx].constraint = constraint;
1618 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619 dfa->nodes[dup_idx].duplicated = 1;
1620
1621 /* Store the index of the original node. */
1622 dfa->org_indices[dup_idx] = org_idx;
1623 }
1624 return dup_idx;
1625}
1626
1627static reg_errcode_t
1628calc_inveclosure (re_dfa_t *dfa)
1629{
1630 Idx src, idx;
1631 bool ok;
1632 for (idx = 0; idx < dfa->nodes_len; ++idx)
1633 re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635 for (src = 0; src < dfa->nodes_len; ++src)
1636 {
1637 Idx *elems = dfa->eclosures[src].elems;
1638 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1639 {
1640 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641 if (__glibc_unlikely (! ok))
1642 return REG_ESPACE;
1643 }
1644 }
1645
1646 return REG_NOERROR;
1647}
1648
1649/* Calculate "eclosure" for all the node in DFA. */
1650
1651static reg_errcode_t
1652calc_eclosure (re_dfa_t *dfa)
1653{
1654 Idx node_idx;
1655 bool incomplete;
1656#ifdef DEBUG
1657 assert (dfa->nodes_len > 0);
1658#endif
1659 incomplete = false;
1660 /* For each nodes, calculate epsilon closure. */
1661 for (node_idx = 0; ; ++node_idx)
1662 {
1663 reg_errcode_t err;
1664 re_node_set eclosure_elem;
1665 if (node_idx == dfa->nodes_len)
1666 {
1667 if (!incomplete)
1668 break;
1669 incomplete = false;
1670 node_idx = 0;
1671 }
1672
1673#ifdef DEBUG
1674 assert (dfa->eclosures[node_idx].nelem != -1);
1675#endif
1676
1677 /* If we have already calculated, skip it. */
1678 if (dfa->eclosures[node_idx].nelem != 0)
1679 continue;
1680 /* Calculate epsilon closure of 'node_idx'. */
1681 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1682 if (__glibc_unlikely (err != REG_NOERROR))
1683 return err;
1684
1685 if (dfa->eclosures[node_idx].nelem == 0)
1686 {
1687 incomplete = true;
1688 re_node_set_free (&eclosure_elem);
1689 }
1690 }
1691 return REG_NOERROR;
1692}
1693
1694/* Calculate epsilon closure of NODE. */
1695
1696static reg_errcode_t
1697calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1698{
1699 reg_errcode_t err;
1700 Idx i;
1701 re_node_set eclosure;
1702 bool ok;
1703 bool incomplete = false;
1704 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1705 if (__glibc_unlikely (err != REG_NOERROR))
1706 return err;
1707
1708 /* This indicates that we are calculating this node now.
1709 We reference this value to avoid infinite loop. */
1710 dfa->eclosures[node].nelem = -1;
1711
1712 /* If the current node has constraints, duplicate all nodes
1713 since they must inherit the constraints. */
1714 if (dfa->nodes[node].constraint
1715 && dfa->edests[node].nelem
1716 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1717 {
1718 err = duplicate_node_closure (dfa, node, node, node,
1719 dfa->nodes[node].constraint);
1720 if (__glibc_unlikely (err != REG_NOERROR))
1721 return err;
1722 }
1723
1724 /* Expand each epsilon destination nodes. */
1725 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1726 for (i = 0; i < dfa->edests[node].nelem; ++i)
1727 {
1728 re_node_set eclosure_elem;
1729 Idx edest = dfa->edests[node].elems[i];
1730 /* If calculating the epsilon closure of 'edest' is in progress,
1731 return intermediate result. */
1732 if (dfa->eclosures[edest].nelem == -1)
1733 {
1734 incomplete = true;
1735 continue;
1736 }
1737 /* If we haven't calculated the epsilon closure of 'edest' yet,
1738 calculate now. Otherwise use calculated epsilon closure. */
1739 if (dfa->eclosures[edest].nelem == 0)
1740 {
1741 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1742 if (__glibc_unlikely (err != REG_NOERROR))
1743 return err;
1744 }
1745 else
1746 eclosure_elem = dfa->eclosures[edest];
1747 /* Merge the epsilon closure of 'edest'. */
1748 err = re_node_set_merge (&eclosure, &eclosure_elem);
1749 if (__glibc_unlikely (err != REG_NOERROR))
1750 return err;
1751 /* If the epsilon closure of 'edest' is incomplete,
1752 the epsilon closure of this node is also incomplete. */
1753 if (dfa->eclosures[edest].nelem == 0)
1754 {
1755 incomplete = true;
1756 re_node_set_free (&eclosure_elem);
1757 }
1758 }
1759
1760 /* An epsilon closure includes itself. */
1761 ok = re_node_set_insert (&eclosure, node);
1762 if (__glibc_unlikely (! ok))
1763 return REG_ESPACE;
1764 if (incomplete && !root)
1765 dfa->eclosures[node].nelem = 0;
1766 else
1767 dfa->eclosures[node] = eclosure;
1768 *new_set = eclosure;
1769 return REG_NOERROR;
1770}
1771
1772/* Functions for token which are used in the parser. */
1773
1774/* Fetch a token from INPUT.
1775 We must not use this function inside bracket expressions. */
1776
1777static void
1778fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1779{
1780 re_string_skip_bytes (input, peek_token (result, input, syntax));
1781}
1782
1783/* Peek a token from INPUT, and return the length of the token.
1784 We must not use this function inside bracket expressions. */
1785
1786static int
1787peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1788{
1789 unsigned char c;
1790
1791 if (re_string_eoi (input))
1792 {
1793 token->type = END_OF_RE;
1794 return 0;
1795 }
1796
1797 c = re_string_peek_byte (input, 0);
1798 token->opr.c = c;
1799
1800 token->word_char = 0;
1801#ifdef RE_ENABLE_I18N
1802 token->mb_partial = 0;
1803 if (input->mb_cur_max > 1
1804 && !re_string_first_byte (input, re_string_cur_idx (input)))
1805 {
1806 token->type = CHARACTER;
1807 token->mb_partial = 1;
1808 return 1;
1809 }
1810#endif
1811 if (c == '\\')
1812 {
1813 unsigned char c2;
1814 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1815 {
1816 token->type = BACK_SLASH;
1817 return 1;
1818 }
1819
1820 c2 = re_string_peek_byte_case (input, 1);
1821 token->opr.c = c2;
1822 token->type = CHARACTER;
1823#ifdef RE_ENABLE_I18N
1824 if (input->mb_cur_max > 1)
1825 {
1826 wint_t wc = re_string_wchar_at (input,
1827 re_string_cur_idx (input) + 1);
1828 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1829 }
1830 else
1831#endif
1832 token->word_char = IS_WORD_CHAR (c2) != 0;
1833
1834 switch (c2)
1835 {
1836 case '|':
1837 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1838 token->type = OP_ALT;
1839 break;
1840 case '1': case '2': case '3': case '4': case '5':
1841 case '6': case '7': case '8': case '9':
1842 if (!(syntax & RE_NO_BK_REFS))
1843 {
1844 token->type = OP_BACK_REF;
1845 token->opr.idx = c2 - '1';
1846 }
1847 break;
1848 case '<':
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 {
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = WORD_FIRST;
1853 }
1854 break;
1855 case '>':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 {
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = WORD_LAST;
1860 }
1861 break;
1862 case 'b':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 {
1865 token->type = ANCHOR;
1866 token->opr.ctx_type = WORD_DELIM;
1867 }
1868 break;
1869 case 'B':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 {
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = NOT_WORD_DELIM;
1874 }
1875 break;
1876 case 'w':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_WORD;
1879 break;
1880 case 'W':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_NOTWORD;
1883 break;
1884 case 's':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_SPACE;
1887 break;
1888 case 'S':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = OP_NOTSPACE;
1891 break;
1892 case '`':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 {
1895 token->type = ANCHOR;
1896 token->opr.ctx_type = BUF_FIRST;
1897 }
1898 break;
1899 case '\'':
1900 if (!(syntax & RE_NO_GNU_OPS))
1901 {
1902 token->type = ANCHOR;
1903 token->opr.ctx_type = BUF_LAST;
1904 }
1905 break;
1906 case '(':
1907 if (!(syntax & RE_NO_BK_PARENS))
1908 token->type = OP_OPEN_SUBEXP;
1909 break;
1910 case ')':
1911 if (!(syntax & RE_NO_BK_PARENS))
1912 token->type = OP_CLOSE_SUBEXP;
1913 break;
1914 case '+':
1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_PLUS;
1917 break;
1918 case '?':
1919 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1920 token->type = OP_DUP_QUESTION;
1921 break;
1922 case '{':
1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924 token->type = OP_OPEN_DUP_NUM;
1925 break;
1926 case '}':
1927 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1928 token->type = OP_CLOSE_DUP_NUM;
1929 break;
1930 default:
1931 break;
1932 }
1933 return 2;
1934 }
1935
1936 token->type = CHARACTER;
1937#ifdef RE_ENABLE_I18N
1938 if (input->mb_cur_max > 1)
1939 {
1940 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1941 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1942 }
1943 else
1944#endif
1945 token->word_char = IS_WORD_CHAR (token->opr.c);
1946
1947 switch (c)
1948 {
1949 case '\n':
1950 if (syntax & RE_NEWLINE_ALT)
1951 token->type = OP_ALT;
1952 break;
1953 case '|':
1954 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1955 token->type = OP_ALT;
1956 break;
1957 case '*':
1958 token->type = OP_DUP_ASTERISK;
1959 break;
1960 case '+':
1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962 token->type = OP_DUP_PLUS;
1963 break;
1964 case '?':
1965 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1966 token->type = OP_DUP_QUESTION;
1967 break;
1968 case '{':
1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970 token->type = OP_OPEN_DUP_NUM;
1971 break;
1972 case '}':
1973 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1974 token->type = OP_CLOSE_DUP_NUM;
1975 break;
1976 case '(':
1977 if (syntax & RE_NO_BK_PARENS)
1978 token->type = OP_OPEN_SUBEXP;
1979 break;
1980 case ')':
1981 if (syntax & RE_NO_BK_PARENS)
1982 token->type = OP_CLOSE_SUBEXP;
1983 break;
1984 case '[':
1985 token->type = OP_OPEN_BRACKET;
1986 break;
1987 case '.':
1988 token->type = OP_PERIOD;
1989 break;
1990 case '^':
1991 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1992 && re_string_cur_idx (input) != 0)
1993 {
1994 char prev = re_string_peek_byte (input, -1);
1995 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1996 break;
1997 }
1998 token->type = ANCHOR;
1999 token->opr.ctx_type = LINE_FIRST;
2000 break;
2001 case '$':
2002 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2003 && re_string_cur_idx (input) + 1 != re_string_length (input))
2004 {
2005 re_token_t next;
2006 re_string_skip_bytes (input, 1);
2007 peek_token (&next, input, syntax);
2008 re_string_skip_bytes (input, -1);
2009 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2010 break;
2011 }
2012 token->type = ANCHOR;
2013 token->opr.ctx_type = LINE_LAST;
2014 break;
2015 default:
2016 break;
2017 }
2018 return 1;
2019}
2020
2021/* Peek a token from INPUT, and return the length of the token.
2022 We must not use this function out of bracket expressions. */
2023
2024static int
2025peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2026{
2027 unsigned char c;
2028 if (re_string_eoi (input))
2029 {
2030 token->type = END_OF_RE;
2031 return 0;
2032 }
2033 c = re_string_peek_byte (input, 0);
2034 token->opr.c = c;
2035
2036#ifdef RE_ENABLE_I18N
2037 if (input->mb_cur_max > 1
2038 && !re_string_first_byte (input, re_string_cur_idx (input)))
2039 {
2040 token->type = CHARACTER;
2041 return 1;
2042 }
2043#endif /* RE_ENABLE_I18N */
2044
2045 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2046 && re_string_cur_idx (input) + 1 < re_string_length (input))
2047 {
2048 /* In this case, '\' escape a character. */
2049 unsigned char c2;
2050 re_string_skip_bytes (input, 1);
2051 c2 = re_string_peek_byte (input, 0);
2052 token->opr.c = c2;
2053 token->type = CHARACTER;
2054 return 1;
2055 }
2056 if (c == '[') /* '[' is a special char in a bracket exps. */
2057 {
2058 unsigned char c2;
2059 int token_len;
2060 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2061 c2 = re_string_peek_byte (input, 1);
2062 else
2063 c2 = 0;
2064 token->opr.c = c2;
2065 token_len = 2;
2066 switch (c2)
2067 {
2068 case '.':
2069 token->type = OP_OPEN_COLL_ELEM;
2070 break;
2071
2072 case '=':
2073 token->type = OP_OPEN_EQUIV_CLASS;
2074 break;
2075
2076 case ':':
2077 if (syntax & RE_CHAR_CLASSES)
2078 {
2079 token->type = OP_OPEN_CHAR_CLASS;
2080 break;
2081 }
2082 FALLTHROUGH;
2083 default:
2084 token->type = CHARACTER;
2085 token->opr.c = c;
2086 token_len = 1;
2087 break;
2088 }
2089 return token_len;
2090 }
2091 switch (c)
2092 {
2093 case '-':
2094 token->type = OP_CHARSET_RANGE;
2095 break;
2096 case ']':
2097 token->type = OP_CLOSE_BRACKET;
2098 break;
2099 case '^':
2100 token->type = OP_NON_MATCH_LIST;
2101 break;
2102 default:
2103 token->type = CHARACTER;
2104 }
2105 return 1;
2106}
2107
2108/* Functions for parser. */
2109
2110/* Entry point of the parser.
2111 Parse the regular expression REGEXP and return the structure tree.
2112 If an error occurs, ERR is set by error code, and return NULL.
2113 This function build the following tree, from regular expression <reg_exp>:
2114 CAT
2115 / \
2116 / \
2117 <reg_exp> EOR
2118
2119 CAT means concatenation.
2120 EOR means end of regular expression. */
2121
2122static bin_tree_t *
2123parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2124 reg_errcode_t *err)
2125{
2126 re_dfa_t *dfa = preg->buffer;
2127 bin_tree_t *tree, *eor, *root;
2128 re_token_t current_token;
2129 dfa->syntax = syntax;
2130 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2131 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2132 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2133 return NULL;
2134 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2135 if (tree != NULL)
2136 root = create_tree (dfa, tree, eor, CONCAT);
2137 else
2138 root = eor;
2139 if (__glibc_unlikely (eor == NULL || root == NULL))
2140 {
2141 *err = REG_ESPACE;
2142 return NULL;
2143 }
2144 return root;
2145}
2146
2147/* This function build the following tree, from regular expression
2148 <branch1>|<branch2>:
2149 ALT
2150 / \
2151 / \
2152 <branch1> <branch2>
2153
2154 ALT means alternative, which represents the operator '|'. */
2155
2156static bin_tree_t *
2157parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2158 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2159{
2160 re_dfa_t *dfa = preg->buffer;
2161 bin_tree_t *tree, *branch = NULL;
2162 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2163 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2164 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2165 return NULL;
2166
2167 while (token->type == OP_ALT)
2168 {
2169 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2170 if (token->type != OP_ALT && token->type != END_OF_RE
2171 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2172 {
2173 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2174 dfa->completed_bkref_map = initial_bkref_map;
2175 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2176 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2177 {
2178 if (tree != NULL)
2179 postorder (tree, free_tree, NULL);
2180 return NULL;
2181 }
2182 dfa->completed_bkref_map |= accumulated_bkref_map;
2183 }
2184 else
2185 branch = NULL;
2186 tree = create_tree (dfa, tree, branch, OP_ALT);
2187 if (__glibc_unlikely (tree == NULL))
2188 {
2189 *err = REG_ESPACE;
2190 return NULL;
2191 }
2192 }
2193 return tree;
2194}
2195
2196/* This function build the following tree, from regular expression
2197 <exp1><exp2>:
2198 CAT
2199 / \
2200 / \
2201 <exp1> <exp2>
2202
2203 CAT means concatenation. */
2204
2205static bin_tree_t *
2206parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2207 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2208{
2209 bin_tree_t *tree, *expr;
2210 re_dfa_t *dfa = preg->buffer;
2211 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2212 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2213 return NULL;
2214
2215 while (token->type != OP_ALT && token->type != END_OF_RE
2216 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2217 {
2218 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2219 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2220 {
2221 if (tree != NULL)
2222 postorder (tree, free_tree, NULL);
2223 return NULL;
2224 }
2225 if (tree != NULL && expr != NULL)
2226 {
2227 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2228 if (newtree == NULL)
2229 {
2230 postorder (expr, free_tree, NULL);
2231 postorder (tree, free_tree, NULL);
2232 *err = REG_ESPACE;
2233 return NULL;
2234 }
2235 tree = newtree;
2236 }
2237 else if (tree == NULL)
2238 tree = expr;
2239 /* Otherwise expr == NULL, we don't need to create new tree. */
2240 }
2241 return tree;
2242}
2243
2244/* This function build the following tree, from regular expression a*:
2245 *
2246 |
2247 a
2248*/
2249
2250static bin_tree_t *
2251parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2252 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2253{
2254 re_dfa_t *dfa = preg->buffer;
2255 bin_tree_t *tree;
2256 switch (token->type)
2257 {
2258 case CHARACTER:
2259 tree = create_token_tree (dfa, NULL, NULL, token);
2260 if (__glibc_unlikely (tree == NULL))
2261 {
2262 *err = REG_ESPACE;
2263 return NULL;
2264 }
2265#ifdef RE_ENABLE_I18N
2266 if (dfa->mb_cur_max > 1)
2267 {
2268 while (!re_string_eoi (regexp)
2269 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2270 {
2271 bin_tree_t *mbc_remain;
2272 fetch_token (token, regexp, syntax);
2273 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2274 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2275 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2276 {
2277 *err = REG_ESPACE;
2278 return NULL;
2279 }
2280 }
2281 }
2282#endif
2283 break;
2284
2285 case OP_OPEN_SUBEXP:
2286 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2287 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288 return NULL;
2289 break;
2290
2291 case OP_OPEN_BRACKET:
2292 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2293 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2294 return NULL;
2295 break;
2296
2297 case OP_BACK_REF:
2298 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2299 {
2300 *err = REG_ESUBREG;
2301 return NULL;
2302 }
2303 dfa->used_bkref_map |= 1 << token->opr.idx;
2304 tree = create_token_tree (dfa, NULL, NULL, token);
2305 if (__glibc_unlikely (tree == NULL))
2306 {
2307 *err = REG_ESPACE;
2308 return NULL;
2309 }
2310 ++dfa->nbackref;
2311 dfa->has_mb_node = 1;
2312 break;
2313
2314 case OP_OPEN_DUP_NUM:
2315 if (syntax & RE_CONTEXT_INVALID_DUP)
2316 {
2317 *err = REG_BADRPT;
2318 return NULL;
2319 }
2320 FALLTHROUGH;
2321 case OP_DUP_ASTERISK:
2322 case OP_DUP_PLUS:
2323 case OP_DUP_QUESTION:
2324 if (syntax & RE_CONTEXT_INVALID_OPS)
2325 {
2326 *err = REG_BADRPT;
2327 return NULL;
2328 }
2329 else if (syntax & RE_CONTEXT_INDEP_OPS)
2330 {
2331 fetch_token (token, regexp, syntax);
2332 return parse_expression (regexp, preg, token, syntax, nest, err);
2333 }
2334 FALLTHROUGH;
2335 case OP_CLOSE_SUBEXP:
2336 if ((token->type == OP_CLOSE_SUBEXP)
2337 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338 {
2339 *err = REG_ERPAREN;
2340 return NULL;
2341 }
2342 FALLTHROUGH;
2343 case OP_CLOSE_DUP_NUM:
2344 /* We treat it as a normal character. */
2345
2346 /* Then we can these characters as normal characters. */
2347 token->type = CHARACTER;
2348 /* mb_partial and word_char bits should be initialized already
2349 by peek_token. */
2350 tree = create_token_tree (dfa, NULL, NULL, token);
2351 if (__glibc_unlikely (tree == NULL))
2352 {
2353 *err = REG_ESPACE;
2354 return NULL;
2355 }
2356 break;
2357
2358 case ANCHOR:
2359 if ((token->opr.ctx_type
2360 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2361 && dfa->word_ops_used == 0)
2362 init_word_char (dfa);
2363 if (token->opr.ctx_type == WORD_DELIM
2364 || token->opr.ctx_type == NOT_WORD_DELIM)
2365 {
2366 bin_tree_t *tree_first, *tree_last;
2367 if (token->opr.ctx_type == WORD_DELIM)
2368 {
2369 token->opr.ctx_type = WORD_FIRST;
2370 tree_first = create_token_tree (dfa, NULL, NULL, token);
2371 token->opr.ctx_type = WORD_LAST;
2372 }
2373 else
2374 {
2375 token->opr.ctx_type = INSIDE_WORD;
2376 tree_first = create_token_tree (dfa, NULL, NULL, token);
2377 token->opr.ctx_type = INSIDE_NOTWORD;
2378 }
2379 tree_last = create_token_tree (dfa, NULL, NULL, token);
2380 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2381 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2382 || tree == NULL))
2383 {
2384 *err = REG_ESPACE;
2385 return NULL;
2386 }
2387 }
2388 else
2389 {
2390 tree = create_token_tree (dfa, NULL, NULL, token);
2391 if (__glibc_unlikely (tree == NULL))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
2397 /* We must return here, since ANCHORs can't be followed
2398 by repetition operators.
2399 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2400 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2401 fetch_token (token, regexp, syntax);
2402 return tree;
2403
2404 case OP_PERIOD:
2405 tree = create_token_tree (dfa, NULL, NULL, token);
2406 if (__glibc_unlikely (tree == NULL))
2407 {
2408 *err = REG_ESPACE;
2409 return NULL;
2410 }
2411 if (dfa->mb_cur_max > 1)
2412 dfa->has_mb_node = 1;
2413 break;
2414
2415 case OP_WORD:
2416 case OP_NOTWORD:
2417 tree = build_charclass_op (dfa, regexp->trans,
2418 "alnum",
2419 "_",
2420 token->type == OP_NOTWORD, err);
2421 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2422 return NULL;
2423 break;
2424
2425 case OP_SPACE:
2426 case OP_NOTSPACE:
2427 tree = build_charclass_op (dfa, regexp->trans,
2428 "space",
2429 "",
2430 token->type == OP_NOTSPACE, err);
2431 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2432 return NULL;
2433 break;
2434
2435 case OP_ALT:
2436 case END_OF_RE:
2437 return NULL;
2438
2439 case BACK_SLASH:
2440 *err = REG_EESCAPE;
2441 return NULL;
2442
2443 default:
2444 /* Must not happen? */
2445#ifdef DEBUG
2446 assert (0);
2447#endif
2448 return NULL;
2449 }
2450 fetch_token (token, regexp, syntax);
2451
2452 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2453 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2454 {
2455 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2456 syntax, err);
2457 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2458 {
2459 if (tree != NULL)
2460 postorder (tree, free_tree, NULL);
2461 return NULL;
2462 }
2463 tree = dup_tree;
2464 /* In BRE consecutive duplications are not allowed. */
2465 if ((syntax & RE_CONTEXT_INVALID_DUP)
2466 && (token->type == OP_DUP_ASTERISK
2467 || token->type == OP_OPEN_DUP_NUM))
2468 {
2469 if (tree != NULL)
2470 postorder (tree, free_tree, NULL);
2471 *err = REG_BADRPT;
2472 return NULL;
2473 }
2474 }
2475
2476 return tree;
2477}
2478
2479/* This function build the following tree, from regular expression
2480 (<reg_exp>):
2481 SUBEXP
2482 |
2483 <reg_exp>
2484*/
2485
2486static bin_tree_t *
2487parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489{
2490 re_dfa_t *dfa = preg->buffer;
2491 bin_tree_t *tree;
2492 size_t cur_nsub;
2493 cur_nsub = preg->re_nsub++;
2494
2495 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497 /* The subexpression may be a null string. */
2498 if (token->type == OP_CLOSE_SUBEXP)
2499 tree = NULL;
2500 else
2501 {
2502 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503 if (__glibc_unlikely (*err == REG_NOERROR
2504 && token->type != OP_CLOSE_SUBEXP))
2505 {
2506 if (tree != NULL)
2507 postorder (tree, free_tree, NULL);
2508 *err = REG_EPAREN;
2509 }
2510 if (__glibc_unlikely (*err != REG_NOERROR))
2511 return NULL;
2512 }
2513
2514 if (cur_nsub <= '9' - '1')
2515 dfa->completed_bkref_map |= 1 << cur_nsub;
2516
2517 tree = create_tree (dfa, tree, NULL, SUBEXP);
2518 if (__glibc_unlikely (tree == NULL))
2519 {
2520 *err = REG_ESPACE;
2521 return NULL;
2522 }
2523 tree->token.opr.idx = cur_nsub;
2524 return tree;
2525}
2526
2527/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2528
2529static bin_tree_t *
2530parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2531 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2532{
2533 bin_tree_t *tree = NULL, *old_tree = NULL;
2534 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2535 re_token_t start_token = *token;
2536
2537 if (token->type == OP_OPEN_DUP_NUM)
2538 {
2539 end = 0;
2540 start = fetch_number (regexp, token, syntax);
2541 if (start == -1)
2542 {
2543 if (token->type == CHARACTER && token->opr.c == ',')
2544 start = 0; /* We treat "{,m}" as "{0,m}". */
2545 else
2546 {
2547 *err = REG_BADBR; /* <re>{} is invalid. */
2548 return NULL;
2549 }
2550 }
2551 if (__glibc_likely (start != -2))
2552 {
2553 /* We treat "{n}" as "{n,n}". */
2554 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2555 : ((token->type == CHARACTER && token->opr.c == ',')
2556 ? fetch_number (regexp, token, syntax) : -2));
2557 }
2558 if (__glibc_unlikely (start == -2 || end == -2))
2559 {
2560 /* Invalid sequence. */
2561 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2562 {
2563 if (token->type == END_OF_RE)
2564 *err = REG_EBRACE;
2565 else
2566 *err = REG_BADBR;
2567
2568 return NULL;
2569 }
2570
2571 /* If the syntax bit is set, rollback. */
2572 re_string_set_index (regexp, start_idx);
2573 *token = start_token;
2574 token->type = CHARACTER;
2575 /* mb_partial and word_char bits should be already initialized by
2576 peek_token. */
2577 return elem;
2578 }
2579
2580 if (__glibc_unlikely ((end != -1 && start > end)
2581 || token->type != OP_CLOSE_DUP_NUM))
2582 {
2583 /* First number greater than second. */
2584 *err = REG_BADBR;
2585 return NULL;
2586 }
2587
2588 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2589 {
2590 *err = REG_ESIZE;
2591 return NULL;
2592 }
2593 }
2594 else
2595 {
2596 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2597 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2598 }
2599
2600 fetch_token (token, regexp, syntax);
2601
2602 if (__glibc_unlikely (elem == NULL))
2603 return NULL;
2604 if (__glibc_unlikely (start == 0 && end == 0))
2605 {
2606 postorder (elem, free_tree, NULL);
2607 return NULL;
2608 }
2609
2610 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2611 if (__glibc_unlikely (start > 0))
2612 {
2613 tree = elem;
2614 for (i = 2; i <= start; ++i)
2615 {
2616 elem = duplicate_tree (elem, dfa);
2617 tree = create_tree (dfa, tree, elem, CONCAT);
2618 if (__glibc_unlikely (elem == NULL || tree == NULL))
2619 goto parse_dup_op_espace;
2620 }
2621
2622 if (start == end)
2623 return tree;
2624
2625 /* Duplicate ELEM before it is marked optional. */
2626 elem = duplicate_tree (elem, dfa);
2627 if (__glibc_unlikely (elem == NULL))
2628 goto parse_dup_op_espace;
2629 old_tree = tree;
2630 }
2631 else
2632 old_tree = NULL;
2633
2634 if (elem->token.type == SUBEXP)
2635 {
2636 uintptr_t subidx = elem->token.opr.idx;
2637 postorder (elem, mark_opt_subexp, (void *) subidx);
2638 }
2639
2640 tree = create_tree (dfa, elem, NULL,
2641 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2642 if (__glibc_unlikely (tree == NULL))
2643 goto parse_dup_op_espace;
2644
2645 /* This loop is actually executed only when end != -1,
2646 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2647 already created the start+1-th copy. */
2648 if (TYPE_SIGNED (Idx) || end != -1)
2649 for (i = start + 2; i <= end; ++i)
2650 {
2651 elem = duplicate_tree (elem, dfa);
2652 tree = create_tree (dfa, tree, elem, CONCAT);
2653 if (__glibc_unlikely (elem == NULL || tree == NULL))
2654 goto parse_dup_op_espace;
2655
2656 tree = create_tree (dfa, tree, NULL, OP_ALT);
2657 if (__glibc_unlikely (tree == NULL))
2658 goto parse_dup_op_espace;
2659 }
2660
2661 if (old_tree)
2662 tree = create_tree (dfa, old_tree, tree, CONCAT);
2663
2664 return tree;
2665
2666 parse_dup_op_espace:
2667 *err = REG_ESPACE;
2668 return NULL;
2669}
2670
2671/* Size of the names for collating symbol/equivalence_class/character_class.
2672 I'm not sure, but maybe enough. */
2673#define BRACKET_NAME_BUF_SIZE 32
2674
2675#ifndef _LIBC
2676
2677# ifdef RE_ENABLE_I18N
2678/* Convert the byte B to the corresponding wide character. In a
2679 unibyte locale, treat B as itself. In a multibyte locale, return
2680 WEOF if B is an encoding error. */
2681static wint_t
2682parse_byte (unsigned char b, re_charset_t *mbcset)
2683{
2684 return mbcset == NULL ? b : __btowc (b);
2685}
2686# endif
2687
2688 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2689 Build the range expression which starts from START_ELEM, and ends
2690 at END_ELEM. The result are written to MBCSET and SBCSET.
2691 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2692 mbcset->range_ends, is a pointer argument since we may
2693 update it. */
2694
2695static reg_errcode_t
2696# ifdef RE_ENABLE_I18N
2697build_range_exp (const reg_syntax_t syntax,
2698 bitset_t sbcset,
2699 re_charset_t *mbcset,
2700 Idx *range_alloc,
2701 const bracket_elem_t *start_elem,
2702 const bracket_elem_t *end_elem)
2703# else /* not RE_ENABLE_I18N */
2704build_range_exp (const reg_syntax_t syntax,
2705 bitset_t sbcset,
2706 const bracket_elem_t *start_elem,
2707 const bracket_elem_t *end_elem)
2708# endif /* not RE_ENABLE_I18N */
2709{
2710 unsigned int start_ch, end_ch;
2711 /* Equivalence Classes and Character Classes can't be a range start/end. */
2712 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2713 || start_elem->type == CHAR_CLASS
2714 || end_elem->type == EQUIV_CLASS
2715 || end_elem->type == CHAR_CLASS))
2716 return REG_ERANGE;
2717
2718 /* We can handle no multi character collating elements without libc
2719 support. */
2720 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2721 && strlen ((char *) start_elem->opr.name) > 1)
2722 || (end_elem->type == COLL_SYM
2723 && strlen ((char *) end_elem->opr.name) > 1)))
2724 return REG_ECOLLATE;
2725
2726# ifdef RE_ENABLE_I18N
2727 {
2728 wchar_t wc;
2729 wint_t start_wc;
2730 wint_t end_wc;
2731
2732 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2733 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734 : 0));
2735 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2736 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737 : 0));
2738 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2739 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2740 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2741 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2742 if (start_wc == WEOF || end_wc == WEOF)
2743 return REG_ECOLLATE;
2744 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2745 && start_wc > end_wc))
2746 return REG_ERANGE;
2747
2748 /* Got valid collation sequence values, add them as a new entry.
2749 However, for !_LIBC we have no collation elements: if the
2750 character set is single byte, the single byte character set
2751 that we build below suffices. parse_bracket_exp passes
2752 no MBCSET if dfa->mb_cur_max == 1. */
2753 if (mbcset)
2754 {
2755 /* Check the space of the arrays. */
2756 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2757 {
2758 /* There is not enough space, need realloc. */
2759 wchar_t *new_array_start, *new_array_end;
2760 Idx new_nranges;
2761
2762 /* +1 in case of mbcset->nranges is 0. */
2763 new_nranges = 2 * mbcset->nranges + 1;
2764 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2765 are NULL if *range_alloc == 0. */
2766 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2767 new_nranges);
2768 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2769 new_nranges);
2770
2771 if (__glibc_unlikely (new_array_start == NULL
2772 || new_array_end == NULL))
2773 {
2774 re_free (new_array_start);
2775 re_free (new_array_end);
2776 return REG_ESPACE;
2777 }
2778
2779 mbcset->range_starts = new_array_start;
2780 mbcset->range_ends = new_array_end;
2781 *range_alloc = new_nranges;
2782 }
2783
2784 mbcset->range_starts[mbcset->nranges] = start_wc;
2785 mbcset->range_ends[mbcset->nranges++] = end_wc;
2786 }
2787
2788 /* Build the table for single byte characters. */
2789 for (wc = 0; wc < SBC_MAX; ++wc)
2790 {
2791 if (start_wc <= wc && wc <= end_wc)
2792 bitset_set (sbcset, wc);
2793 }
2794 }
2795# else /* not RE_ENABLE_I18N */
2796 {
2797 unsigned int ch;
2798 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2799 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2800 : 0));
2801 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2802 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2803 : 0));
2804 if (start_ch > end_ch)
2805 return REG_ERANGE;
2806 /* Build the table for single byte characters. */
2807 for (ch = 0; ch < SBC_MAX; ++ch)
2808 if (start_ch <= ch && ch <= end_ch)
2809 bitset_set (sbcset, ch);
2810 }
2811# endif /* not RE_ENABLE_I18N */
2812 return REG_NOERROR;
2813}
2814#endif /* not _LIBC */
2815
2816#ifndef _LIBC
2817/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2818 Build the collating element which is represented by NAME.
2819 The result are written to MBCSET and SBCSET.
2820 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2821 pointer argument since we may update it. */
2822
2823static reg_errcode_t
2824# ifdef RE_ENABLE_I18N
2825build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2826 Idx *coll_sym_alloc, const unsigned char *name)
2827# else /* not RE_ENABLE_I18N */
2828build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2829# endif /* not RE_ENABLE_I18N */
2830{
2831 size_t name_len = strlen ((const char *) name);
2832 if (__glibc_unlikely (name_len != 1))
2833 return REG_ECOLLATE;
2834 else
2835 {
2836 bitset_set (sbcset, name[0]);
2837 return REG_NOERROR;
2838 }
2839}
2840#endif /* not _LIBC */
2841
2842/* This function parse bracket expression like "[abc]", "[a-c]",
2843 "[[.a-a.]]" etc. */
2844
2845static bin_tree_t *
2846parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2847 reg_syntax_t syntax, reg_errcode_t *err)
2848{
2849#ifdef _LIBC
2850 const unsigned char *collseqmb;
2851 const char *collseqwc;
2852 uint32_t nrules;
2853 int32_t table_size;
2854 const int32_t *symb_table;
2855 const unsigned char *extra;
2856
2857 /* Local function for parse_bracket_exp used in _LIBC environment.
2858 Seek the collating symbol entry corresponding to NAME.
2859 Return the index of the symbol in the SYMB_TABLE,
2860 or -1 if not found. */
2861
2862 auto inline int32_t
2863 __attribute__ ((always_inline))
2864 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2865 {
2866 int32_t elem;
2867
2868 for (elem = 0; elem < table_size; elem++)
2869 if (symb_table[2 * elem] != 0)
2870 {
2871 int32_t idx = symb_table[2 * elem + 1];
2872 /* Skip the name of collating element name. */
2873 idx += 1 + extra[idx];
2874 if (/* Compare the length of the name. */
2875 name_len == extra[idx]
2876 /* Compare the name. */
2877 && memcmp (name, &extra[idx + 1], name_len) == 0)
2878 /* Yep, this is the entry. */
2879 return elem;
2880 }
2881 return -1;
2882 }
2883
2884 /* Local function for parse_bracket_exp used in _LIBC environment.
2885 Look up the collation sequence value of BR_ELEM.
2886 Return the value if succeeded, UINT_MAX otherwise. */
2887
2888 auto inline unsigned int
2889 __attribute__ ((always_inline))
2890 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2891 {
2892 if (br_elem->type == SB_CHAR)
2893 {
2894 /*
2895 if (MB_CUR_MAX == 1)
2896 */
2897 if (nrules == 0)
2898 return collseqmb[br_elem->opr.ch];
2899 else
2900 {
2901 wint_t wc = __btowc (br_elem->opr.ch);
2902 return __collseq_table_lookup (collseqwc, wc);
2903 }
2904 }
2905 else if (br_elem->type == MB_CHAR)
2906 {
2907 if (nrules != 0)
2908 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2909 }
2910 else if (br_elem->type == COLL_SYM)
2911 {
2912 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2913 if (nrules != 0)
2914 {
2915 int32_t elem, idx;
2916 elem = seek_collating_symbol_entry (br_elem->opr.name,
2917 sym_name_len);
2918 if (elem != -1)
2919 {
2920 /* We found the entry. */
2921 idx = symb_table[2 * elem + 1];
2922 /* Skip the name of collating element name. */
2923 idx += 1 + extra[idx];
2924 /* Skip the byte sequence of the collating element. */
2925 idx += 1 + extra[idx];
2926 /* Adjust for the alignment. */
2927 idx = (idx + 3) & ~3;
2928 /* Skip the multibyte collation sequence value. */
2929 idx += sizeof (unsigned int);
2930 /* Skip the wide char sequence of the collating element. */
2931 idx += sizeof (unsigned int) *
2932 (1 + *(unsigned int *) (extra + idx));
2933 /* Return the collation sequence value. */
2934 return *(unsigned int *) (extra + idx);
2935 }
2936 else if (sym_name_len == 1)
2937 {
2938 /* No valid character. Match it as a single byte
2939 character. */
2940 return collseqmb[br_elem->opr.name[0]];
2941 }
2942 }
2943 else if (sym_name_len == 1)
2944 return collseqmb[br_elem->opr.name[0]];
2945 }
2946 return UINT_MAX;
2947 }
2948
2949 /* Local function for parse_bracket_exp used in _LIBC environment.
2950 Build the range expression which starts from START_ELEM, and ends
2951 at END_ELEM. The result are written to MBCSET and SBCSET.
2952 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2953 mbcset->range_ends, is a pointer argument since we may
2954 update it. */
2955
2956 auto inline reg_errcode_t
2957 __attribute__ ((always_inline))
2958 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2959 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2960 {
2961 unsigned int ch;
2962 uint32_t start_collseq;
2963 uint32_t end_collseq;
2964
2965 /* Equivalence Classes and Character Classes can't be a range
2966 start/end. */
2967 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2968 || start_elem->type == CHAR_CLASS
2969 || end_elem->type == EQUIV_CLASS
2970 || end_elem->type == CHAR_CLASS))
2971 return REG_ERANGE;
2972
2973 /* FIXME: Implement rational ranges here, too. */
2974 start_collseq = lookup_collation_sequence_value (start_elem);
2975 end_collseq = lookup_collation_sequence_value (end_elem);
2976 /* Check start/end collation sequence values. */
2977 if (__glibc_unlikely (start_collseq == UINT_MAX
2978 || end_collseq == UINT_MAX))
2979 return REG_ECOLLATE;
2980 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2981 && start_collseq > end_collseq))
2982 return REG_ERANGE;
2983
2984 /* Got valid collation sequence values, add them as a new entry.
2985 However, if we have no collation elements, and the character set
2986 is single byte, the single byte character set that we
2987 build below suffices. */
2988 if (nrules > 0 || dfa->mb_cur_max > 1)
2989 {
2990 /* Check the space of the arrays. */
2991 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2992 {
2993 /* There is not enough space, need realloc. */
2994 uint32_t *new_array_start;
2995 uint32_t *new_array_end;
2996 Idx new_nranges;
2997
2998 /* +1 in case of mbcset->nranges is 0. */
2999 new_nranges = 2 * mbcset->nranges + 1;
3000 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3001 new_nranges);
3002 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3003 new_nranges);
3004
3005 if (__glibc_unlikely (new_array_start == NULL
3006 || new_array_end == NULL))
3007 return REG_ESPACE;
3008
3009 mbcset->range_starts = new_array_start;
3010 mbcset->range_ends = new_array_end;
3011 *range_alloc = new_nranges;
3012 }
3013
3014 mbcset->range_starts[mbcset->nranges] = start_collseq;
3015 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3016 }
3017
3018 /* Build the table for single byte characters. */
3019 for (ch = 0; ch < SBC_MAX; ch++)
3020 {
3021 uint32_t ch_collseq;
3022 /*
3023 if (MB_CUR_MAX == 1)
3024 */
3025 if (nrules == 0)
3026 ch_collseq = collseqmb[ch];
3027 else
3028 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3029 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3030 bitset_set (sbcset, ch);
3031 }
3032 return REG_NOERROR;
3033 }
3034
3035 /* Local function for parse_bracket_exp used in _LIBC environment.
3036 Build the collating element which is represented by NAME.
3037 The result are written to MBCSET and SBCSET.
3038 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3039 pointer argument since we may update it. */
3040
3041 auto inline reg_errcode_t
3042 __attribute__ ((always_inline))
3043 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3044 Idx *coll_sym_alloc, const unsigned char *name)
3045 {
3046 int32_t elem, idx;
3047 size_t name_len = strlen ((const char *) name);
3048 if (nrules != 0)
3049 {
3050 elem = seek_collating_symbol_entry (name, name_len);
3051 if (elem != -1)
3052 {
3053 /* We found the entry. */
3054 idx = symb_table[2 * elem + 1];
3055 /* Skip the name of collating element name. */
3056 idx += 1 + extra[idx];
3057 }
3058 else if (name_len == 1)
3059 {
3060 /* No valid character, treat it as a normal
3061 character. */
3062 bitset_set (sbcset, name[0]);
3063 return REG_NOERROR;
3064 }
3065 else
3066 return REG_ECOLLATE;
3067
3068 /* Got valid collation sequence, add it as a new entry. */
3069 /* Check the space of the arrays. */
3070 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3071 {
3072 /* Not enough, realloc it. */
3073 /* +1 in case of mbcset->ncoll_syms is 0. */
3074 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3075 /* Use realloc since mbcset->coll_syms is NULL
3076 if *alloc == 0. */
3077 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3078 new_coll_sym_alloc);
3079 if (__glibc_unlikely (new_coll_syms == NULL))
3080 return REG_ESPACE;
3081 mbcset->coll_syms = new_coll_syms;
3082 *coll_sym_alloc = new_coll_sym_alloc;
3083 }
3084 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085 return REG_NOERROR;
3086 }
3087 else
3088 {
3089 if (__glibc_unlikely (name_len != 1))
3090 return REG_ECOLLATE;
3091 else
3092 {
3093 bitset_set (sbcset, name[0]);
3094 return REG_NOERROR;
3095 }
3096 }
3097 }
3098#endif
3099
3100 re_token_t br_token;
3101 re_bitset_ptr_t sbcset;
3102#ifdef RE_ENABLE_I18N
3103 re_charset_t *mbcset;
3104 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3105 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3106#endif /* not RE_ENABLE_I18N */
3107 bool non_match = false;
3108 bin_tree_t *work_tree;
3109 int token_len;
3110 bool first_round = true;
3111#ifdef _LIBC
3112 collseqmb = (const unsigned char *)
3113 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3114 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3115 if (nrules)
3116 {
3117 /*
3118 if (MB_CUR_MAX > 1)
3119 */
3120 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3121 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3122 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3123 _NL_COLLATE_SYMB_TABLEMB);
3124 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3125 _NL_COLLATE_SYMB_EXTRAMB);
3126 }
3127#endif
3128 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3129#ifdef RE_ENABLE_I18N
3130 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3131#endif /* RE_ENABLE_I18N */
3132#ifdef RE_ENABLE_I18N
3133 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3134#else
3135 if (__glibc_unlikely (sbcset == NULL))
3136#endif /* RE_ENABLE_I18N */
3137 {
3138 re_free (sbcset);
3139#ifdef RE_ENABLE_I18N
3140 re_free (mbcset);
3141#endif
3142 *err = REG_ESPACE;
3143 return NULL;
3144 }
3145
3146 token_len = peek_token_bracket (token, regexp, syntax);
3147 if (__glibc_unlikely (token->type == END_OF_RE))
3148 {
3149 *err = REG_BADPAT;
3150 goto parse_bracket_exp_free_return;
3151 }
3152 if (token->type == OP_NON_MATCH_LIST)
3153 {
3154#ifdef RE_ENABLE_I18N
3155 mbcset->non_match = 1;
3156#endif /* not RE_ENABLE_I18N */
3157 non_match = true;
3158 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3159 bitset_set (sbcset, '\n');
3160 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161 token_len = peek_token_bracket (token, regexp, syntax);
3162 if (__glibc_unlikely (token->type == END_OF_RE))
3163 {
3164 *err = REG_BADPAT;
3165 goto parse_bracket_exp_free_return;
3166 }
3167 }
3168
3169 /* We treat the first ']' as a normal character. */
3170 if (token->type == OP_CLOSE_BRACKET)
3171 token->type = CHARACTER;
3172
3173 while (1)
3174 {
3175 bracket_elem_t start_elem, end_elem;
3176 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3177 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178 reg_errcode_t ret;
3179 int token_len2 = 0;
3180 bool is_range_exp = false;
3181 re_token_t token2;
3182
3183 start_elem.opr.name = start_name_buf;
3184 start_elem.type = COLL_SYM;
3185 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3186 syntax, first_round);
3187 if (__glibc_unlikely (ret != REG_NOERROR))
3188 {
3189 *err = ret;
3190 goto parse_bracket_exp_free_return;
3191 }
3192 first_round = false;
3193
3194 /* Get information about the next token. We need it in any case. */
3195 token_len = peek_token_bracket (token, regexp, syntax);
3196
3197 /* Do not check for ranges if we know they are not allowed. */
3198 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3199 {
3200 if (__glibc_unlikely (token->type == END_OF_RE))
3201 {
3202 *err = REG_EBRACK;
3203 goto parse_bracket_exp_free_return;
3204 }
3205 if (token->type == OP_CHARSET_RANGE)
3206 {
3207 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3208 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3209 if (__glibc_unlikely (token2.type == END_OF_RE))
3210 {
3211 *err = REG_EBRACK;
3212 goto parse_bracket_exp_free_return;
3213 }
3214 if (token2.type == OP_CLOSE_BRACKET)
3215 {
3216 /* We treat the last '-' as a normal character. */
3217 re_string_skip_bytes (regexp, -token_len);
3218 token->type = CHARACTER;
3219 }
3220 else
3221 is_range_exp = true;
3222 }
3223 }
3224
3225 if (is_range_exp == true)
3226 {
3227 end_elem.opr.name = end_name_buf;
3228 end_elem.type = COLL_SYM;
3229 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3230 dfa, syntax, true);
3231 if (__glibc_unlikely (ret != REG_NOERROR))
3232 {
3233 *err = ret;
3234 goto parse_bracket_exp_free_return;
3235 }
3236
3237 token_len = peek_token_bracket (token, regexp, syntax);
3238
3239#ifdef _LIBC
3240 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3241 &start_elem, &end_elem);
3242#else
3243# ifdef RE_ENABLE_I18N
3244 *err = build_range_exp (syntax, sbcset,
3245 dfa->mb_cur_max > 1 ? mbcset : NULL,
3246 &range_alloc, &start_elem, &end_elem);
3247# else
3248 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3249# endif
3250#endif /* RE_ENABLE_I18N */
3251 if (__glibc_unlikely (*err != REG_NOERROR))
3252 goto parse_bracket_exp_free_return;
3253 }
3254 else
3255 {
3256 switch (start_elem.type)
3257 {
3258 case SB_CHAR:
3259 bitset_set (sbcset, start_elem.opr.ch);
3260 break;
3261#ifdef RE_ENABLE_I18N
3262 case MB_CHAR:
3263 /* Check whether the array has enough space. */
3264 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3265 {
3266 wchar_t *new_mbchars;
3267 /* Not enough, realloc it. */
3268 /* +1 in case of mbcset->nmbchars is 0. */
3269 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3270 /* Use realloc since array is NULL if *alloc == 0. */
3271 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3272 mbchar_alloc);
3273 if (__glibc_unlikely (new_mbchars == NULL))
3274 goto parse_bracket_exp_espace;
3275 mbcset->mbchars = new_mbchars;
3276 }
3277 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3278 break;
3279#endif /* RE_ENABLE_I18N */
3280 case EQUIV_CLASS:
3281 *err = build_equiv_class (sbcset,
3282#ifdef RE_ENABLE_I18N
3283 mbcset, &equiv_class_alloc,
3284#endif /* RE_ENABLE_I18N */
3285 start_elem.opr.name);
3286 if (__glibc_unlikely (*err != REG_NOERROR))
3287 goto parse_bracket_exp_free_return;
3288 break;
3289 case COLL_SYM:
3290 *err = build_collating_symbol (sbcset,
3291#ifdef RE_ENABLE_I18N
3292 mbcset, &coll_sym_alloc,
3293#endif /* RE_ENABLE_I18N */
3294 start_elem.opr.name);
3295 if (__glibc_unlikely (*err != REG_NOERROR))
3296 goto parse_bracket_exp_free_return;
3297 break;
3298 case CHAR_CLASS:
3299 *err = build_charclass (regexp->trans, sbcset,
3300#ifdef RE_ENABLE_I18N
3301 mbcset, &char_class_alloc,
3302#endif /* RE_ENABLE_I18N */
3303 (const char *) start_elem.opr.name,
3304 syntax);
3305 if (__glibc_unlikely (*err != REG_NOERROR))
3306 goto parse_bracket_exp_free_return;
3307 break;
3308 default:
3309 assert (0);
3310 break;
3311 }
3312 }
3313 if (__glibc_unlikely (token->type == END_OF_RE))
3314 {
3315 *err = REG_EBRACK;
3316 goto parse_bracket_exp_free_return;
3317 }
3318 if (token->type == OP_CLOSE_BRACKET)
3319 break;
3320 }
3321
3322 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3323
3324 /* If it is non-matching list. */
3325 if (non_match)
3326 bitset_not (sbcset);
3327
3328#ifdef RE_ENABLE_I18N
3329 /* Ensure only single byte characters are set. */
3330 if (dfa->mb_cur_max > 1)
3331 bitset_mask (sbcset, dfa->sb_char);
3332
3333 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3334 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3335 || mbcset->non_match)))
3336 {
3337 bin_tree_t *mbc_tree;
3338 int sbc_idx;
3339 /* Build a tree for complex bracket. */
3340 dfa->has_mb_node = 1;
3341 br_token.type = COMPLEX_BRACKET;
3342 br_token.opr.mbcset = mbcset;
3343 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344 if (__glibc_unlikely (mbc_tree == NULL))
3345 goto parse_bracket_exp_espace;
3346 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3347 if (sbcset[sbc_idx])
3348 break;
3349 /* If there are no bits set in sbcset, there is no point
3350 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3351 if (sbc_idx < BITSET_WORDS)
3352 {
3353 /* Build a tree for simple bracket. */
3354 br_token.type = SIMPLE_BRACKET;
3355 br_token.opr.sbcset = sbcset;
3356 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3357 if (__glibc_unlikely (work_tree == NULL))
3358 goto parse_bracket_exp_espace;
3359
3360 /* Then join them by ALT node. */
3361 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3362 if (__glibc_unlikely (work_tree == NULL))
3363 goto parse_bracket_exp_espace;
3364 }
3365 else
3366 {
3367 re_free (sbcset);
3368 work_tree = mbc_tree;
3369 }
3370 }
3371 else
3372#endif /* not RE_ENABLE_I18N */
3373 {
3374#ifdef RE_ENABLE_I18N
3375 free_charset (mbcset);
3376#endif
3377 /* Build a tree for simple bracket. */
3378 br_token.type = SIMPLE_BRACKET;
3379 br_token.opr.sbcset = sbcset;
3380 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3381 if (__glibc_unlikely (work_tree == NULL))
3382 goto parse_bracket_exp_espace;
3383 }
3384 return work_tree;
3385
3386 parse_bracket_exp_espace:
3387 *err = REG_ESPACE;
3388 parse_bracket_exp_free_return:
3389 re_free (sbcset);
3390#ifdef RE_ENABLE_I18N
3391 free_charset (mbcset);
3392#endif /* RE_ENABLE_I18N */
3393 return NULL;
3394}
3395
3396/* Parse an element in the bracket expression. */
3397
3398static reg_errcode_t
3399parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3400 re_token_t *token, int token_len, re_dfa_t *dfa,
3401 reg_syntax_t syntax, bool accept_hyphen)
3402{
3403#ifdef RE_ENABLE_I18N
3404 int cur_char_size;
3405 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3406 if (cur_char_size > 1)
3407 {
3408 elem->type = MB_CHAR;
3409 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3410 re_string_skip_bytes (regexp, cur_char_size);
3411 return REG_NOERROR;
3412 }
3413#endif /* RE_ENABLE_I18N */
3414 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3415 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3416 || token->type == OP_OPEN_EQUIV_CLASS)
3417 return parse_bracket_symbol (elem, regexp, token);
3418 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3419 {
3420 /* A '-' must only appear as anything but a range indicator before
3421 the closing bracket. Everything else is an error. */
3422 re_token_t token2;
3423 (void) peek_token_bracket (&token2, regexp, syntax);
3424 if (token2.type != OP_CLOSE_BRACKET)
3425 /* The actual error value is not standardized since this whole
3426 case is undefined. But ERANGE makes good sense. */
3427 return REG_ERANGE;
3428 }
3429 elem->type = SB_CHAR;
3430 elem->opr.ch = token->opr.c;
3431 return REG_NOERROR;
3432}
3433
3434/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3435 such as [:<character_class>:], [.<collating_element>.], and
3436 [=<equivalent_class>=]. */
3437
3438static reg_errcode_t
3439parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3440 re_token_t *token)
3441{
3442 unsigned char ch, delim = token->opr.c;
3443 int i = 0;
3444 if (re_string_eoi(regexp))
3445 return REG_EBRACK;
3446 for (;; ++i)
3447 {
3448 if (i >= BRACKET_NAME_BUF_SIZE)
3449 return REG_EBRACK;
3450 if (token->type == OP_OPEN_CHAR_CLASS)
3451 ch = re_string_fetch_byte_case (regexp);
3452 else
3453 ch = re_string_fetch_byte (regexp);
3454 if (re_string_eoi(regexp))
3455 return REG_EBRACK;
3456 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3457 break;
3458 elem->opr.name[i] = ch;
3459 }
3460 re_string_skip_bytes (regexp, 1);
3461 elem->opr.name[i] = '\0';
3462 switch (token->type)
3463 {
3464 case OP_OPEN_COLL_ELEM:
3465 elem->type = COLL_SYM;
3466 break;
3467 case OP_OPEN_EQUIV_CLASS:
3468 elem->type = EQUIV_CLASS;
3469 break;
3470 case OP_OPEN_CHAR_CLASS:
3471 elem->type = CHAR_CLASS;
3472 break;
3473 default:
3474 break;
3475 }
3476 return REG_NOERROR;
3477}
3478
3479 /* Helper function for parse_bracket_exp.
3480 Build the equivalence class which is represented by NAME.
3481 The result are written to MBCSET and SBCSET.
3482 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3483 is a pointer argument since we may update it. */
3484
3485static reg_errcode_t
3486#ifdef RE_ENABLE_I18N
3487build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3488 Idx *equiv_class_alloc, const unsigned char *name)
3489#else /* not RE_ENABLE_I18N */
3490build_equiv_class (bitset_t sbcset, const unsigned char *name)
3491#endif /* not RE_ENABLE_I18N */
3492{
3493#ifdef _LIBC
3494 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3495 if (nrules != 0)
3496 {
3497 const int32_t *table, *indirect;
3498 const unsigned char *weights, *extra, *cp;
3499 unsigned char char_buf[2];
3500 int32_t idx1, idx2;
3501 unsigned int ch;
3502 size_t len;
3503 /* Calculate the index for equivalence class. */
3504 cp = name;
3505 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3506 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3507 _NL_COLLATE_WEIGHTMB);
3508 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_EXTRAMB);
3510 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_INDIRECTMB);
3512 idx1 = findidx (table, indirect, extra, &cp, -1);
3513 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3514 /* This isn't a valid character. */
3515 return REG_ECOLLATE;
3516
3517 /* Build single byte matching table for this equivalence class. */
3518 len = weights[idx1 & 0xffffff];
3519 for (ch = 0; ch < SBC_MAX; ++ch)
3520 {
3521 char_buf[0] = ch;
3522 cp = char_buf;
3523 idx2 = findidx (table, indirect, extra, &cp, 1);
3524/*
3525 idx2 = table[ch];
3526*/
3527 if (idx2 == 0)
3528 /* This isn't a valid character. */
3529 continue;
3530 /* Compare only if the length matches and the collation rule
3531 index is the same. */
3532 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3533 && memcmp (weights + (idx1 & 0xffffff) + 1,
3534 weights + (idx2 & 0xffffff) + 1, len) == 0)
3535 bitset_set (sbcset, ch);
3536 }
3537 /* Check whether the array has enough space. */
3538 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3539 {
3540 /* Not enough, realloc it. */
3541 /* +1 in case of mbcset->nequiv_classes is 0. */
3542 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3543 /* Use realloc since the array is NULL if *alloc == 0. */
3544 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3545 int32_t,
3546 new_equiv_class_alloc);
3547 if (__glibc_unlikely (new_equiv_classes == NULL))
3548 return REG_ESPACE;
3549 mbcset->equiv_classes = new_equiv_classes;
3550 *equiv_class_alloc = new_equiv_class_alloc;
3551 }
3552 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3553 }
3554 else
3555#endif /* _LIBC */
3556 {
3557 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3558 return REG_ECOLLATE;
3559 bitset_set (sbcset, *name);
3560 }
3561 return REG_NOERROR;
3562}
3563
3564 /* Helper function for parse_bracket_exp.
3565 Build the character class which is represented by NAME.
3566 The result are written to MBCSET and SBCSET.
3567 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3568 is a pointer argument since we may update it. */
3569
3570static reg_errcode_t
3571#ifdef RE_ENABLE_I18N
3572build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573 re_charset_t *mbcset, Idx *char_class_alloc,
3574 const char *class_name, reg_syntax_t syntax)
3575#else /* not RE_ENABLE_I18N */
3576build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577 const char *class_name, reg_syntax_t syntax)
3578#endif /* not RE_ENABLE_I18N */
3579{
3580 int i;
3581 const char *name = class_name;
3582
3583 /* In case of REG_ICASE "upper" and "lower" match the both of
3584 upper and lower cases. */
3585 if ((syntax & RE_ICASE)
3586 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3587 name = "alpha";
3588
3589#ifdef RE_ENABLE_I18N
3590 /* Check the space of the arrays. */
3591 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3592 {
3593 /* Not enough, realloc it. */
3594 /* +1 in case of mbcset->nchar_classes is 0. */
3595 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3596 /* Use realloc since array is NULL if *alloc == 0. */
3597 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3598 new_char_class_alloc);
3599 if (__glibc_unlikely (new_char_classes == NULL))
3600 return REG_ESPACE;
3601 mbcset->char_classes = new_char_classes;
3602 *char_class_alloc = new_char_class_alloc;
3603 }
3604 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3605#endif /* RE_ENABLE_I18N */
3606
3607#define BUILD_CHARCLASS_LOOP(ctype_func) \
3608 do { \
3609 if (__glibc_unlikely (trans != NULL)) \
3610 { \
3611 for (i = 0; i < SBC_MAX; ++i) \
3612 if (ctype_func (i)) \
3613 bitset_set (sbcset, trans[i]); \
3614 } \
3615 else \
3616 { \
3617 for (i = 0; i < SBC_MAX; ++i) \
3618 if (ctype_func (i)) \
3619 bitset_set (sbcset, i); \
3620 } \
3621 } while (0)
3622
3623 if (strcmp (name, "alnum") == 0)
3624 BUILD_CHARCLASS_LOOP (isalnum);
3625 else if (strcmp (name, "cntrl") == 0)
3626 BUILD_CHARCLASS_LOOP (iscntrl);
3627 else if (strcmp (name, "lower") == 0)
3628 BUILD_CHARCLASS_LOOP (islower);
3629 else if (strcmp (name, "space") == 0)
3630 BUILD_CHARCLASS_LOOP (isspace);
3631 else if (strcmp (name, "alpha") == 0)
3632 BUILD_CHARCLASS_LOOP (isalpha);
3633 else if (strcmp (name, "digit") == 0)
3634 BUILD_CHARCLASS_LOOP (isdigit);
3635 else if (strcmp (name, "print") == 0)
3636 BUILD_CHARCLASS_LOOP (isprint);
3637 else if (strcmp (name, "upper") == 0)
3638 BUILD_CHARCLASS_LOOP (isupper);
3639 else if (strcmp (name, "blank") == 0)
3640 BUILD_CHARCLASS_LOOP (isblank);
3641 else if (strcmp (name, "graph") == 0)
3642 BUILD_CHARCLASS_LOOP (isgraph);
3643 else if (strcmp (name, "punct") == 0)
3644 BUILD_CHARCLASS_LOOP (ispunct);
3645 else if (strcmp (name, "xdigit") == 0)
3646 BUILD_CHARCLASS_LOOP (isxdigit);
3647 else
3648 return REG_ECTYPE;
3649
3650 return REG_NOERROR;
3651}
3652
3653static bin_tree_t *
3654build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3655 const char *class_name,
3656 const char *extra, bool non_match,
3657 reg_errcode_t *err)
3658{
3659 re_bitset_ptr_t sbcset;
3660#ifdef RE_ENABLE_I18N
3661 re_charset_t *mbcset;
3662 Idx alloc = 0;
3663#endif /* not RE_ENABLE_I18N */
3664 reg_errcode_t ret;
3665 re_token_t br_token;
3666 bin_tree_t *tree;
3667
3668 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3669 if (__glibc_unlikely (sbcset == NULL))
3670 {
3671 *err = REG_ESPACE;
3672 return NULL;
3673 }
3674#ifdef RE_ENABLE_I18N
3675 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676 if (__glibc_unlikely (mbcset == NULL))
3677 {
3678 re_free (sbcset);
3679 *err = REG_ESPACE;
3680 return NULL;
3681 }
3682 mbcset->non_match = non_match;
3683#endif /* RE_ENABLE_I18N */
3684
3685 /* We don't care the syntax in this case. */
3686 ret = build_charclass (trans, sbcset,
3687#ifdef RE_ENABLE_I18N
3688 mbcset, &alloc,
3689#endif /* RE_ENABLE_I18N */
3690 class_name, 0);
3691
3692 if (__glibc_unlikely (ret != REG_NOERROR))
3693 {
3694 re_free (sbcset);
3695#ifdef RE_ENABLE_I18N
3696 free_charset (mbcset);
3697#endif /* RE_ENABLE_I18N */
3698 *err = ret;
3699 return NULL;
3700 }
3701 /* \w match '_' also. */
3702 for (; *extra; extra++)
3703 bitset_set (sbcset, *extra);
3704
3705 /* If it is non-matching list. */
3706 if (non_match)
3707 bitset_not (sbcset);
3708
3709#ifdef RE_ENABLE_I18N
3710 /* Ensure only single byte characters are set. */
3711 if (dfa->mb_cur_max > 1)
3712 bitset_mask (sbcset, dfa->sb_char);
3713#endif
3714
3715 /* Build a tree for simple bracket. */
3716#if defined GCC_LINT || defined lint
3717 memset (&br_token, 0, sizeof br_token);
3718#endif
3719 br_token.type = SIMPLE_BRACKET;
3720 br_token.opr.sbcset = sbcset;
3721 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3722 if (__glibc_unlikely (tree == NULL))
3723 goto build_word_op_espace;
3724
3725#ifdef RE_ENABLE_I18N
3726 if (dfa->mb_cur_max > 1)
3727 {
3728 bin_tree_t *mbc_tree;
3729 /* Build a tree for complex bracket. */
3730 br_token.type = COMPLEX_BRACKET;
3731 br_token.opr.mbcset = mbcset;
3732 dfa->has_mb_node = 1;
3733 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734 if (__glibc_unlikely (mbc_tree == NULL))
3735 goto build_word_op_espace;
3736 /* Then join them by ALT node. */
3737 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3738 if (__glibc_likely (mbc_tree != NULL))
3739 return tree;
3740 }
3741 else
3742 {
3743 free_charset (mbcset);
3744 return tree;
3745 }
3746#else /* not RE_ENABLE_I18N */
3747 return tree;
3748#endif /* not RE_ENABLE_I18N */
3749
3750 build_word_op_espace:
3751 re_free (sbcset);
3752#ifdef RE_ENABLE_I18N
3753 free_charset (mbcset);
3754#endif /* RE_ENABLE_I18N */
3755 *err = REG_ESPACE;
3756 return NULL;
3757}
3758
3759/* This is intended for the expressions like "a{1,3}".
3760 Fetch a number from 'input', and return the number.
3761 Return -1 if the number field is empty like "{,1}".
3762 Return RE_DUP_MAX + 1 if the number field is too large.
3763 Return -2 if an error occurred. */
3764
3765static Idx
3766fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3767{
3768 Idx num = -1;
3769 unsigned char c;
3770 while (1)
3771 {
3772 fetch_token (token, input, syntax);
3773 c = token->opr.c;
3774 if (__glibc_unlikely (token->type == END_OF_RE))
3775 return -2;
3776 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3777 break;
3778 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3779 ? -2
3780 : num == -1
3781 ? c - '0'
3782 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3783 }
3784 return num;
3785}
3786
3787#ifdef RE_ENABLE_I18N
3788static void
3789free_charset (re_charset_t *cset)
3790{
3791 re_free (cset->mbchars);
3792# ifdef _LIBC
3793 re_free (cset->coll_syms);
3794 re_free (cset->equiv_classes);
3795# endif
3796 re_free (cset->range_starts);
3797 re_free (cset->range_ends);
3798 re_free (cset->char_classes);
3799 re_free (cset);
3800}
3801#endif /* RE_ENABLE_I18N */
3802
3803/* Functions for binary tree operation. */
3804
3805/* Create a tree node. */
3806
3807static bin_tree_t *
3808create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809 re_token_type_t type)
3810{
3811 re_token_t t;
3812#if defined GCC_LINT || defined lint
3813 memset (&t, 0, sizeof t);
3814#endif
3815 t.type = type;
3816 return create_token_tree (dfa, left, right, &t);
3817}
3818
3819static bin_tree_t *
3820create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3821 const re_token_t *token)
3822{
3823 bin_tree_t *tree;
3824 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3825 {
3826 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3827
3828 if (storage == NULL)
3829 return NULL;
3830 storage->next = dfa->str_tree_storage;
3831 dfa->str_tree_storage = storage;
3832 dfa->str_tree_storage_idx = 0;
3833 }
3834 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3835
3836 tree->parent = NULL;
3837 tree->left = left;
3838 tree->right = right;
3839 tree->token = *token;
3840 tree->token.duplicated = 0;
3841 tree->token.opt_subexp = 0;
3842 tree->first = NULL;
3843 tree->next = NULL;
3844 tree->node_idx = -1;
3845
3846 if (left != NULL)
3847 left->parent = tree;
3848 if (right != NULL)
3849 right->parent = tree;
3850 return tree;
3851}
3852
3853/* Mark the tree SRC as an optional subexpression.
3854 To be called from preorder or postorder. */
3855
3856static reg_errcode_t
3857mark_opt_subexp (void *extra, bin_tree_t *node)
3858{
3859 Idx idx = (uintptr_t) extra;
3860 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3861 node->token.opt_subexp = 1;
3862
3863 return REG_NOERROR;
3864}
3865
3866/* Free the allocated memory inside NODE. */
3867
3868static void
3869free_token (re_token_t *node)
3870{
3871#ifdef RE_ENABLE_I18N
3872 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3873 free_charset (node->opr.mbcset);
3874 else
3875#endif /* RE_ENABLE_I18N */
3876 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3877 re_free (node->opr.sbcset);
3878}
3879
3880/* Worker function for tree walking. Free the allocated memory inside NODE
3881 and its children. */
3882
3883static reg_errcode_t
3884free_tree (void *extra, bin_tree_t *node)
3885{
3886 free_token (&node->token);
3887 return REG_NOERROR;
3888}
3889
3890
3891/* Duplicate the node SRC, and return new node. This is a preorder
3892 visit similar to the one implemented by the generic visitor, but
3893 we need more infrastructure to maintain two parallel trees --- so,
3894 it's easier to duplicate. */
3895
3896static bin_tree_t *
3897duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3898{
3899 const bin_tree_t *node;
3900 bin_tree_t *dup_root;
3901 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3902
3903 for (node = root; ; )
3904 {
3905 /* Create a new tree and link it back to the current parent. */
3906 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3907 if (*p_new == NULL)
3908 return NULL;
3909 (*p_new)->parent = dup_node;
3910 (*p_new)->token.duplicated = 1;
3911 dup_node = *p_new;
3912
3913 /* Go to the left node, or up and to the right. */
3914 if (node->left)
3915 {
3916 node = node->left;
3917 p_new = &dup_node->left;
3918 }
3919 else
3920 {
3921 const bin_tree_t *prev = NULL;
3922 while (node->right == prev || node->right == NULL)
3923 {
3924 prev = node;
3925 node = node->parent;
3926 dup_node = dup_node->parent;
3927 if (!node)
3928 return dup_root;
3929 }
3930 node = node->right;
3931 p_new = &dup_node->right;
3932 }
3933 }
3934}
3935