regcomp.c source code [glibc_src_2.30/posix/regcomp.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2019 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const* char * pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t token, int* token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const* bin_tree_t src, re_dfa_t dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127	/ This table gives an error message for each of the error codes listed*
128	in regex.h. Obviously the order here has to be same as there.
129	POSIX doesn't require that we do anything for REG_NOERROR,
130	but why not be nice? /*
131
132	static const char __re_error_msgid[] =
133	{
134	#define REG_NOERROR_IDX 0
135	gettext_noop ("Success") / REG_NOERROR /
136	"\0"
137	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138	gettext_noop ("No match") / REG_NOMATCH /
139	"\0"
140	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141	gettext_noop ("Invalid regular expression") / REG_BADPAT /
142	"\0"
143	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144	gettext_noop ("Invalid collation character") / REG_ECOLLATE /
145	"\0"
146	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147	gettext_noop ("Invalid character class name") / REG_ECTYPE /
148	"\0"
149	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150	gettext_noop ("Trailing backslash") / REG_EESCAPE /
151	"\0"
152	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153	gettext_noop ("Invalid back reference") / REG_ESUBREG /
154	"\0"
155	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156	gettext_noop ("Unmatched [, [^, [:, [., or [=") / REG_EBRACK /
157	"\0"
158	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159	gettext_noop ("Unmatched ( or \\(") / REG_EPAREN /
160	"\0"
161	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162	gettext_noop ("Unmatched \\{") / REG_EBRACE /
163	"\0"
164	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165	gettext_noop ("Invalid content of \\{\\}") / REG_BADBR /
166	"\0"
167	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168	gettext_noop ("Invalid range end") / REG_ERANGE /
169	"\0"
170	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171	gettext_noop ("Memory exhausted") / REG_ESPACE /
172	"\0"
173	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174	gettext_noop ("Invalid preceding regular expression") / REG_BADRPT /
175	"\0"
176	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177	gettext_noop ("Premature end of regular expression") / REG_EEND /
178	"\0"
179	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180	gettext_noop ("Regular expression too big") / REG_ESIZE /
181	"\0"
182	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183	gettext_noop ("Unmatched ) or \\)") / REG_ERPAREN /
184	};
185
186	static const size_t __re_error_msgid_idx[] =
187	{
188	REG_NOERROR_IDX,
189	REG_NOMATCH_IDX,
190	REG_BADPAT_IDX,
191	REG_ECOLLATE_IDX,
192	REG_ECTYPE_IDX,
193	REG_EESCAPE_IDX,
194	REG_ESUBREG_IDX,
195	REG_EBRACK_IDX,
196	REG_EPAREN_IDX,
197	REG_EBRACE_IDX,
198	REG_BADBR_IDX,
199	REG_ERANGE_IDX,
200	REG_ESPACE_IDX,
201	REG_BADRPT_IDX,
202	REG_EEND_IDX,
203	REG_ESIZE_IDX,
204	REG_ERPAREN_IDX
205	};
206
207	/ Entry points for GNU code. /
208
209	/ re_compile_pattern is the GNU regular expression compiler: it*
210	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211	Returns 0 if the pattern was valid, otherwise an error string.
212
213	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214	are set in BUFP on entry. /*
215
216	const char *
217	re_compile_pattern (const char *pattern, size_t length,
218	struct re_pattern_buffer *bufp)
219	{
220	reg_errcode_t ret;
221
222	/ And GNU code determines whether or not to get register information*
223	by passing null for the REGS argument to re_match, etc., not by
224	setting no_sub, unless RE_NO_SUB is set. /*
225	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227	/ Match anchors at newline. /
228	bufp->newline_anchor = `1`;
229
230	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232	if (!ret)
233	return NULL;
234	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235	}
236	weak_alias (__re_compile_pattern, re_compile_pattern)
237
238	/ Set by 're_set_syntax' to the current regexp syntax to recognize. Can*
239	also be assigned to arbitrarily: each pattern buffer stores its own
240	syntax, so it can be changed between regex compilations. /*
241	/ This has no initializer because initialized variables in Emacs*
242	become read-only after dumping. /*
243	reg_syntax_t re_syntax_options;
244
245
246	/ Specify the precise syntax of regexps for compilation. This provides*
247	for compatibility for various utilities which historically have
248	different, incompatible syntaxes.
249
250	The argument SYNTAX is a bit mask comprised of the various bits
251	defined in regex.h. We return the old syntax. /*
252
253	reg_syntax_t
254	re_set_syntax (reg_syntax_t syntax)
255	{
256	reg_syntax_t ret = re_syntax_options;
257
258	re_syntax_options = syntax;
259	return ret;
260	}
261	weak_alias (__re_set_syntax, re_set_syntax)
262
263	int
264	re_compile_fastmap (struct re_pattern_buffer *bufp)
265	{
266	re_dfa_t *dfa = bufp->buffer;
267	char *fastmap = bufp->fastmap;
268
269	memset (fastmap, `'\0'`, sizeof (char) * SBC_MAX);
270	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271	if (dfa->init_state != dfa->init_state_word)
272	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273	if (dfa->init_state != dfa->init_state_nl)
274	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275	if (dfa->init_state != dfa->init_state_begbuf)
276	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277	bufp->fastmap_accurate = `1`;
278	return `0`;
279	}
280	weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282	static inline void
283	__attribute__ ((always_inline))
284	re_set_fastmap (char fastmap, bool icase, int* ch)
285	{
286	fastmap[ch] = `1`;
287	if (icase)
288	fastmap[tolower (ch)] = `1`;
289	}
290
291	/ Helper function for re_compile_fastmap.*
292	Compile fastmap for the initial_state INIT_STATE. /*
293
294	static void
295	re_compile_fastmap_iter (regex_t bufp, const* re_dfastate_t *init_state,
296	char *fastmap)
297	{
298	re_dfa_t *dfa = bufp->buffer;
299	Idx node_cnt;
300	bool icase = (dfa->mb_cur_max == `1` && (bufp->syntax & RE_ICASE));
301	for (node_cnt = `0`; node_cnt < init_state->nodes.nelem; ++node_cnt)
302	{
303	Idx node = init_state->nodes.elems[node_cnt];
304	re_token_type_t type = dfa->nodes[node].type;
305
306	if (type == CHARACTER)
307	{
308	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309	#ifdef RE_ENABLE_I18N
310	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
311	{
312	unsigned char buf[MB_LEN_MAX];
313	unsigned char *p;
314	wchar_t wc;
315	mbstate_t state;
316
317	p = buf;
318	*p++ = dfa->nodes[node].opr.c;
319	while (++node < dfa->nodes_len
320	&& dfa->nodes[node].type == CHARACTER
321	&& dfa->nodes[node].mb_partial)
322	*p++ = dfa->nodes[node].opr.c;
323	memset (&state, `'\0'`, sizeof (state));
324	if (__mbrtowc (&wc, (const char *) buf, p - buf,
325	&state) == p - buf
326	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
327	!= (size_t) -`1`))
328	re_set_fastmap (fastmap, false, buf[`0`]);
329	}
330	#endif
331	}
332	else if (type == SIMPLE_BRACKET)
333	{
334	int i, ch;
335	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
336	{
337	int j;
338	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
340	if (w & ((bitset_word_t) `1` << j))
341	re_set_fastmap (fastmap, icase, ch);
342	}
343	}
344	#ifdef RE_ENABLE_I18N
345	else if (type == COMPLEX_BRACKET)
346	{
347	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348	Idx i;
349
350	# ifdef _LIBC
351	/ See if we have to try all bytes which start multiple collation*
352	elements.
353	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354	collation element, and don't catch 'b' since 'b' is
355	the only collation element which starts from 'b' (and
356	it is caught by SIMPLE_BRACKET). /*
357	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != `0`
358	&& (cset->ncoll_syms \|\| cset->nranges))
359	{
360	const int32_t table = (const* int32_t *)
361	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362	for (i = `0`; i < SBC_MAX; ++i)
363	if (table[i] < `0`)
364	re_set_fastmap (fastmap, icase, i);
365	}
366	# endif /* _LIBC */
367
368	/ See if we have to start the match at all multibyte characters,*
369	i.e. where we would not find an invalid sequence. This only
370	applies to multibyte character sets; for single byte character
371	sets, the SIMPLE_BRACKET again suffices. /*
372	if (dfa->mb_cur_max > `1`
373	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
374	# ifdef _LIBC
375	\|\| cset->nequiv_classes
376	# endif /* _LIBC */
377	))
378	{
379	unsigned char c = `0`;
380	do
381	{
382	mbstate_t mbs;
383	memset (&mbs, `0`, sizeof (mbs));
384	if (__mbrtowc (NULL, (char *) &c, `1`, &mbs) == (size_t) -`2`)
385	re_set_fastmap (fastmap, false, (int) c);
386	}
387	while (++c != `0`);
388	}
389
390	else
391	{
392	/ ... Else catch all bytes which can start the mbchars. /
393	for (i = `0`; i < cset->nmbchars; ++i)
394	{
395	char buf[`256`];
396	mbstate_t state;
397	memset (&state, `'\0'`, sizeof (state));
398	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -`1`)
399	re_set_fastmap (fastmap, icase, (unsigned* char *) buf);
400	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
401	{
402	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403	!= (size_t) -`1`)
404	re_set_fastmap (fastmap, false, (unsigned* char *) buf);
405	}
406	}
407	}
408	}
409	#endif /* RE_ENABLE_I18N */
410	else if (type == OP_PERIOD
411	#ifdef RE_ENABLE_I18N
412	\|\| type == OP_UTF8_PERIOD
413	#endif /* RE_ENABLE_I18N */
414	\|\| type == END_OF_RE)
415	{
416	memset (fastmap, `'\1'`, sizeof (char) * SBC_MAX);
417	if (type == END_OF_RE)
418	bufp->can_be_null = `1`;
419	return;
420	}
421	}
422	}
423
424	/ Entry point for POSIX code. /
425	/ regcomp takes a regular expression as a string and compiles it.*
426
427	PREG is a regex_t . We do not expect any fields to be initialized,*
428	since POSIX says we shouldn't. Thus, we set
429
430	'buffer' to the compiled pattern;
431	'used' to the length of the compiled pattern;
432	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433	REG_EXTENDED bit in CFLAGS is set; otherwise, to
434	RE_SYNTAX_POSIX_BASIC;
435	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436	'fastmap' to an allocated space for the fastmap;
437	'fastmap_accurate' to zero;
438	're_nsub' to the number of subexpressions in PATTERN.
439
440	PATTERN is the address of the pattern string.
441
442	CFLAGS is a series of bits which affect compilation.
443
444	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445	use POSIX basic syntax.
446
447	If REG_NEWLINE is set, then . and [^...] don't match newline.
448	Also, regexec will try a match beginning after every newline.
449
450	If REG_ICASE is set, then we considers upper- and lowercase
451	versions of letters to be equivalent when matching.
452
453	If REG_NOSUB is set, then when PREG is passed to regexec, that
454	routine will report only success or failure, and nothing about the
455	registers.
456
457	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458	the return codes and their meanings.) /*
459
460	int
461	regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462	{
463	reg_errcode_t ret;
464	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465	: RE_SYNTAX_POSIX_BASIC);
466
467	preg->buffer = NULL;
468	preg->allocated = `0`;
469	preg->used = `0`;
470
471	/ Try to allocate space for the fastmap. /
472	preg->fastmap = re_malloc (char, SBC_MAX);
473	if (__glibc_unlikely (preg->fastmap == NULL))
474	return REG_ESPACE;
475
476	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : `0`;
477
478	/ If REG_NEWLINE is set, newlines are treated differently. /
479	if (cflags & REG_NEWLINE)
480	{ / REG_NEWLINE implies neither . nor [^...] match newline. /
481	syntax &= ~RE_DOT_NEWLINE;
482	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
483	/ It also changes the matching behavior. /
484	preg->newline_anchor = `1`;
485	}
486	else
487	preg->newline_anchor = `0`;
488	preg->no_sub = !!(cflags & REG_NOSUB);
489	preg->translate = NULL;
490
491	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493	/ POSIX doesn't distinguish between an unmatched open-group and an*
494	unmatched close-group: both are REG_EPAREN. /*
495	if (ret == REG_ERPAREN)
496	ret = REG_EPAREN;
497
498	/ We have already checked preg->fastmap != NULL. /
499	if (__glibc_likely (ret == REG_NOERROR))
500	/ Compute the fastmap now, since regexec cannot modify the pattern*
501	buffer. This function never fails in this implementation. /*
502	(void) re_compile_fastmap (preg);
503	else
504	{
505	/ Some error occurred while compiling the expression. /
506	re_free (preg->fastmap);
507	preg->fastmap = NULL;
508	}
509
510	return (int) ret;
511	}
512	libc_hidden_def (__regcomp)
513	weak_alias (__regcomp, regcomp)
514
515	/ Returns a message corresponding to an error code, ERRCODE, returned*
516	from either regcomp or regexec. We don't use PREG here. /*
517
518	size_t
519	regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520	size_t errbuf_size)
521	{
522	const char *msg;
523	size_t msg_size;
524	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[`0`];
525
526	if (__glibc_unlikely (errcode < `0` \|\| errcode >= nerrcodes))
527	/ Only error codes returned by the rest of the code should be passed*
528	to this routine. If we are given anything else, or if other regex
529	code generates an invalid error code, then the program has a bug.
530	Dump core so we can fix it. /*
531	abort ();
532
533	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535	msg_size = strlen (msg) + `1`; / Includes the null. /
536
537	if (__glibc_likely (errbuf_size != `0`))
538	{
539	size_t cpy_size = msg_size;
540	if (__glibc_unlikely (msg_size > errbuf_size))
541	{
542	cpy_size = errbuf_size - `1`;
543	errbuf[cpy_size] = `'\0'`;
544	}
545	memcpy (errbuf, msg, cpy_size);
546	}
547
548	return msg_size;
549	}
550	weak_alias (__regerror, regerror)
551
552
553	#ifdef RE_ENABLE_I18N
554	/ This static array is used for the map to single-byte characters when*
555	UTF-8 is used. Otherwise we would allocate memory just to initialize
556	it the same all the time. UTF-8 is the preferred encoding so this is
557	a worthwhile optimization. /*
558	static const bitset_t utf8_sb_map =
559	{
560	/ Set the first 128 bits. /
561	# if defined __GNUC__ && !defined __STRICT_ANSI__
562	[`0` ... `0x80` / BITSET_WORD_BITS - `1`] = BITSET_WORD_MAX
563	# else
564	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565	# error "bitset_word_t is narrower than 32 bits"
566	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569	BITSET_WORD_MAX, BITSET_WORD_MAX,
570	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571	BITSET_WORD_MAX,
572	# endif
573	(BITSET_WORD_MAX
574	>> (SBC_MAX % BITSET_WORD_BITS == `0`
575	? `0`
576	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577	# endif
578	};
579	#endif
580
581
582	static void
583	free_dfa_content (re_dfa_t *dfa)
584	{
585	Idx i, j;
586
587	if (dfa->nodes)
588	for (i = `0`; i < dfa->nodes_len; ++i)
589	free_token (dfa->nodes + i);
590	re_free (dfa->nexts);
591	for (i = `0`; i < dfa->nodes_len; ++i)
592	{
593	if (dfa->eclosures != NULL)
594	re_node_set_free (dfa->eclosures + i);
595	if (dfa->inveclosures != NULL)
596	re_node_set_free (dfa->inveclosures + i);
597	if (dfa->edests != NULL)
598	re_node_set_free (dfa->edests + i);
599	}
600	re_free (dfa->edests);
601	re_free (dfa->eclosures);
602	re_free (dfa->inveclosures);
603	re_free (dfa->nodes);
604
605	if (dfa->state_table)
606	for (i = `0`; i <= dfa->state_hash_mask; ++i)
607	{
608	struct re_state_table_entry *entry = dfa->state_table + i;
609	for (j = `0`; j < entry->num; ++j)
610	{
611	re_dfastate_t *state = entry->array[j];
612	free_state (state);
613	}
614	re_free (entry->array);
615	}
616	re_free (dfa->state_table);
617	#ifdef RE_ENABLE_I18N
618	if (dfa->sb_char != utf8_sb_map)
619	re_free (dfa->sb_char);
620	#endif
621	re_free (dfa->subexp_map);
622	#ifdef DEBUG
623	re_free (dfa->re_str);
624	#endif
625
626	re_free (dfa);
627	}
628
629
630	/ Free dynamically allocated space used by PREG. /
631
632	void
633	regfree (regex_t *preg)
634	{
635	re_dfa_t *dfa = preg->buffer;
636	if (__glibc_likely (dfa != NULL))
637	{
638	lock_fini (dfa->lock);
639	free_dfa_content (dfa);
640	}
641	preg->buffer = NULL;
642	preg->allocated = `0`;
643
644	re_free (preg->fastmap);
645	preg->fastmap = NULL;
646
647	re_free (preg->translate);
648	preg->translate = NULL;
649	}
650	libc_hidden_def (__regfree)
651	weak_alias (__regfree, regfree)
652
653	/ Entry points compatible with 4.2 BSD regex library. We don't define*
654	them unless specifically requested. /*
655
656	#if defined _REGEX_RE_COMP \|\| defined _LIBC
657
658	/ BSD has one and only one pattern buffer. /
659	static struct re_pattern_buffer re_comp_buf;
660
661	char *
662	# ifdef _LIBC
663	/ Make these definitions weak in libc, so POSIX programs can redefine*
664	these names if they don't use our functions, and still use
665	regcomp/regexec above without link errors. /*
666	weak_function
667	# endif
668	re_comp (const char *s)
669	{
670	reg_errcode_t ret;
671	char *fastmap;
672
673	if (!s)
674	{
675	if (!re_comp_buf.buffer)
676	return gettext ("No previous regular expression");
677	return `0`;
678	}
679
680	if (re_comp_buf.buffer)
681	{
682	fastmap = re_comp_buf.fastmap;
683	re_comp_buf.fastmap = NULL;
684	__regfree (&re_comp_buf);
685	memset (&re_comp_buf, `'\0'`, sizeof (re_comp_buf));
686	re_comp_buf.fastmap = fastmap;
687	}
688
689	if (re_comp_buf.fastmap == NULL)
690	{
691	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692	if (re_comp_buf.fastmap == NULL)
693	return (char *) gettext (__re_error_msgid
694	+ __re_error_msgid_idx[(int) REG_ESPACE]);
695	}
696
697	/ Since 're_exec' always passes NULL for the 'regs' argument, we*
698	don't need to initialize the pattern buffer fields which affect it. /*
699
700	/ Match anchors at newlines. /
701	re_comp_buf.newline_anchor = `1`;
702
703	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705	if (!ret)
706	return NULL;
707
708	/ Yes, we're discarding 'const' here if !HAVE_LIBINTL. /
709	return (char ) gettext (__re_error_msgid + __re_error_msgid_idx[(int*) ret]);
710	}
711
712	#ifdef _LIBC
713	libc_freeres_fn (free_mem)
714	{
715	__regfree (&re_comp_buf);
716	}
717	#endif
718
719	#endif /* _REGEX_RE_COMP */
720
721	/ Internal entry point.*
722	Compile the regular expression PATTERN, whose length is LENGTH.
723	SYNTAX indicate regular expression's syntax. /*
724
725	static reg_errcode_t
726	re_compile_internal (regex_t preg, const* char * pattern, size_t length,
727	reg_syntax_t syntax)
728	{
729	reg_errcode_t err = REG_NOERROR;
730	re_dfa_t *dfa;
731	re_string_t regexp;
732
733	/ Initialize the pattern buffer. /
734	preg->fastmap_accurate = `0`;
735	preg->syntax = syntax;
736	preg->not_bol = preg->not_eol = `0`;
737	preg->used = `0`;
738	preg->re_nsub = `0`;
739	preg->can_be_null = `0`;
740	preg->regs_allocated = REGS_UNALLOCATED;
741
742	/ Initialize the dfa. /
743	dfa = preg->buffer;
744	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745	{
746	/ If zero allocated, but buffer is non-null, try to realloc*
747	enough space. This loses if buffer's address is bogus, but
748	that is the user's responsibility. If ->buffer is NULL this
749	is a simple allocation. /*
750	dfa = re_realloc (preg->buffer, re_dfa_t, `1`);
751	if (dfa == NULL)
752	return REG_ESPACE;
753	preg->allocated = sizeof (re_dfa_t);
754	preg->buffer = dfa;
755	}
756	preg->used = sizeof (re_dfa_t);
757
758	err = init_dfa (dfa, length);
759	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != `0`))
760	err = REG_ESPACE;
761	if (__glibc_unlikely (err != REG_NOERROR))
762	{
763	free_dfa_content (dfa);
764	preg->buffer = NULL;
765	preg->allocated = `0`;
766	return err;
767	}
768	#ifdef DEBUG
769	/ Note: length+1 will not overflow since it is checked in init_dfa. /
770	dfa->re_str = re_malloc (char, length + `1`);
771	strncpy (dfa->re_str, pattern, length + `1`);
772	#endif
773
774	err = re_string_construct (&regexp, pattern, length, preg->translate,
775	(syntax & RE_ICASE) != `0`, dfa);
776	if (__glibc_unlikely (err != REG_NOERROR))
777	{
778	re_compile_internal_free_return:
779	free_workarea_compile (preg);
780	re_string_destruct (&regexp);
781	lock_fini (dfa->lock);
782	free_dfa_content (dfa);
783	preg->buffer = NULL;
784	preg->allocated = `0`;
785	return err;
786	}
787
788	/ Parse the regular expression, and build a structure tree. /
789	preg->re_nsub = `0`;
790	dfa->str_tree = parse (&regexp, preg, syntax, &err);
791	if (__glibc_unlikely (dfa->str_tree == NULL))
792	goto re_compile_internal_free_return;
793
794	/ Analyze the tree and create the nfa. /
795	err = analyze (preg);
796	if (__glibc_unlikely (err != REG_NOERROR))
797	goto re_compile_internal_free_return;
798
799	#ifdef RE_ENABLE_I18N
800	/ If possible, do searching in single byte encoding to speed things up. /
801	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802	optimize_utf8 (dfa);
803	#endif
804
805	/ Then create the initial state of the dfa. /
806	err = create_initial_state (dfa);
807
808	/ Release work areas. /
809	free_workarea_compile (preg);
810	re_string_destruct (&regexp);
811
812	if (__glibc_unlikely (err != REG_NOERROR))
813	{
814	lock_fini (dfa->lock);
815	free_dfa_content (dfa);
816	preg->buffer = NULL;
817	preg->allocated = `0`;
818	}
819
820	return err;
821	}
822
823	/ Initialize DFA. We use the length of the regular expression PAT_LEN*
824	as the initial length of some arrays. /*
825
826	static reg_errcode_t
827	init_dfa (re_dfa_t *dfa, size_t pat_len)
828	{
829	__re_size_t table_size;
830	#ifndef _LIBC
831	const char *codeset_name;
832	#endif
833	#ifdef RE_ENABLE_I18N
834	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835	#else
836	size_t max_i18n_object_size = `0`;
837	#endif
838	size_t max_object_size =
839	MAX (sizeof (struct re_state_table_entry),
840	MAX (sizeof (re_token_t),
841	MAX (sizeof (re_node_set),
842	MAX (sizeof (regmatch_t),
843	max_i18n_object_size))));
844
845	memset (dfa, `'\0'`, sizeof (re_dfa_t));
846
847	/ Force allocation of str_tree_storage the first time. /
848	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
850	/ Avoid overflows. The extra "/ 2" is for the table_size doubling*
851	calculation below, and for similar doubling calculations
852	elsewhere. And it's <= rather than <, because some of the
853	doubling calculations add 1 afterwards. /*
854	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / `2`
855	<= pat_len))
856	return REG_ESPACE;
857
858	dfa->nodes_alloc = pat_len + `1`;
859	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861	/ table_size = 2 ^ ceil(log pat_len) /
862	for (table_size = `1`; ; table_size <<= `1`)
863	if (table_size > pat_len)
864	break;
865
866	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867	dfa->state_hash_mask = table_size - `1`;
868
869	dfa->mb_cur_max = MB_CUR_MAX;
870	#ifdef _LIBC
871	if (dfa->mb_cur_max == `6`
872	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == `0`)
873	dfa->is_utf8 = `1`;
874	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875	!= `0`);
876	#else
877	codeset_name = nl_langinfo (CODESET);
878	if ((codeset_name[`0`] == `'U'` \|\| codeset_name[`0`] == `'u'`)
879	&& (codeset_name[`1`] == `'T'` \|\| codeset_name[`1`] == `'t'`)
880	&& (codeset_name[`2`] == `'F'` \|\| codeset_name[`2`] == `'f'`)
881	&& strcmp (codeset_name + `3` + (codeset_name[`3`] == `'-'`), "8") == `0`)
882	dfa->is_utf8 = `1`;
883
884	/ We check exhaustively in the loop below if this charset is a*
885	superset of ASCII. /*
886	dfa->map_notascii = `0`;
887	#endif
888
889	#ifdef RE_ENABLE_I18N
890	if (dfa->mb_cur_max > `1`)
891	{
892	if (dfa->is_utf8)
893	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894	else
895	{
896	int i, j, ch;
897
898	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
899	if (__glibc_unlikely (dfa->sb_char == NULL))
900	return REG_ESPACE;
901
902	/ Set the bits corresponding to single byte chars. /
903	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
904	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
905	{
906	wint_t wch = __btowc (ch);
907	if (wch != WEOF)
908	dfa->sb_char[i] \|= (bitset_word_t) `1` << j;
909	# ifndef _LIBC
910	if (isascii (ch) && wch != ch)
911	dfa->map_notascii = `1`;
912	# endif
913	}
914	}
915	}
916	#endif
917
918	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
919	return REG_ESPACE;
920	return REG_NOERROR;
921	}
922
923	/ Initialize WORD_CHAR table, which indicate which character is*
924	"word". In this case "word" means that it is the word construction
925	character used by some operators like "\<", "\>", etc. /*
926
927	static void
928	init_word_char (re_dfa_t *dfa)
929	{
930	int i = `0`;
931	int j;
932	int ch = `0`;
933	dfa->word_ops_used = `1`;
934	if (__glibc_likely (dfa->map_notascii == `0`))
935	{
936	/ Avoid uint32_t and uint64_t as some non-GCC platforms lack*
937	them, an issue when this code is used in Gnulib. /*
938	bitset_word_t bits0 = `0x00000000`;
939	bitset_word_t bits1 = `0x03ff0000`;
940	bitset_word_t bits2 = `0x87fffffe`;
941	bitset_word_t bits3 = `0x07fffffe`;
942	if (BITSET_WORD_BITS == `64`)
943	{
944	/ Pacify gcc -Woverflow on 32-bit platformns. /
945	dfa->word_char[`0`] = bits1 << `31` << `1` \| bits0;
946	dfa->word_char[`1`] = bits3 << `31` << `1` \| bits2;
947	i = `2`;
948	}
949	else if (BITSET_WORD_BITS == `32`)
950	{
951	dfa->word_char[`0`] = bits0;
952	dfa->word_char[`1`] = bits1;
953	dfa->word_char[`2`] = bits2;
954	dfa->word_char[`3`] = bits3;
955	i = `4`;
956	}
957	else
958	goto general_case;
959	ch = `128`;
960
961	if (__glibc_likely (dfa->is_utf8))
962	{
963	memset (&dfa->word_char[i], `'\0'`, (SBC_MAX - ch) / `8`);
964	return;
965	}
966	}
967
968	general_case:
969	for (; i < BITSET_WORDS; ++i)
970	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
971	if (isalnum (ch) \|\| ch == `'_'`)
972	dfa->word_char[i] \|= (bitset_word_t) `1` << j;
973	}
974
975	/ Free the work area which are only used while compiling. /
976
977	static void
978	free_workarea_compile (regex_t *preg)
979	{
980	re_dfa_t *dfa = preg->buffer;
981	bin_tree_storage_t storage, next;
982	for (storage = dfa->str_tree_storage; storage; storage = next)
983	{
984	next = storage->next;
985	re_free (storage);
986	}
987	dfa->str_tree_storage = NULL;
988	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989	dfa->str_tree = NULL;
990	re_free (dfa->org_indices);
991	dfa->org_indices = NULL;
992	}
993
994	/ Create initial states for all contexts. /
995
996	static reg_errcode_t
997	create_initial_state (re_dfa_t *dfa)
998	{
999	Idx first, i;
1000	reg_errcode_t err;
1001	re_node_set init_nodes;
1002
1003	/ Initial states have the epsilon closure of the node which is*
1004	the first node of the regular expression. /*
1005	first = dfa->str_tree->first->node_idx;
1006	dfa->init_node = first;
1007	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008	if (__glibc_unlikely (err != REG_NOERROR))
1009	return err;
1010
1011	/ The back-references which are in initial states can epsilon transit,*
1012	since in this case all of the subexpressions can be null.
1013	Then we add epsilon closures of the nodes which are the next nodes of
1014	the back-references. /*
1015	if (dfa->nbackref > `0`)
1016	for (i = `0`; i < init_nodes.nelem; ++i)
1017	{
1018	Idx node_idx = init_nodes.elems[i];
1019	re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021	Idx clexp_idx;
1022	if (type != OP_BACK_REF)
1023	continue;
1024	for (clexp_idx = `0`; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025	{
1026	re_token_t *clexp_node;
1027	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028	if (clexp_node->type == OP_CLOSE_SUBEXP
1029	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030	break;
1031	}
1032	if (clexp_idx == init_nodes.nelem)
1033	continue;
1034
1035	if (type == OP_BACK_REF)
1036	{
1037	Idx dest_idx = dfa->edests[node_idx].elems[`0`];
1038	if (!re_node_set_contains (&init_nodes, dest_idx))
1039	{
1040	reg_errcode_t merge_err
1041	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042	if (merge_err != REG_NOERROR)
1043	return merge_err;
1044	i = `0`;
1045	}
1046	}
1047	}
1048
1049	/ It must be the first time to invoke acquire_state. /
1050	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, `0`);
1051	/ We don't check ERR here, since the initial state must not be NULL. /
1052	if (__glibc_unlikely (dfa->init_state == NULL))
1053	return err;
1054	if (dfa->init_state->has_constraint)
1055	{
1056	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057	CONTEXT_WORD);
1058	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059	CONTEXT_NEWLINE);
1060	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061	&init_nodes,
1062	CONTEXT_NEWLINE
1063	\| CONTEXT_BEGBUF);
1064	if (__glibc_unlikely (dfa->init_state_word == NULL
1065	\|\| dfa->init_state_nl == NULL
1066	\|\| dfa->init_state_begbuf == NULL))
1067	return err;
1068	}
1069	else
1070	dfa->init_state_word = dfa->init_state_nl
1071	= dfa->init_state_begbuf = dfa->init_state;
1072
1073	re_node_set_free (&init_nodes);
1074	return REG_NOERROR;
1075	}
1076
1077	#ifdef RE_ENABLE_I18N
1078	/ If it is possible to do searching in single byte encoding instead of UTF-8*
1079	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080	DFA nodes where needed. /*
1081
1082	static void
1083	optimize_utf8 (re_dfa_t *dfa)
1084	{
1085	Idx node;
1086	int i;
1087	bool mb_chars = false;
1088	bool has_period = false;
1089
1090	for (node = `0`; node < dfa->nodes_len; ++node)
1091	switch (dfa->nodes[node].type)
1092	{
1093	case CHARACTER:
1094	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095	mb_chars = true;
1096	break;
1097	case ANCHOR:
1098	switch (dfa->nodes[node].opr.ctx_type)
1099	{
1100	case LINE_FIRST:
1101	case LINE_LAST:
1102	case BUF_FIRST:
1103	case BUF_LAST:
1104	break;
1105	default:
1106	/ Word anchors etc. cannot be handled. It's okay to test*
1107	opr.ctx_type since constraints (for all DFA nodes) are
1108	created by ORing one or more opr.ctx_type values. /*
1109	return;
1110	}
1111	break;
1112	case OP_PERIOD:
1113	has_period = true;
1114	break;
1115	case OP_BACK_REF:
1116	case OP_ALT:
1117	case END_OF_RE:
1118	case OP_DUP_ASTERISK:
1119	case OP_OPEN_SUBEXP:
1120	case OP_CLOSE_SUBEXP:
1121	break;
1122	case COMPLEX_BRACKET:
1123	return;
1124	case SIMPLE_BRACKET:
1125	/ Just double check. /
1126	{
1127	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == `0`
1128	? `0`
1129	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131	{
1132	if (dfa->nodes[node].opr.sbcset[i] >> rshift != `0`)
1133	return;
1134	rshift = `0`;
1135	}
1136	}
1137	break;
1138	default:
1139	abort ();
1140	}
1141
1142	if (mb_chars \|\| has_period)
1143	for (node = `0`; node < dfa->nodes_len; ++node)
1144	{
1145	if (dfa->nodes[node].type == CHARACTER
1146	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1147	dfa->nodes[node].mb_partial = `0`;
1148	else if (dfa->nodes[node].type == OP_PERIOD)
1149	dfa->nodes[node].type = OP_UTF8_PERIOD;
1150	}
1151
1152	/ The search can be in single byte locale. /
1153	dfa->mb_cur_max = `1`;
1154	dfa->is_utf8 = `0`;
1155	dfa->has_mb_node = dfa->nbackref > `0` \|\| has_period;
1156	}
1157	#endif
1158
1159	/ Analyze the structure tree, and calculate "first", "next", "edest",*
1160	"eclosure", and "inveclosure". /*
1161
1162	static reg_errcode_t
1163	analyze (regex_t *preg)
1164	{
1165	re_dfa_t *dfa = preg->buffer;
1166	reg_errcode_t ret;
1167
1168	/ Allocate arrays. /
1169	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1174	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1175	return REG_ESPACE;
1176
1177	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178	if (dfa->subexp_map != NULL)
1179	{
1180	Idx i;
1181	for (i = `0`; i < preg->re_nsub; i++)
1182	dfa->subexp_map[i] = i;
1183	preorder (dfa->str_tree, optimize_subexps, dfa);
1184	for (i = `0`; i < preg->re_nsub; i++)
1185	if (dfa->subexp_map[i] != i)
1186	break;
1187	if (i == preg->re_nsub)
1188	{
1189	re_free (dfa->subexp_map);
1190	dfa->subexp_map = NULL;
1191	}
1192	}
1193
1194	ret = postorder (dfa->str_tree, lower_subexps, preg);
1195	if (__glibc_unlikely (ret != REG_NOERROR))
1196	return ret;
1197	ret = postorder (dfa->str_tree, calc_first, dfa);
1198	if (__glibc_unlikely (ret != REG_NOERROR))
1199	return ret;
1200	preorder (dfa->str_tree, calc_next, dfa);
1201	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202	if (__glibc_unlikely (ret != REG_NOERROR))
1203	return ret;
1204	ret = calc_eclosure (dfa);
1205	if (__glibc_unlikely (ret != REG_NOERROR))
1206	return ret;
1207
1208	/ We only need this during the prune_impossible_nodes pass in regexec.c;*
1209	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. /*
1210	if ((!preg->no_sub && preg->re_nsub > `0` && dfa->has_plural_match)
1211	\|\| dfa->nbackref)
1212	{
1213	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214	if (__glibc_unlikely (dfa->inveclosures == NULL))
1215	return REG_ESPACE;
1216	ret = calc_inveclosure (dfa);
1217	}
1218
1219	return ret;
1220	}
1221
1222	/ Our parse trees are very unbalanced, so we cannot use a stack to*
1223	implement parse tree visits. Instead, we use parent pointers and
1224	some hairy code in these two functions. /*
1225	static reg_errcode_t
1226	postorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1227	void *extra)
1228	{
1229	bin_tree_t node, prev;
1230
1231	for (node = root; ; )
1232	{
1233	/ Descend down the tree, preferably to the left (or to the right*
1234	if that's the only child). /*
1235	while (node->left \|\| node->right)
1236	if (node->left)
1237	node = node->left;
1238	else
1239	node = node->right;
1240
1241	do
1242	{
1243	reg_errcode_t err = fn (extra, node);
1244	if (__glibc_unlikely (err != REG_NOERROR))
1245	return err;
1246	if (node->parent == NULL)
1247	return REG_NOERROR;
1248	prev = node;
1249	node = node->parent;
1250	}
1251	/ Go up while we have a node that is reached from the right. /
1252	while (node->right == prev \|\| node->right == NULL);
1253	node = node->right;
1254	}
1255	}
1256
1257	static reg_errcode_t
1258	preorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1259	void *extra)
1260	{
1261	bin_tree_t *node;
1262
1263	for (node = root; ; )
1264	{
1265	reg_errcode_t err = fn (extra, node);
1266	if (__glibc_unlikely (err != REG_NOERROR))
1267	return err;
1268
1269	/ Go to the left node, or up and to the right. /
1270	if (node->left)
1271	node = node->left;
1272	else
1273	{
1274	bin_tree_t *prev = NULL;
1275	while (node->right == prev \|\| node->right == NULL)
1276	{
1277	prev = node;
1278	node = node->parent;
1279	if (!node)
1280	return REG_NOERROR;
1281	}
1282	node = node->right;
1283	}
1284	}
1285	}
1286
1287	/ Optimization pass: if a SUBEXP is entirely contained, strip it and tell*
1288	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289	backreferences as well. Requires a preorder visit. /*
1290	static reg_errcode_t
1291	optimize_subexps (void extra, bin_tree_t node)
1292	{
1293	re_dfa_t dfa = (re_dfa_t ) extra;
1294
1295	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296	{
1297	int idx = node->token.opr.idx;
1298	node->token.opr.idx = dfa->subexp_map[idx];
1299	dfa->used_bkref_map \|= `1` << node->token.opr.idx;
1300	}
1301
1302	else if (node->token.type == SUBEXP
1303	&& node->left && node->left->token.type == SUBEXP)
1304	{
1305	Idx other_idx = node->left->token.opr.idx;
1306
1307	node->left = node->left->left;
1308	if (node->left)
1309	node->left->parent = node;
1310
1311	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312	if (other_idx < BITSET_WORD_BITS)
1313	dfa->used_bkref_map &= ~((bitset_word_t) `1` << other_idx);
1314	}
1315
1316	return REG_NOERROR;
1317	}
1318
1319	/ Lowering pass: Turn each SUBEXP node into the appropriate concatenation*
1320	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. /*
1321	static reg_errcode_t
1322	lower_subexps (void extra, bin_tree_t node)
1323	{
1324	regex_t preg = (regex_t ) extra;
1325	reg_errcode_t err = REG_NOERROR;
1326
1327	if (node->left && node->left->token.type == SUBEXP)
1328	{
1329	node->left = lower_subexp (&err, preg, node->left);
1330	if (node->left)
1331	node->left->parent = node;
1332	}
1333	if (node->right && node->right->token.type == SUBEXP)
1334	{
1335	node->right = lower_subexp (&err, preg, node->right);
1336	if (node->right)
1337	node->right->parent = node;
1338	}
1339
1340	return err;
1341	}
1342
1343	static bin_tree_t *
1344	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1345	{
1346	re_dfa_t *dfa = preg->buffer;
1347	bin_tree_t *body = node->left;
1348	bin_tree_t op, cls, tree1, tree;
1349
1350	if (preg->no_sub
1351	/ We do not optimize empty subexpressions, because otherwise we may*
1352	have bad CONCAT nodes with NULL children. This is obviously not
1353	very common, so we do not lose much. An example that triggers
1354	this case is the sed "script" //x. /*
1355	&& node->left != NULL
1356	&& (node->token.opr.idx >= BITSET_WORD_BITS
1357	\|\| !(dfa->used_bkref_map
1358	& ((bitset_word_t) `1` << node->token.opr.idx))))
1359	return node->left;
1360
1361	/ Convert the SUBEXP node to the concatenation of an*
1362	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. /*
1363	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366	tree = create_tree (dfa, op, tree1, CONCAT);
1367	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1368	\|\| op == NULL \|\| cls == NULL))
1369	{
1370	*err = REG_ESPACE;
1371	return NULL;
1372	}
1373
1374	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376	return tree;
1377	}
1378
1379	/ Pass 1 in building the NFA: compute FIRST and create unlinked automaton*
1380	nodes. Requires a postorder visit. /*
1381	static reg_errcode_t
1382	calc_first (void extra, bin_tree_t node)
1383	{
1384	re_dfa_t dfa = (re_dfa_t ) extra;
1385	if (node->token.type == CONCAT)
1386	{
1387	node->first = node->left->first;
1388	node->node_idx = node->left->node_idx;
1389	}
1390	else
1391	{
1392	node->first = node;
1393	node->node_idx = re_dfa_add_node (dfa, node->token);
1394	if (__glibc_unlikely (node->node_idx == -`1`))
1395	return REG_ESPACE;
1396	if (node->token.type == ANCHOR)
1397	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398	}
1399	return REG_NOERROR;
1400	}
1401
1402	/ Pass 2: compute NEXT on the tree. Preorder visit. /
1403	static reg_errcode_t
1404	calc_next (void extra, bin_tree_t node)
1405	{
1406	switch (node->token.type)
1407	{
1408	case OP_DUP_ASTERISK:
1409	node->left->next = node;
1410	break;
1411	case CONCAT:
1412	node->left->next = node->right->first;
1413	node->right->next = node->next;
1414	break;
1415	default:
1416	if (node->left)
1417	node->left->next = node->next;
1418	if (node->right)
1419	node->right->next = node->next;
1420	break;
1421	}
1422	return REG_NOERROR;
1423	}
1424
1425	/ Pass 3: link all DFA nodes to their NEXT node (any order will do). /
1426	static reg_errcode_t
1427	link_nfa_nodes (void extra, bin_tree_t node)
1428	{
1429	re_dfa_t dfa = (re_dfa_t ) extra;
1430	Idx idx = node->node_idx;
1431	reg_errcode_t err = REG_NOERROR;
1432
1433	switch (node->token.type)
1434	{
1435	case CONCAT:
1436	break;
1437
1438	case END_OF_RE:
1439	assert (node->next == NULL);
1440	break;
1441
1442	case OP_DUP_ASTERISK:
1443	case OP_ALT:
1444	{
1445	Idx left, right;
1446	dfa->has_plural_match = `1`;
1447	if (node->left != NULL)
1448	left = node->left->first->node_idx;
1449	else
1450	left = node->next->node_idx;
1451	if (node->right != NULL)
1452	right = node->right->first->node_idx;
1453	else
1454	right = node->next->node_idx;
1455	assert (left > -`1`);
1456	assert (right > -`1`);
1457	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458	}
1459	break;
1460
1461	case ANCHOR:
1462	case OP_OPEN_SUBEXP:
1463	case OP_CLOSE_SUBEXP:
1464	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465	break;
1466
1467	case OP_BACK_REF:
1468	dfa->nexts[idx] = node->next->node_idx;
1469	if (node->token.type == OP_BACK_REF)
1470	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471	break;
1472
1473	default:
1474	assert (!IS_EPSILON_NODE (node->token.type));
1475	dfa->nexts[idx] = node->next->node_idx;
1476	break;
1477	}
1478
1479	return err;
1480	}
1481
1482	/ Duplicate the epsilon closure of the node ROOT_NODE.*
1483	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484	to their own constraint. /*
1485
1486	static reg_errcode_t
1487	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488	Idx root_node, unsigned int init_constraint)
1489	{
1490	Idx org_node, clone_node;
1491	bool ok;
1492	unsigned int constraint = init_constraint;
1493	for (org_node = top_org_node, clone_node = top_clone_node;;)
1494	{
1495	Idx org_dest, clone_dest;
1496	if (dfa->nodes[org_node].type == OP_BACK_REF)
1497	{
1498	/ If the back reference epsilon-transit, its destination must*
1499	also have the constraint. Then duplicate the epsilon closure
1500	of the destination of the back reference, and store it in
1501	edests of the back reference. /*
1502	org_dest = dfa->nexts[org_node];
1503	re_node_set_empty (dfa->edests + clone_node);
1504	clone_dest = duplicate_node (dfa, org_dest, constraint);
1505	if (__glibc_unlikely (clone_dest == -`1`))
1506	return REG_ESPACE;
1507	dfa->nexts[clone_node] = dfa->nexts[org_node];
1508	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509	if (__glibc_unlikely (! ok))
1510	return REG_ESPACE;
1511	}
1512	else if (dfa->edests[org_node].nelem == `0`)
1513	{
1514	/ In case of the node can't epsilon-transit, don't duplicate the*
1515	destination and store the original destination as the
1516	destination of the node. /*
1517	dfa->nexts[clone_node] = dfa->nexts[org_node];
1518	break;
1519	}
1520	else if (dfa->edests[org_node].nelem == `1`)
1521	{
1522	/ In case of the node can epsilon-transit, and it has only one*
1523	destination. /*
1524	org_dest = dfa->edests[org_node].elems[`0`];
1525	re_node_set_empty (dfa->edests + clone_node);
1526	/ If the node is root_node itself, it means the epsilon closure*
1527	has a loop. Then tie it to the destination of the root_node. /*
1528	if (org_node == root_node && clone_node != org_node)
1529	{
1530	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531	if (__glibc_unlikely (! ok))
1532	return REG_ESPACE;
1533	break;
1534	}
1535	/ In case the node has another constraint, append it. /
1536	constraint \|= dfa->nodes[org_node].constraint;
1537	clone_dest = duplicate_node (dfa, org_dest, constraint);
1538	if (__glibc_unlikely (clone_dest == -`1`))
1539	return REG_ESPACE;
1540	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541	if (__glibc_unlikely (! ok))
1542	return REG_ESPACE;
1543	}
1544	else / dfa->edests[org_node].nelem == 2 /
1545	{
1546	/ In case of the node can epsilon-transit, and it has two*
1547	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1548	org_dest = dfa->edests[org_node].elems[`0`];
1549	re_node_set_empty (dfa->edests + clone_node);
1550	/ Search for a duplicated node which satisfies the constraint. /
1551	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552	if (clone_dest == -`1`)
1553	{
1554	/ There is no such duplicated node, create a new one. /
1555	reg_errcode_t err;
1556	clone_dest = duplicate_node (dfa, org_dest, constraint);
1557	if (__glibc_unlikely (clone_dest == -`1`))
1558	return REG_ESPACE;
1559	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560	if (__glibc_unlikely (! ok))
1561	return REG_ESPACE;
1562	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563	root_node, constraint);
1564	if (__glibc_unlikely (err != REG_NOERROR))
1565	return err;
1566	}
1567	else
1568	{
1569	/ There is a duplicated node which satisfies the constraint,*
1570	use it to avoid infinite loop. /*
1571	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572	if (__glibc_unlikely (! ok))
1573	return REG_ESPACE;
1574	}
1575
1576	org_dest = dfa->edests[org_node].elems[`1`];
1577	clone_dest = duplicate_node (dfa, org_dest, constraint);
1578	if (__glibc_unlikely (clone_dest == -`1`))
1579	return REG_ESPACE;
1580	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581	if (__glibc_unlikely (! ok))
1582	return REG_ESPACE;
1583	}
1584	org_node = org_dest;
1585	clone_node = clone_dest;
1586	}
1587	return REG_NOERROR;
1588	}
1589
1590	/ Search for a node which is duplicated from the node ORG_NODE, and*
1591	satisfies the constraint CONSTRAINT. /*
1592
1593	static Idx
1594	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595	unsigned int constraint)
1596	{
1597	Idx idx;
1598	for (idx = dfa->nodes_len - `1`; dfa->nodes[idx].duplicated && idx > `0`; --idx)
1599	{
1600	if (org_node == dfa->org_indices[idx]
1601	&& constraint == dfa->nodes[idx].constraint)
1602	return idx; / Found. /
1603	}
1604	return -`1`; / Not found. /
1605	}
1606
1607	/ Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.*
1608	Return the index of the new node, or -1 if insufficient storage is
1609	available. /*
1610
1611	static Idx
1612	duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint)
1613	{
1614	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615	if (__glibc_likely (dup_idx != -`1`))
1616	{
1617	dfa->nodes[dup_idx].constraint = constraint;
1618	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1619	dfa->nodes[dup_idx].duplicated = `1`;
1620
1621	/ Store the index of the original node. /
1622	dfa->org_indices[dup_idx] = org_idx;
1623	}
1624	return dup_idx;
1625	}
1626
1627	static reg_errcode_t
1628	calc_inveclosure (re_dfa_t *dfa)
1629	{
1630	Idx src, idx;
1631	bool ok;
1632	for (idx = `0`; idx < dfa->nodes_len; ++idx)
1633	re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635	for (src = `0`; src < dfa->nodes_len; ++src)
1636	{
1637	Idx *elems = dfa->eclosures[src].elems;
1638	for (idx = `0`; idx < dfa->eclosures[src].nelem; ++idx)
1639	{
1640	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641	if (__glibc_unlikely (! ok))
1642	return REG_ESPACE;
1643	}
1644	}
1645
1646	return REG_NOERROR;
1647	}
1648
1649	/ Calculate "eclosure" for all the node in DFA. /
1650
1651	static reg_errcode_t
1652	calc_eclosure (re_dfa_t *dfa)
1653	{
1654	Idx node_idx;
1655	bool incomplete;
1656	#ifdef DEBUG
1657	assert (dfa->nodes_len > `0`);
1658	#endif
1659	incomplete = false;
1660	/ For each nodes, calculate epsilon closure. /
1661	for (node_idx = `0`; ; ++node_idx)
1662	{
1663	reg_errcode_t err;
1664	re_node_set eclosure_elem;
1665	if (node_idx == dfa->nodes_len)
1666	{
1667	if (!incomplete)
1668	break;
1669	incomplete = false;
1670	node_idx = `0`;
1671	}
1672
1673	#ifdef DEBUG
1674	assert (dfa->eclosures[node_idx].nelem != -`1`);
1675	#endif
1676
1677	/ If we have already calculated, skip it. /
1678	if (dfa->eclosures[node_idx].nelem != `0`)
1679	continue;
1680	/ Calculate epsilon closure of 'node_idx'. /
1681	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1682	if (__glibc_unlikely (err != REG_NOERROR))
1683	return err;
1684
1685	if (dfa->eclosures[node_idx].nelem == `0`)
1686	{
1687	incomplete = true;
1688	re_node_set_free (&eclosure_elem);
1689	}
1690	}
1691	return REG_NOERROR;
1692	}
1693
1694	/ Calculate epsilon closure of NODE. /
1695
1696	static reg_errcode_t
1697	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1698	{
1699	reg_errcode_t err;
1700	Idx i;
1701	re_node_set eclosure;
1702	bool ok;
1703	bool incomplete = false;
1704	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + `1`);
1705	if (__glibc_unlikely (err != REG_NOERROR))
1706	return err;
1707
1708	/ This indicates that we are calculating this node now.*
1709	We reference this value to avoid infinite loop. /*
1710	dfa->eclosures[node].nelem = -`1`;
1711
1712	/ If the current node has constraints, duplicate all nodes*
1713	since they must inherit the constraints. /*
1714	if (dfa->nodes[node].constraint
1715	&& dfa->edests[node].nelem
1716	&& !dfa->nodes[dfa->edests[node].elems[`0`]].duplicated)
1717	{
1718	err = duplicate_node_closure (dfa, node, node, node,
1719	dfa->nodes[node].constraint);
1720	if (__glibc_unlikely (err != REG_NOERROR))
1721	return err;
1722	}
1723
1724	/ Expand each epsilon destination nodes. /
1725	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1726	for (i = `0`; i < dfa->edests[node].nelem; ++i)
1727	{
1728	re_node_set eclosure_elem;
1729	Idx edest = dfa->edests[node].elems[i];
1730	/ If calculating the epsilon closure of 'edest' is in progress,*
1731	return intermediate result. /*
1732	if (dfa->eclosures[edest].nelem == -`1`)
1733	{
1734	incomplete = true;
1735	continue;
1736	}
1737	/ If we haven't calculated the epsilon closure of 'edest' yet,*
1738	calculate now. Otherwise use calculated epsilon closure. /*
1739	if (dfa->eclosures[edest].nelem == `0`)
1740	{
1741	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1742	if (__glibc_unlikely (err != REG_NOERROR))
1743	return err;
1744	}
1745	else
1746	eclosure_elem = dfa->eclosures[edest];
1747	/ Merge the epsilon closure of 'edest'. /
1748	err = re_node_set_merge (&eclosure, &eclosure_elem);
1749	if (__glibc_unlikely (err != REG_NOERROR))
1750	return err;
1751	/ If the epsilon closure of 'edest' is incomplete,*
1752	the epsilon closure of this node is also incomplete. /*
1753	if (dfa->eclosures[edest].nelem == `0`)
1754	{
1755	incomplete = true;
1756	re_node_set_free (&eclosure_elem);
1757	}
1758	}
1759
1760	/ An epsilon closure includes itself. /
1761	ok = re_node_set_insert (&eclosure, node);
1762	if (__glibc_unlikely (! ok))
1763	return REG_ESPACE;
1764	if (incomplete && !root)
1765	dfa->eclosures[node].nelem = `0`;
1766	else
1767	dfa->eclosures[node] = eclosure;
1768	*new_set = eclosure;
1769	return REG_NOERROR;
1770	}
1771
1772	/ Functions for token which are used in the parser. /
1773
1774	/ Fetch a token from INPUT.*
1775	We must not use this function inside bracket expressions. /*
1776
1777	static void
1778	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1779	{
1780	re_string_skip_bytes (input, peek_token (result, input, syntax));
1781	}
1782
1783	/ Peek a token from INPUT, and return the length of the token.*
1784	We must not use this function inside bracket expressions. /*
1785
1786	static int
1787	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1788	{
1789	unsigned char c;
1790
1791	if (re_string_eoi (input))
1792	{
1793	token->type = END_OF_RE;
1794	return `0`;
1795	}
1796
1797	c = re_string_peek_byte (input, `0`);
1798	token->opr.c = c;
1799
1800	token->word_char = `0`;
1801	#ifdef RE_ENABLE_I18N
1802	token->mb_partial = `0`;
1803	if (input->mb_cur_max > `1`
1804	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1805	{
1806	token->type = CHARACTER;
1807	token->mb_partial = `1`;
1808	return `1`;
1809	}
1810	#endif
1811	if (c == `'\\'`)
1812	{
1813	unsigned char c2;
1814	if (re_string_cur_idx (input) + `1` >= re_string_length (input))
1815	{
1816	token->type = BACK_SLASH;
1817	return `1`;
1818	}
1819
1820	c2 = re_string_peek_byte_case (input, `1`);
1821	token->opr.c = c2;
1822	token->type = CHARACTER;
1823	#ifdef RE_ENABLE_I18N
1824	if (input->mb_cur_max > `1`)
1825	{
1826	wint_t wc = re_string_wchar_at (input,
1827	re_string_cur_idx (input) + `1`);
1828	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1829	}
1830	else
1831	#endif
1832	token->word_char = IS_WORD_CHAR (c2) != `0`;
1833
1834	switch (c2)
1835	{
1836	case `'\|'`:
1837	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1838	token->type = OP_ALT;
1839	break;
1840	case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`:
1841	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
1842	if (!(syntax & RE_NO_BK_REFS))
1843	{
1844	token->type = OP_BACK_REF;
1845	token->opr.idx = c2 - `'1'`;
1846	}
1847	break;
1848	case `'<'`:
1849	if (!(syntax & RE_NO_GNU_OPS))
1850	{
1851	token->type = ANCHOR;
1852	token->opr.ctx_type = WORD_FIRST;
1853	}
1854	break;
1855	case `'>'`:
1856	if (!(syntax & RE_NO_GNU_OPS))
1857	{
1858	token->type = ANCHOR;
1859	token->opr.ctx_type = WORD_LAST;
1860	}
1861	break;
1862	case `'b'`:
1863	if (!(syntax & RE_NO_GNU_OPS))
1864	{
1865	token->type = ANCHOR;
1866	token->opr.ctx_type = WORD_DELIM;
1867	}
1868	break;
1869	case `'B'`:
1870	if (!(syntax & RE_NO_GNU_OPS))
1871	{
1872	token->type = ANCHOR;
1873	token->opr.ctx_type = NOT_WORD_DELIM;
1874	}
1875	break;
1876	case `'w'`:
1877	if (!(syntax & RE_NO_GNU_OPS))
1878	token->type = OP_WORD;
1879	break;
1880	case `'W'`:
1881	if (!(syntax & RE_NO_GNU_OPS))
1882	token->type = OP_NOTWORD;
1883	break;
1884	case `'s'`:
1885	if (!(syntax & RE_NO_GNU_OPS))
1886	token->type = OP_SPACE;
1887	break;
1888	case `'S'`:
1889	if (!(syntax & RE_NO_GNU_OPS))
1890	token->type = OP_NOTSPACE;
1891	break;
1892	case '`':
1893	if (!(syntax & RE_NO_GNU_OPS))
1894	{
1895	token->type = ANCHOR;
1896	token->opr.ctx_type = BUF_FIRST;
1897	}
1898	break;
1899	case `'\''`:
1900	if (!(syntax & RE_NO_GNU_OPS))
1901	{
1902	token->type = ANCHOR;
1903	token->opr.ctx_type = BUF_LAST;
1904	}
1905	break;
1906	case `'('`:
1907	if (!(syntax & RE_NO_BK_PARENS))
1908	token->type = OP_OPEN_SUBEXP;
1909	break;
1910	case `')'`:
1911	if (!(syntax & RE_NO_BK_PARENS))
1912	token->type = OP_CLOSE_SUBEXP;
1913	break;
1914	case `'+'`:
1915	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916	token->type = OP_DUP_PLUS;
1917	break;
1918	case `'?'`:
1919	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1920	token->type = OP_DUP_QUESTION;
1921	break;
1922	case `'{'`:
1923	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924	token->type = OP_OPEN_DUP_NUM;
1925	break;
1926	case `'}'`:
1927	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1928	token->type = OP_CLOSE_DUP_NUM;
1929	break;
1930	default:
1931	break;
1932	}
1933	return `2`;
1934	}
1935
1936	token->type = CHARACTER;
1937	#ifdef RE_ENABLE_I18N
1938	if (input->mb_cur_max > `1`)
1939	{
1940	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1941	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1942	}
1943	else
1944	#endif
1945	token->word_char = IS_WORD_CHAR (token->opr.c);
1946
1947	switch (c)
1948	{
1949	case `'\n'`:
1950	if (syntax & RE_NEWLINE_ALT)
1951	token->type = OP_ALT;
1952	break;
1953	case `'\|'`:
1954	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1955	token->type = OP_ALT;
1956	break;
1957	case `'*'`:
1958	token->type = OP_DUP_ASTERISK;
1959	break;
1960	case `'+'`:
1961	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962	token->type = OP_DUP_PLUS;
1963	break;
1964	case `'?'`:
1965	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1966	token->type = OP_DUP_QUESTION;
1967	break;
1968	case `'{'`:
1969	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970	token->type = OP_OPEN_DUP_NUM;
1971	break;
1972	case `'}'`:
1973	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1974	token->type = OP_CLOSE_DUP_NUM;
1975	break;
1976	case `'('`:
1977	if (syntax & RE_NO_BK_PARENS)
1978	token->type = OP_OPEN_SUBEXP;
1979	break;
1980	case `')'`:
1981	if (syntax & RE_NO_BK_PARENS)
1982	token->type = OP_CLOSE_SUBEXP;
1983	break;
1984	case `'['`:
1985	token->type = OP_OPEN_BRACKET;
1986	break;
1987	case `'.'`:
1988	token->type = OP_PERIOD;
1989	break;
1990	case `'^'`:
1991	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1992	&& re_string_cur_idx (input) != `0`)
1993	{
1994	char prev = re_string_peek_byte (input, -`1`);
1995	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != `'\n'`)
1996	break;
1997	}
1998	token->type = ANCHOR;
1999	token->opr.ctx_type = LINE_FIRST;
2000	break;
2001	case `'$'`:
2002	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2003	&& re_string_cur_idx (input) + `1` != re_string_length (input))
2004	{
2005	re_token_t next;
2006	re_string_skip_bytes (input, `1`);
2007	peek_token (&next, input, syntax);
2008	re_string_skip_bytes (input, -`1`);
2009	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2010	break;
2011	}
2012	token->type = ANCHOR;
2013	token->opr.ctx_type = LINE_LAST;
2014	break;
2015	default:
2016	break;
2017	}
2018	return `1`;
2019	}
2020
2021	/ Peek a token from INPUT, and return the length of the token.*
2022	We must not use this function out of bracket expressions. /*
2023
2024	static int
2025	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2026	{
2027	unsigned char c;
2028	if (re_string_eoi (input))
2029	{
2030	token->type = END_OF_RE;
2031	return `0`;
2032	}
2033	c = re_string_peek_byte (input, `0`);
2034	token->opr.c = c;
2035
2036	#ifdef RE_ENABLE_I18N
2037	if (input->mb_cur_max > `1`
2038	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2039	{
2040	token->type = CHARACTER;
2041	return `1`;
2042	}
2043	#endif /* RE_ENABLE_I18N */
2044
2045	if (c == `'\\'` && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2046	&& re_string_cur_idx (input) + `1` < re_string_length (input))
2047	{
2048	/ In this case, '\' escape a character. /
2049	unsigned char c2;
2050	re_string_skip_bytes (input, `1`);
2051	c2 = re_string_peek_byte (input, `0`);
2052	token->opr.c = c2;
2053	token->type = CHARACTER;
2054	return `1`;
2055	}
2056	if (c == `'['`) / '[' is a special char in a bracket exps. /
2057	{
2058	unsigned char c2;
2059	int token_len;
2060	if (re_string_cur_idx (input) + `1` < re_string_length (input))
2061	c2 = re_string_peek_byte (input, `1`);
2062	else
2063	c2 = `0`;
2064	token->opr.c = c2;
2065	token_len = `2`;
2066	switch (c2)
2067	{
2068	case `'.'`:
2069	token->type = OP_OPEN_COLL_ELEM;
2070	break;
2071
2072	case `'='`:
2073	token->type = OP_OPEN_EQUIV_CLASS;
2074	break;
2075
2076	case `':'`:
2077	if (syntax & RE_CHAR_CLASSES)
2078	{
2079	token->type = OP_OPEN_CHAR_CLASS;
2080	break;
2081	}
2082	FALLTHROUGH;
2083	default:
2084	token->type = CHARACTER;
2085	token->opr.c = c;
2086	token_len = `1`;
2087	break;
2088	}
2089	return token_len;
2090	}
2091	switch (c)
2092	{
2093	case `'-'`:
2094	token->type = OP_CHARSET_RANGE;
2095	break;
2096	case `']'`:
2097	token->type = OP_CLOSE_BRACKET;
2098	break;
2099	case `'^'`:
2100	token->type = OP_NON_MATCH_LIST;
2101	break;
2102	default:
2103	token->type = CHARACTER;
2104	}
2105	return `1`;
2106	}
2107
2108	/ Functions for parser. /
2109
2110	/ Entry point of the parser.*
2111	Parse the regular expression REGEXP and return the structure tree.
2112	If an error occurs, ERR is set by error code, and return NULL.
2113	This function build the following tree, from regular expression <reg_exp>:
2114	CAT
2115	/ \
2116	/ \
2117	<reg_exp> EOR
2118
2119	CAT means concatenation.
2120	EOR means end of regular expression. /*
2121
2122	static bin_tree_t *
2123	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2124	reg_errcode_t *err)
2125	{
2126	re_dfa_t *dfa = preg->buffer;
2127	bin_tree_t tree, eor, *root;
2128	re_token_t current_token;
2129	dfa->syntax = syntax;
2130	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2131	tree = parse_reg_exp (regexp, preg, &current_token, syntax, `0`, err);
2132	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2133	return NULL;
2134	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2135	if (tree != NULL)
2136	root = create_tree (dfa, tree, eor, CONCAT);
2137	else
2138	root = eor;
2139	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2140	{
2141	*err = REG_ESPACE;
2142	return NULL;
2143	}
2144	return root;
2145	}
2146
2147	/ This function build the following tree, from regular expression*
2148	<branch1>\|<branch2>:
2149	ALT
2150	/ \
2151	/ \
2152	<branch1> <branch2>
2153
2154	ALT means alternative, which represents the operator '\|'. /*
2155
2156	static bin_tree_t *
2157	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2158	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2159	{
2160	re_dfa_t *dfa = preg->buffer;
2161	bin_tree_t tree, branch = NULL;
2162	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2163	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2164	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2165	return NULL;
2166
2167	while (token->type == OP_ALT)
2168	{
2169	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2170	if (token->type != OP_ALT && token->type != END_OF_RE
2171	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2172	{
2173	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2174	dfa->completed_bkref_map = initial_bkref_map;
2175	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2176	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2177	{
2178	if (tree != NULL)
2179	postorder (tree, free_tree, NULL);
2180	return NULL;
2181	}
2182	dfa->completed_bkref_map \|= accumulated_bkref_map;
2183	}
2184	else
2185	branch = NULL;
2186	tree = create_tree (dfa, tree, branch, OP_ALT);
2187	if (__glibc_unlikely (tree == NULL))
2188	{
2189	*err = REG_ESPACE;
2190	return NULL;
2191	}
2192	}
2193	return tree;
2194	}
2195
2196	/ This function build the following tree, from regular expression*
2197	<exp1><exp2>:
2198	CAT
2199	/ \
2200	/ \
2201	<exp1> <exp2>
2202
2203	CAT means concatenation. /*
2204
2205	static bin_tree_t *
2206	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2207	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2208	{
2209	bin_tree_t tree, expr;
2210	re_dfa_t *dfa = preg->buffer;
2211	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2212	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2213	return NULL;
2214
2215	while (token->type != OP_ALT && token->type != END_OF_RE
2216	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2217	{
2218	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2219	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2220	{
2221	if (tree != NULL)
2222	postorder (tree, free_tree, NULL);
2223	return NULL;
2224	}
2225	if (tree != NULL && expr != NULL)
2226	{
2227	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2228	if (newtree == NULL)
2229	{
2230	postorder (expr, free_tree, NULL);
2231	postorder (tree, free_tree, NULL);
2232	*err = REG_ESPACE;
2233	return NULL;
2234	}
2235	tree = newtree;
2236	}
2237	else if (tree == NULL)
2238	tree = expr;
2239	/ Otherwise expr == NULL, we don't need to create new tree. /
2240	}
2241	return tree;
2242	}
2243
2244	/ This function build the following tree, from regular expression a:
2245	*
2246	\|
2247	a
2248	*/
2249
2250	static bin_tree_t *
2251	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2252	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2253	{
2254	re_dfa_t *dfa = preg->buffer;
2255	bin_tree_t *tree;
2256	switch (token->type)
2257	{
2258	case CHARACTER:
2259	tree = create_token_tree (dfa, NULL, NULL, token);
2260	if (__glibc_unlikely (tree == NULL))
2261	{
2262	*err = REG_ESPACE;
2263	return NULL;
2264	}
2265	#ifdef RE_ENABLE_I18N
2266	if (dfa->mb_cur_max > `1`)
2267	{
2268	while (!re_string_eoi (regexp)
2269	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2270	{
2271	bin_tree_t *mbc_remain;
2272	fetch_token (token, regexp, syntax);
2273	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2274	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2275	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2276	{
2277	*err = REG_ESPACE;
2278	return NULL;
2279	}
2280	}
2281	}
2282	#endif
2283	break;
2284
2285	case OP_OPEN_SUBEXP:
2286	tree = parse_sub_exp (regexp, preg, token, syntax, nest + `1`, err);
2287	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288	return NULL;
2289	break;
2290
2291	case OP_OPEN_BRACKET:
2292	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2293	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2294	return NULL;
2295	break;
2296
2297	case OP_BACK_REF:
2298	if (!__glibc_likely (dfa->completed_bkref_map & (`1` << token->opr.idx)))
2299	{
2300	*err = REG_ESUBREG;
2301	return NULL;
2302	}
2303	dfa->used_bkref_map \|= `1` << token->opr.idx;
2304	tree = create_token_tree (dfa, NULL, NULL, token);
2305	if (__glibc_unlikely (tree == NULL))
2306	{
2307	*err = REG_ESPACE;
2308	return NULL;
2309	}
2310	++dfa->nbackref;
2311	dfa->has_mb_node = `1`;
2312	break;
2313
2314	case OP_OPEN_DUP_NUM:
2315	if (syntax & RE_CONTEXT_INVALID_DUP)
2316	{
2317	*err = REG_BADRPT;
2318	return NULL;
2319	}
2320	FALLTHROUGH;
2321	case OP_DUP_ASTERISK:
2322	case OP_DUP_PLUS:
2323	case OP_DUP_QUESTION:
2324	if (syntax & RE_CONTEXT_INVALID_OPS)
2325	{
2326	*err = REG_BADRPT;
2327	return NULL;
2328	}
2329	else if (syntax & RE_CONTEXT_INDEP_OPS)
2330	{
2331	fetch_token (token, regexp, syntax);
2332	return parse_expression (regexp, preg, token, syntax, nest, err);
2333	}
2334	FALLTHROUGH;
2335	case OP_CLOSE_SUBEXP:
2336	if ((token->type == OP_CLOSE_SUBEXP)
2337	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338	{
2339	*err = REG_ERPAREN;
2340	return NULL;
2341	}
2342	FALLTHROUGH;
2343	case OP_CLOSE_DUP_NUM:
2344	/ We treat it as a normal character. /
2345
2346	/ Then we can these characters as normal characters. /
2347	token->type = CHARACTER;
2348	/ mb_partial and word_char bits should be initialized already*
2349	by peek_token. /*
2350	tree = create_token_tree (dfa, NULL, NULL, token);
2351	if (__glibc_unlikely (tree == NULL))
2352	{
2353	*err = REG_ESPACE;
2354	return NULL;
2355	}
2356	break;
2357
2358	case ANCHOR:
2359	if ((token->opr.ctx_type
2360	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2361	&& dfa->word_ops_used == `0`)
2362	init_word_char (dfa);
2363	if (token->opr.ctx_type == WORD_DELIM
2364	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2365	{
2366	bin_tree_t tree_first, tree_last;
2367	if (token->opr.ctx_type == WORD_DELIM)
2368	{
2369	token->opr.ctx_type = WORD_FIRST;
2370	tree_first = create_token_tree (dfa, NULL, NULL, token);
2371	token->opr.ctx_type = WORD_LAST;
2372	}
2373	else
2374	{
2375	token->opr.ctx_type = INSIDE_WORD;
2376	tree_first = create_token_tree (dfa, NULL, NULL, token);
2377	token->opr.ctx_type = INSIDE_NOTWORD;
2378	}
2379	tree_last = create_token_tree (dfa, NULL, NULL, token);
2380	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2381	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2382	\|\| tree == NULL))
2383	{
2384	*err = REG_ESPACE;
2385	return NULL;
2386	}
2387	}
2388	else
2389	{
2390	tree = create_token_tree (dfa, NULL, NULL, token);
2391	if (__glibc_unlikely (tree == NULL))
2392	{
2393	*err = REG_ESPACE;
2394	return NULL;
2395	}
2396	}
2397	/ We must return here, since ANCHORs can't be followed*
2398	by repetition operators.
2399	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2400	it must not be "<ANCHOR(^)><REPEAT()>". /
2401	fetch_token (token, regexp, syntax);
2402	return tree;
2403
2404	case OP_PERIOD:
2405	tree = create_token_tree (dfa, NULL, NULL, token);
2406	if (__glibc_unlikely (tree == NULL))
2407	{
2408	*err = REG_ESPACE;
2409	return NULL;
2410	}
2411	if (dfa->mb_cur_max > `1`)
2412	dfa->has_mb_node = `1`;
2413	break;
2414
2415	case OP_WORD:
2416	case OP_NOTWORD:
2417	tree = build_charclass_op (dfa, regexp->trans,
2418	"alnum",
2419	"_",
2420	token->type == OP_NOTWORD, err);
2421	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2422	return NULL;
2423	break;
2424
2425	case OP_SPACE:
2426	case OP_NOTSPACE:
2427	tree = build_charclass_op (dfa, regexp->trans,
2428	"space",
2429	"",
2430	token->type == OP_NOTSPACE, err);
2431	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2432	return NULL;
2433	break;
2434
2435	case OP_ALT:
2436	case END_OF_RE:
2437	return NULL;
2438
2439	case BACK_SLASH:
2440	*err = REG_EESCAPE;
2441	return NULL;
2442
2443	default:
2444	/ Must not happen? /
2445	#ifdef DEBUG
2446	assert (`0`);
2447	#endif
2448	return NULL;
2449	}
2450	fetch_token (token, regexp, syntax);
2451
2452	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2453	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2454	{
2455	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2456	syntax, err);
2457	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2458	{
2459	if (tree != NULL)
2460	postorder (tree, free_tree, NULL);
2461	return NULL;
2462	}
2463	tree = dup_tree;
2464	/ In BRE consecutive duplications are not allowed. /
2465	if ((syntax & RE_CONTEXT_INVALID_DUP)
2466	&& (token->type == OP_DUP_ASTERISK
2467	\|\| token->type == OP_OPEN_DUP_NUM))
2468	{
2469	if (tree != NULL)
2470	postorder (tree, free_tree, NULL);
2471	*err = REG_BADRPT;
2472	return NULL;
2473	}
2474	}
2475
2476	return tree;
2477	}
2478
2479	/ This function build the following tree, from regular expression*
2480	(<reg_exp>):
2481	SUBEXP
2482	\|
2483	<reg_exp>
2484	*/
2485
2486	static bin_tree_t *
2487	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2488	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489	{
2490	re_dfa_t *dfa = preg->buffer;
2491	bin_tree_t *tree;
2492	size_t cur_nsub;
2493	cur_nsub = preg->re_nsub++;
2494
2495	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2496
2497	/ The subexpression may be a null string. /
2498	if (token->type == OP_CLOSE_SUBEXP)
2499	tree = NULL;
2500	else
2501	{
2502	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503	if (__glibc_unlikely (*err == REG_NOERROR
2504	&& token->type != OP_CLOSE_SUBEXP))
2505	{
2506	if (tree != NULL)
2507	postorder (tree, free_tree, NULL);
2508	*err = REG_EPAREN;
2509	}
2510	if (__glibc_unlikely (*err != REG_NOERROR))
2511	return NULL;
2512	}
2513
2514	if (cur_nsub <= `'9'` - `'1'`)
2515	dfa->completed_bkref_map \|= `1` << cur_nsub;
2516
2517	tree = create_tree (dfa, tree, NULL, SUBEXP);
2518	if (__glibc_unlikely (tree == NULL))
2519	{
2520	*err = REG_ESPACE;
2521	return NULL;
2522	}
2523	tree->token.opr.idx = cur_nsub;
2524	return tree;
2525	}
2526
2527	/ This function parse repetition operators like "", "+", "{1,3}" etc. /*
2528
2529	static bin_tree_t *
2530	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2531	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2532	{
2533	bin_tree_t tree = NULL, old_tree = NULL;
2534	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2535	re_token_t start_token = *token;
2536
2537	if (token->type == OP_OPEN_DUP_NUM)
2538	{
2539	end = `0`;
2540	start = fetch_number (regexp, token, syntax);
2541	if (start == -`1`)
2542	{
2543	if (token->type == CHARACTER && token->opr.c == `','`)
2544	start = `0`; / We treat "{,m}" as "{0,m}". /
2545	else
2546	{
2547	err = REG_BADBR; /* <re>{} is invalid. /
2548	return NULL;
2549	}
2550	}
2551	if (__glibc_likely (start != -`2`))
2552	{
2553	/ We treat "{n}" as "{n,n}". /
2554	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2555	: ((token->type == CHARACTER && token->opr.c == `','`)
2556	? fetch_number (regexp, token, syntax) : -`2`));
2557	}
2558	if (__glibc_unlikely (start == -`2` \|\| end == -`2`))
2559	{
2560	/ Invalid sequence. /
2561	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2562	{
2563	if (token->type == END_OF_RE)
2564	*err = REG_EBRACE;
2565	else
2566	*err = REG_BADBR;
2567
2568	return NULL;
2569	}
2570
2571	/ If the syntax bit is set, rollback. /
2572	re_string_set_index (regexp, start_idx);
2573	*token = start_token;
2574	token->type = CHARACTER;
2575	/ mb_partial and word_char bits should be already initialized by*
2576	peek_token. /*
2577	return elem;
2578	}
2579
2580	if (__glibc_unlikely ((end != -`1` && start > end)
2581	\|\| token->type != OP_CLOSE_DUP_NUM))
2582	{
2583	/ First number greater than second. /
2584	*err = REG_BADBR;
2585	return NULL;
2586	}
2587
2588	if (__glibc_unlikely (RE_DUP_MAX < (end == -`1` ? start : end)))
2589	{
2590	*err = REG_ESIZE;
2591	return NULL;
2592	}
2593	}
2594	else
2595	{
2596	start = (token->type == OP_DUP_PLUS) ? `1` : `0`;
2597	end = (token->type == OP_DUP_QUESTION) ? `1` : -`1`;
2598	}
2599
2600	fetch_token (token, regexp, syntax);
2601
2602	if (__glibc_unlikely (elem == NULL))
2603	return NULL;
2604	if (__glibc_unlikely (start == `0` && end == `0`))
2605	{
2606	postorder (elem, free_tree, NULL);
2607	return NULL;
2608	}
2609
2610	/ Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". /
2611	if (__glibc_unlikely (start > `0`))
2612	{
2613	tree = elem;
2614	for (i = `2`; i <= start; ++i)
2615	{
2616	elem = duplicate_tree (elem, dfa);
2617	tree = create_tree (dfa, tree, elem, CONCAT);
2618	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2619	goto parse_dup_op_espace;
2620	}
2621
2622	if (start == end)
2623	return tree;
2624
2625	/ Duplicate ELEM before it is marked optional. /
2626	elem = duplicate_tree (elem, dfa);
2627	if (__glibc_unlikely (elem == NULL))
2628	goto parse_dup_op_espace;
2629	old_tree = tree;
2630	}
2631	else
2632	old_tree = NULL;
2633
2634	if (elem->token.type == SUBEXP)
2635	{
2636	uintptr_t subidx = elem->token.opr.idx;
2637	postorder (elem, mark_opt_subexp, (void *) subidx);
2638	}
2639
2640	tree = create_tree (dfa, elem, NULL,
2641	(end == -`1` ? OP_DUP_ASTERISK : OP_ALT));
2642	if (__glibc_unlikely (tree == NULL))
2643	goto parse_dup_op_espace;
2644
2645	/ This loop is actually executed only when end != -1,*
2646	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2647	already created the start+1-th copy. /*
2648	if (TYPE_SIGNED (Idx) \|\| end != -`1`)
2649	for (i = start + `2`; i <= end; ++i)
2650	{
2651	elem = duplicate_tree (elem, dfa);
2652	tree = create_tree (dfa, tree, elem, CONCAT);
2653	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2654	goto parse_dup_op_espace;
2655
2656	tree = create_tree (dfa, tree, NULL, OP_ALT);
2657	if (__glibc_unlikely (tree == NULL))
2658	goto parse_dup_op_espace;
2659	}
2660
2661	if (old_tree)
2662	tree = create_tree (dfa, old_tree, tree, CONCAT);
2663
2664	return tree;
2665
2666	parse_dup_op_espace:
2667	*err = REG_ESPACE;
2668	return NULL;
2669	}
2670
2671	/ Size of the names for collating symbol/equivalence_class/character_class.*
2672	I'm not sure, but maybe enough. /*
2673	#define BRACKET_NAME_BUF_SIZE 32
2674
2675	#ifndef _LIBC
2676
2677	# ifdef RE_ENABLE_I18N
2678	/ Convert the byte B to the corresponding wide character. In a*
2679	unibyte locale, treat B as itself. In a multibyte locale, return
2680	WEOF if B is an encoding error. /*
2681	static wint_t
2682	parse_byte (unsigned char b, re_charset_t *mbcset)
2683	{
2684	return mbcset == NULL ? b : __btowc (b);
2685	}
2686	# endif
2687
2688	/ Local function for parse_bracket_exp only used in case of NOT _LIBC.*
2689	Build the range expression which starts from START_ELEM, and ends
2690	at END_ELEM. The result are written to MBCSET and SBCSET.
2691	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2692	mbcset->range_ends, is a pointer argument since we may
2693	update it. /*
2694
2695	static reg_errcode_t
2696	# ifdef RE_ENABLE_I18N
2697	build_range_exp (const reg_syntax_t syntax,
2698	bitset_t sbcset,
2699	re_charset_t *mbcset,
2700	Idx *range_alloc,
2701	const bracket_elem_t *start_elem,
2702	const bracket_elem_t *end_elem)
2703	# else /* not RE_ENABLE_I18N */
2704	build_range_exp (const reg_syntax_t syntax,
2705	bitset_t sbcset,
2706	const bracket_elem_t *start_elem,
2707	const bracket_elem_t *end_elem)
2708	# endif /* not RE_ENABLE_I18N */
2709	{
2710	unsigned int start_ch, end_ch;
2711	/ Equivalence Classes and Character Classes can't be a range start/end. /
2712	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2713	\|\| start_elem->type == CHAR_CLASS
2714	\|\| end_elem->type == EQUIV_CLASS
2715	\|\| end_elem->type == CHAR_CLASS))
2716	return REG_ERANGE;
2717
2718	/ We can handle no multi character collating elements without libc*
2719	support. /*
2720	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2721	&& strlen ((char *) start_elem->opr.name) > `1`)
2722	\|\| (end_elem->type == COLL_SYM
2723	&& strlen ((char *) end_elem->opr.name) > `1`)))
2724	return REG_ECOLLATE;
2725
2726	# ifdef RE_ENABLE_I18N
2727	{
2728	wchar_t wc;
2729	wint_t start_wc;
2730	wint_t end_wc;
2731
2732	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2733	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2734	: `0`));
2735	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2736	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2737	: `0`));
2738	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2739	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2740	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2741	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2742	if (start_wc == WEOF \|\| end_wc == WEOF)
2743	return REG_ECOLLATE;
2744	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2745	&& start_wc > end_wc))
2746	return REG_ERANGE;
2747
2748	/ Got valid collation sequence values, add them as a new entry.*
2749	However, for !_LIBC we have no collation elements: if the
2750	character set is single byte, the single byte character set
2751	that we build below suffices. parse_bracket_exp passes
2752	no MBCSET if dfa->mb_cur_max == 1. /*
2753	if (mbcset)
2754	{
2755	/ Check the space of the arrays. /
2756	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2757	{
2758	/ There is not enough space, need realloc. /
2759	wchar_t new_array_start, new_array_end;
2760	Idx new_nranges;
2761
2762	/ +1 in case of mbcset->nranges is 0. /
2763	new_nranges = `2` * mbcset->nranges + `1`;
2764	/ Use realloc since mbcset->range_starts and mbcset->range_ends*
2765	are NULL if range_alloc == 0. /
2766	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2767	new_nranges);
2768	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2769	new_nranges);
2770
2771	if (__glibc_unlikely (new_array_start == NULL
2772	\|\| new_array_end == NULL))
2773	{
2774	re_free (new_array_start);
2775	re_free (new_array_end);
2776	return REG_ESPACE;
2777	}
2778
2779	mbcset->range_starts = new_array_start;
2780	mbcset->range_ends = new_array_end;
2781	*range_alloc = new_nranges;
2782	}
2783
2784	mbcset->range_starts[mbcset->nranges] = start_wc;
2785	mbcset->range_ends[mbcset->nranges++] = end_wc;
2786	}
2787
2788	/ Build the table for single byte characters. /
2789	for (wc = `0`; wc < SBC_MAX; ++wc)
2790	{
2791	if (start_wc <= wc && wc <= end_wc)
2792	bitset_set (sbcset, wc);
2793	}
2794	}
2795	# else /* not RE_ENABLE_I18N */
2796	{
2797	unsigned int ch;
2798	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2799	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2800	: `0`));
2801	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2802	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2803	: `0`));
2804	if (start_ch > end_ch)
2805	return REG_ERANGE;
2806	/ Build the table for single byte characters. /
2807	for (ch = `0`; ch < SBC_MAX; ++ch)
2808	if (start_ch <= ch && ch <= end_ch)
2809	bitset_set (sbcset, ch);
2810	}
2811	# endif /* not RE_ENABLE_I18N */
2812	return REG_NOERROR;
2813	}
2814	#endif /* not _LIBC */
2815
2816	#ifndef _LIBC
2817	/ Helper function for parse_bracket_exp only used in case of NOT _LIBC..*
2818	Build the collating element which is represented by NAME.
2819	The result are written to MBCSET and SBCSET.
2820	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2821	pointer argument since we may update it. /*
2822
2823	static reg_errcode_t
2824	# ifdef RE_ENABLE_I18N
2825	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2826	Idx coll_sym_alloc, const* unsigned char *name)
2827	# else /* not RE_ENABLE_I18N */
2828	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2829	# endif /* not RE_ENABLE_I18N */
2830	{
2831	size_t name_len = strlen ((const char *) name);
2832	if (__glibc_unlikely (name_len != `1`))
2833	return REG_ECOLLATE;
2834	else
2835	{
2836	bitset_set (sbcset, name[`0`]);
2837	return REG_NOERROR;
2838	}
2839	}
2840	#endif /* not _LIBC */
2841
2842	/ This function parse bracket expression like "[abc]", "[a-c]",*
2843	"[[.a-a.]]" etc. /*
2844
2845	static bin_tree_t *
2846	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2847	reg_syntax_t syntax, reg_errcode_t *err)
2848	{
2849	#ifdef _LIBC
2850	const unsigned char *collseqmb;
2851	const char *collseqwc;
2852	uint32_t nrules;
2853	int32_t table_size;
2854	const int32_t *symb_table;
2855	const unsigned char *extra;
2856
2857	/ Local function for parse_bracket_exp used in _LIBC environment.*
2858	Seek the collating symbol entry corresponding to NAME.
2859	Return the index of the symbol in the SYMB_TABLE,
2860	or -1 if not found. /*
2861
2862	auto inline int32_t
2863	__attribute__ ((always_inline))
2864	seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2865	{
2866	int32_t elem;
2867
2868	for (elem = `0`; elem < table_size; elem++)
2869	if (symb_table[`2` * elem] != `0`)
2870	{
2871	int32_t idx = symb_table[`2` * elem + `1`];
2872	/ Skip the name of collating element name. /
2873	idx += `1` + extra[idx];
2874	if (/ Compare the length of the name. /
2875	name_len == extra[idx]
2876	/ Compare the name. /
2877	&& memcmp (name, &extra[idx + `1`], name_len) == `0`)
2878	/ Yep, this is the entry. /
2879	return elem;
2880	}
2881	return -`1`;
2882	}
2883
2884	/ Local function for parse_bracket_exp used in _LIBC environment.*
2885	Look up the collation sequence value of BR_ELEM.
2886	Return the value if succeeded, UINT_MAX otherwise. /*
2887
2888	auto inline unsigned int
2889	__attribute__ ((always_inline))
2890	lookup_collation_sequence_value (bracket_elem_t *br_elem)
2891	{
2892	if (br_elem->type == SB_CHAR)
2893	{
2894	/*
2895	if (MB_CUR_MAX == 1)
2896	*/
2897	if (nrules == `0`)
2898	return collseqmb[br_elem->opr.ch];
2899	else
2900	{
2901	wint_t wc = __btowc (br_elem->opr.ch);
2902	return __collseq_table_lookup (collseqwc, wc);
2903	}
2904	}
2905	else if (br_elem->type == MB_CHAR)
2906	{
2907	if (nrules != `0`)
2908	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2909	}
2910	else if (br_elem->type == COLL_SYM)
2911	{
2912	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2913	if (nrules != `0`)
2914	{
2915	int32_t elem, idx;
2916	elem = seek_collating_symbol_entry (br_elem->opr.name,
2917	sym_name_len);
2918	if (elem != -`1`)
2919	{
2920	/ We found the entry. /
2921	idx = symb_table[`2` * elem + `1`];
2922	/ Skip the name of collating element name. /
2923	idx += `1` + extra[idx];
2924	/ Skip the byte sequence of the collating element. /
2925	idx += `1` + extra[idx];
2926	/ Adjust for the alignment. /
2927	idx = (idx + `3`) & ~`3`;
2928	/ Skip the multibyte collation sequence value. /
2929	idx += sizeof (unsigned int);
2930	/ Skip the wide char sequence of the collating element. /
2931	idx += sizeof (unsigned int) *
2932	(`1` + (unsigned* int *) (extra + idx));
2933	/ Return the collation sequence value. /
2934	return (unsigned* int *) (extra + idx);
2935	}
2936	else if (sym_name_len == `1`)
2937	{
2938	/ No valid character. Match it as a single byte*
2939	character. /*
2940	return collseqmb[br_elem->opr.name[`0`]];
2941	}
2942	}
2943	else if (sym_name_len == `1`)
2944	return collseqmb[br_elem->opr.name[`0`]];
2945	}
2946	return UINT_MAX;
2947	}
2948
2949	/ Local function for parse_bracket_exp used in _LIBC environment.*
2950	Build the range expression which starts from START_ELEM, and ends
2951	at END_ELEM. The result are written to MBCSET and SBCSET.
2952	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2953	mbcset->range_ends, is a pointer argument since we may
2954	update it. /*
2955
2956	auto inline reg_errcode_t
2957	__attribute__ ((always_inline))
2958	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int* *range_alloc,
2959	bracket_elem_t start_elem, bracket_elem_t end_elem)
2960	{
2961	unsigned int ch;
2962	uint32_t start_collseq;
2963	uint32_t end_collseq;
2964
2965	/ Equivalence Classes and Character Classes can't be a range*
2966	start/end. /*
2967	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2968	\|\| start_elem->type == CHAR_CLASS
2969	\|\| end_elem->type == EQUIV_CLASS
2970	\|\| end_elem->type == CHAR_CLASS))
2971	return REG_ERANGE;
2972
2973	/ FIXME: Implement rational ranges here, too. /
2974	start_collseq = lookup_collation_sequence_value (start_elem);
2975	end_collseq = lookup_collation_sequence_value (end_elem);
2976	/ Check start/end collation sequence values. /
2977	if (__glibc_unlikely (start_collseq == UINT_MAX
2978	\|\| end_collseq == UINT_MAX))
2979	return REG_ECOLLATE;
2980	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2981	&& start_collseq > end_collseq))
2982	return REG_ERANGE;
2983
2984	/ Got valid collation sequence values, add them as a new entry.*
2985	However, if we have no collation elements, and the character set
2986	is single byte, the single byte character set that we
2987	build below suffices. /*
2988	if (nrules > `0` \|\| dfa->mb_cur_max > `1`)
2989	{
2990	/ Check the space of the arrays. /
2991	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2992	{
2993	/ There is not enough space, need realloc. /
2994	uint32_t *new_array_start;
2995	uint32_t *new_array_end;
2996	Idx new_nranges;
2997
2998	/ +1 in case of mbcset->nranges is 0. /
2999	new_nranges = `2` * mbcset->nranges + `1`;
3000	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3001	new_nranges);
3002	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3003	new_nranges);
3004
3005	if (__glibc_unlikely (new_array_start == NULL
3006	\|\| new_array_end == NULL))
3007	return REG_ESPACE;
3008
3009	mbcset->range_starts = new_array_start;
3010	mbcset->range_ends = new_array_end;
3011	*range_alloc = new_nranges;
3012	}
3013
3014	mbcset->range_starts[mbcset->nranges] = start_collseq;
3015	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3016	}
3017
3018	/ Build the table for single byte characters. /
3019	for (ch = `0`; ch < SBC_MAX; ch++)
3020	{
3021	uint32_t ch_collseq;
3022	/*
3023	if (MB_CUR_MAX == 1)
3024	*/
3025	if (nrules == `0`)
3026	ch_collseq = collseqmb[ch];
3027	else
3028	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3029	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3030	bitset_set (sbcset, ch);
3031	}
3032	return REG_NOERROR;
3033	}
3034
3035	/ Local function for parse_bracket_exp used in _LIBC environment.*
3036	Build the collating element which is represented by NAME.
3037	The result are written to MBCSET and SBCSET.
3038	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3039	pointer argument since we may update it. /*
3040
3041	auto inline reg_errcode_t
3042	__attribute__ ((always_inline))
3043	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3044	Idx coll_sym_alloc, const* unsigned char *name)
3045	{
3046	int32_t elem, idx;
3047	size_t name_len = strlen ((const char *) name);
3048	if (nrules != `0`)
3049	{
3050	elem = seek_collating_symbol_entry (name, name_len);
3051	if (elem != -`1`)
3052	{
3053	/ We found the entry. /
3054	idx = symb_table[`2` * elem + `1`];
3055	/ Skip the name of collating element name. /
3056	idx += `1` + extra[idx];
3057	}
3058	else if (name_len == `1`)
3059	{
3060	/ No valid character, treat it as a normal*
3061	character. /*
3062	bitset_set (sbcset, name[`0`]);
3063	return REG_NOERROR;
3064	}
3065	else
3066	return REG_ECOLLATE;
3067
3068	/ Got valid collation sequence, add it as a new entry. /
3069	/ Check the space of the arrays. /
3070	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3071	{
3072	/ Not enough, realloc it. /
3073	/ +1 in case of mbcset->ncoll_syms is 0. /
3074	Idx new_coll_sym_alloc = `2` * mbcset->ncoll_syms + `1`;
3075	/ Use realloc since mbcset->coll_syms is NULL*
3076	if alloc == 0. /
3077	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3078	new_coll_sym_alloc);
3079	if (__glibc_unlikely (new_coll_syms == NULL))
3080	return REG_ESPACE;
3081	mbcset->coll_syms = new_coll_syms;
3082	*coll_sym_alloc = new_coll_sym_alloc;
3083	}
3084	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085	return REG_NOERROR;
3086	}
3087	else
3088	{
3089	if (__glibc_unlikely (name_len != `1`))
3090	return REG_ECOLLATE;
3091	else
3092	{
3093	bitset_set (sbcset, name[`0`]);
3094	return REG_NOERROR;
3095	}
3096	}
3097	}
3098	#endif
3099
3100	re_token_t br_token;
3101	re_bitset_ptr_t sbcset;
3102	#ifdef RE_ENABLE_I18N
3103	re_charset_t *mbcset;
3104	Idx coll_sym_alloc = `0`, range_alloc = `0`, mbchar_alloc = `0`;
3105	Idx equiv_class_alloc = `0`, char_class_alloc = `0`;
3106	#endif /* not RE_ENABLE_I18N */
3107	bool non_match = false;
3108	bin_tree_t *work_tree;
3109	int token_len;
3110	bool first_round = true;
3111	#ifdef _LIBC
3112	collseqmb = (const unsigned char *)
3113	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3114	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3115	if (nrules)
3116	{
3117	/*
3118	if (MB_CUR_MAX > 1)
3119	*/
3120	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3121	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3122	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3123	_NL_COLLATE_SYMB_TABLEMB);
3124	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3125	_NL_COLLATE_SYMB_EXTRAMB);
3126	}
3127	#endif
3128	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3129	#ifdef RE_ENABLE_I18N
3130	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3131	#endif /* RE_ENABLE_I18N */
3132	#ifdef RE_ENABLE_I18N
3133	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3134	#else
3135	if (__glibc_unlikely (sbcset == NULL))
3136	#endif /* RE_ENABLE_I18N */
3137	{
3138	re_free (sbcset);
3139	#ifdef RE_ENABLE_I18N
3140	re_free (mbcset);
3141	#endif
3142	*err = REG_ESPACE;
3143	return NULL;
3144	}
3145
3146	token_len = peek_token_bracket (token, regexp, syntax);
3147	if (__glibc_unlikely (token->type == END_OF_RE))
3148	{
3149	*err = REG_BADPAT;
3150	goto parse_bracket_exp_free_return;
3151	}
3152	if (token->type == OP_NON_MATCH_LIST)
3153	{
3154	#ifdef RE_ENABLE_I18N
3155	mbcset->non_match = `1`;
3156	#endif /* not RE_ENABLE_I18N */
3157	non_match = true;
3158	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3159	bitset_set (sbcset, `'\n'`);
3160	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3161	token_len = peek_token_bracket (token, regexp, syntax);
3162	if (__glibc_unlikely (token->type == END_OF_RE))
3163	{
3164	*err = REG_BADPAT;
3165	goto parse_bracket_exp_free_return;
3166	}
3167	}
3168
3169	/ We treat the first ']' as a normal character. /
3170	if (token->type == OP_CLOSE_BRACKET)
3171	token->type = CHARACTER;
3172
3173	while (`1`)
3174	{
3175	bracket_elem_t start_elem, end_elem;
3176	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3177	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178	reg_errcode_t ret;
3179	int token_len2 = `0`;
3180	bool is_range_exp = false;
3181	re_token_t token2;
3182
3183	start_elem.opr.name = start_name_buf;
3184	start_elem.type = COLL_SYM;
3185	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3186	syntax, first_round);
3187	if (__glibc_unlikely (ret != REG_NOERROR))
3188	{
3189	*err = ret;
3190	goto parse_bracket_exp_free_return;
3191	}
3192	first_round = false;
3193
3194	/ Get information about the next token. We need it in any case. /
3195	token_len = peek_token_bracket (token, regexp, syntax);
3196
3197	/ Do not check for ranges if we know they are not allowed. /
3198	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3199	{
3200	if (__glibc_unlikely (token->type == END_OF_RE))
3201	{
3202	*err = REG_EBRACK;
3203	goto parse_bracket_exp_free_return;
3204	}
3205	if (token->type == OP_CHARSET_RANGE)
3206	{
3207	re_string_skip_bytes (regexp, token_len); / Skip '-'. /
3208	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3209	if (__glibc_unlikely (token2.type == END_OF_RE))
3210	{
3211	*err = REG_EBRACK;
3212	goto parse_bracket_exp_free_return;
3213	}
3214	if (token2.type == OP_CLOSE_BRACKET)
3215	{
3216	/ We treat the last '-' as a normal character. /
3217	re_string_skip_bytes (regexp, -token_len);
3218	token->type = CHARACTER;
3219	}
3220	else
3221	is_range_exp = true;
3222	}
3223	}
3224
3225	if (is_range_exp == true)
3226	{
3227	end_elem.opr.name = end_name_buf;
3228	end_elem.type = COLL_SYM;
3229	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3230	dfa, syntax, true);
3231	if (__glibc_unlikely (ret != REG_NOERROR))
3232	{
3233	*err = ret;
3234	goto parse_bracket_exp_free_return;
3235	}
3236
3237	token_len = peek_token_bracket (token, regexp, syntax);
3238
3239	#ifdef _LIBC
3240	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3241	&start_elem, &end_elem);
3242	#else
3243	# ifdef RE_ENABLE_I18N
3244	*err = build_range_exp (syntax, sbcset,
3245	dfa->mb_cur_max > `1` ? mbcset : NULL,
3246	&range_alloc, &start_elem, &end_elem);
3247	# else
3248	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3249	# endif
3250	#endif /* RE_ENABLE_I18N */
3251	if (__glibc_unlikely (*err != REG_NOERROR))
3252	goto parse_bracket_exp_free_return;
3253	}
3254	else
3255	{
3256	switch (start_elem.type)
3257	{
3258	case SB_CHAR:
3259	bitset_set (sbcset, start_elem.opr.ch);
3260	break;
3261	#ifdef RE_ENABLE_I18N
3262	case MB_CHAR:
3263	/ Check whether the array has enough space. /
3264	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3265	{
3266	wchar_t *new_mbchars;
3267	/ Not enough, realloc it. /
3268	/ +1 in case of mbcset->nmbchars is 0. /
3269	mbchar_alloc = `2` * mbcset->nmbchars + `1`;
3270	/ Use realloc since array is NULL if alloc == 0. /*
3271	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3272	mbchar_alloc);
3273	if (__glibc_unlikely (new_mbchars == NULL))
3274	goto parse_bracket_exp_espace;
3275	mbcset->mbchars = new_mbchars;
3276	}
3277	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3278	break;
3279	#endif /* RE_ENABLE_I18N */
3280	case EQUIV_CLASS:
3281	*err = build_equiv_class (sbcset,
3282	#ifdef RE_ENABLE_I18N
3283	mbcset, &equiv_class_alloc,
3284	#endif /* RE_ENABLE_I18N */
3285	start_elem.opr.name);
3286	if (__glibc_unlikely (*err != REG_NOERROR))
3287	goto parse_bracket_exp_free_return;
3288	break;
3289	case COLL_SYM:
3290	*err = build_collating_symbol (sbcset,
3291	#ifdef RE_ENABLE_I18N
3292	mbcset, &coll_sym_alloc,
3293	#endif /* RE_ENABLE_I18N */
3294	start_elem.opr.name);
3295	if (__glibc_unlikely (*err != REG_NOERROR))
3296	goto parse_bracket_exp_free_return;
3297	break;
3298	case CHAR_CLASS:
3299	*err = build_charclass (regexp->trans, sbcset,
3300	#ifdef RE_ENABLE_I18N
3301	mbcset, &char_class_alloc,
3302	#endif /* RE_ENABLE_I18N */
3303	(const char *) start_elem.opr.name,
3304	syntax);
3305	if (__glibc_unlikely (*err != REG_NOERROR))
3306	goto parse_bracket_exp_free_return;
3307	break;
3308	default:
3309	assert (`0`);
3310	break;
3311	}
3312	}
3313	if (__glibc_unlikely (token->type == END_OF_RE))
3314	{
3315	*err = REG_EBRACK;
3316	goto parse_bracket_exp_free_return;
3317	}
3318	if (token->type == OP_CLOSE_BRACKET)
3319	break;
3320	}
3321
3322	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3323
3324	/ If it is non-matching list. /
3325	if (non_match)
3326	bitset_not (sbcset);
3327
3328	#ifdef RE_ENABLE_I18N
3329	/ Ensure only single byte characters are set. /
3330	if (dfa->mb_cur_max > `1`)
3331	bitset_mask (sbcset, dfa->sb_char);
3332
3333	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3334	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > `1` && (mbcset->nchar_classes
3335	\|\| mbcset->non_match)))
3336	{
3337	bin_tree_t *mbc_tree;
3338	int sbc_idx;
3339	/ Build a tree for complex bracket. /
3340	dfa->has_mb_node = `1`;
3341	br_token.type = COMPLEX_BRACKET;
3342	br_token.opr.mbcset = mbcset;
3343	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344	if (__glibc_unlikely (mbc_tree == NULL))
3345	goto parse_bracket_exp_espace;
3346	for (sbc_idx = `0`; sbc_idx < BITSET_WORDS; ++sbc_idx)
3347	if (sbcset[sbc_idx])
3348	break;
3349	/ If there are no bits set in sbcset, there is no point*
3350	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. /*
3351	if (sbc_idx < BITSET_WORDS)
3352	{
3353	/ Build a tree for simple bracket. /
3354	br_token.type = SIMPLE_BRACKET;
3355	br_token.opr.sbcset = sbcset;
3356	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3357	if (__glibc_unlikely (work_tree == NULL))
3358	goto parse_bracket_exp_espace;
3359
3360	/ Then join them by ALT node. /
3361	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3362	if (__glibc_unlikely (work_tree == NULL))
3363	goto parse_bracket_exp_espace;
3364	}
3365	else
3366	{
3367	re_free (sbcset);
3368	work_tree = mbc_tree;
3369	}
3370	}
3371	else
3372	#endif /* not RE_ENABLE_I18N */
3373	{
3374	#ifdef RE_ENABLE_I18N
3375	free_charset (mbcset);
3376	#endif
3377	/ Build a tree for simple bracket. /
3378	br_token.type = SIMPLE_BRACKET;
3379	br_token.opr.sbcset = sbcset;
3380	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3381	if (__glibc_unlikely (work_tree == NULL))
3382	goto parse_bracket_exp_espace;
3383	}
3384	return work_tree;
3385
3386	parse_bracket_exp_espace:
3387	*err = REG_ESPACE;
3388	parse_bracket_exp_free_return:
3389	re_free (sbcset);
3390	#ifdef RE_ENABLE_I18N
3391	free_charset (mbcset);
3392	#endif /* RE_ENABLE_I18N */
3393	return NULL;
3394	}
3395
3396	/ Parse an element in the bracket expression. /
3397
3398	static reg_errcode_t
3399	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3400	re_token_t token, int* token_len, re_dfa_t *dfa,
3401	reg_syntax_t syntax, bool accept_hyphen)
3402	{
3403	#ifdef RE_ENABLE_I18N
3404	int cur_char_size;
3405	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3406	if (cur_char_size > `1`)
3407	{
3408	elem->type = MB_CHAR;
3409	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3410	re_string_skip_bytes (regexp, cur_char_size);
3411	return REG_NOERROR;
3412	}
3413	#endif /* RE_ENABLE_I18N */
3414	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3415	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3416	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3417	return parse_bracket_symbol (elem, regexp, token);
3418	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3419	{
3420	/ A '-' must only appear as anything but a range indicator before*
3421	the closing bracket. Everything else is an error. /*
3422	re_token_t token2;
3423	(void) peek_token_bracket (&token2, regexp, syntax);
3424	if (token2.type != OP_CLOSE_BRACKET)
3425	/ The actual error value is not standardized since this whole*
3426	case is undefined. But ERANGE makes good sense. /*
3427	return REG_ERANGE;
3428	}
3429	elem->type = SB_CHAR;
3430	elem->opr.ch = token->opr.c;
3431	return REG_NOERROR;
3432	}
3433
3434	/ Parse a bracket symbol in the bracket expression. Bracket symbols are*
3435	such as [:<character_class>:], [.<collating_element>.], and
3436	[=<equivalent_class>=]. /*
3437
3438	static reg_errcode_t
3439	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3440	re_token_t *token)
3441	{
3442	unsigned char ch, delim = token->opr.c;
3443	int i = `0`;
3444	if (re_string_eoi(regexp))
3445	return REG_EBRACK;
3446	for (;; ++i)
3447	{
3448	if (i >= BRACKET_NAME_BUF_SIZE)
3449	return REG_EBRACK;
3450	if (token->type == OP_OPEN_CHAR_CLASS)
3451	ch = re_string_fetch_byte_case (regexp);
3452	else
3453	ch = re_string_fetch_byte (regexp);
3454	if (re_string_eoi(regexp))
3455	return REG_EBRACK;
3456	if (ch == delim && re_string_peek_byte (regexp, `0`) == `']'`)
3457	break;
3458	elem->opr.name[i] = ch;
3459	}
3460	re_string_skip_bytes (regexp, `1`);
3461	elem->opr.name[i] = `'\0'`;
3462	switch (token->type)
3463	{
3464	case OP_OPEN_COLL_ELEM:
3465	elem->type = COLL_SYM;
3466	break;
3467	case OP_OPEN_EQUIV_CLASS:
3468	elem->type = EQUIV_CLASS;
3469	break;
3470	case OP_OPEN_CHAR_CLASS:
3471	elem->type = CHAR_CLASS;
3472	break;
3473	default:
3474	break;
3475	}
3476	return REG_NOERROR;
3477	}
3478
3479	/ Helper function for parse_bracket_exp.*
3480	Build the equivalence class which is represented by NAME.
3481	The result are written to MBCSET and SBCSET.
3482	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3483	is a pointer argument since we may update it. /*
3484
3485	static reg_errcode_t
3486	#ifdef RE_ENABLE_I18N
3487	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3488	Idx equiv_class_alloc, const* unsigned char *name)
3489	#else /* not RE_ENABLE_I18N */
3490	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3491	#endif /* not RE_ENABLE_I18N */
3492	{
3493	#ifdef _LIBC
3494	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3495	if (nrules != `0`)
3496	{
3497	const int32_t table, indirect;
3498	const unsigned char weights, extra, *cp;
3499	unsigned char char_buf[`2`];
3500	int32_t idx1, idx2;
3501	unsigned int ch;
3502	size_t len;
3503	/ Calculate the index for equivalence class. /
3504	cp = name;
3505	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3506	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3507	_NL_COLLATE_WEIGHTMB);
3508	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509	_NL_COLLATE_EXTRAMB);
3510	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3511	_NL_COLLATE_INDIRECTMB);
3512	idx1 = findidx (table, indirect, extra, &cp, -`1`);
3513	if (__glibc_unlikely (idx1 == `0` \|\| *cp != `'\0'`))
3514	/ This isn't a valid character. /
3515	return REG_ECOLLATE;
3516
3517	/ Build single byte matching table for this equivalence class. /
3518	len = weights[idx1 & `0xffffff`];
3519	for (ch = `0`; ch < SBC_MAX; ++ch)
3520	{
3521	char_buf[`0`] = ch;
3522	cp = char_buf;
3523	idx2 = findidx (table, indirect, extra, &cp, `1`);
3524	/*
3525	idx2 = table[ch];
3526	*/
3527	if (idx2 == `0`)
3528	/ This isn't a valid character. /
3529	continue;
3530	/ Compare only if the length matches and the collation rule*
3531	index is the same. /*
3532	if (len == weights[idx2 & `0xffffff`] && (idx1 >> `24`) == (idx2 >> `24`)
3533	&& memcmp (weights + (idx1 & `0xffffff`) + `1`,
3534	weights + (idx2 & `0xffffff`) + `1`, len) == `0`)
3535	bitset_set (sbcset, ch);
3536	}
3537	/ Check whether the array has enough space. /
3538	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3539	{
3540	/ Not enough, realloc it. /
3541	/ +1 in case of mbcset->nequiv_classes is 0. /
3542	Idx new_equiv_class_alloc = `2` * mbcset->nequiv_classes + `1`;
3543	/ Use realloc since the array is NULL if alloc == 0. /*
3544	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3545	int32_t,
3546	new_equiv_class_alloc);
3547	if (__glibc_unlikely (new_equiv_classes == NULL))
3548	return REG_ESPACE;
3549	mbcset->equiv_classes = new_equiv_classes;
3550	*equiv_class_alloc = new_equiv_class_alloc;
3551	}
3552	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3553	}
3554	else
3555	#endif /* _LIBC */
3556	{
3557	if (__glibc_unlikely (strlen ((const char *) name) != `1`))
3558	return REG_ECOLLATE;
3559	bitset_set (sbcset, *name);
3560	}
3561	return REG_NOERROR;
3562	}
3563
3564	/ Helper function for parse_bracket_exp.*
3565	Build the character class which is represented by NAME.
3566	The result are written to MBCSET and SBCSET.
3567	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3568	is a pointer argument since we may update it. /*
3569
3570	static reg_errcode_t
3571	#ifdef RE_ENABLE_I18N
3572	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573	re_charset_t mbcset, Idx char_class_alloc,
3574	const char *class_name, reg_syntax_t syntax)
3575	#else /* not RE_ENABLE_I18N */
3576	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577	const char *class_name, reg_syntax_t syntax)
3578	#endif /* not RE_ENABLE_I18N */
3579	{
3580	int i;
3581	const char *name = class_name;
3582
3583	/ In case of REG_ICASE "upper" and "lower" match the both of*
3584	upper and lower cases. /*
3585	if ((syntax & RE_ICASE)
3586	&& (strcmp (name, "upper") == `0` \|\| strcmp (name, "lower") == `0`))
3587	name = "alpha";
3588
3589	#ifdef RE_ENABLE_I18N
3590	/ Check the space of the arrays. /
3591	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3592	{
3593	/ Not enough, realloc it. /
3594	/ +1 in case of mbcset->nchar_classes is 0. /
3595	Idx new_char_class_alloc = `2` * mbcset->nchar_classes + `1`;
3596	/ Use realloc since array is NULL if alloc == 0. /*
3597	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3598	new_char_class_alloc);
3599	if (__glibc_unlikely (new_char_classes == NULL))
3600	return REG_ESPACE;
3601	mbcset->char_classes = new_char_classes;
3602	*char_class_alloc = new_char_class_alloc;
3603	}
3604	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3605	#endif /* RE_ENABLE_I18N */
3606
3607	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3608	do { \
3609	if (__glibc_unlikely (trans != NULL)) \
3610	{ \
3611	for (i = 0; i < SBC_MAX; ++i) \
3612	if (ctype_func (i)) \
3613	bitset_set (sbcset, trans[i]); \
3614	} \
3615	else \
3616	{ \
3617	for (i = 0; i < SBC_MAX; ++i) \
3618	if (ctype_func (i)) \
3619	bitset_set (sbcset, i); \
3620	} \
3621	} while (0)
3622
3623	if (strcmp (name, "alnum") == `0`)
3624	BUILD_CHARCLASS_LOOP (isalnum);
3625	else if (strcmp (name, "cntrl") == `0`)
3626	BUILD_CHARCLASS_LOOP (iscntrl);
3627	else if (strcmp (name, "lower") == `0`)
3628	BUILD_CHARCLASS_LOOP (islower);
3629	else if (strcmp (name, "space") == `0`)
3630	BUILD_CHARCLASS_LOOP (isspace);
3631	else if (strcmp (name, "alpha") == `0`)
3632	BUILD_CHARCLASS_LOOP (isalpha);
3633	else if (strcmp (name, "digit") == `0`)
3634	BUILD_CHARCLASS_LOOP (isdigit);
3635	else if (strcmp (name, "print") == `0`)
3636	BUILD_CHARCLASS_LOOP (isprint);
3637	else if (strcmp (name, "upper") == `0`)
3638	BUILD_CHARCLASS_LOOP (isupper);
3639	else if (strcmp (name, "blank") == `0`)
3640	BUILD_CHARCLASS_LOOP (isblank);
3641	else if (strcmp (name, "graph") == `0`)
3642	BUILD_CHARCLASS_LOOP (isgraph);
3643	else if (strcmp (name, "punct") == `0`)
3644	BUILD_CHARCLASS_LOOP (ispunct);
3645	else if (strcmp (name, "xdigit") == `0`)
3646	BUILD_CHARCLASS_LOOP (isxdigit);
3647	else
3648	return REG_ECTYPE;
3649
3650	return REG_NOERROR;
3651	}
3652
3653	static bin_tree_t *
3654	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3655	const char *class_name,
3656	const char *extra, bool non_match,
3657	reg_errcode_t *err)
3658	{
3659	re_bitset_ptr_t sbcset;
3660	#ifdef RE_ENABLE_I18N
3661	re_charset_t *mbcset;
3662	Idx alloc = `0`;
3663	#endif /* not RE_ENABLE_I18N */
3664	reg_errcode_t ret;
3665	re_token_t br_token;
3666	bin_tree_t *tree;
3667
3668	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3669	if (__glibc_unlikely (sbcset == NULL))
3670	{
3671	*err = REG_ESPACE;
3672	return NULL;
3673	}
3674	#ifdef RE_ENABLE_I18N
3675	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3676	if (__glibc_unlikely (mbcset == NULL))
3677	{
3678	re_free (sbcset);
3679	*err = REG_ESPACE;
3680	return NULL;
3681	}
3682	mbcset->non_match = non_match;
3683	#endif /* RE_ENABLE_I18N */
3684
3685	/ We don't care the syntax in this case. /
3686	ret = build_charclass (trans, sbcset,
3687	#ifdef RE_ENABLE_I18N
3688	mbcset, &alloc,
3689	#endif /* RE_ENABLE_I18N */
3690	class_name, `0`);
3691
3692	if (__glibc_unlikely (ret != REG_NOERROR))
3693	{
3694	re_free (sbcset);
3695	#ifdef RE_ENABLE_I18N
3696	free_charset (mbcset);
3697	#endif /* RE_ENABLE_I18N */
3698	*err = ret;
3699	return NULL;
3700	}
3701	/ \w match '_' also. /
3702	for (; *extra; extra++)
3703	bitset_set (sbcset, *extra);
3704
3705	/ If it is non-matching list. /
3706	if (non_match)
3707	bitset_not (sbcset);
3708
3709	#ifdef RE_ENABLE_I18N
3710	/ Ensure only single byte characters are set. /
3711	if (dfa->mb_cur_max > `1`)
3712	bitset_mask (sbcset, dfa->sb_char);
3713	#endif
3714
3715	/ Build a tree for simple bracket. /
3716	#if defined GCC_LINT \|\| defined lint
3717	memset (&br_token, `0`, sizeof br_token);
3718	#endif
3719	br_token.type = SIMPLE_BRACKET;
3720	br_token.opr.sbcset = sbcset;
3721	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3722	if (__glibc_unlikely (tree == NULL))
3723	goto build_word_op_espace;
3724
3725	#ifdef RE_ENABLE_I18N
3726	if (dfa->mb_cur_max > `1`)
3727	{
3728	bin_tree_t *mbc_tree;
3729	/ Build a tree for complex bracket. /
3730	br_token.type = COMPLEX_BRACKET;
3731	br_token.opr.mbcset = mbcset;
3732	dfa->has_mb_node = `1`;
3733	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734	if (__glibc_unlikely (mbc_tree == NULL))
3735	goto build_word_op_espace;
3736	/ Then join them by ALT node. /
3737	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3738	if (__glibc_likely (mbc_tree != NULL))
3739	return tree;
3740	}
3741	else
3742	{
3743	free_charset (mbcset);
3744	return tree;
3745	}
3746	#else /* not RE_ENABLE_I18N */
3747	return tree;
3748	#endif /* not RE_ENABLE_I18N */
3749
3750	build_word_op_espace:
3751	re_free (sbcset);
3752	#ifdef RE_ENABLE_I18N
3753	free_charset (mbcset);
3754	#endif /* RE_ENABLE_I18N */
3755	*err = REG_ESPACE;
3756	return NULL;
3757	}
3758
3759	/ This is intended for the expressions like "a{1,3}".*
3760	Fetch a number from 'input', and return the number.
3761	Return -1 if the number field is empty like "{,1}".
3762	Return RE_DUP_MAX + 1 if the number field is too large.
3763	Return -2 if an error occurred. /*
3764
3765	static Idx
3766	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3767	{
3768	Idx num = -`1`;
3769	unsigned char c;
3770	while (`1`)
3771	{
3772	fetch_token (token, input, syntax);
3773	c = token->opr.c;
3774	if (__glibc_unlikely (token->type == END_OF_RE))
3775	return -`2`;
3776	if (token->type == OP_CLOSE_DUP_NUM \|\| c == `','`)
3777	break;
3778	num = ((token->type != CHARACTER \|\| c < `'0'` \|\| `'9'` < c \|\| num == -`2`)
3779	? -`2`
3780	: num == -`1`
3781	? c - `'0'`
3782	: MIN (RE_DUP_MAX + `1`, num * `10` + c - `'0'`));
3783	}
3784	return num;
3785	}
3786
3787	#ifdef RE_ENABLE_I18N
3788	static void
3789	free_charset (re_charset_t *cset)
3790	{
3791	re_free (cset->mbchars);
3792	# ifdef _LIBC
3793	re_free (cset->coll_syms);
3794	re_free (cset->equiv_classes);
3795	# endif
3796	re_free (cset->range_starts);
3797	re_free (cset->range_ends);
3798	re_free (cset->char_classes);
3799	re_free (cset);
3800	}
3801	#endif /* RE_ENABLE_I18N */
3802
3803	/ Functions for binary tree operation. /
3804
3805	/ Create a tree node. /
3806
3807	static bin_tree_t *
3808	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3809	re_token_type_t type)
3810	{
3811	re_token_t t;
3812	#if defined GCC_LINT \|\| defined lint
3813	memset (&t, `0`, sizeof t);
3814	#endif
3815	t.type = type;
3816	return create_token_tree (dfa, left, right, &t);
3817	}
3818
3819	static bin_tree_t *
3820	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3821	const re_token_t *token)
3822	{
3823	bin_tree_t *tree;
3824	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3825	{
3826	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, `1`);
3827
3828	if (storage == NULL)
3829	return NULL;
3830	storage->next = dfa->str_tree_storage;
3831	dfa->str_tree_storage = storage;
3832	dfa->str_tree_storage_idx = `0`;
3833	}
3834	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3835
3836	tree->parent = NULL;
3837	tree->left = left;
3838	tree->right = right;
3839	tree->token = *token;
3840	tree->token.duplicated = `0`;
3841	tree->token.opt_subexp = `0`;
3842	tree->first = NULL;
3843	tree->next = NULL;
3844	tree->node_idx = -`1`;
3845
3846	if (left != NULL)
3847	left->parent = tree;
3848	if (right != NULL)
3849	right->parent = tree;
3850	return tree;
3851	}
3852
3853	/ Mark the tree SRC as an optional subexpression.*
3854	To be called from preorder or postorder. /*
3855
3856	static reg_errcode_t
3857	mark_opt_subexp (void extra, bin_tree_t node)
3858	{
3859	Idx idx = (uintptr_t) extra;
3860	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3861	node->token.opt_subexp = `1`;
3862
3863	return REG_NOERROR;
3864	}
3865
3866	/ Free the allocated memory inside NODE. /
3867
3868	static void
3869	free_token (re_token_t *node)
3870	{
3871	#ifdef RE_ENABLE_I18N
3872	if (node->type == COMPLEX_BRACKET && node->duplicated == `0`)
3873	free_charset (node->opr.mbcset);
3874	else
3875	#endif /* RE_ENABLE_I18N */
3876	if (node->type == SIMPLE_BRACKET && node->duplicated == `0`)
3877	re_free (node->opr.sbcset);
3878	}
3879
3880	/ Worker function for tree walking. Free the allocated memory inside NODE*
3881	and its children. /*
3882
3883	static reg_errcode_t
3884	free_tree (void extra, bin_tree_t node)
3885	{
3886	free_token (&node->token);
3887	return REG_NOERROR;
3888	}
3889
3890
3891	/ Duplicate the node SRC, and return new node. This is a preorder*
3892	visit similar to the one implemented by the generic visitor, but
3893	we need more infrastructure to maintain two parallel trees --- so,
3894	it's easier to duplicate. /*
3895
3896	static bin_tree_t *
3897	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3898	{
3899	const bin_tree_t *node;
3900	bin_tree_t *dup_root;
3901	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3902
3903	for (node = root; ; )
3904	{
3905	/ Create a new tree and link it back to the current parent. /
3906	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3907	if (*p_new == NULL)
3908	return NULL;
3909	(*p_new)->parent = dup_node;
3910	(*p_new)->token.duplicated = `1`;
3911	dup_node = *p_new;
3912
3913	/ Go to the left node, or up and to the right. /
3914	if (node->left)
3915	{
3916	node = node->left;
3917	p_new = &dup_node->left;
3918	}
3919	else
3920	{
3921	const bin_tree_t *prev = NULL;
3922	while (node->right == prev \|\| node->right == NULL)
3923	{
3924	prev = node;
3925	node = node->parent;
3926	dup_node = dup_node->parent;
3927	if (!node)
3928	return dup_root;
3929	}
3930	node = node->right;
3931	p_new = &dup_node->right;
3932	}
3933	}
3934	}
3935

Browse the source code of glibc_src_2.30/posix/regcomp.c