regex_internal.c source code [glibc_src_2.25/posix/regex_internal.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2017 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<http://www.gnu.org/licenses/>. /*
19
20	static void re_string_construct_common (const char str, int* len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, int icase,
23	const re_dfa_t *dfa) internal_function;
24	static re_dfastate_t create_ci_newstate (const* re_dfa_t *dfa,
25	const re_node_set *nodes,
26	unsigned int hash) internal_function;
27	static re_dfastate_t create_cd_newstate (const* re_dfa_t *dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	unsigned int hash) internal_function;
31
32	/ Functions for string operation. /
33
34	/ This function allocate the buffers. It is necessary to call*
35	re_string_reconstruct before using the object. /*
36
37	static reg_errcode_t
38	internal_function __attribute_warn_unused_result__
39	re_string_allocate (re_string_t pstr, const* char str, int* len, int init_len,
40	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
41	{
42	reg_errcode_t ret;
43	int init_buf_len;
44
45	/ Ensure at least one character fits into the buffers. /
46	if (init_len < dfa->mb_cur_max)
47	init_len = dfa->mb_cur_max;
48	init_buf_len = (len + `1` < init_len) ? len + `1`: init_len;
49	re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51	ret = re_string_realloc_buffers (pstr, init_buf_len);
52	if (BE (ret != REG_NOERROR, `0`))
53	return ret;
54
55	pstr->word_char = dfa->word_char;
56	pstr->word_ops_used = dfa->word_ops_used;
57	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > `1`) ? `0` : len;
59	pstr->valid_raw_len = pstr->valid_len;
60	return REG_NOERROR;
61	}
62
63	/ This function allocate the buffers, and initialize them. /
64
65	static reg_errcode_t
66	internal_function __attribute_warn_unused_result__
67	re_string_construct (re_string_t pstr, const* char str, int* len,
68	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
69	{
70	reg_errcode_t ret;
71	memset (pstr, `'\0'`, sizeof (re_string_t));
72	re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74	if (len > `0`)
75	{
76	ret = re_string_realloc_buffers (pstr, len + `1`);
77	if (BE (ret != REG_NOERROR, `0`))
78	return ret;
79	}
80	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82	if (icase)
83	{
84	#ifdef RE_ENABLE_I18N
85	if (dfa->mb_cur_max > `1`)
86	{
87	while (`1`)
88	{
89	ret = build_wcs_upper_buffer (pstr);
90	if (BE (ret != REG_NOERROR, `0`))
91	return ret;
92	if (pstr->valid_raw_len >= len)
93	break;
94	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95	break;
96	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * `2`);
97	if (BE (ret != REG_NOERROR, `0`))
98	return ret;
99	}
100	}
101	else
102	#endif /* RE_ENABLE_I18N */
103	build_upper_buffer (pstr);
104	}
105	else
106	{
107	#ifdef RE_ENABLE_I18N
108	if (dfa->mb_cur_max > `1`)
109	build_wcs_buffer (pstr);
110	else
111	#endif /* RE_ENABLE_I18N */
112	{
113	if (trans != NULL)
114	re_string_translate_buffer (pstr);
115	else
116	{
117	pstr->valid_len = pstr->bufs_len;
118	pstr->valid_raw_len = pstr->bufs_len;
119	}
120	}
121	}
122
123	return REG_NOERROR;
124	}
125
126	/ Helper functions for re_string_allocate, and re_string_construct. /
127
128	static reg_errcode_t
129	internal_function __attribute_warn_unused_result__
130	re_string_realloc_buffers (re_string_t pstr, int* new_buf_len)
131	{
132	#ifdef RE_ENABLE_I18N
133	if (pstr->mb_cur_max > `1`)
134	{
135	wint_t *new_wcs;
136
137	/ Avoid overflow in realloc. /
138	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
139	if (BE (SIZE_MAX / max_object_size < new_buf_len, `0`))
140	return REG_ESPACE;
141
142	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143	if (BE (new_wcs == NULL, `0`))
144	return REG_ESPACE;
145	pstr->wcs = new_wcs;
146	if (pstr->offsets != NULL)
147	{
148	int new_offsets = re_realloc (pstr->offsets, int*, new_buf_len);
149	if (BE (new_offsets == NULL, `0`))
150	return REG_ESPACE;
151	pstr->offsets = new_offsets;
152	}
153	}
154	#endif /* RE_ENABLE_I18N */
155	if (pstr->mbs_allocated)
156	{
157	unsigned char new_mbs = re_realloc (pstr->mbs, unsigned* char,
158	new_buf_len);
159	if (BE (new_mbs == NULL, `0`))
160	return REG_ESPACE;
161	pstr->mbs = new_mbs;
162	}
163	pstr->bufs_len = new_buf_len;
164	return REG_NOERROR;
165	}
166
167
168	static void
169	internal_function
170	re_string_construct_common (const char str, int* len, re_string_t *pstr,
171	RE_TRANSLATE_TYPE trans, int icase,
172	const re_dfa_t *dfa)
173	{
174	pstr->raw_mbs = (const unsigned char *) str;
175	pstr->len = len;
176	pstr->raw_len = len;
177	pstr->trans = trans;
178	pstr->icase = icase ? `1` : `0`;
179	pstr->mbs_allocated = (trans != NULL \|\| icase);
180	pstr->mb_cur_max = dfa->mb_cur_max;
181	pstr->is_utf8 = dfa->is_utf8;
182	pstr->map_notascii = dfa->map_notascii;
183	pstr->stop = pstr->len;
184	pstr->raw_stop = pstr->stop;
185	}
186
187	#ifdef RE_ENABLE_I18N
188
189	/ Build wide character buffer PSTR->WCS.*
190	If the byte sequence of the string are:
191	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192	Then wide character buffer will be:
193	<wc1> , WEOF , <wc2> , WEOF , <wc3>
194	We use WEOF for padding, they indicate that the position isn't
195	a first byte of a multibyte character.
196
197	Note that this function assumes PSTR->VALID_LEN elements are already
198	built and starts from PSTR->VALID_LEN. /*
199
200	static void
201	internal_function
202	build_wcs_buffer (re_string_t *pstr)
203	{
204	#ifdef _LIBC
205	unsigned char buf[MB_LEN_MAX];
206	assert (MB_LEN_MAX >= pstr->mb_cur_max);
207	#else
208	unsigned char buf[`64`];
209	#endif
210	mbstate_t prev_st;
211	int byte_idx, end_idx, remain_len;
212	size_t mbclen;
213
214	/ Build the buffers from pstr->valid_len to either pstr->len or*
215	pstr->bufs_len. /*
216	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218	{
219	wchar_t wc;
220	const char *p;
221
222	remain_len = end_idx - byte_idx;
223	prev_st = pstr->cur_state;
224	/ Apply the translation if we need. /
225	if (BE (pstr->trans != NULL, `0`))
226	{
227	int i, ch;
228
229	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
230	{
231	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233	}
234	p = (const char *) buf;
235	}
236	else
237	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239	if (BE (mbclen == (size_t) -`1` \|\| mbclen == `0`
240	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len), `0`))
241	{
242	/ We treat these cases as a singlebyte character. /
243	mbclen = `1`;
244	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245	if (BE (pstr->trans != NULL, `0`))
246	wc = pstr->trans[wc];
247	pstr->cur_state = prev_st;
248	}
249	else if (BE (mbclen == (size_t) -`2`, `0`))
250	{
251	/ The buffer doesn't have enough space, finish to build. /
252	pstr->cur_state = prev_st;
253	break;
254	}
255
256	/ Write wide character and padding. /
257	pstr->wcs[byte_idx++] = wc;
258	/ Write paddings. /
259	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
260	pstr->wcs[byte_idx++] = WEOF;
261	}
262	pstr->valid_len = byte_idx;
263	pstr->valid_raw_len = byte_idx;
264	}
265
266	/ Build wide character buffer PSTR->WCS like build_wcs_buffer,*
267	but for REG_ICASE. /*
268
269	static reg_errcode_t
270	internal_function __attribute_warn_unused_result__
271	build_wcs_upper_buffer (re_string_t *pstr)
272	{
273	mbstate_t prev_st;
274	int src_idx, byte_idx, end_idx, remain_len;
275	size_t mbclen;
276	#ifdef _LIBC
277	char buf[MB_LEN_MAX];
278	assert (MB_LEN_MAX >= pstr->mb_cur_max);
279	#else
280	char buf[`64`];
281	#endif
282
283	byte_idx = pstr->valid_len;
284	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286	/ The following optimization assumes that ASCII characters can be*
287	mapped to wide characters with a simple cast. /*
288	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289	{
290	while (byte_idx < end_idx)
291	{
292	wchar_t wc;
293
294	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295	&& mbsinit (&pstr->cur_state))
296	{
297	/ In case of a singlebyte character. /
298	pstr->mbs[byte_idx]
299	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300	/ The next step uses the assumption that wchar_t is encoded*
301	ASCII-safe: all ASCII values can be converted like this. /*
302	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303	++byte_idx;
304	continue;
305	}
306
307	remain_len = end_idx - byte_idx;
308	prev_st = pstr->cur_state;
309	mbclen = __mbrtowc (&wc,
310	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311	+ byte_idx), remain_len, &pstr->cur_state);
312	if (BE (mbclen + `2` > `2`, `1`))
313	{
314	wchar_t wcu = wc;
315	if (__iswlower (wc))
316	{
317	size_t mbcdlen;
318
319	wcu = __towupper (wc);
320	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
321	if (BE (mbclen == mbcdlen, `1`))
322	memcpy (pstr->mbs + byte_idx, buf, mbclen);
323	else
324	{
325	src_idx = byte_idx;
326	goto offsets_needed;
327	}
328	}
329	else
330	memcpy (pstr->mbs + byte_idx,
331	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332	pstr->wcs[byte_idx++] = wcu;
333	/ Write paddings. /
334	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
335	pstr->wcs[byte_idx++] = WEOF;
336	}
337	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
338	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
339	{
340	/ It is an invalid character, an incomplete character*
341	at the end of the string, or '\0'. Just use the byte. /*
342	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343	pstr->mbs[byte_idx] = ch;
344	/ And also cast it to wide char. /
345	pstr->wcs[byte_idx++] = (wchar_t) ch;
346	if (BE (mbclen == (size_t) -`1`, `0`))
347	pstr->cur_state = prev_st;
348	}
349	else
350	{
351	/ The buffer doesn't have enough space, finish to build. /
352	pstr->cur_state = prev_st;
353	break;
354	}
355	}
356	pstr->valid_len = byte_idx;
357	pstr->valid_raw_len = byte_idx;
358	return REG_NOERROR;
359	}
360	else
361	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362	{
363	wchar_t wc;
364	const char *p;
365	offsets_needed:
366	remain_len = end_idx - byte_idx;
367	prev_st = pstr->cur_state;
368	if (BE (pstr->trans != NULL, `0`))
369	{
370	int i, ch;
371
372	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
373	{
374	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375	buf[i] = pstr->trans[ch];
376	}
377	p = (const char *) buf;
378	}
379	else
380	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382	if (BE (mbclen + `2` > `2`, `1`))
383	{
384	wchar_t wcu = wc;
385	if (__iswlower (wc))
386	{
387	size_t mbcdlen;
388
389	wcu = __towupper (wc);
390	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
391	if (BE (mbclen == mbcdlen, `1`))
392	memcpy (pstr->mbs + byte_idx, buf, mbclen);
393	else if (mbcdlen != (size_t) -`1`)
394	{
395	size_t i;
396
397	if (byte_idx + mbcdlen > pstr->bufs_len)
398	{
399	pstr->cur_state = prev_st;
400	break;
401	}
402
403	if (pstr->offsets == NULL)
404	{
405	pstr->offsets = re_malloc (int, pstr->bufs_len);
406
407	if (pstr->offsets == NULL)
408	return REG_ESPACE;
409	}
410	if (!pstr->offsets_needed)
411	{
412	for (i = `0`; i < (size_t) byte_idx; ++i)
413	pstr->offsets[i] = i;
414	pstr->offsets_needed = `1`;
415	}
416
417	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418	pstr->wcs[byte_idx] = wcu;
419	pstr->offsets[byte_idx] = src_idx;
420	for (i = `1`; i < mbcdlen; ++i)
421	{
422	pstr->offsets[byte_idx + i]
423	= src_idx + (i < mbclen ? i : mbclen - `1`);
424	pstr->wcs[byte_idx + i] = WEOF;
425	}
426	pstr->len += mbcdlen - mbclen;
427	if (pstr->raw_stop > src_idx)
428	pstr->stop += mbcdlen - mbclen;
429	end_idx = (pstr->bufs_len > pstr->len)
430	? pstr->len : pstr->bufs_len;
431	byte_idx += mbcdlen;
432	src_idx += mbclen;
433	continue;
434	}
435	else
436	memcpy (pstr->mbs + byte_idx, p, mbclen);
437	}
438	else
439	memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441	if (BE (pstr->offsets_needed != `0`, `0`))
442	{
443	size_t i;
444	for (i = `0`; i < mbclen; ++i)
445	pstr->offsets[byte_idx + i] = src_idx + i;
446	}
447	src_idx += mbclen;
448
449	pstr->wcs[byte_idx++] = wcu;
450	/ Write paddings. /
451	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
452	pstr->wcs[byte_idx++] = WEOF;
453	}
454	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
455	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
456	{
457	/ It is an invalid character or '\0'. Just use the byte. /
458	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459
460	if (BE (pstr->trans != NULL, `0`))
461	ch = pstr->trans [ch];
462	pstr->mbs[byte_idx] = ch;
463
464	if (BE (pstr->offsets_needed != `0`, `0`))
465	pstr->offsets[byte_idx] = src_idx;
466	++src_idx;
467
468	/ And also cast it to wide char. /
469	pstr->wcs[byte_idx++] = (wchar_t) ch;
470	if (BE (mbclen == (size_t) -`1`, `0`))
471	pstr->cur_state = prev_st;
472	}
473	else
474	{
475	/ The buffer doesn't have enough space, finish to build. /
476	pstr->cur_state = prev_st;
477	break;
478	}
479	}
480	pstr->valid_len = byte_idx;
481	pstr->valid_raw_len = src_idx;
482	return REG_NOERROR;
483	}
484
485	/ Skip characters until the index becomes greater than NEW_RAW_IDX.*
486	Return the index. /*
487
488	static int
489	internal_function
490	re_string_skip_chars (re_string_t pstr, int* new_raw_idx, wint_t *last_wc)
491	{
492	mbstate_t prev_st;
493	int rawbuf_idx;
494	size_t mbclen;
495	wint_t wc = WEOF;
496
497	/ Skip the characters which are not necessary to check. /
498	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499	rawbuf_idx < new_raw_idx;)
500	{
501	wchar_t wc2;
502	int remain_len = pstr->raw_len - rawbuf_idx;
503	prev_st = pstr->cur_state;
504	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505	remain_len, &pstr->cur_state);
506	if (BE ((ssize_t) mbclen <= `0`, `0`))
507	{
508	/ We treat these cases as a single byte character. /
509	if (mbclen == `0` \|\| remain_len == `0`)
510	wc = L`'\0'`;
511	else
512	wc = (unsigned* char *) (pstr->raw_mbs + rawbuf_idx);
513	mbclen = `1`;
514	pstr->cur_state = prev_st;
515	}
516	else
517	wc = (wint_t) wc2;
518	/ Then proceed the next character. /
519	rawbuf_idx += mbclen;
520	}
521	*last_wc = wc;
522	return rawbuf_idx;
523	}
524	#endif /* RE_ENABLE_I18N */
525
526	/ Build the buffer PSTR->MBS, and apply the translation if we need.*
527	This function is used in case of REG_ICASE. /*
528
529	static void
530	internal_function
531	build_upper_buffer (re_string_t *pstr)
532	{
533	int char_idx, end_idx;
534	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537	{
538	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539	if (BE (pstr->trans != NULL, `0`))
540	ch = pstr->trans[ch];
541	if (islower (ch))
542	pstr->mbs[char_idx] = toupper (ch);
543	else
544	pstr->mbs[char_idx] = ch;
545	}
546	pstr->valid_len = char_idx;
547	pstr->valid_raw_len = char_idx;
548	}
549
550	/ Apply TRANS to the buffer in PSTR. /
551
552	static void
553	internal_function
554	re_string_translate_buffer (re_string_t *pstr)
555	{
556	int buf_idx, end_idx;
557	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560	{
561	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562	pstr->mbs[buf_idx] = pstr->trans[ch];
563	}
564
565	pstr->valid_len = buf_idx;
566	pstr->valid_raw_len = buf_idx;
567	}
568
569	/ This function re-construct the buffers.*
570	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571	convert to upper case in case of REG_ICASE, apply translation. /*
572
573	static reg_errcode_t
574	internal_function __attribute_warn_unused_result__
575	re_string_reconstruct (re_string_t pstr, int* idx, int eflags)
576	{
577	int offset = idx - pstr->raw_mbs_idx;
578	if (BE (offset < `0`, `0`))
579	{
580	/ Reset buffer. /
581	#ifdef RE_ENABLE_I18N
582	if (pstr->mb_cur_max > `1`)
583	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
584	#endif /* RE_ENABLE_I18N */
585	pstr->len = pstr->raw_len;
586	pstr->stop = pstr->raw_stop;
587	pstr->valid_len = `0`;
588	pstr->raw_mbs_idx = `0`;
589	pstr->valid_raw_len = `0`;
590	pstr->offsets_needed = `0`;
591	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
593	if (!pstr->mbs_allocated)
594	pstr->mbs = (unsigned char *) pstr->raw_mbs;
595	offset = idx;
596	}
597
598	if (BE (offset != `0`, `1`))
599	{
600	/ Should the already checked characters be kept? /
601	if (BE (offset < pstr->valid_raw_len, `1`))
602	{
603	/ Yes, move them to the front of the buffer. /
604	#ifdef RE_ENABLE_I18N
605	if (BE (pstr->offsets_needed, `0`))
606	{
607	int low = `0`, high = pstr->valid_len, mid;
608	do
609	{
610	mid = (high + low) / `2`;
611	if (pstr->offsets[mid] > offset)
612	high = mid;
613	else if (pstr->offsets[mid] < offset)
614	low = mid + `1`;
615	else
616	break;
617	}
618	while (low < high);
619	if (pstr->offsets[mid] < offset)
620	++mid;
621	pstr->tip_context = re_string_context_at (pstr, mid - `1`,
622	eflags);
623	/ This can be quite complicated, so handle specially*
624	only the common and easy case where the character with
625	different length representation of lower and upper
626	case is present at or after offset. /*
627	if (pstr->valid_len > offset
628	&& mid == offset && pstr->offsets[mid] == offset)
629	{
630	memmove (pstr->wcs, pstr->wcs + offset,
631	(pstr->valid_len - offset) * sizeof (wint_t));
632	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
633	pstr->valid_len -= offset;
634	pstr->valid_raw_len -= offset;
635	for (low = `0`; low < pstr->valid_len; low++)
636	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
637	}
638	else
639	{
640	/ Otherwise, just find out how long the partial multibyte*
641	character at offset is and fill it with WEOF/255. /*
642	pstr->len = pstr->raw_len - idx + offset;
643	pstr->stop = pstr->raw_stop - idx + offset;
644	pstr->offsets_needed = `0`;
645	while (mid > `0` && pstr->offsets[mid - `1`] == offset)
646	--mid;
647	while (mid < pstr->valid_len)
648	if (pstr->wcs[mid] != WEOF)
649	break;
650	else
651	++mid;
652	if (mid == pstr->valid_len)
653	pstr->valid_len = `0`;
654	else
655	{
656	pstr->valid_len = pstr->offsets[mid] - offset;
657	if (pstr->valid_len)
658	{
659	for (low = `0`; low < pstr->valid_len; ++low)
660	pstr->wcs[low] = WEOF;
661	memset (pstr->mbs, `255`, pstr->valid_len);
662	}
663	}
664	pstr->valid_raw_len = pstr->valid_len;
665	}
666	}
667	else
668	#endif
669	{
670	pstr->tip_context = re_string_context_at (pstr, offset - `1`,
671	eflags);
672	#ifdef RE_ENABLE_I18N
673	if (pstr->mb_cur_max > `1`)
674	memmove (pstr->wcs, pstr->wcs + offset,
675	(pstr->valid_len - offset) * sizeof (wint_t));
676	#endif /* RE_ENABLE_I18N */
677	if (BE (pstr->mbs_allocated, `0`))
678	memmove (pstr->mbs, pstr->mbs + offset,
679	pstr->valid_len - offset);
680	pstr->valid_len -= offset;
681	pstr->valid_raw_len -= offset;
682	#if defined DEBUG && DEBUG
683	assert (pstr->valid_len > `0`);
684	#endif
685	}
686	}
687	else
688	{
689	/ No, skip all characters until IDX. /
690	int prev_valid_len = pstr->valid_len;
691
692	#ifdef RE_ENABLE_I18N
693	if (BE (pstr->offsets_needed, `0`))
694	{
695	pstr->len = pstr->raw_len - idx + offset;
696	pstr->stop = pstr->raw_stop - idx + offset;
697	pstr->offsets_needed = `0`;
698	}
699	#endif
700	pstr->valid_len = `0`;
701	#ifdef RE_ENABLE_I18N
702	if (pstr->mb_cur_max > `1`)
703	{
704	int wcs_idx;
705	wint_t wc = WEOF;
706
707	if (pstr->is_utf8)
708	{
709	const unsigned char raw, p, *end;
710
711	/ Special case UTF-8. Multi-byte chars start with any*
712	byte other than 0x80 - 0xbf. /*
713	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
714	end = raw + (offset - pstr->mb_cur_max);
715	if (end < pstr->raw_mbs)
716	end = pstr->raw_mbs;
717	p = raw + offset - `1`;
718	#ifdef _LIBC
719	/ We know the wchar_t encoding is UCS4, so for the simple*
720	case, ASCII characters, skip the conversion step. /*
721	if (isascii (*p) && BE (pstr->trans == NULL, `1`))
722	{
723	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
724	/ pstr->valid_len = 0; /
725	wc = (wchar_t) *p;
726	}
727	else
728	#endif
729	for (; p >= end; --p)
730	if ((*p & `0xc0`) != `0x80`)
731	{
732	mbstate_t cur_state;
733	wchar_t wc2;
734	int mlen = raw + pstr->len - p;
735	unsigned char buf[`6`];
736	size_t mbclen;
737
738	const unsigned char *pp = p;
739	if (BE (pstr->trans != NULL, `0`))
740	{
741	int i = mlen < `6` ? mlen : `6`;
742	while (--i >= `0`)
743	buf[i] = pstr->trans[p[i]];
744	pp = buf;
745	}
746	/ XXX Don't use mbrtowc, we know which conversion*
747	to use (UTF-8 -> UCS4). /*
748	memset (&cur_state, `0`, sizeof (cur_state));
749	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
750	&cur_state);
751	if (raw + offset - p <= mbclen
752	&& mbclen < (size_t) -`2`)
753	{
754	memset (&pstr->cur_state, `'\0'`,
755	sizeof (mbstate_t));
756	pstr->valid_len = mbclen - (raw + offset - p);
757	wc = wc2;
758	}
759	break;
760	}
761	}
762
763	if (wc == WEOF)
764	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
765	if (wc == WEOF)
766	pstr->tip_context
767	= re_string_context_at (pstr, prev_valid_len - `1`, eflags);
768	else
769	pstr->tip_context = ((BE (pstr->word_ops_used != `0`, `0`)
770	&& IS_WIDE_WORD_CHAR (wc))
771	? CONTEXT_WORD
772	: ((IS_WIDE_NEWLINE (wc)
773	&& pstr->newline_anchor)
774	? CONTEXT_NEWLINE : `0`));
775	if (BE (pstr->valid_len, `0`))
776	{
777	for (wcs_idx = `0`; wcs_idx < pstr->valid_len; ++wcs_idx)
778	pstr->wcs[wcs_idx] = WEOF;
779	if (pstr->mbs_allocated)
780	memset (pstr->mbs, `255`, pstr->valid_len);
781	}
782	pstr->valid_raw_len = pstr->valid_len;
783	}
784	else
785	#endif /* RE_ENABLE_I18N */
786	{
787	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - `1`];
788	pstr->valid_raw_len = `0`;
789	if (pstr->trans)
790	c = pstr->trans[c];
791	pstr->tip_context = (bitset_contain (pstr->word_char, c)
792	? CONTEXT_WORD
793	: ((IS_NEWLINE (c) && pstr->newline_anchor)
794	? CONTEXT_NEWLINE : `0`));
795	}
796	}
797	if (!BE (pstr->mbs_allocated, `0`))
798	pstr->mbs += offset;
799	}
800	pstr->raw_mbs_idx = idx;
801	pstr->len -= offset;
802	pstr->stop -= offset;
803
804	/ Then build the buffers. /
805	#ifdef RE_ENABLE_I18N
806	if (pstr->mb_cur_max > `1`)
807	{
808	if (pstr->icase)
809	{
810	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
811	if (BE (ret != REG_NOERROR, `0`))
812	return ret;
813	}
814	else
815	build_wcs_buffer (pstr);
816	}
817	else
818	#endif /* RE_ENABLE_I18N */
819	if (BE (pstr->mbs_allocated, `0`))
820	{
821	if (pstr->icase)
822	build_upper_buffer (pstr);
823	else if (pstr->trans != NULL)
824	re_string_translate_buffer (pstr);
825	}
826	else
827	pstr->valid_len = pstr->len;
828
829	pstr->cur_idx = `0`;
830	return REG_NOERROR;
831	}
832
833	static unsigned char
834	internal_function __attribute ((pure))
835	re_string_peek_byte_case (const re_string_t pstr, int* idx)
836	{
837	int ch, off;
838
839	/ Handle the common (easiest) cases first. /
840	if (BE (!pstr->mbs_allocated, `1`))
841	return re_string_peek_byte (pstr, idx);
842
843	#ifdef RE_ENABLE_I18N
844	if (pstr->mb_cur_max > `1`
845	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
846	return re_string_peek_byte (pstr, idx);
847	#endif
848
849	off = pstr->cur_idx + idx;
850	#ifdef RE_ENABLE_I18N
851	if (pstr->offsets_needed)
852	off = pstr->offsets[off];
853	#endif
854
855	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
856
857	#ifdef RE_ENABLE_I18N
858	/ Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I*
859	this function returns CAPITAL LETTER I instead of first byte of
860	DOTLESS SMALL LETTER I. The latter would confuse the parser,
861	since peek_byte_case doesn't advance cur_idx in any way. /*
862	if (pstr->offsets_needed && !isascii (ch))
863	return re_string_peek_byte (pstr, idx);
864	#endif
865
866	return ch;
867	}
868
869	static unsigned char
870	internal_function
871	re_string_fetch_byte_case (re_string_t *pstr)
872	{
873	if (BE (!pstr->mbs_allocated, `1`))
874	return re_string_fetch_byte (pstr);
875
876	#ifdef RE_ENABLE_I18N
877	if (pstr->offsets_needed)
878	{
879	int off, ch;
880
881	/ For tr_TR.UTF-8 [[:islower:]] there is*
882	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
883	in that case the whole multi-byte character and return
884	the original letter. On the other side, with
885	[[: DOTLESS SMALL LETTER I return [[:I, as doing
886	anything else would complicate things too much. /*
887
888	if (!re_string_first_byte (pstr, pstr->cur_idx))
889	return re_string_fetch_byte (pstr);
890
891	off = pstr->offsets[pstr->cur_idx];
892	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893
894	if (! isascii (ch))
895	return re_string_fetch_byte (pstr);
896
897	re_string_skip_bytes (pstr,
898	re_string_char_size_at (pstr, pstr->cur_idx));
899	return ch;
900	}
901	#endif
902
903	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904	}
905
906	static void
907	internal_function
908	re_string_destruct (re_string_t *pstr)
909	{
910	#ifdef RE_ENABLE_I18N
911	re_free (pstr->wcs);
912	re_free (pstr->offsets);
913	#endif /* RE_ENABLE_I18N */
914	if (pstr->mbs_allocated)
915	re_free (pstr->mbs);
916	}
917
918	/ Return the context at IDX in INPUT. /
919
920	static unsigned int
921	internal_function
922	re_string_context_at (const re_string_t input, int* idx, int eflags)
923	{
924	int c;
925	if (BE (idx < `0`, `0`))
926	/ In this case, we use the value stored in input->tip_context,*
927	since we can't know the character in input->mbs[-1] here. /*
928	return input->tip_context;
929	if (BE (idx == input->len, `0`))
930	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
932	#ifdef RE_ENABLE_I18N
933	if (input->mb_cur_max > `1`)
934	{
935	wint_t wc;
936	int wc_idx = idx;
937	while(input->wcs[wc_idx] == WEOF)
938	{
939	#if defined DEBUG && DEBUG
940	/ It must not happen. /
941	assert (wc_idx >= `0`);
942	#endif
943	--wc_idx;
944	if (wc_idx < `0`)
945	return input->tip_context;
946	}
947	wc = input->wcs[wc_idx];
948	if (BE (input->word_ops_used != `0`, `0`) && IS_WIDE_WORD_CHAR (wc))
949	return CONTEXT_WORD;
950	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951	? CONTEXT_NEWLINE : `0`);
952	}
953	else
954	#endif
955	{
956	c = re_string_byte_at (input, idx);
957	if (bitset_contain (input->word_char, c))
958	return CONTEXT_WORD;
959	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : `0`;
960	}
961	}
962
963	/ Functions for set operation. /
964
965	static reg_errcode_t
966	internal_function __attribute_warn_unused_result__
967	re_node_set_alloc (re_node_set set, int* size)
968	{
969	set->alloc = size;
970	set->nelem = `0`;
971	set->elems = re_malloc (int, size);
972	if (BE (set->elems == NULL, `0`))
973	return REG_ESPACE;
974	return REG_NOERROR;
975	}
976
977	static reg_errcode_t
978	internal_function __attribute_warn_unused_result__
979	re_node_set_init_1 (re_node_set set, int* elem)
980	{
981	set->alloc = `1`;
982	set->nelem = `1`;
983	set->elems = re_malloc (int, `1`);
984	if (BE (set->elems == NULL, `0`))
985	{
986	set->alloc = set->nelem = `0`;
987	return REG_ESPACE;
988	}
989	set->elems[`0`] = elem;
990	return REG_NOERROR;
991	}
992
993	static reg_errcode_t
994	internal_function __attribute_warn_unused_result__
995	re_node_set_init_2 (re_node_set set, int* elem1, int elem2)
996	{
997	set->alloc = `2`;
998	set->elems = re_malloc (int, `2`);
999	if (BE (set->elems == NULL, `0`))
1000	return REG_ESPACE;
1001	if (elem1 == elem2)
1002	{
1003	set->nelem = `1`;
1004	set->elems[`0`] = elem1;
1005	}
1006	else
1007	{
1008	set->nelem = `2`;
1009	if (elem1 < elem2)
1010	{
1011	set->elems[`0`] = elem1;
1012	set->elems[`1`] = elem2;
1013	}
1014	else
1015	{
1016	set->elems[`0`] = elem2;
1017	set->elems[`1`] = elem1;
1018	}
1019	}
1020	return REG_NOERROR;
1021	}
1022
1023	static reg_errcode_t
1024	internal_function __attribute_warn_unused_result__
1025	re_node_set_init_copy (re_node_set dest, const* re_node_set *src)
1026	{
1027	dest->nelem = src->nelem;
1028	if (src->nelem > `0`)
1029	{
1030	dest->alloc = dest->nelem;
1031	dest->elems = re_malloc (int, dest->alloc);
1032	if (BE (dest->elems == NULL, `0`))
1033	{
1034	dest->alloc = dest->nelem = `0`;
1035	return REG_ESPACE;
1036	}
1037	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1038	}
1039	else
1040	re_node_set_init_empty (dest);
1041	return REG_NOERROR;
1042	}
1043
1044	/ Calculate the intersection of the sets SRC1 and SRC2. And merge it to*
1045	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046	Note: We assume dest->elems is NULL, when dest->alloc is 0. /*
1047
1048	static reg_errcode_t
1049	internal_function __attribute_warn_unused_result__
1050	re_node_set_add_intersect (re_node_set dest, const* re_node_set *src1,
1051	const re_node_set *src2)
1052	{
1053	int i1, i2, is, id, delta, sbase;
1054	if (src1->nelem == `0` \|\| src2->nelem == `0`)
1055	return REG_NOERROR;
1056
1057	/ We need dest->nelem + 2 * elems_in_intersection; this is a*
1058	conservative estimate. /*
1059	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060	{
1061	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062	int new_elems = re_realloc (dest->elems, int*, new_alloc);
1063	if (BE (new_elems == NULL, `0`))
1064	return REG_ESPACE;
1065	dest->elems = new_elems;
1066	dest->alloc = new_alloc;
1067	}
1068
1069	/ Find the items in the intersection of SRC1 and SRC2, and copy*
1070	into the top of DEST those that are not already in DEST itself. /*
1071	sbase = dest->nelem + src1->nelem + src2->nelem;
1072	i1 = src1->nelem - `1`;
1073	i2 = src2->nelem - `1`;
1074	id = dest->nelem - `1`;
1075	for (;;)
1076	{
1077	if (src1->elems[i1] == src2->elems[i2])
1078	{
1079	/ Try to find the item in DEST. Maybe we could binary search? /
1080	while (id >= `0` && dest->elems[id] > src1->elems[i1])
1081	--id;
1082
1083	if (id < `0` \|\| dest->elems[id] != src1->elems[i1])
1084	dest->elems[--sbase] = src1->elems[i1];
1085
1086	if (--i1 < `0` \|\| --i2 < `0`)
1087	break;
1088	}
1089
1090	/ Lower the highest of the two items. /
1091	else if (src1->elems[i1] < src2->elems[i2])
1092	{
1093	if (--i2 < `0`)
1094	break;
1095	}
1096	else
1097	{
1098	if (--i1 < `0`)
1099	break;
1100	}
1101	}
1102
1103	id = dest->nelem - `1`;
1104	is = dest->nelem + src1->nelem + src2->nelem - `1`;
1105	delta = is - sbase + `1`;
1106
1107	/ Now copy. When DELTA becomes zero, the remaining*
1108	DEST elements are already in place; this is more or
1109	less the same loop that is in re_node_set_merge. /*
1110	dest->nelem += delta;
1111	if (delta > `0` && id >= `0`)
1112	for (;;)
1113	{
1114	if (dest->elems[is] > dest->elems[id])
1115	{
1116	/ Copy from the top. /
1117	dest->elems[id + delta--] = dest->elems[is--];
1118	if (delta == `0`)
1119	break;
1120	}
1121	else
1122	{
1123	/ Slide from the bottom. /
1124	dest->elems[id + delta] = dest->elems[id];
1125	if (--id < `0`)
1126	break;
1127	}
1128	}
1129
1130	/ Copy remaining SRC elements. /
1131	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1132
1133	return REG_NOERROR;
1134	}
1135
1136	/ Calculate the union set of the sets SRC1 and SRC2. And store it to*
1137	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1138
1139	static reg_errcode_t
1140	internal_function __attribute_warn_unused_result__
1141	re_node_set_init_union (re_node_set dest, const* re_node_set *src1,
1142	const re_node_set *src2)
1143	{
1144	int i1, i2, id;
1145	if (src1 != NULL && src1->nelem > `0` && src2 != NULL && src2->nelem > `0`)
1146	{
1147	dest->alloc = src1->nelem + src2->nelem;
1148	dest->elems = re_malloc (int, dest->alloc);
1149	if (BE (dest->elems == NULL, `0`))
1150	return REG_ESPACE;
1151	}
1152	else
1153	{
1154	if (src1 != NULL && src1->nelem > `0`)
1155	return re_node_set_init_copy (dest, src1);
1156	else if (src2 != NULL && src2->nelem > `0`)
1157	return re_node_set_init_copy (dest, src2);
1158	else
1159	re_node_set_init_empty (dest);
1160	return REG_NOERROR;
1161	}
1162	for (i1 = i2 = id = `0` ; i1 < src1->nelem && i2 < src2->nelem ;)
1163	{
1164	if (src1->elems[i1] > src2->elems[i2])
1165	{
1166	dest->elems[id++] = src2->elems[i2++];
1167	continue;
1168	}
1169	if (src1->elems[i1] == src2->elems[i2])
1170	++i2;
1171	dest->elems[id++] = src1->elems[i1++];
1172	}
1173	if (i1 < src1->nelem)
1174	{
1175	memcpy (dest->elems + id, src1->elems + i1,
1176	(src1->nelem - i1) * sizeof (int));
1177	id += src1->nelem - i1;
1178	}
1179	else if (i2 < src2->nelem)
1180	{
1181	memcpy (dest->elems + id, src2->elems + i2,
1182	(src2->nelem - i2) * sizeof (int));
1183	id += src2->nelem - i2;
1184	}
1185	dest->nelem = id;
1186	return REG_NOERROR;
1187	}
1188
1189	/ Calculate the union set of the sets DEST and SRC. And store it to*
1190	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1191
1192	static reg_errcode_t
1193	internal_function __attribute_warn_unused_result__
1194	re_node_set_merge (re_node_set dest, const* re_node_set *src)
1195	{
1196	int is, id, sbase, delta;
1197	if (src == NULL \|\| src->nelem == `0`)
1198	return REG_NOERROR;
1199	if (dest->alloc < `2` * src->nelem + dest->nelem)
1200	{
1201	int new_alloc = `2` * (src->nelem + dest->alloc);
1202	int new_buffer = re_realloc (dest->elems, int*, new_alloc);
1203	if (BE (new_buffer == NULL, `0`))
1204	return REG_ESPACE;
1205	dest->elems = new_buffer;
1206	dest->alloc = new_alloc;
1207	}
1208
1209	if (BE (dest->nelem == `0`, `0`))
1210	{
1211	dest->nelem = src->nelem;
1212	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1213	return REG_NOERROR;
1214	}
1215
1216	/ Copy into the top of DEST the items of SRC that are not*
1217	found in DEST. Maybe we could binary search in DEST? /*
1218	for (sbase = dest->nelem + `2` * src->nelem,
1219	is = src->nelem - `1`, id = dest->nelem - `1`; is >= `0` && id >= `0`; )
1220	{
1221	if (dest->elems[id] == src->elems[is])
1222	is--, id--;
1223	else if (dest->elems[id] < src->elems[is])
1224	dest->elems[--sbase] = src->elems[is--];
1225	else / if (dest->elems[id] > src->elems[is]) /
1226	--id;
1227	}
1228
1229	if (is >= `0`)
1230	{
1231	/ If DEST is exhausted, the remaining items of SRC must be unique. /
1232	sbase -= is + `1`;
1233	memcpy (dest->elems + sbase, src->elems, (is + `1`) * sizeof (int));
1234	}
1235
1236	id = dest->nelem - `1`;
1237	is = dest->nelem + `2` * src->nelem - `1`;
1238	delta = is - sbase + `1`;
1239	if (delta == `0`)
1240	return REG_NOERROR;
1241
1242	/ Now copy. When DELTA becomes zero, the remaining*
1243	DEST elements are already in place. /*
1244	dest->nelem += delta;
1245	for (;;)
1246	{
1247	if (dest->elems[is] > dest->elems[id])
1248	{
1249	/ Copy from the top. /
1250	dest->elems[id + delta--] = dest->elems[is--];
1251	if (delta == `0`)
1252	break;
1253	}
1254	else
1255	{
1256	/ Slide from the bottom. /
1257	dest->elems[id + delta] = dest->elems[id];
1258	if (--id < `0`)
1259	{
1260	/ Copy remaining SRC elements. /
1261	memcpy (dest->elems, dest->elems + sbase,
1262	delta * sizeof (int));
1263	break;
1264	}
1265	}
1266	}
1267
1268	return REG_NOERROR;
1269	}
1270
1271	/ Insert the new element ELEM to the re_node_set* SET.*
1272	SET should not already have ELEM.
1273	return -1 if an error is occured, return 1 otherwise. /*
1274
1275	static int
1276	internal_function __attribute_warn_unused_result__
1277	re_node_set_insert (re_node_set set, int* elem)
1278	{
1279	int idx;
1280	/ In case the set is empty. /
1281	if (set->alloc == `0`)
1282	{
1283	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, `1`))
1284	return `1`;
1285	else
1286	return -`1`;
1287	}
1288
1289	if (BE (set->nelem, `0`) == `0`)
1290	{
1291	/ We already guaranteed above that set->alloc != 0. /
1292	set->elems[`0`] = elem;
1293	++set->nelem;
1294	return `1`;
1295	}
1296
1297	/ Realloc if we need. /
1298	if (set->alloc == set->nelem)
1299	{
1300	int *new_elems;
1301	set->alloc = set->alloc * `2`;
1302	new_elems = re_realloc (set->elems, int, set->alloc);
1303	if (BE (new_elems == NULL, `0`))
1304	return -`1`;
1305	set->elems = new_elems;
1306	}
1307
1308	/ Move the elements which follows the new element. Test the*
1309	first element separately to skip a check in the inner loop. /*
1310	if (elem < set->elems[`0`])
1311	{
1312	idx = `0`;
1313	for (idx = set->nelem; idx > `0`; idx--)
1314	set->elems[idx] = set->elems[idx - `1`];
1315	}
1316	else
1317	{
1318	for (idx = set->nelem; set->elems[idx - `1`] > elem; idx--)
1319	set->elems[idx] = set->elems[idx - `1`];
1320	}
1321
1322	/ Insert the new element. /
1323	set->elems[idx] = elem;
1324	++set->nelem;
1325	return `1`;
1326	}
1327
1328	/ Insert the new element ELEM to the re_node_set* SET.*
1329	SET should not already have any element greater than or equal to ELEM.
1330	Return -1 if an error is occured, return 1 otherwise. /*
1331
1332	static int
1333	internal_function __attribute_warn_unused_result__
1334	re_node_set_insert_last (re_node_set set, int* elem)
1335	{
1336	/ Realloc if we need. /
1337	if (set->alloc == set->nelem)
1338	{
1339	int *new_elems;
1340	set->alloc = (set->alloc + `1`) * `2`;
1341	new_elems = re_realloc (set->elems, int, set->alloc);
1342	if (BE (new_elems == NULL, `0`))
1343	return -`1`;
1344	set->elems = new_elems;
1345	}
1346
1347	/ Insert the new element. /
1348	set->elems[set->nelem++] = elem;
1349	return `1`;
1350	}
1351
1352	/ Compare two node sets SET1 and SET2.*
1353	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. /*
1354
1355	static int
1356	internal_function __attribute ((pure))
1357	re_node_set_compare (const re_node_set set1, const* re_node_set *set2)
1358	{
1359	int i;
1360	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1361	return `0`;
1362	for (i = set1->nelem ; --i >= `0` ; )
1363	if (set1->elems[i] != set2->elems[i])
1364	return `0`;
1365	return `1`;
1366	}
1367
1368	/ Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. /
1369
1370	static int
1371	internal_function __attribute ((pure))
1372	re_node_set_contains (const re_node_set set, int* elem)
1373	{
1374	unsigned int idx, right, mid;
1375	if (set->nelem <= `0`)
1376	return `0`;
1377
1378	/ Binary search the element. /
1379	idx = `0`;
1380	right = set->nelem - `1`;
1381	while (idx < right)
1382	{
1383	mid = (idx + right) / `2`;
1384	if (set->elems[mid] < elem)
1385	idx = mid + `1`;
1386	else
1387	right = mid;
1388	}
1389	return set->elems[idx] == elem ? idx + `1` : `0`;
1390	}
1391
1392	static void
1393	internal_function
1394	re_node_set_remove_at (re_node_set set, int* idx)
1395	{
1396	if (idx < `0` \|\| idx >= set->nelem)
1397	return;
1398	--set->nelem;
1399	for (; idx < set->nelem; idx++)
1400	set->elems[idx] = set->elems[idx + `1`];
1401	}
1402
1403
1404	/ Add the token TOKEN to dfa->nodes, and return the index of the token.*
1405	Or return -1, if an error will be occured. /*
1406
1407	static int
1408	internal_function
1409	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410	{
1411	int type = token.type;
1412	if (BE (dfa->nodes_len >= dfa->nodes_alloc, `0`))
1413	{
1414	size_t new_nodes_alloc = dfa->nodes_alloc * `2`;
1415	int new_nexts, new_indices;
1416	re_node_set new_edests, new_eclosures;
1417	re_token_t *new_nodes;
1418
1419	/ Avoid overflows in realloc. /
1420	const size_t max_object_size = MAX (sizeof (re_token_t),
1421	MAX (sizeof (re_node_set),
1422	sizeof (int)));
1423	if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, `0`))
1424	return -`1`;
1425
1426	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427	if (BE (new_nodes == NULL, `0`))
1428	return -`1`;
1429	dfa->nodes = new_nodes;
1430	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1431	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1432	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434	if (BE (new_nexts == NULL \|\| new_indices == NULL
1435	\|\| new_edests == NULL \|\| new_eclosures == NULL, `0`))
1436	return -`1`;
1437	dfa->nexts = new_nexts;
1438	dfa->org_indices = new_indices;
1439	dfa->edests = new_edests;
1440	dfa->eclosures = new_eclosures;
1441	dfa->nodes_alloc = new_nodes_alloc;
1442	}
1443	dfa->nodes[dfa->nodes_len] = token;
1444	dfa->nodes[dfa->nodes_len].constraint = `0`;
1445	#ifdef RE_ENABLE_I18N
1446	dfa->nodes[dfa->nodes_len].accept_mb =
1447	(type == OP_PERIOD && dfa->mb_cur_max > `1`) \|\| type == COMPLEX_BRACKET;
1448	#endif
1449	dfa->nexts[dfa->nodes_len] = -`1`;
1450	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452	return dfa->nodes_len++;
1453	}
1454
1455	static inline unsigned int
1456	internal_function
1457	calc_state_hash (const re_node_set nodes, unsigned* int context)
1458	{
1459	unsigned int hash = nodes->nelem + context;
1460	int i;
1461	for (i = `0` ; i < nodes->nelem ; i++)
1462	hash += nodes->elems[i];
1463	return hash;
1464	}
1465
1466	/ Search for the state whose node_set is equivalent to NODES.*
1467	Return the pointer to the state, if we found it in the DFA.
1468	Otherwise create the new one and return it. In case of an error
1469	return NULL and set the error code in ERR.
1470	Note: - We assume NULL as the invalid state, then it is possible that
1471	return value is NULL and ERR is REG_NOERROR.
1472	- We never return non-NULL value in case of any errors, it is for
1473	optimization. /*
1474
1475	static re_dfastate_t *
1476	internal_function __attribute_warn_unused_result__
1477	re_acquire_state (reg_errcode_t err, const* re_dfa_t *dfa,
1478	const re_node_set *nodes)
1479	{
1480	unsigned int hash;
1481	re_dfastate_t *new_state;
1482	struct re_state_table_entry *spot;
1483	int i;
1484	if (BE (nodes->nelem == `0`, `0`))
1485	{
1486	*err = REG_NOERROR;
1487	return NULL;
1488	}
1489	hash = calc_state_hash (nodes, `0`);
1490	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491
1492	for (i = `0` ; i < spot->num ; i++)
1493	{
1494	re_dfastate_t *state = spot->array[i];
1495	if (hash != state->hash)
1496	continue;
1497	if (re_node_set_compare (&state->nodes, nodes))
1498	return state;
1499	}
1500
1501	/ There are no appropriate state in the dfa, create the new one. /
1502	new_state = create_ci_newstate (dfa, nodes, hash);
1503	if (BE (new_state == NULL, `0`))
1504	*err = REG_ESPACE;
1505
1506	return new_state;
1507	}
1508
1509	/ Search for the state whose node_set is equivalent to NODES and*
1510	whose context is equivalent to CONTEXT.
1511	Return the pointer to the state, if we found it in the DFA.
1512	Otherwise create the new one and return it. In case of an error
1513	return NULL and set the error code in ERR.
1514	Note: - We assume NULL as the invalid state, then it is possible that
1515	return value is NULL and ERR is REG_NOERROR.
1516	- We never return non-NULL value in case of any errors, it is for
1517	optimization. /*
1518
1519	static re_dfastate_t *
1520	internal_function __attribute_warn_unused_result__
1521	re_acquire_state_context (reg_errcode_t err, const* re_dfa_t *dfa,
1522	const re_node_set nodes, unsigned* int context)
1523	{
1524	unsigned int hash;
1525	re_dfastate_t *new_state;
1526	struct re_state_table_entry *spot;
1527	int i;
1528	if (nodes->nelem == `0`)
1529	{
1530	*err = REG_NOERROR;
1531	return NULL;
1532	}
1533	hash = calc_state_hash (nodes, context);
1534	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1535
1536	for (i = `0` ; i < spot->num ; i++)
1537	{
1538	re_dfastate_t *state = spot->array[i];
1539	if (state->hash == hash
1540	&& state->context == context
1541	&& re_node_set_compare (state->entrance_nodes, nodes))
1542	return state;
1543	}
1544	/ There are no appropriate state in `dfa', create the new one. /
1545	new_state = create_cd_newstate (dfa, nodes, context, hash);
1546	if (BE (new_state == NULL, `0`))
1547	*err = REG_ESPACE;
1548
1549	return new_state;
1550	}
1551
1552	/ Finish initialization of the new state NEWSTATE, and using its hash value*
1553	HASH put in the appropriate bucket of DFA's state table. Return value
1554	indicates the error code if failed. /*
1555
1556	static reg_errcode_t
1557	__attribute_warn_unused_result__
1558	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1559	unsigned int hash)
1560	{
1561	struct re_state_table_entry *spot;
1562	reg_errcode_t err;
1563	int i;
1564
1565	newstate->hash = hash;
1566	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1567	if (BE (err != REG_NOERROR, `0`))
1568	return REG_ESPACE;
1569	for (i = `0`; i < newstate->nodes.nelem; i++)
1570	{
1571	int elem = newstate->nodes.elems[i];
1572	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1573	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < `0`)
1574	return REG_ESPACE;
1575	}
1576
1577	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1578	if (BE (spot->alloc <= spot->num, `0`))
1579	{
1580	int new_alloc = `2` * spot->num + `2`;
1581	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1582	new_alloc);
1583	if (BE (new_array == NULL, `0`))
1584	return REG_ESPACE;
1585	spot->array = new_array;
1586	spot->alloc = new_alloc;
1587	}
1588	spot->array[spot->num++] = newstate;
1589	return REG_NOERROR;
1590	}
1591
1592	static void
1593	free_state (re_dfastate_t *state)
1594	{
1595	re_node_set_free (&state->non_eps_nodes);
1596	re_node_set_free (&state->inveclosure);
1597	if (state->entrance_nodes != &state->nodes)
1598	{
1599	re_node_set_free (state->entrance_nodes);
1600	re_free (state->entrance_nodes);
1601	}
1602	re_node_set_free (&state->nodes);
1603	re_free (state->word_trtable);
1604	re_free (state->trtable);
1605	re_free (state);
1606	}
1607
1608	/ Create the new state which is independ of contexts.*
1609	Return the new state if succeeded, otherwise return NULL. /*
1610
1611	static re_dfastate_t *
1612	internal_function __attribute_warn_unused_result__
1613	create_ci_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1614	unsigned int hash)
1615	{
1616	int i;
1617	reg_errcode_t err;
1618	re_dfastate_t *newstate;
1619
1620	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1621	if (BE (newstate == NULL, `0`))
1622	return NULL;
1623	err = re_node_set_init_copy (&newstate->nodes, nodes);
1624	if (BE (err != REG_NOERROR, `0`))
1625	{
1626	re_free (newstate);
1627	return NULL;
1628	}
1629
1630	newstate->entrance_nodes = &newstate->nodes;
1631	for (i = `0` ; i < nodes->nelem ; i++)
1632	{
1633	re_token_t *node = dfa->nodes + nodes->elems[i];
1634	re_token_type_t type = node->type;
1635	if (type == CHARACTER && !node->constraint)
1636	continue;
1637	#ifdef RE_ENABLE_I18N
1638	newstate->accept_mb \|= node->accept_mb;
1639	#endif /* RE_ENABLE_I18N */
1640
1641	/ If the state has the halt node, the state is a halt state. /
1642	if (type == END_OF_RE)
1643	newstate->halt = `1`;
1644	else if (type == OP_BACK_REF)
1645	newstate->has_backref = `1`;
1646	else if (type == ANCHOR \|\| node->constraint)
1647	newstate->has_constraint = `1`;
1648	}
1649	err = register_state (dfa, newstate, hash);
1650	if (BE (err != REG_NOERROR, `0`))
1651	{
1652	free_state (newstate);
1653	newstate = NULL;
1654	}
1655	return newstate;
1656	}
1657
1658	/ Create the new state which is depend on the context CONTEXT.*
1659	Return the new state if succeeded, otherwise return NULL. /*
1660
1661	static re_dfastate_t *
1662	internal_function __attribute_warn_unused_result__
1663	create_cd_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1664	unsigned int context, unsigned int hash)
1665	{
1666	int i, nctx_nodes = `0`;
1667	reg_errcode_t err;
1668	re_dfastate_t *newstate;
1669
1670	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1671	if (BE (newstate == NULL, `0`))
1672	return NULL;
1673	err = re_node_set_init_copy (&newstate->nodes, nodes);
1674	if (BE (err != REG_NOERROR, `0`))
1675	{
1676	re_free (newstate);
1677	return NULL;
1678	}
1679
1680	newstate->context = context;
1681	newstate->entrance_nodes = &newstate->nodes;
1682
1683	for (i = `0` ; i < nodes->nelem ; i++)
1684	{
1685	re_token_t *node = dfa->nodes + nodes->elems[i];
1686	re_token_type_t type = node->type;
1687	unsigned int constraint = node->constraint;
1688
1689	if (type == CHARACTER && !constraint)
1690	continue;
1691	#ifdef RE_ENABLE_I18N
1692	newstate->accept_mb \|= node->accept_mb;
1693	#endif /* RE_ENABLE_I18N */
1694
1695	/ If the state has the halt node, the state is a halt state. /
1696	if (type == END_OF_RE)
1697	newstate->halt = `1`;
1698	else if (type == OP_BACK_REF)
1699	newstate->has_backref = `1`;
1700
1701	if (constraint)
1702	{
1703	if (newstate->entrance_nodes == &newstate->nodes)
1704	{
1705	newstate->entrance_nodes = re_malloc (re_node_set, `1`);
1706	if (BE (newstate->entrance_nodes == NULL, `0`))
1707	{
1708	free_state (newstate);
1709	return NULL;
1710	}
1711	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1712	!= REG_NOERROR)
1713	return NULL;
1714	nctx_nodes = `0`;
1715	newstate->has_constraint = `1`;
1716	}
1717
1718	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1719	{
1720	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1721	++nctx_nodes;
1722	}
1723	}
1724	}
1725	err = register_state (dfa, newstate, hash);
1726	if (BE (err != REG_NOERROR, `0`))
1727	{
1728	free_state (newstate);
1729	newstate = NULL;
1730	}
1731	return newstate;
1732	}
1733

Browse the source code of glibc_src_2.25/posix/regex_internal.c