1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48__attribute_warn_unused_result__
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51{
52 reg_errcode_t ret;
53 Idx init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (__glibc_unlikely (ret != REG_NOERROR))
63 return ret;
64
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
76__attribute_warn_unused_result__
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79{
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (__glibc_unlikely (ret != REG_NOERROR))
88 return ret;
89 }
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (__glibc_unlikely (ret != REG_NOERROR))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (__glibc_unlikely (ret != REG_NOERROR))
108 return ret;
109 }
110 }
111 else
112#endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117#ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121#endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
130 }
131 }
132
133 return REG_NOERROR;
134}
135
136/* Helper functions for re_string_allocate, and re_string_construct. */
137
138static reg_errcode_t
139__attribute_warn_unused_result__
140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141{
142#ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
144 {
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 < new_buf_len))
151 return REG_ESPACE;
152
153 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154 if (__glibc_unlikely (new_wcs == NULL))
155 return REG_ESPACE;
156 pstr->wcs = new_wcs;
157 if (pstr->offsets != NULL)
158 {
159 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160 if (__glibc_unlikely (new_offsets == NULL))
161 return REG_ESPACE;
162 pstr->offsets = new_offsets;
163 }
164 }
165#endif /* RE_ENABLE_I18N */
166 if (pstr->mbs_allocated)
167 {
168 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 new_buf_len);
170 if (__glibc_unlikely (new_mbs == NULL))
171 return REG_ESPACE;
172 pstr->mbs = new_mbs;
173 }
174 pstr->bufs_len = new_buf_len;
175 return REG_NOERROR;
176}
177
178
179static void
180re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, bool icase,
182 const re_dfa_t *dfa)
183{
184 pstr->raw_mbs = (const unsigned char *) str;
185 pstr->len = len;
186 pstr->raw_len = len;
187 pstr->trans = trans;
188 pstr->icase = icase;
189 pstr->mbs_allocated = (trans != NULL || icase);
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
195}
196
197#ifdef RE_ENABLE_I18N
198
199/* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
206
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210static void
211build_wcs_buffer (re_string_t *pstr)
212{
213#ifdef _LIBC
214 unsigned char buf[MB_LEN_MAX];
215 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
216#else
217 unsigned char buf[64];
218#endif
219 mbstate_t prev_st;
220 Idx byte_idx, end_idx, remain_len;
221 size_t mbclen;
222
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
229 const char *p;
230
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
233 /* Apply the translation if we need. */
234 if (__glibc_unlikely (pstr->trans != NULL))
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 || (mbclen == (size_t) -2
250 && pstr->bufs_len >= pstr->len)))
251 {
252 /* We treat these cases as a singlebyte character. */
253 mbclen = 1;
254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 if (__glibc_unlikely (pstr->trans != NULL))
256 wc = pstr->trans[wc];
257 pstr->cur_state = prev_st;
258 }
259 else if (__glibc_unlikely (mbclen == (size_t) -2))
260 {
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr->cur_state = prev_st;
263 break;
264 }
265
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
271 }
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
274}
275
276/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
279static reg_errcode_t
280__attribute_warn_unused_result__
281build_wcs_upper_buffer (re_string_t *pstr)
282{
283 mbstate_t prev_st;
284 Idx src_idx, byte_idx, end_idx, remain_len;
285 size_t mbclen;
286#ifdef _LIBC
287 char buf[MB_LEN_MAX];
288 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
289#else
290 char buf[64];
291#endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
301 {
302 wchar_t wc;
303
304 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305 && mbsinit (&pstr->cur_state))
306 {
307 /* In case of a singlebyte character. */
308 pstr->mbs[byte_idx]
309 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310 /* The next step uses the assumption that wchar_t is encoded
311 ASCII-safe: all ASCII values can be converted like this. */
312 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
313 ++byte_idx;
314 continue;
315 }
316
317 remain_len = end_idx - byte_idx;
318 prev_st = pstr->cur_state;
319 mbclen = __mbrtowc (&wc,
320 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321 + byte_idx), remain_len, &pstr->cur_state);
322 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
323 {
324 wchar_t wcu = __towupper (wc);
325 if (wcu != wc)
326 {
327 size_t mbcdlen;
328
329 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330 if (__glibc_likely (mbclen == mbcdlen))
331 memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 else
333 {
334 src_idx = byte_idx;
335 goto offsets_needed;
336 }
337 }
338 else
339 memcpy (pstr->mbs + byte_idx,
340 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 pstr->wcs[byte_idx++] = wcu;
342 /* Write paddings. */
343 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 pstr->wcs[byte_idx++] = WEOF;
345 }
346 else if (mbclen == (size_t) -1 || mbclen == 0
347 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
348 {
349 /* It is an invalid character, an incomplete character
350 at the end of the string, or '\0'. Just use the byte. */
351 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352 pstr->mbs[byte_idx] = ch;
353 /* And also cast it to wide char. */
354 pstr->wcs[byte_idx++] = (wchar_t) ch;
355 if (__glibc_unlikely (mbclen == (size_t) -1))
356 pstr->cur_state = prev_st;
357 }
358 else
359 {
360 /* The buffer doesn't have enough space, finish to build. */
361 pstr->cur_state = prev_st;
362 break;
363 }
364 }
365 pstr->valid_len = byte_idx;
366 pstr->valid_raw_len = byte_idx;
367 return REG_NOERROR;
368 }
369 else
370 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371 {
372 wchar_t wc;
373 const char *p;
374 offsets_needed:
375 remain_len = end_idx - byte_idx;
376 prev_st = pstr->cur_state;
377 if (__glibc_unlikely (pstr->trans != NULL))
378 {
379 int i, ch;
380
381 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382 {
383 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 buf[i] = pstr->trans[ch];
385 }
386 p = (const char *) buf;
387 }
388 else
389 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
392 {
393 wchar_t wcu = __towupper (wc);
394 if (wcu != wc)
395 {
396 size_t mbcdlen;
397
398 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399 if (__glibc_likely (mbclen == mbcdlen))
400 memcpy (pstr->mbs + byte_idx, buf, mbclen);
401 else if (mbcdlen != (size_t) -1)
402 {
403 size_t i;
404
405 if (byte_idx + mbcdlen > pstr->bufs_len)
406 {
407 pstr->cur_state = prev_st;
408 break;
409 }
410
411 if (pstr->offsets == NULL)
412 {
413 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
414
415 if (pstr->offsets == NULL)
416 return REG_ESPACE;
417 }
418 if (!pstr->offsets_needed)
419 {
420 for (i = 0; i < (size_t) byte_idx; ++i)
421 pstr->offsets[i] = i;
422 pstr->offsets_needed = 1;
423 }
424
425 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426 pstr->wcs[byte_idx] = wcu;
427 pstr->offsets[byte_idx] = src_idx;
428 for (i = 1; i < mbcdlen; ++i)
429 {
430 pstr->offsets[byte_idx + i]
431 = src_idx + (i < mbclen ? i : mbclen - 1);
432 pstr->wcs[byte_idx + i] = WEOF;
433 }
434 pstr->len += mbcdlen - mbclen;
435 if (pstr->raw_stop > src_idx)
436 pstr->stop += mbcdlen - mbclen;
437 end_idx = (pstr->bufs_len > pstr->len)
438 ? pstr->len : pstr->bufs_len;
439 byte_idx += mbcdlen;
440 src_idx += mbclen;
441 continue;
442 }
443 else
444 memcpy (pstr->mbs + byte_idx, p, mbclen);
445 }
446 else
447 memcpy (pstr->mbs + byte_idx, p, mbclen);
448
449 if (__glibc_unlikely (pstr->offsets_needed != 0))
450 {
451 size_t i;
452 for (i = 0; i < mbclen; ++i)
453 pstr->offsets[byte_idx + i] = src_idx + i;
454 }
455 src_idx += mbclen;
456
457 pstr->wcs[byte_idx++] = wcu;
458 /* Write paddings. */
459 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
460 pstr->wcs[byte_idx++] = WEOF;
461 }
462 else if (mbclen == (size_t) -1 || mbclen == 0
463 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
464 {
465 /* It is an invalid character or '\0'. Just use the byte. */
466 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467
468 if (__glibc_unlikely (pstr->trans != NULL))
469 ch = pstr->trans [ch];
470 pstr->mbs[byte_idx] = ch;
471
472 if (__glibc_unlikely (pstr->offsets_needed != 0))
473 pstr->offsets[byte_idx] = src_idx;
474 ++src_idx;
475
476 /* And also cast it to wide char. */
477 pstr->wcs[byte_idx++] = (wchar_t) ch;
478 if (__glibc_unlikely (mbclen == (size_t) -1))
479 pstr->cur_state = prev_st;
480 }
481 else
482 {
483 /* The buffer doesn't have enough space, finish to build. */
484 pstr->cur_state = prev_st;
485 break;
486 }
487 }
488 pstr->valid_len = byte_idx;
489 pstr->valid_raw_len = src_idx;
490 return REG_NOERROR;
491}
492
493/* Skip characters until the index becomes greater than NEW_RAW_IDX.
494 Return the index. */
495
496static Idx
497re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
498{
499 mbstate_t prev_st;
500 Idx rawbuf_idx;
501 size_t mbclen;
502 wint_t wc = WEOF;
503
504 /* Skip the characters which are not necessary to check. */
505 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
506 rawbuf_idx < new_raw_idx;)
507 {
508 wchar_t wc2;
509 Idx remain_len = pstr->raw_len - rawbuf_idx;
510 prev_st = pstr->cur_state;
511 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
512 remain_len, &pstr->cur_state);
513 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
514 || mbclen == 0))
515 {
516 /* We treat these cases as a single byte character. */
517 if (mbclen == 0 || remain_len == 0)
518 wc = L'\0';
519 else
520 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
521 mbclen = 1;
522 pstr->cur_state = prev_st;
523 }
524 else
525 wc = wc2;
526 /* Then proceed the next character. */
527 rawbuf_idx += mbclen;
528 }
529 *last_wc = wc;
530 return rawbuf_idx;
531}
532#endif /* RE_ENABLE_I18N */
533
534/* Build the buffer PSTR->MBS, and apply the translation if we need.
535 This function is used in case of REG_ICASE. */
536
537static void
538build_upper_buffer (re_string_t *pstr)
539{
540 Idx char_idx, end_idx;
541 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542
543 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
544 {
545 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546 if (__glibc_unlikely (pstr->trans != NULL))
547 ch = pstr->trans[ch];
548 pstr->mbs[char_idx] = toupper (ch);
549 }
550 pstr->valid_len = char_idx;
551 pstr->valid_raw_len = char_idx;
552}
553
554/* Apply TRANS to the buffer in PSTR. */
555
556static void
557re_string_translate_buffer (re_string_t *pstr)
558{
559 Idx buf_idx, end_idx;
560 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
561
562 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
563 {
564 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
565 pstr->mbs[buf_idx] = pstr->trans[ch];
566 }
567
568 pstr->valid_len = buf_idx;
569 pstr->valid_raw_len = buf_idx;
570}
571
572/* This function re-construct the buffers.
573 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
574 convert to upper case in case of REG_ICASE, apply translation. */
575
576static reg_errcode_t
577__attribute_warn_unused_result__
578re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
579{
580 Idx offset;
581
582 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
583 offset = idx - pstr->raw_mbs_idx;
584 else
585 {
586 /* Reset buffer. */
587#ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590#endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
602 }
603
604 if (__glibc_likely (offset != 0))
605 {
606 /* Should the already checked characters be kept? */
607 if (__glibc_likely (offset < pstr->valid_raw_len))
608 {
609 /* Yes, move them to the front of the buffer. */
610#ifdef RE_ENABLE_I18N
611 if (__glibc_unlikely (pstr->offsets_needed))
612 {
613 Idx low = 0, high = pstr->valid_len, mid;
614 do
615 {
616 mid = (high + low) / 2;
617 if (pstr->offsets[mid] > offset)
618 high = mid;
619 else if (pstr->offsets[mid] < offset)
620 low = mid + 1;
621 else
622 break;
623 }
624 while (low < high);
625 if (pstr->offsets[mid] < offset)
626 ++mid;
627 pstr->tip_context = re_string_context_at (pstr, mid - 1,
628 eflags);
629 /* This can be quite complicated, so handle specially
630 only the common and easy case where the character with
631 different length representation of lower and upper
632 case is present at or after offset. */
633 if (pstr->valid_len > offset
634 && mid == offset && pstr->offsets[mid] == offset)
635 {
636 memmove (pstr->wcs, pstr->wcs + offset,
637 (pstr->valid_len - offset) * sizeof (wint_t));
638 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639 pstr->valid_len -= offset;
640 pstr->valid_raw_len -= offset;
641 for (low = 0; low < pstr->valid_len; low++)
642 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 }
644 else
645 {
646 /* Otherwise, just find out how long the partial multibyte
647 character at offset is and fill it with WEOF/255. */
648 pstr->len = pstr->raw_len - idx + offset;
649 pstr->stop = pstr->raw_stop - idx + offset;
650 pstr->offsets_needed = 0;
651 while (mid > 0 && pstr->offsets[mid - 1] == offset)
652 --mid;
653 while (mid < pstr->valid_len)
654 if (pstr->wcs[mid] != WEOF)
655 break;
656 else
657 ++mid;
658 if (mid == pstr->valid_len)
659 pstr->valid_len = 0;
660 else
661 {
662 pstr->valid_len = pstr->offsets[mid] - offset;
663 if (pstr->valid_len)
664 {
665 for (low = 0; low < pstr->valid_len; ++low)
666 pstr->wcs[low] = WEOF;
667 memset (pstr->mbs, 255, pstr->valid_len);
668 }
669 }
670 pstr->valid_raw_len = pstr->valid_len;
671 }
672 }
673 else
674#endif
675 {
676 pstr->tip_context = re_string_context_at (pstr, offset - 1,
677 eflags);
678#ifdef RE_ENABLE_I18N
679 if (pstr->mb_cur_max > 1)
680 memmove (pstr->wcs, pstr->wcs + offset,
681 (pstr->valid_len - offset) * sizeof (wint_t));
682#endif /* RE_ENABLE_I18N */
683 if (__glibc_unlikely (pstr->mbs_allocated))
684 memmove (pstr->mbs, pstr->mbs + offset,
685 pstr->valid_len - offset);
686 pstr->valid_len -= offset;
687 pstr->valid_raw_len -= offset;
688 DEBUG_ASSERT (pstr->valid_len > 0);
689 }
690 }
691 else
692 {
693#ifdef RE_ENABLE_I18N
694 /* No, skip all characters until IDX. */
695 Idx prev_valid_len = pstr->valid_len;
696
697 if (__glibc_unlikely (pstr->offsets_needed))
698 {
699 pstr->len = pstr->raw_len - idx + offset;
700 pstr->stop = pstr->raw_stop - idx + offset;
701 pstr->offsets_needed = 0;
702 }
703#endif
704 pstr->valid_len = 0;
705#ifdef RE_ENABLE_I18N
706 if (pstr->mb_cur_max > 1)
707 {
708 Idx wcs_idx;
709 wint_t wc = WEOF;
710
711 if (pstr->is_utf8)
712 {
713 const unsigned char *raw, *p, *end;
714
715 /* Special case UTF-8. Multi-byte chars start with any
716 byte other than 0x80 - 0xbf. */
717 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
718 end = raw + (offset - pstr->mb_cur_max);
719 if (end < pstr->raw_mbs)
720 end = pstr->raw_mbs;
721 p = raw + offset - 1;
722#ifdef _LIBC
723 /* We know the wchar_t encoding is UCS4, so for the simple
724 case, ASCII characters, skip the conversion step. */
725 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
726 {
727 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
728 /* pstr->valid_len = 0; */
729 wc = (wchar_t) *p;
730 }
731 else
732#endif
733 for (; p >= end; --p)
734 if ((*p & 0xc0) != 0x80)
735 {
736 mbstate_t cur_state;
737 wchar_t wc2;
738 Idx mlen = raw + pstr->len - p;
739 unsigned char buf[6];
740 size_t mbclen;
741
742 const unsigned char *pp = p;
743 if (__glibc_unlikely (pstr->trans != NULL))
744 {
745 int i = mlen < 6 ? mlen : 6;
746 while (--i >= 0)
747 buf[i] = pstr->trans[p[i]];
748 pp = buf;
749 }
750 /* XXX Don't use mbrtowc, we know which conversion
751 to use (UTF-8 -> UCS4). */
752 memset (&cur_state, 0, sizeof (cur_state));
753 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
754 &cur_state);
755 if (raw + offset - p <= mbclen
756 && mbclen < (size_t) -2)
757 {
758 memset (&pstr->cur_state, '\0',
759 sizeof (mbstate_t));
760 pstr->valid_len = mbclen - (raw + offset - p);
761 wc = wc2;
762 }
763 break;
764 }
765 }
766
767 if (wc == WEOF)
768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
769 if (wc == WEOF)
770 pstr->tip_context
771 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
772 else
773 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
774 && IS_WIDE_WORD_CHAR (wc))
775 ? CONTEXT_WORD
776 : ((IS_WIDE_NEWLINE (wc)
777 && pstr->newline_anchor)
778 ? CONTEXT_NEWLINE : 0));
779 if (__glibc_unlikely (pstr->valid_len))
780 {
781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
782 pstr->wcs[wcs_idx] = WEOF;
783 if (pstr->mbs_allocated)
784 memset (pstr->mbs, 255, pstr->valid_len);
785 }
786 pstr->valid_raw_len = pstr->valid_len;
787 }
788 else
789#endif /* RE_ENABLE_I18N */
790 {
791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
792 pstr->valid_raw_len = 0;
793 if (pstr->trans)
794 c = pstr->trans[c];
795 pstr->tip_context = (bitset_contain (pstr->word_char, c)
796 ? CONTEXT_WORD
797 : ((IS_NEWLINE (c) && pstr->newline_anchor)
798 ? CONTEXT_NEWLINE : 0));
799 }
800 }
801 if (!__glibc_unlikely (pstr->mbs_allocated))
802 pstr->mbs += offset;
803 }
804 pstr->raw_mbs_idx = idx;
805 pstr->len -= offset;
806 pstr->stop -= offset;
807
808 /* Then build the buffers. */
809#ifdef RE_ENABLE_I18N
810 if (pstr->mb_cur_max > 1)
811 {
812 if (pstr->icase)
813 {
814 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815 if (__glibc_unlikely (ret != REG_NOERROR))
816 return ret;
817 }
818 else
819 build_wcs_buffer (pstr);
820 }
821 else
822#endif /* RE_ENABLE_I18N */
823 if (__glibc_unlikely (pstr->mbs_allocated))
824 {
825 if (pstr->icase)
826 build_upper_buffer (pstr);
827 else if (pstr->trans != NULL)
828 re_string_translate_buffer (pstr);
829 }
830 else
831 pstr->valid_len = pstr->len;
832
833 pstr->cur_idx = 0;
834 return REG_NOERROR;
835}
836
837static unsigned char
838__attribute__ ((pure))
839re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
840{
841 int ch;
842 Idx off;
843
844 /* Handle the common (easiest) cases first. */
845 if (__glibc_likely (!pstr->mbs_allocated))
846 return re_string_peek_byte (pstr, idx);
847
848#ifdef RE_ENABLE_I18N
849 if (pstr->mb_cur_max > 1
850 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
851 return re_string_peek_byte (pstr, idx);
852#endif
853
854 off = pstr->cur_idx + idx;
855#ifdef RE_ENABLE_I18N
856 if (pstr->offsets_needed)
857 off = pstr->offsets[off];
858#endif
859
860 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
861
862#ifdef RE_ENABLE_I18N
863 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
864 this function returns CAPITAL LETTER I instead of first byte of
865 DOTLESS SMALL LETTER I. The latter would confuse the parser,
866 since peek_byte_case doesn't advance cur_idx in any way. */
867 if (pstr->offsets_needed && !isascii (ch))
868 return re_string_peek_byte (pstr, idx);
869#endif
870
871 return ch;
872}
873
874static unsigned char
875re_string_fetch_byte_case (re_string_t *pstr)
876{
877 if (__glibc_likely (!pstr->mbs_allocated))
878 return re_string_fetch_byte (pstr);
879
880#ifdef RE_ENABLE_I18N
881 if (pstr->offsets_needed)
882 {
883 Idx off;
884 int ch;
885
886 /* For tr_TR.UTF-8 [[:islower:]] there is
887 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
888 in that case the whole multi-byte character and return
889 the original letter. On the other side, with
890 [[: DOTLESS SMALL LETTER I return [[:I, as doing
891 anything else would complicate things too much. */
892
893 if (!re_string_first_byte (pstr, pstr->cur_idx))
894 return re_string_fetch_byte (pstr);
895
896 off = pstr->offsets[pstr->cur_idx];
897 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
898
899 if (! isascii (ch))
900 return re_string_fetch_byte (pstr);
901
902 re_string_skip_bytes (pstr,
903 re_string_char_size_at (pstr, pstr->cur_idx));
904 return ch;
905 }
906#endif
907
908 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
909}
910
911static void
912re_string_destruct (re_string_t *pstr)
913{
914#ifdef RE_ENABLE_I18N
915 re_free (pstr->wcs);
916 re_free (pstr->offsets);
917#endif /* RE_ENABLE_I18N */
918 if (pstr->mbs_allocated)
919 re_free (pstr->mbs);
920}
921
922/* Return the context at IDX in INPUT. */
923
924static unsigned int
925re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926{
927 int c;
928 if (__glibc_unlikely (idx < 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (__glibc_unlikely (idx == input->len))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935#ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
937 {
938 wint_t wc;
939 Idx wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
941 {
942 DEBUG_ASSERT (wc_idx >= 0);
943 --wc_idx;
944 if (wc_idx < 0)
945 return input->tip_context;
946 }
947 wc = input->wcs[wc_idx];
948 if (__glibc_unlikely (input->word_ops_used != 0)
949 && IS_WIDE_WORD_CHAR (wc))
950 return CONTEXT_WORD;
951 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
952 ? CONTEXT_NEWLINE : 0);
953 }
954 else
955#endif
956 {
957 c = re_string_byte_at (input, idx);
958 if (bitset_contain (input->word_char, c))
959 return CONTEXT_WORD;
960 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
961 }
962}
963
964/* Functions for set operation. */
965
966static reg_errcode_t
967__attribute_warn_unused_result__
968re_node_set_alloc (re_node_set *set, Idx size)
969{
970 set->alloc = size;
971 set->nelem = 0;
972 set->elems = re_malloc (Idx, size);
973 if (__glibc_unlikely (set->elems == NULL)
974 && (MALLOC_0_IS_NONNULL || size != 0))
975 return REG_ESPACE;
976 return REG_NOERROR;
977}
978
979static reg_errcode_t
980__attribute_warn_unused_result__
981re_node_set_init_1 (re_node_set *set, Idx elem)
982{
983 set->alloc = 1;
984 set->nelem = 1;
985 set->elems = re_malloc (Idx, 1);
986 if (__glibc_unlikely (set->elems == NULL))
987 {
988 set->alloc = set->nelem = 0;
989 return REG_ESPACE;
990 }
991 set->elems[0] = elem;
992 return REG_NOERROR;
993}
994
995static reg_errcode_t
996__attribute_warn_unused_result__
997re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998{
999 set->alloc = 2;
1000 set->elems = re_malloc (Idx, 2);
1001 if (__glibc_unlikely (set->elems == NULL))
1002 return REG_ESPACE;
1003 if (elem1 == elem2)
1004 {
1005 set->nelem = 1;
1006 set->elems[0] = elem1;
1007 }
1008 else
1009 {
1010 set->nelem = 2;
1011 if (elem1 < elem2)
1012 {
1013 set->elems[0] = elem1;
1014 set->elems[1] = elem2;
1015 }
1016 else
1017 {
1018 set->elems[0] = elem2;
1019 set->elems[1] = elem1;
1020 }
1021 }
1022 return REG_NOERROR;
1023}
1024
1025static reg_errcode_t
1026__attribute_warn_unused_result__
1027re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028{
1029 dest->nelem = src->nelem;
1030 if (src->nelem > 0)
1031 {
1032 dest->alloc = dest->nelem;
1033 dest->elems = re_malloc (Idx, dest->alloc);
1034 if (__glibc_unlikely (dest->elems == NULL))
1035 {
1036 dest->alloc = dest->nelem = 0;
1037 return REG_ESPACE;
1038 }
1039 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040 }
1041 else
1042 re_node_set_init_empty (dest);
1043 return REG_NOERROR;
1044}
1045
1046/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049
1050static reg_errcode_t
1051__attribute_warn_unused_result__
1052re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 const re_node_set *src2)
1054{
1055 Idx i1, i2, is, id, delta, sbase;
1056 if (src1->nelem == 0 || src2->nelem == 0)
1057 return REG_NOERROR;
1058
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062 {
1063 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065 if (__glibc_unlikely (new_elems == NULL))
1066 return REG_ESPACE;
1067 dest->elems = new_elems;
1068 dest->alloc = new_alloc;
1069 }
1070
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase = dest->nelem + src1->nelem + src2->nelem;
1074 i1 = src1->nelem - 1;
1075 i2 = src2->nelem - 1;
1076 id = dest->nelem - 1;
1077 for (;;)
1078 {
1079 if (src1->elems[i1] == src2->elems[i2])
1080 {
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083 --id;
1084
1085 if (id < 0 || dest->elems[id] != src1->elems[i1])
1086 dest->elems[--sbase] = src1->elems[i1];
1087
1088 if (--i1 < 0 || --i2 < 0)
1089 break;
1090 }
1091
1092 /* Lower the highest of the two items. */
1093 else if (src1->elems[i1] < src2->elems[i2])
1094 {
1095 if (--i2 < 0)
1096 break;
1097 }
1098 else
1099 {
1100 if (--i1 < 0)
1101 break;
1102 }
1103 }
1104
1105 id = dest->nelem - 1;
1106 is = dest->nelem + src1->nelem + src2->nelem - 1;
1107 delta = is - sbase + 1;
1108
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest->nelem += delta;
1113 if (delta > 0 && id >= 0)
1114 for (;;)
1115 {
1116 if (dest->elems[is] > dest->elems[id])
1117 {
1118 /* Copy from the top. */
1119 dest->elems[id + delta--] = dest->elems[is--];
1120 if (delta == 0)
1121 break;
1122 }
1123 else
1124 {
1125 /* Slide from the bottom. */
1126 dest->elems[id + delta] = dest->elems[id];
1127 if (--id < 0)
1128 break;
1129 }
1130 }
1131
1132 /* Copy remaining SRC elements. */
1133 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135 return REG_NOERROR;
1136}
1137
1138/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140
1141static reg_errcode_t
1142__attribute_warn_unused_result__
1143re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 const re_node_set *src2)
1145{
1146 Idx i1, i2, id;
1147 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 {
1149 dest->alloc = src1->nelem + src2->nelem;
1150 dest->elems = re_malloc (Idx, dest->alloc);
1151 if (__glibc_unlikely (dest->elems == NULL))
1152 return REG_ESPACE;
1153 }
1154 else
1155 {
1156 if (src1 != NULL && src1->nelem > 0)
1157 return re_node_set_init_copy (dest, src1);
1158 else if (src2 != NULL && src2->nelem > 0)
1159 return re_node_set_init_copy (dest, src2);
1160 else
1161 re_node_set_init_empty (dest);
1162 return REG_NOERROR;
1163 }
1164 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 {
1166 if (src1->elems[i1] > src2->elems[i2])
1167 {
1168 dest->elems[id++] = src2->elems[i2++];
1169 continue;
1170 }
1171 if (src1->elems[i1] == src2->elems[i2])
1172 ++i2;
1173 dest->elems[id++] = src1->elems[i1++];
1174 }
1175 if (i1 < src1->nelem)
1176 {
1177 memcpy (dest->elems + id, src1->elems + i1,
1178 (src1->nelem - i1) * sizeof (Idx));
1179 id += src1->nelem - i1;
1180 }
1181 else if (i2 < src2->nelem)
1182 {
1183 memcpy (dest->elems + id, src2->elems + i2,
1184 (src2->nelem - i2) * sizeof (Idx));
1185 id += src2->nelem - i2;
1186 }
1187 dest->nelem = id;
1188 return REG_NOERROR;
1189}
1190
1191/* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193
1194static reg_errcode_t
1195__attribute_warn_unused_result__
1196re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197{
1198 Idx is, id, sbase, delta;
1199 if (src == NULL || src->nelem == 0)
1200 return REG_NOERROR;
1201 if (dest->alloc < 2 * src->nelem + dest->nelem)
1202 {
1203 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205 if (__glibc_unlikely (new_buffer == NULL))
1206 return REG_ESPACE;
1207 dest->elems = new_buffer;
1208 dest->alloc = new_alloc;
1209 }
1210
1211 if (__glibc_unlikely (dest->nelem == 0))
1212 {
1213 dest->nelem = src->nelem;
1214 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215 return REG_NOERROR;
1216 }
1217
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase = dest->nelem + 2 * src->nelem,
1221 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222 {
1223 if (dest->elems[id] == src->elems[is])
1224 is--, id--;
1225 else if (dest->elems[id] < src->elems[is])
1226 dest->elems[--sbase] = src->elems[is--];
1227 else /* if (dest->elems[id] > src->elems[is]) */
1228 --id;
1229 }
1230
1231 if (is >= 0)
1232 {
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 sbase -= is + 1;
1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236 }
1237
1238 id = dest->nelem - 1;
1239 is = dest->nelem + 2 * src->nelem - 1;
1240 delta = is - sbase + 1;
1241 if (delta == 0)
1242 return REG_NOERROR;
1243
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest->nelem += delta;
1247 for (;;)
1248 {
1249 if (dest->elems[is] > dest->elems[id])
1250 {
1251 /* Copy from the top. */
1252 dest->elems[id + delta--] = dest->elems[is--];
1253 if (delta == 0)
1254 break;
1255 }
1256 else
1257 {
1258 /* Slide from the bottom. */
1259 dest->elems[id + delta] = dest->elems[id];
1260 if (--id < 0)
1261 {
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest->elems, dest->elems + sbase,
1264 delta * sizeof (Idx));
1265 break;
1266 }
1267 }
1268 }
1269
1270 return REG_NOERROR;
1271}
1272
1273/* Insert the new element ELEM to the re_node_set* SET.
1274 SET should not already have ELEM.
1275 Return true if successful. */
1276
1277static bool
1278__attribute_warn_unused_result__
1279re_node_set_insert (re_node_set *set, Idx elem)
1280{
1281 Idx idx;
1282 /* In case the set is empty. */
1283 if (set->alloc == 0)
1284 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1285
1286 if (__glibc_unlikely (set->nelem) == 0)
1287 {
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set->elems[0] = elem;
1290 ++set->nelem;
1291 return true;
1292 }
1293
1294 /* Realloc if we need. */
1295 if (set->alloc == set->nelem)
1296 {
1297 Idx *new_elems;
1298 set->alloc = set->alloc * 2;
1299 new_elems = re_realloc (set->elems, Idx, set->alloc);
1300 if (__glibc_unlikely (new_elems == NULL))
1301 return false;
1302 set->elems = new_elems;
1303 }
1304
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem < set->elems[0])
1308 {
1309 for (idx = set->nelem; idx > 0; idx--)
1310 set->elems[idx] = set->elems[idx - 1];
1311 }
1312 else
1313 {
1314 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1315 set->elems[idx] = set->elems[idx - 1];
1316 }
1317
1318 /* Insert the new element. */
1319 set->elems[idx] = elem;
1320 ++set->nelem;
1321 return true;
1322}
1323
1324/* Insert the new element ELEM to the re_node_set* SET.
1325 SET should not already have any element greater than or equal to ELEM.
1326 Return true if successful. */
1327
1328static bool
1329__attribute_warn_unused_result__
1330re_node_set_insert_last (re_node_set *set, Idx elem)
1331{
1332 /* Realloc if we need. */
1333 if (set->alloc == set->nelem)
1334 {
1335 Idx *new_elems;
1336 set->alloc = (set->alloc + 1) * 2;
1337 new_elems = re_realloc (set->elems, Idx, set->alloc);
1338 if (__glibc_unlikely (new_elems == NULL))
1339 return false;
1340 set->elems = new_elems;
1341 }
1342
1343 /* Insert the new element. */
1344 set->elems[set->nelem++] = elem;
1345 return true;
1346}
1347
1348/* Compare two node sets SET1 and SET2.
1349 Return true if SET1 and SET2 are equivalent. */
1350
1351static bool
1352__attribute__ ((pure))
1353re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1354{
1355 Idx i;
1356 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1357 return false;
1358 for (i = set1->nelem ; --i >= 0 ; )
1359 if (set1->elems[i] != set2->elems[i])
1360 return false;
1361 return true;
1362}
1363
1364/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1365
1366static Idx
1367__attribute__ ((pure))
1368re_node_set_contains (const re_node_set *set, Idx elem)
1369{
1370 __re_size_t idx, right, mid;
1371 if (set->nelem <= 0)
1372 return 0;
1373
1374 /* Binary search the element. */
1375 idx = 0;
1376 right = set->nelem - 1;
1377 while (idx < right)
1378 {
1379 mid = (idx + right) / 2;
1380 if (set->elems[mid] < elem)
1381 idx = mid + 1;
1382 else
1383 right = mid;
1384 }
1385 return set->elems[idx] == elem ? idx + 1 : 0;
1386}
1387
1388static void
1389re_node_set_remove_at (re_node_set *set, Idx idx)
1390{
1391 if (idx < 0 || idx >= set->nelem)
1392 return;
1393 --set->nelem;
1394 for (; idx < set->nelem; idx++)
1395 set->elems[idx] = set->elems[idx + 1];
1396}
1397
1398
1399/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400 Or return -1 if an error occurred. */
1401
1402static Idx
1403re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1404{
1405 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1406 {
1407 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1408 Idx *new_nexts, *new_indices;
1409 re_node_set *new_edests, *new_eclosures;
1410 re_token_t *new_nodes;
1411
1412 /* Avoid overflows in realloc. */
1413 const size_t max_object_size = MAX (sizeof (re_token_t),
1414 MAX (sizeof (re_node_set),
1415 sizeof (Idx)));
1416 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1417 < new_nodes_alloc))
1418 return -1;
1419
1420 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421 if (__glibc_unlikely (new_nodes == NULL))
1422 return -1;
1423 dfa->nodes = new_nodes;
1424 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1429 || new_edests == NULL || new_eclosures == NULL))
1430 {
1431 re_free (new_nexts);
1432 re_free (new_indices);
1433 re_free (new_edests);
1434 re_free (new_eclosures);
1435 return -1;
1436 }
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
1441 dfa->nodes_alloc = new_nodes_alloc;
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
1444 dfa->nodes[dfa->nodes_len].constraint = 0;
1445#ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
1449#endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1454}
1455
1456static re_hashval_t
1457calc_state_hash (const re_node_set *nodes, unsigned int context)
1458{
1459 re_hashval_t hash = nodes->nelem + context;
1460 Idx i;
1461 for (i = 0 ; i < nodes->nelem ; i++)
1462 hash += nodes->elems[i];
1463 return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1473 optimization. */
1474
1475static re_dfastate_t *
1476__attribute_warn_unused_result__
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 const re_node_set *nodes)
1479{
1480 re_hashval_t hash;
1481 re_dfastate_t *new_state;
1482 struct re_state_table_entry *spot;
1483 Idx i;
1484#if defined GCC_LINT || defined lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1486 *err = REG_NOERROR;
1487#endif
1488 if (__glibc_unlikely (nodes->nelem == 0))
1489 {
1490 *err = REG_NOERROR;
1491 return NULL;
1492 }
1493 hash = calc_state_hash (nodes, 0);
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496 for (i = 0 ; i < spot->num ; i++)
1497 {
1498 re_dfastate_t *state = spot->array[i];
1499 if (hash != state->hash)
1500 continue;
1501 if (re_node_set_compare (&state->nodes, nodes))
1502 return state;
1503 }
1504
1505 /* There are no appropriate state in the dfa, create the new one. */
1506 new_state = create_ci_newstate (dfa, nodes, hash);
1507 if (__glibc_unlikely (new_state == NULL))
1508 *err = REG_ESPACE;
1509
1510 return new_state;
1511}
1512
1513/* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1521 optimization. */
1522
1523static re_dfastate_t *
1524__attribute_warn_unused_result__
1525re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 const re_node_set *nodes, unsigned int context)
1527{
1528 re_hashval_t hash;
1529 re_dfastate_t *new_state;
1530 struct re_state_table_entry *spot;
1531 Idx i;
1532#if defined GCC_LINT || defined lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1534 *err = REG_NOERROR;
1535#endif
1536 if (nodes->nelem == 0)
1537 {
1538 *err = REG_NOERROR;
1539 return NULL;
1540 }
1541 hash = calc_state_hash (nodes, context);
1542 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544 for (i = 0 ; i < spot->num ; i++)
1545 {
1546 re_dfastate_t *state = spot->array[i];
1547 if (state->hash == hash
1548 && state->context == context
1549 && re_node_set_compare (state->entrance_nodes, nodes))
1550 return state;
1551 }
1552 /* There are no appropriate state in 'dfa', create the new one. */
1553 new_state = create_cd_newstate (dfa, nodes, context, hash);
1554 if (__glibc_unlikely (new_state == NULL))
1555 *err = REG_ESPACE;
1556
1557 return new_state;
1558}
1559
1560/* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
1563
1564static reg_errcode_t
1565__attribute_warn_unused_result__
1566register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 re_hashval_t hash)
1568{
1569 struct re_state_table_entry *spot;
1570 reg_errcode_t err;
1571 Idx i;
1572
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575 if (__glibc_unlikely (err != REG_NOERROR))
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
1578 {
1579 Idx elem = newstate->nodes.elems[i];
1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582 return REG_ESPACE;
1583 }
1584
1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586 if (__glibc_unlikely (spot->alloc <= spot->num))
1587 {
1588 Idx new_alloc = 2 * spot->num + 2;
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
1591 if (__glibc_unlikely (new_array == NULL))
1592 return REG_ESPACE;
1593 spot->array = new_array;
1594 spot->alloc = new_alloc;
1595 }
1596 spot->array[spot->num++] = newstate;
1597 return REG_NOERROR;
1598}
1599
1600static void
1601free_state (re_dfastate_t *state)
1602{
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614}
1615
1616/* Create the new state which is independent of contexts.
1617 Return the new state if succeeded, otherwise return NULL. */
1618
1619static re_dfastate_t *
1620__attribute_warn_unused_result__
1621create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 re_hashval_t hash)
1623{
1624 Idx i;
1625 reg_errcode_t err;
1626 re_dfastate_t *newstate;
1627
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629 if (__glibc_unlikely (newstate == NULL))
1630 return NULL;
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (__glibc_unlikely (err != REG_NOERROR))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
1637
1638 newstate->entrance_nodes = &newstate->nodes;
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
1643 if (type == CHARACTER && !node->constraint)
1644 continue;
1645#ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647#endif /* RE_ENABLE_I18N */
1648
1649 /* If the state has the halt node, the state is a halt state. */
1650 if (type == END_OF_RE)
1651 newstate->halt = 1;
1652 else if (type == OP_BACK_REF)
1653 newstate->has_backref = 1;
1654 else if (type == ANCHOR || node->constraint)
1655 newstate->has_constraint = 1;
1656 }
1657 err = register_state (dfa, newstate, hash);
1658 if (__glibc_unlikely (err != REG_NOERROR))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
1664}
1665
1666/* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1668
1669static re_dfastate_t *
1670__attribute_warn_unused_result__
1671create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 unsigned int context, re_hashval_t hash)
1673{
1674 Idx i, nctx_nodes = 0;
1675 reg_errcode_t err;
1676 re_dfastate_t *newstate;
1677
1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679 if (__glibc_unlikely (newstate == NULL))
1680 return NULL;
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (__glibc_unlikely (err != REG_NOERROR))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1687
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1690
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
1693 re_token_t *node = dfa->nodes + nodes->elems[i];
1694 re_token_type_t type = node->type;
1695 unsigned int constraint = node->constraint;
1696
1697 if (type == CHARACTER && !constraint)
1698 continue;
1699#ifdef RE_ENABLE_I18N
1700 newstate->accept_mb |= node->accept_mb;
1701#endif /* RE_ENABLE_I18N */
1702
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type == END_OF_RE)
1705 newstate->halt = 1;
1706 else if (type == OP_BACK_REF)
1707 newstate->has_backref = 1;
1708
1709 if (constraint)
1710 {
1711 if (newstate->entrance_nodes == &newstate->nodes)
1712 {
1713 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1714 if (__glibc_unlikely (entrance_nodes == NULL))
1715 {
1716 free_state (newstate);
1717 return NULL;
1718 }
1719 newstate->entrance_nodes = entrance_nodes;
1720 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721 != REG_NOERROR)
1722 {
1723 free_state (newstate);
1724 return NULL;
1725 }
1726 nctx_nodes = 0;
1727 newstate->has_constraint = 1;
1728 }
1729
1730 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731 {
1732 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733 ++nctx_nodes;
1734 }
1735 }
1736 }
1737 err = register_state (dfa, newstate, hash);
1738 if (__glibc_unlikely (err != REG_NOERROR))
1739 {
1740 free_state (newstate);
1741 newstate = NULL;
1742 }
1743 return newstate;
1744}
1745