1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48__attribute_warn_unused_result__
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51{
52 reg_errcode_t ret;
53 Idx init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (__glibc_unlikely (ret != REG_NOERROR))
63 return ret;
64
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
76__attribute_warn_unused_result__
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79{
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (__glibc_unlikely (ret != REG_NOERROR))
88 return ret;
89 }
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (__glibc_unlikely (ret != REG_NOERROR))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (__glibc_unlikely (ret != REG_NOERROR))
108 return ret;
109 }
110 }
111 else
112#endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117#ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121#endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
130 }
131 }
132
133 return REG_NOERROR;
134}
135
136/* Helper functions for re_string_allocate, and re_string_construct. */
137
138static reg_errcode_t
139__attribute_warn_unused_result__
140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141{
142#ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
144 {
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 < new_buf_len))
151 return REG_ESPACE;
152
153 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154 if (__glibc_unlikely (new_wcs == NULL))
155 return REG_ESPACE;
156 pstr->wcs = new_wcs;
157 if (pstr->offsets != NULL)
158 {
159 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160 if (__glibc_unlikely (new_offsets == NULL))
161 return REG_ESPACE;
162 pstr->offsets = new_offsets;
163 }
164 }
165#endif /* RE_ENABLE_I18N */
166 if (pstr->mbs_allocated)
167 {
168 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 new_buf_len);
170 if (__glibc_unlikely (new_mbs == NULL))
171 return REG_ESPACE;
172 pstr->mbs = new_mbs;
173 }
174 pstr->bufs_len = new_buf_len;
175 return REG_NOERROR;
176}
177
178
179static void
180re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, bool icase,
182 const re_dfa_t *dfa)
183{
184 pstr->raw_mbs = (const unsigned char *) str;
185 pstr->len = len;
186 pstr->raw_len = len;
187 pstr->trans = trans;
188 pstr->icase = icase;
189 pstr->mbs_allocated = (trans != NULL || icase);
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
195}
196
197#ifdef RE_ENABLE_I18N
198
199/* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
206
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210static void
211build_wcs_buffer (re_string_t *pstr)
212{
213#ifdef _LIBC
214 unsigned char buf[MB_LEN_MAX];
215 assert (MB_LEN_MAX >= pstr->mb_cur_max);
216#else
217 unsigned char buf[64];
218#endif
219 mbstate_t prev_st;
220 Idx byte_idx, end_idx, remain_len;
221 size_t mbclen;
222
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
229 const char *p;
230
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
233 /* Apply the translation if we need. */
234 if (__glibc_unlikely (pstr->trans != NULL))
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 || (mbclen == (size_t) -2
250 && pstr->bufs_len >= pstr->len)))
251 {
252 /* We treat these cases as a singlebyte character. */
253 mbclen = 1;
254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 if (__glibc_unlikely (pstr->trans != NULL))
256 wc = pstr->trans[wc];
257 pstr->cur_state = prev_st;
258 }
259 else if (__glibc_unlikely (mbclen == (size_t) -2))
260 {
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr->cur_state = prev_st;
263 break;
264 }
265
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
271 }
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
274}
275
276/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
279static reg_errcode_t
280__attribute_warn_unused_result__
281build_wcs_upper_buffer (re_string_t *pstr)
282{
283 mbstate_t prev_st;
284 Idx src_idx, byte_idx, end_idx, remain_len;
285 size_t mbclen;
286#ifdef _LIBC
287 char buf[MB_LEN_MAX];
288 assert (MB_LEN_MAX >= pstr->mb_cur_max);
289#else
290 char buf[64];
291#endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
301 {
302 wchar_t wc;
303
304 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305 && mbsinit (&pstr->cur_state))
306 {
307 /* In case of a singlebyte character. */
308 pstr->mbs[byte_idx]
309 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310 /* The next step uses the assumption that wchar_t is encoded
311 ASCII-safe: all ASCII values can be converted like this. */
312 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
313 ++byte_idx;
314 continue;
315 }
316
317 remain_len = end_idx - byte_idx;
318 prev_st = pstr->cur_state;
319 mbclen = __mbrtowc (&wc,
320 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321 + byte_idx), remain_len, &pstr->cur_state);
322 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
323 {
324 wchar_t wcu = __towupper (wc);
325 if (wcu != wc)
326 {
327 size_t mbcdlen;
328
329 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330 if (__glibc_likely (mbclen == mbcdlen))
331 memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 else
333 {
334 src_idx = byte_idx;
335 goto offsets_needed;
336 }
337 }
338 else
339 memcpy (pstr->mbs + byte_idx,
340 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 pstr->wcs[byte_idx++] = wcu;
342 /* Write paddings. */
343 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 pstr->wcs[byte_idx++] = WEOF;
345 }
346 else if (mbclen == (size_t) -1 || mbclen == 0
347 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
348 {
349 /* It is an invalid character, an incomplete character
350 at the end of the string, or '\0'. Just use the byte. */
351 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352 pstr->mbs[byte_idx] = ch;
353 /* And also cast it to wide char. */
354 pstr->wcs[byte_idx++] = (wchar_t) ch;
355 if (__glibc_unlikely (mbclen == (size_t) -1))
356 pstr->cur_state = prev_st;
357 }
358 else
359 {
360 /* The buffer doesn't have enough space, finish to build. */
361 pstr->cur_state = prev_st;
362 break;
363 }
364 }
365 pstr->valid_len = byte_idx;
366 pstr->valid_raw_len = byte_idx;
367 return REG_NOERROR;
368 }
369 else
370 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371 {
372 wchar_t wc;
373 const char *p;
374 offsets_needed:
375 remain_len = end_idx - byte_idx;
376 prev_st = pstr->cur_state;
377 if (__glibc_unlikely (pstr->trans != NULL))
378 {
379 int i, ch;
380
381 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382 {
383 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 buf[i] = pstr->trans[ch];
385 }
386 p = (const char *) buf;
387 }
388 else
389 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
392 {
393 wchar_t wcu = __towupper (wc);
394 if (wcu != wc)
395 {
396 size_t mbcdlen;
397
398 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399 if (__glibc_likely (mbclen == mbcdlen))
400 memcpy (pstr->mbs + byte_idx, buf, mbclen);
401 else if (mbcdlen != (size_t) -1)
402 {
403 size_t i;
404
405 if (byte_idx + mbcdlen > pstr->bufs_len)
406 {
407 pstr->cur_state = prev_st;
408 break;
409 }
410
411 if (pstr->offsets == NULL)
412 {
413 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
414
415 if (pstr->offsets == NULL)
416 return REG_ESPACE;
417 }
418 if (!pstr->offsets_needed)
419 {
420 for (i = 0; i < (size_t) byte_idx; ++i)
421 pstr->offsets[i] = i;
422 pstr->offsets_needed = 1;
423 }
424
425 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426 pstr->wcs[byte_idx] = wcu;
427 pstr->offsets[byte_idx] = src_idx;
428 for (i = 1; i < mbcdlen; ++i)
429 {
430 pstr->offsets[byte_idx + i]
431 = src_idx + (i < mbclen ? i : mbclen - 1);
432 pstr->wcs[byte_idx + i] = WEOF;
433 }
434 pstr->len += mbcdlen - mbclen;
435 if (pstr->raw_stop > src_idx)
436 pstr->stop += mbcdlen - mbclen;
437 end_idx = (pstr->bufs_len > pstr->len)
438 ? pstr->len : pstr->bufs_len;
439 byte_idx += mbcdlen;
440 src_idx += mbclen;
441 continue;
442 }
443 else
444 memcpy (pstr->mbs + byte_idx, p, mbclen);
445 }
446 else
447 memcpy (pstr->mbs + byte_idx, p, mbclen);
448
449 if (__glibc_unlikely (pstr->offsets_needed != 0))
450 {
451 size_t i;
452 for (i = 0; i < mbclen; ++i)
453 pstr->offsets[byte_idx + i] = src_idx + i;
454 }
455 src_idx += mbclen;
456
457 pstr->wcs[byte_idx++] = wcu;
458 /* Write paddings. */
459 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
460 pstr->wcs[byte_idx++] = WEOF;
461 }
462 else if (mbclen == (size_t) -1 || mbclen == 0
463 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
464 {
465 /* It is an invalid character or '\0'. Just use the byte. */
466 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467
468 if (__glibc_unlikely (pstr->trans != NULL))
469 ch = pstr->trans [ch];
470 pstr->mbs[byte_idx] = ch;
471
472 if (__glibc_unlikely (pstr->offsets_needed != 0))
473 pstr->offsets[byte_idx] = src_idx;
474 ++src_idx;
475
476 /* And also cast it to wide char. */
477 pstr->wcs[byte_idx++] = (wchar_t) ch;
478 if (__glibc_unlikely (mbclen == (size_t) -1))
479 pstr->cur_state = prev_st;
480 }
481 else
482 {
483 /* The buffer doesn't have enough space, finish to build. */
484 pstr->cur_state = prev_st;
485 break;
486 }
487 }
488 pstr->valid_len = byte_idx;
489 pstr->valid_raw_len = src_idx;
490 return REG_NOERROR;
491}
492
493/* Skip characters until the index becomes greater than NEW_RAW_IDX.
494 Return the index. */
495
496static Idx
497re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
498{
499 mbstate_t prev_st;
500 Idx rawbuf_idx;
501 size_t mbclen;
502 wint_t wc = WEOF;
503
504 /* Skip the characters which are not necessary to check. */
505 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
506 rawbuf_idx < new_raw_idx;)
507 {
508 wchar_t wc2;
509 Idx remain_len = pstr->raw_len - rawbuf_idx;
510 prev_st = pstr->cur_state;
511 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
512 remain_len, &pstr->cur_state);
513 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
514 || mbclen == 0))
515 {
516 /* We treat these cases as a single byte character. */
517 if (mbclen == 0 || remain_len == 0)
518 wc = L'\0';
519 else
520 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
521 mbclen = 1;
522 pstr->cur_state = prev_st;
523 }
524 else
525 wc = wc2;
526 /* Then proceed the next character. */
527 rawbuf_idx += mbclen;
528 }
529 *last_wc = wc;
530 return rawbuf_idx;
531}
532#endif /* RE_ENABLE_I18N */
533
534/* Build the buffer PSTR->MBS, and apply the translation if we need.
535 This function is used in case of REG_ICASE. */
536
537static void
538build_upper_buffer (re_string_t *pstr)
539{
540 Idx char_idx, end_idx;
541 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542
543 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
544 {
545 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546 if (__glibc_unlikely (pstr->trans != NULL))
547 ch = pstr->trans[ch];
548 pstr->mbs[char_idx] = toupper (ch);
549 }
550 pstr->valid_len = char_idx;
551 pstr->valid_raw_len = char_idx;
552}
553
554/* Apply TRANS to the buffer in PSTR. */
555
556static void
557re_string_translate_buffer (re_string_t *pstr)
558{
559 Idx buf_idx, end_idx;
560 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
561
562 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
563 {
564 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
565 pstr->mbs[buf_idx] = pstr->trans[ch];
566 }
567
568 pstr->valid_len = buf_idx;
569 pstr->valid_raw_len = buf_idx;
570}
571
572/* This function re-construct the buffers.
573 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
574 convert to upper case in case of REG_ICASE, apply translation. */
575
576static reg_errcode_t
577__attribute_warn_unused_result__
578re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
579{
580 Idx offset;
581
582 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
583 offset = idx - pstr->raw_mbs_idx;
584 else
585 {
586 /* Reset buffer. */
587#ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590#endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
602 }
603
604 if (__glibc_likely (offset != 0))
605 {
606 /* Should the already checked characters be kept? */
607 if (__glibc_likely (offset < pstr->valid_raw_len))
608 {
609 /* Yes, move them to the front of the buffer. */
610#ifdef RE_ENABLE_I18N
611 if (__glibc_unlikely (pstr->offsets_needed))
612 {
613 Idx low = 0, high = pstr->valid_len, mid;
614 do
615 {
616 mid = (high + low) / 2;
617 if (pstr->offsets[mid] > offset)
618 high = mid;
619 else if (pstr->offsets[mid] < offset)
620 low = mid + 1;
621 else
622 break;
623 }
624 while (low < high);
625 if (pstr->offsets[mid] < offset)
626 ++mid;
627 pstr->tip_context = re_string_context_at (pstr, mid - 1,
628 eflags);
629 /* This can be quite complicated, so handle specially
630 only the common and easy case where the character with
631 different length representation of lower and upper
632 case is present at or after offset. */
633 if (pstr->valid_len > offset
634 && mid == offset && pstr->offsets[mid] == offset)
635 {
636 memmove (pstr->wcs, pstr->wcs + offset,
637 (pstr->valid_len - offset) * sizeof (wint_t));
638 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639 pstr->valid_len -= offset;
640 pstr->valid_raw_len -= offset;
641 for (low = 0; low < pstr->valid_len; low++)
642 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 }
644 else
645 {
646 /* Otherwise, just find out how long the partial multibyte
647 character at offset is and fill it with WEOF/255. */
648 pstr->len = pstr->raw_len - idx + offset;
649 pstr->stop = pstr->raw_stop - idx + offset;
650 pstr->offsets_needed = 0;
651 while (mid > 0 && pstr->offsets[mid - 1] == offset)
652 --mid;
653 while (mid < pstr->valid_len)
654 if (pstr->wcs[mid] != WEOF)
655 break;
656 else
657 ++mid;
658 if (mid == pstr->valid_len)
659 pstr->valid_len = 0;
660 else
661 {
662 pstr->valid_len = pstr->offsets[mid] - offset;
663 if (pstr->valid_len)
664 {
665 for (low = 0; low < pstr->valid_len; ++low)
666 pstr->wcs[low] = WEOF;
667 memset (pstr->mbs, 255, pstr->valid_len);
668 }
669 }
670 pstr->valid_raw_len = pstr->valid_len;
671 }
672 }
673 else
674#endif
675 {
676 pstr->tip_context = re_string_context_at (pstr, offset - 1,
677 eflags);
678#ifdef RE_ENABLE_I18N
679 if (pstr->mb_cur_max > 1)
680 memmove (pstr->wcs, pstr->wcs + offset,
681 (pstr->valid_len - offset) * sizeof (wint_t));
682#endif /* RE_ENABLE_I18N */
683 if (__glibc_unlikely (pstr->mbs_allocated))
684 memmove (pstr->mbs, pstr->mbs + offset,
685 pstr->valid_len - offset);
686 pstr->valid_len -= offset;
687 pstr->valid_raw_len -= offset;
688#if defined DEBUG && DEBUG
689 assert (pstr->valid_len > 0);
690#endif
691 }
692 }
693 else
694 {
695#ifdef RE_ENABLE_I18N
696 /* No, skip all characters until IDX. */
697 Idx prev_valid_len = pstr->valid_len;
698
699 if (__glibc_unlikely (pstr->offsets_needed))
700 {
701 pstr->len = pstr->raw_len - idx + offset;
702 pstr->stop = pstr->raw_stop - idx + offset;
703 pstr->offsets_needed = 0;
704 }
705#endif
706 pstr->valid_len = 0;
707#ifdef RE_ENABLE_I18N
708 if (pstr->mb_cur_max > 1)
709 {
710 Idx wcs_idx;
711 wint_t wc = WEOF;
712
713 if (pstr->is_utf8)
714 {
715 const unsigned char *raw, *p, *end;
716
717 /* Special case UTF-8. Multi-byte chars start with any
718 byte other than 0x80 - 0xbf. */
719 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720 end = raw + (offset - pstr->mb_cur_max);
721 if (end < pstr->raw_mbs)
722 end = pstr->raw_mbs;
723 p = raw + offset - 1;
724#ifdef _LIBC
725 /* We know the wchar_t encoding is UCS4, so for the simple
726 case, ASCII characters, skip the conversion step. */
727 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
728 {
729 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730 /* pstr->valid_len = 0; */
731 wc = (wchar_t) *p;
732 }
733 else
734#endif
735 for (; p >= end; --p)
736 if ((*p & 0xc0) != 0x80)
737 {
738 mbstate_t cur_state;
739 wchar_t wc2;
740 Idx mlen = raw + pstr->len - p;
741 unsigned char buf[6];
742 size_t mbclen;
743
744 const unsigned char *pp = p;
745 if (__glibc_unlikely (pstr->trans != NULL))
746 {
747 int i = mlen < 6 ? mlen : 6;
748 while (--i >= 0)
749 buf[i] = pstr->trans[p[i]];
750 pp = buf;
751 }
752 /* XXX Don't use mbrtowc, we know which conversion
753 to use (UTF-8 -> UCS4). */
754 memset (&cur_state, 0, sizeof (cur_state));
755 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
756 &cur_state);
757 if (raw + offset - p <= mbclen
758 && mbclen < (size_t) -2)
759 {
760 memset (&pstr->cur_state, '\0',
761 sizeof (mbstate_t));
762 pstr->valid_len = mbclen - (raw + offset - p);
763 wc = wc2;
764 }
765 break;
766 }
767 }
768
769 if (wc == WEOF)
770 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
771 if (wc == WEOF)
772 pstr->tip_context
773 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
774 else
775 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
776 && IS_WIDE_WORD_CHAR (wc))
777 ? CONTEXT_WORD
778 : ((IS_WIDE_NEWLINE (wc)
779 && pstr->newline_anchor)
780 ? CONTEXT_NEWLINE : 0));
781 if (__glibc_unlikely (pstr->valid_len))
782 {
783 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
784 pstr->wcs[wcs_idx] = WEOF;
785 if (pstr->mbs_allocated)
786 memset (pstr->mbs, 255, pstr->valid_len);
787 }
788 pstr->valid_raw_len = pstr->valid_len;
789 }
790 else
791#endif /* RE_ENABLE_I18N */
792 {
793 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
794 pstr->valid_raw_len = 0;
795 if (pstr->trans)
796 c = pstr->trans[c];
797 pstr->tip_context = (bitset_contain (pstr->word_char, c)
798 ? CONTEXT_WORD
799 : ((IS_NEWLINE (c) && pstr->newline_anchor)
800 ? CONTEXT_NEWLINE : 0));
801 }
802 }
803 if (!__glibc_unlikely (pstr->mbs_allocated))
804 pstr->mbs += offset;
805 }
806 pstr->raw_mbs_idx = idx;
807 pstr->len -= offset;
808 pstr->stop -= offset;
809
810 /* Then build the buffers. */
811#ifdef RE_ENABLE_I18N
812 if (pstr->mb_cur_max > 1)
813 {
814 if (pstr->icase)
815 {
816 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
817 if (__glibc_unlikely (ret != REG_NOERROR))
818 return ret;
819 }
820 else
821 build_wcs_buffer (pstr);
822 }
823 else
824#endif /* RE_ENABLE_I18N */
825 if (__glibc_unlikely (pstr->mbs_allocated))
826 {
827 if (pstr->icase)
828 build_upper_buffer (pstr);
829 else if (pstr->trans != NULL)
830 re_string_translate_buffer (pstr);
831 }
832 else
833 pstr->valid_len = pstr->len;
834
835 pstr->cur_idx = 0;
836 return REG_NOERROR;
837}
838
839static unsigned char
840__attribute__ ((pure))
841re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
842{
843 int ch;
844 Idx off;
845
846 /* Handle the common (easiest) cases first. */
847 if (__glibc_likely (!pstr->mbs_allocated))
848 return re_string_peek_byte (pstr, idx);
849
850#ifdef RE_ENABLE_I18N
851 if (pstr->mb_cur_max > 1
852 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
853 return re_string_peek_byte (pstr, idx);
854#endif
855
856 off = pstr->cur_idx + idx;
857#ifdef RE_ENABLE_I18N
858 if (pstr->offsets_needed)
859 off = pstr->offsets[off];
860#endif
861
862 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
863
864#ifdef RE_ENABLE_I18N
865 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
866 this function returns CAPITAL LETTER I instead of first byte of
867 DOTLESS SMALL LETTER I. The latter would confuse the parser,
868 since peek_byte_case doesn't advance cur_idx in any way. */
869 if (pstr->offsets_needed && !isascii (ch))
870 return re_string_peek_byte (pstr, idx);
871#endif
872
873 return ch;
874}
875
876static unsigned char
877re_string_fetch_byte_case (re_string_t *pstr)
878{
879 if (__glibc_likely (!pstr->mbs_allocated))
880 return re_string_fetch_byte (pstr);
881
882#ifdef RE_ENABLE_I18N
883 if (pstr->offsets_needed)
884 {
885 Idx off;
886 int ch;
887
888 /* For tr_TR.UTF-8 [[:islower:]] there is
889 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890 in that case the whole multi-byte character and return
891 the original letter. On the other side, with
892 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 anything else would complicate things too much. */
894
895 if (!re_string_first_byte (pstr, pstr->cur_idx))
896 return re_string_fetch_byte (pstr);
897
898 off = pstr->offsets[pstr->cur_idx];
899 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900
901 if (! isascii (ch))
902 return re_string_fetch_byte (pstr);
903
904 re_string_skip_bytes (pstr,
905 re_string_char_size_at (pstr, pstr->cur_idx));
906 return ch;
907 }
908#endif
909
910 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911}
912
913static void
914re_string_destruct (re_string_t *pstr)
915{
916#ifdef RE_ENABLE_I18N
917 re_free (pstr->wcs);
918 re_free (pstr->offsets);
919#endif /* RE_ENABLE_I18N */
920 if (pstr->mbs_allocated)
921 re_free (pstr->mbs);
922}
923
924/* Return the context at IDX in INPUT. */
925
926static unsigned int
927re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928{
929 int c;
930 if (__glibc_unlikely (idx < 0))
931 /* In this case, we use the value stored in input->tip_context,
932 since we can't know the character in input->mbs[-1] here. */
933 return input->tip_context;
934 if (__glibc_unlikely (idx == input->len))
935 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
937#ifdef RE_ENABLE_I18N
938 if (input->mb_cur_max > 1)
939 {
940 wint_t wc;
941 Idx wc_idx = idx;
942 while(input->wcs[wc_idx] == WEOF)
943 {
944#if defined DEBUG && DEBUG
945 /* It must not happen. */
946 assert (wc_idx >= 0);
947#endif
948 --wc_idx;
949 if (wc_idx < 0)
950 return input->tip_context;
951 }
952 wc = input->wcs[wc_idx];
953 if (__glibc_unlikely (input->word_ops_used != 0)
954 && IS_WIDE_WORD_CHAR (wc))
955 return CONTEXT_WORD;
956 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
957 ? CONTEXT_NEWLINE : 0);
958 }
959 else
960#endif
961 {
962 c = re_string_byte_at (input, idx);
963 if (bitset_contain (input->word_char, c))
964 return CONTEXT_WORD;
965 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
966 }
967}
968
969/* Functions for set operation. */
970
971static reg_errcode_t
972__attribute_warn_unused_result__
973re_node_set_alloc (re_node_set *set, Idx size)
974{
975 set->alloc = size;
976 set->nelem = 0;
977 set->elems = re_malloc (Idx, size);
978 if (__glibc_unlikely (set->elems == NULL)
979 && (MALLOC_0_IS_NONNULL || size != 0))
980 return REG_ESPACE;
981 return REG_NOERROR;
982}
983
984static reg_errcode_t
985__attribute_warn_unused_result__
986re_node_set_init_1 (re_node_set *set, Idx elem)
987{
988 set->alloc = 1;
989 set->nelem = 1;
990 set->elems = re_malloc (Idx, 1);
991 if (__glibc_unlikely (set->elems == NULL))
992 {
993 set->alloc = set->nelem = 0;
994 return REG_ESPACE;
995 }
996 set->elems[0] = elem;
997 return REG_NOERROR;
998}
999
1000static reg_errcode_t
1001__attribute_warn_unused_result__
1002re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003{
1004 set->alloc = 2;
1005 set->elems = re_malloc (Idx, 2);
1006 if (__glibc_unlikely (set->elems == NULL))
1007 return REG_ESPACE;
1008 if (elem1 == elem2)
1009 {
1010 set->nelem = 1;
1011 set->elems[0] = elem1;
1012 }
1013 else
1014 {
1015 set->nelem = 2;
1016 if (elem1 < elem2)
1017 {
1018 set->elems[0] = elem1;
1019 set->elems[1] = elem2;
1020 }
1021 else
1022 {
1023 set->elems[0] = elem2;
1024 set->elems[1] = elem1;
1025 }
1026 }
1027 return REG_NOERROR;
1028}
1029
1030static reg_errcode_t
1031__attribute_warn_unused_result__
1032re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033{
1034 dest->nelem = src->nelem;
1035 if (src->nelem > 0)
1036 {
1037 dest->alloc = dest->nelem;
1038 dest->elems = re_malloc (Idx, dest->alloc);
1039 if (__glibc_unlikely (dest->elems == NULL))
1040 {
1041 dest->alloc = dest->nelem = 0;
1042 return REG_ESPACE;
1043 }
1044 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045 }
1046 else
1047 re_node_set_init_empty (dest);
1048 return REG_NOERROR;
1049}
1050
1051/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1054
1055static reg_errcode_t
1056__attribute_warn_unused_result__
1057re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058 const re_node_set *src2)
1059{
1060 Idx i1, i2, is, id, delta, sbase;
1061 if (src1->nelem == 0 || src2->nelem == 0)
1062 return REG_NOERROR;
1063
1064 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065 conservative estimate. */
1066 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067 {
1068 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070 if (__glibc_unlikely (new_elems == NULL))
1071 return REG_ESPACE;
1072 dest->elems = new_elems;
1073 dest->alloc = new_alloc;
1074 }
1075
1076 /* Find the items in the intersection of SRC1 and SRC2, and copy
1077 into the top of DEST those that are not already in DEST itself. */
1078 sbase = dest->nelem + src1->nelem + src2->nelem;
1079 i1 = src1->nelem - 1;
1080 i2 = src2->nelem - 1;
1081 id = dest->nelem - 1;
1082 for (;;)
1083 {
1084 if (src1->elems[i1] == src2->elems[i2])
1085 {
1086 /* Try to find the item in DEST. Maybe we could binary search? */
1087 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1088 --id;
1089
1090 if (id < 0 || dest->elems[id] != src1->elems[i1])
1091 dest->elems[--sbase] = src1->elems[i1];
1092
1093 if (--i1 < 0 || --i2 < 0)
1094 break;
1095 }
1096
1097 /* Lower the highest of the two items. */
1098 else if (src1->elems[i1] < src2->elems[i2])
1099 {
1100 if (--i2 < 0)
1101 break;
1102 }
1103 else
1104 {
1105 if (--i1 < 0)
1106 break;
1107 }
1108 }
1109
1110 id = dest->nelem - 1;
1111 is = dest->nelem + src1->nelem + src2->nelem - 1;
1112 delta = is - sbase + 1;
1113
1114 /* Now copy. When DELTA becomes zero, the remaining
1115 DEST elements are already in place; this is more or
1116 less the same loop that is in re_node_set_merge. */
1117 dest->nelem += delta;
1118 if (delta > 0 && id >= 0)
1119 for (;;)
1120 {
1121 if (dest->elems[is] > dest->elems[id])
1122 {
1123 /* Copy from the top. */
1124 dest->elems[id + delta--] = dest->elems[is--];
1125 if (delta == 0)
1126 break;
1127 }
1128 else
1129 {
1130 /* Slide from the bottom. */
1131 dest->elems[id + delta] = dest->elems[id];
1132 if (--id < 0)
1133 break;
1134 }
1135 }
1136
1137 /* Copy remaining SRC elements. */
1138 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139
1140 return REG_NOERROR;
1141}
1142
1143/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1145
1146static reg_errcode_t
1147__attribute_warn_unused_result__
1148re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149 const re_node_set *src2)
1150{
1151 Idx i1, i2, id;
1152 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153 {
1154 dest->alloc = src1->nelem + src2->nelem;
1155 dest->elems = re_malloc (Idx, dest->alloc);
1156 if (__glibc_unlikely (dest->elems == NULL))
1157 return REG_ESPACE;
1158 }
1159 else
1160 {
1161 if (src1 != NULL && src1->nelem > 0)
1162 return re_node_set_init_copy (dest, src1);
1163 else if (src2 != NULL && src2->nelem > 0)
1164 return re_node_set_init_copy (dest, src2);
1165 else
1166 re_node_set_init_empty (dest);
1167 return REG_NOERROR;
1168 }
1169 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170 {
1171 if (src1->elems[i1] > src2->elems[i2])
1172 {
1173 dest->elems[id++] = src2->elems[i2++];
1174 continue;
1175 }
1176 if (src1->elems[i1] == src2->elems[i2])
1177 ++i2;
1178 dest->elems[id++] = src1->elems[i1++];
1179 }
1180 if (i1 < src1->nelem)
1181 {
1182 memcpy (dest->elems + id, src1->elems + i1,
1183 (src1->nelem - i1) * sizeof (Idx));
1184 id += src1->nelem - i1;
1185 }
1186 else if (i2 < src2->nelem)
1187 {
1188 memcpy (dest->elems + id, src2->elems + i2,
1189 (src2->nelem - i2) * sizeof (Idx));
1190 id += src2->nelem - i2;
1191 }
1192 dest->nelem = id;
1193 return REG_NOERROR;
1194}
1195
1196/* Calculate the union set of the sets DEST and SRC. And store it to
1197 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1198
1199static reg_errcode_t
1200__attribute_warn_unused_result__
1201re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202{
1203 Idx is, id, sbase, delta;
1204 if (src == NULL || src->nelem == 0)
1205 return REG_NOERROR;
1206 if (dest->alloc < 2 * src->nelem + dest->nelem)
1207 {
1208 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210 if (__glibc_unlikely (new_buffer == NULL))
1211 return REG_ESPACE;
1212 dest->elems = new_buffer;
1213 dest->alloc = new_alloc;
1214 }
1215
1216 if (__glibc_unlikely (dest->nelem == 0))
1217 {
1218 dest->nelem = src->nelem;
1219 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220 return REG_NOERROR;
1221 }
1222
1223 /* Copy into the top of DEST the items of SRC that are not
1224 found in DEST. Maybe we could binary search in DEST? */
1225 for (sbase = dest->nelem + 2 * src->nelem,
1226 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1227 {
1228 if (dest->elems[id] == src->elems[is])
1229 is--, id--;
1230 else if (dest->elems[id] < src->elems[is])
1231 dest->elems[--sbase] = src->elems[is--];
1232 else /* if (dest->elems[id] > src->elems[is]) */
1233 --id;
1234 }
1235
1236 if (is >= 0)
1237 {
1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1239 sbase -= is + 1;
1240 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1241 }
1242
1243 id = dest->nelem - 1;
1244 is = dest->nelem + 2 * src->nelem - 1;
1245 delta = is - sbase + 1;
1246 if (delta == 0)
1247 return REG_NOERROR;
1248
1249 /* Now copy. When DELTA becomes zero, the remaining
1250 DEST elements are already in place. */
1251 dest->nelem += delta;
1252 for (;;)
1253 {
1254 if (dest->elems[is] > dest->elems[id])
1255 {
1256 /* Copy from the top. */
1257 dest->elems[id + delta--] = dest->elems[is--];
1258 if (delta == 0)
1259 break;
1260 }
1261 else
1262 {
1263 /* Slide from the bottom. */
1264 dest->elems[id + delta] = dest->elems[id];
1265 if (--id < 0)
1266 {
1267 /* Copy remaining SRC elements. */
1268 memcpy (dest->elems, dest->elems + sbase,
1269 delta * sizeof (Idx));
1270 break;
1271 }
1272 }
1273 }
1274
1275 return REG_NOERROR;
1276}
1277
1278/* Insert the new element ELEM to the re_node_set* SET.
1279 SET should not already have ELEM.
1280 Return true if successful. */
1281
1282static bool
1283__attribute_warn_unused_result__
1284re_node_set_insert (re_node_set *set, Idx elem)
1285{
1286 Idx idx;
1287 /* In case the set is empty. */
1288 if (set->alloc == 0)
1289 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1290
1291 if (__glibc_unlikely (set->nelem) == 0)
1292 {
1293 /* We already guaranteed above that set->alloc != 0. */
1294 set->elems[0] = elem;
1295 ++set->nelem;
1296 return true;
1297 }
1298
1299 /* Realloc if we need. */
1300 if (set->alloc == set->nelem)
1301 {
1302 Idx *new_elems;
1303 set->alloc = set->alloc * 2;
1304 new_elems = re_realloc (set->elems, Idx, set->alloc);
1305 if (__glibc_unlikely (new_elems == NULL))
1306 return false;
1307 set->elems = new_elems;
1308 }
1309
1310 /* Move the elements which follows the new element. Test the
1311 first element separately to skip a check in the inner loop. */
1312 if (elem < set->elems[0])
1313 {
1314 idx = 0;
1315 for (idx = set->nelem; idx > 0; idx--)
1316 set->elems[idx] = set->elems[idx - 1];
1317 }
1318 else
1319 {
1320 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1321 set->elems[idx] = set->elems[idx - 1];
1322 }
1323
1324 /* Insert the new element. */
1325 set->elems[idx] = elem;
1326 ++set->nelem;
1327 return true;
1328}
1329
1330/* Insert the new element ELEM to the re_node_set* SET.
1331 SET should not already have any element greater than or equal to ELEM.
1332 Return true if successful. */
1333
1334static bool
1335__attribute_warn_unused_result__
1336re_node_set_insert_last (re_node_set *set, Idx elem)
1337{
1338 /* Realloc if we need. */
1339 if (set->alloc == set->nelem)
1340 {
1341 Idx *new_elems;
1342 set->alloc = (set->alloc + 1) * 2;
1343 new_elems = re_realloc (set->elems, Idx, set->alloc);
1344 if (__glibc_unlikely (new_elems == NULL))
1345 return false;
1346 set->elems = new_elems;
1347 }
1348
1349 /* Insert the new element. */
1350 set->elems[set->nelem++] = elem;
1351 return true;
1352}
1353
1354/* Compare two node sets SET1 and SET2.
1355 Return true if SET1 and SET2 are equivalent. */
1356
1357static bool
1358__attribute__ ((pure))
1359re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1360{
1361 Idx i;
1362 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1363 return false;
1364 for (i = set1->nelem ; --i >= 0 ; )
1365 if (set1->elems[i] != set2->elems[i])
1366 return false;
1367 return true;
1368}
1369
1370/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1371
1372static Idx
1373__attribute__ ((pure))
1374re_node_set_contains (const re_node_set *set, Idx elem)
1375{
1376 __re_size_t idx, right, mid;
1377 if (set->nelem <= 0)
1378 return 0;
1379
1380 /* Binary search the element. */
1381 idx = 0;
1382 right = set->nelem - 1;
1383 while (idx < right)
1384 {
1385 mid = (idx + right) / 2;
1386 if (set->elems[mid] < elem)
1387 idx = mid + 1;
1388 else
1389 right = mid;
1390 }
1391 return set->elems[idx] == elem ? idx + 1 : 0;
1392}
1393
1394static void
1395re_node_set_remove_at (re_node_set *set, Idx idx)
1396{
1397 if (idx < 0 || idx >= set->nelem)
1398 return;
1399 --set->nelem;
1400 for (; idx < set->nelem; idx++)
1401 set->elems[idx] = set->elems[idx + 1];
1402}
1403
1404
1405/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406 Or return -1 if an error occurred. */
1407
1408static Idx
1409re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410{
1411 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1412 {
1413 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414 Idx *new_nexts, *new_indices;
1415 re_node_set *new_edests, *new_eclosures;
1416 re_token_t *new_nodes;
1417
1418 /* Avoid overflows in realloc. */
1419 const size_t max_object_size = MAX (sizeof (re_token_t),
1420 MAX (sizeof (re_node_set),
1421 sizeof (Idx)));
1422 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1423 < new_nodes_alloc))
1424 return -1;
1425
1426 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427 if (__glibc_unlikely (new_nodes == NULL))
1428 return -1;
1429 dfa->nodes = new_nodes;
1430 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1435 || new_edests == NULL || new_eclosures == NULL))
1436 {
1437 re_free (new_nexts);
1438 re_free (new_indices);
1439 re_free (new_edests);
1440 re_free (new_eclosures);
1441 return -1;
1442 }
1443 dfa->nexts = new_nexts;
1444 dfa->org_indices = new_indices;
1445 dfa->edests = new_edests;
1446 dfa->eclosures = new_eclosures;
1447 dfa->nodes_alloc = new_nodes_alloc;
1448 }
1449 dfa->nodes[dfa->nodes_len] = token;
1450 dfa->nodes[dfa->nodes_len].constraint = 0;
1451#ifdef RE_ENABLE_I18N
1452 dfa->nodes[dfa->nodes_len].accept_mb =
1453 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1454 || token.type == COMPLEX_BRACKET);
1455#endif
1456 dfa->nexts[dfa->nodes_len] = -1;
1457 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1458 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1459 return dfa->nodes_len++;
1460}
1461
1462static re_hashval_t
1463calc_state_hash (const re_node_set *nodes, unsigned int context)
1464{
1465 re_hashval_t hash = nodes->nelem + context;
1466 Idx i;
1467 for (i = 0 ; i < nodes->nelem ; i++)
1468 hash += nodes->elems[i];
1469 return hash;
1470}
1471
1472/* Search for the state whose node_set is equivalent to NODES.
1473 Return the pointer to the state, if we found it in the DFA.
1474 Otherwise create the new one and return it. In case of an error
1475 return NULL and set the error code in ERR.
1476 Note: - We assume NULL as the invalid state, then it is possible that
1477 return value is NULL and ERR is REG_NOERROR.
1478 - We never return non-NULL value in case of any errors, it is for
1479 optimization. */
1480
1481static re_dfastate_t *
1482__attribute_warn_unused_result__
1483re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1484 const re_node_set *nodes)
1485{
1486 re_hashval_t hash;
1487 re_dfastate_t *new_state;
1488 struct re_state_table_entry *spot;
1489 Idx i;
1490#if defined GCC_LINT || defined lint
1491 /* Suppress bogus uninitialized-variable warnings. */
1492 *err = REG_NOERROR;
1493#endif
1494 if (__glibc_unlikely (nodes->nelem == 0))
1495 {
1496 *err = REG_NOERROR;
1497 return NULL;
1498 }
1499 hash = calc_state_hash (nodes, 0);
1500 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1501
1502 for (i = 0 ; i < spot->num ; i++)
1503 {
1504 re_dfastate_t *state = spot->array[i];
1505 if (hash != state->hash)
1506 continue;
1507 if (re_node_set_compare (&state->nodes, nodes))
1508 return state;
1509 }
1510
1511 /* There are no appropriate state in the dfa, create the new one. */
1512 new_state = create_ci_newstate (dfa, nodes, hash);
1513 if (__glibc_unlikely (new_state == NULL))
1514 *err = REG_ESPACE;
1515
1516 return new_state;
1517}
1518
1519/* Search for the state whose node_set is equivalent to NODES and
1520 whose context is equivalent to CONTEXT.
1521 Return the pointer to the state, if we found it in the DFA.
1522 Otherwise create the new one and return it. In case of an error
1523 return NULL and set the error code in ERR.
1524 Note: - We assume NULL as the invalid state, then it is possible that
1525 return value is NULL and ERR is REG_NOERROR.
1526 - We never return non-NULL value in case of any errors, it is for
1527 optimization. */
1528
1529static re_dfastate_t *
1530__attribute_warn_unused_result__
1531re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1532 const re_node_set *nodes, unsigned int context)
1533{
1534 re_hashval_t hash;
1535 re_dfastate_t *new_state;
1536 struct re_state_table_entry *spot;
1537 Idx i;
1538#if defined GCC_LINT || defined lint
1539 /* Suppress bogus uninitialized-variable warnings. */
1540 *err = REG_NOERROR;
1541#endif
1542 if (nodes->nelem == 0)
1543 {
1544 *err = REG_NOERROR;
1545 return NULL;
1546 }
1547 hash = calc_state_hash (nodes, context);
1548 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1549
1550 for (i = 0 ; i < spot->num ; i++)
1551 {
1552 re_dfastate_t *state = spot->array[i];
1553 if (state->hash == hash
1554 && state->context == context
1555 && re_node_set_compare (state->entrance_nodes, nodes))
1556 return state;
1557 }
1558 /* There are no appropriate state in 'dfa', create the new one. */
1559 new_state = create_cd_newstate (dfa, nodes, context, hash);
1560 if (__glibc_unlikely (new_state == NULL))
1561 *err = REG_ESPACE;
1562
1563 return new_state;
1564}
1565
1566/* Finish initialization of the new state NEWSTATE, and using its hash value
1567 HASH put in the appropriate bucket of DFA's state table. Return value
1568 indicates the error code if failed. */
1569
1570static reg_errcode_t
1571__attribute_warn_unused_result__
1572register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1573 re_hashval_t hash)
1574{
1575 struct re_state_table_entry *spot;
1576 reg_errcode_t err;
1577 Idx i;
1578
1579 newstate->hash = hash;
1580 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581 if (__glibc_unlikely (err != REG_NOERROR))
1582 return REG_ESPACE;
1583 for (i = 0; i < newstate->nodes.nelem; i++)
1584 {
1585 Idx elem = newstate->nodes.elems[i];
1586 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1587 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1588 return REG_ESPACE;
1589 }
1590
1591 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592 if (__glibc_unlikely (spot->alloc <= spot->num))
1593 {
1594 Idx new_alloc = 2 * spot->num + 2;
1595 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1596 new_alloc);
1597 if (__glibc_unlikely (new_array == NULL))
1598 return REG_ESPACE;
1599 spot->array = new_array;
1600 spot->alloc = new_alloc;
1601 }
1602 spot->array[spot->num++] = newstate;
1603 return REG_NOERROR;
1604}
1605
1606static void
1607free_state (re_dfastate_t *state)
1608{
1609 re_node_set_free (&state->non_eps_nodes);
1610 re_node_set_free (&state->inveclosure);
1611 if (state->entrance_nodes != &state->nodes)
1612 {
1613 re_node_set_free (state->entrance_nodes);
1614 re_free (state->entrance_nodes);
1615 }
1616 re_node_set_free (&state->nodes);
1617 re_free (state->word_trtable);
1618 re_free (state->trtable);
1619 re_free (state);
1620}
1621
1622/* Create the new state which is independent of contexts.
1623 Return the new state if succeeded, otherwise return NULL. */
1624
1625static re_dfastate_t *
1626__attribute_warn_unused_result__
1627create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1628 re_hashval_t hash)
1629{
1630 Idx i;
1631 reg_errcode_t err;
1632 re_dfastate_t *newstate;
1633
1634 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635 if (__glibc_unlikely (newstate == NULL))
1636 return NULL;
1637 err = re_node_set_init_copy (&newstate->nodes, nodes);
1638 if (__glibc_unlikely (err != REG_NOERROR))
1639 {
1640 re_free (newstate);
1641 return NULL;
1642 }
1643
1644 newstate->entrance_nodes = &newstate->nodes;
1645 for (i = 0 ; i < nodes->nelem ; i++)
1646 {
1647 re_token_t *node = dfa->nodes + nodes->elems[i];
1648 re_token_type_t type = node->type;
1649 if (type == CHARACTER && !node->constraint)
1650 continue;
1651#ifdef RE_ENABLE_I18N
1652 newstate->accept_mb |= node->accept_mb;
1653#endif /* RE_ENABLE_I18N */
1654
1655 /* If the state has the halt node, the state is a halt state. */
1656 if (type == END_OF_RE)
1657 newstate->halt = 1;
1658 else if (type == OP_BACK_REF)
1659 newstate->has_backref = 1;
1660 else if (type == ANCHOR || node->constraint)
1661 newstate->has_constraint = 1;
1662 }
1663 err = register_state (dfa, newstate, hash);
1664 if (__glibc_unlikely (err != REG_NOERROR))
1665 {
1666 free_state (newstate);
1667 newstate = NULL;
1668 }
1669 return newstate;
1670}
1671
1672/* Create the new state which is depend on the context CONTEXT.
1673 Return the new state if succeeded, otherwise return NULL. */
1674
1675static re_dfastate_t *
1676__attribute_warn_unused_result__
1677create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1678 unsigned int context, re_hashval_t hash)
1679{
1680 Idx i, nctx_nodes = 0;
1681 reg_errcode_t err;
1682 re_dfastate_t *newstate;
1683
1684 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685 if (__glibc_unlikely (newstate == NULL))
1686 return NULL;
1687 err = re_node_set_init_copy (&newstate->nodes, nodes);
1688 if (__glibc_unlikely (err != REG_NOERROR))
1689 {
1690 re_free (newstate);
1691 return NULL;
1692 }
1693
1694 newstate->context = context;
1695 newstate->entrance_nodes = &newstate->nodes;
1696
1697 for (i = 0 ; i < nodes->nelem ; i++)
1698 {
1699 re_token_t *node = dfa->nodes + nodes->elems[i];
1700 re_token_type_t type = node->type;
1701 unsigned int constraint = node->constraint;
1702
1703 if (type == CHARACTER && !constraint)
1704 continue;
1705#ifdef RE_ENABLE_I18N
1706 newstate->accept_mb |= node->accept_mb;
1707#endif /* RE_ENABLE_I18N */
1708
1709 /* If the state has the halt node, the state is a halt state. */
1710 if (type == END_OF_RE)
1711 newstate->halt = 1;
1712 else if (type == OP_BACK_REF)
1713 newstate->has_backref = 1;
1714
1715 if (constraint)
1716 {
1717 if (newstate->entrance_nodes == &newstate->nodes)
1718 {
1719 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720 if (__glibc_unlikely (newstate->entrance_nodes == NULL))
1721 {
1722 free_state (newstate);
1723 return NULL;
1724 }
1725 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1726 != REG_NOERROR)
1727 return NULL;
1728 nctx_nodes = 0;
1729 newstate->has_constraint = 1;
1730 }
1731
1732 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1733 {
1734 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735 ++nctx_nodes;
1736 }
1737 }
1738 }
1739 err = register_state (dfa, newstate, hash);
1740 if (__glibc_unlikely (err != REG_NOERROR))
1741 {
1742 free_state (newstate);
1743 newstate = NULL;
1744 }
1745 return newstate;
1746}
1747