1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <stdint.h>
21
22static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 int n) internal_function;
24static void match_ctx_clean (re_match_context_t *mctx) internal_function;
25static void match_ctx_free (re_match_context_t *cache) internal_function;
26static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to)
28 internal_function;
29static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 internal_function;
31static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
32 int str_idx) internal_function;
33static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
34 int node, int str_idx)
35 internal_function;
36static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
37 re_dfastate_t **limited_sts, int last_node,
38 int last_str_idx)
39 internal_function;
40static reg_errcode_t re_search_internal (const regex_t *preg,
41 const char *string, int length,
42 int start, int range, int stop,
43 size_t nmatch, regmatch_t pmatch[],
44 int eflags) internal_function;
45static int re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, int length1,
47 const char *string2, int length2,
48 int start, int range, struct re_registers *regs,
49 int stop, int ret_len) internal_function;
50static int re_search_stub (struct re_pattern_buffer *bufp,
51 const char *string, int length, int start,
52 int range, int stop, struct re_registers *regs,
53 int ret_len) internal_function;
54static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 int nregs, int regs_allocated) internal_function;
56static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58static int check_matching (re_match_context_t *mctx, int fl_longest_match,
59 int *p_match_first) internal_function;
60static int check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, int idx)
62 internal_function;
63static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, int cur_node,
65 int cur_idx, int nmatch) internal_function;
66static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 int str_idx, int dest_node, int nregs,
68 regmatch_t *regs,
69 re_node_set *eps_via_nodes)
70 internal_function;
71static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76 internal_function;
77
78#ifdef RE_ENABLE_I18N
79static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 int node_idx, int str_idx, int max_str_idx)
82 internal_function;
83#endif /* RE_ENABLE_I18N */
84static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
86 internal_function;
87static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, int str_idx,
89 re_node_set *cur_dest)
90 internal_function;
91static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 int str_idx,
94 re_node_set *dest_nodes)
95 internal_function;
96static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
99 internal_function;
100static int check_dst_limits (const re_match_context_t *mctx,
101 re_node_set *limits,
102 int dst_node, int dst_idx, int src_node,
103 int src_idx) internal_function;
104static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, int subexp_idx,
106 int from_node, int bkref_idx)
107 internal_function;
108static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 int limit, int subexp_idx,
110 int node, int str_idx,
111 int bkref_idx) internal_function;
112static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
115 re_node_set *limits,
116 struct re_backref_cache_entry *bkref_ents,
117 int str_idx) internal_function;
118static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 int str_idx, const re_node_set *candidates)
121 internal_function;
122static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **dst,
124 re_dfastate_t **src, int num)
125 internal_function;
126static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
134 internal_function;
135static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 int str_idx) internal_function;
138#if 0
139static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
142 internal_function;
143#endif
144#ifdef RE_ENABLE_I18N
145static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
147 internal_function;
148#endif /* RE_ENABLE_I18N */
149static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
151 internal_function;
152static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx)
154 internal_function;
155static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 int bkref_node, int bkref_str)
159 internal_function;
160static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 int subexp_idx, int type) internal_function;
162static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, int top_node,
164 int top_str, int last_node, int last_str,
165 int type) internal_function;
166static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 int str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
170 internal_function;
171static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 int ex_subexp, int type)
174 internal_function;
175static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 int target, int ex_subexp,
178 int type) internal_function;
179static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, int cur_str,
181 int subexp_num, int type)
182 internal_function;
183static int build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185#ifdef RE_ENABLE_I18N
186static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
187 const re_string_t *input, int idx)
188 internal_function;
189# ifdef _LIBC
190static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 size_t name_len)
192 internal_function;
193# endif /* _LIBC */
194#endif /* RE_ENABLE_I18N */
195static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199static int check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, int idx)
201 internal_function;
202static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203 internal_function;
204
205/* Entry point for POSIX code. */
206
207/* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
209
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
214
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
218
219 We return 0 if we find a match and REG_NOMATCH if not. */
220
221int
222regexec (const regex_t *__restrict preg, const char *__restrict string,
223 size_t nmatch, regmatch_t pmatch[], int eflags)
224{
225 reg_errcode_t err;
226 int start, length;
227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
228
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
231
232 if (eflags & REG_STARTEND)
233 {
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
236 }
237 else
238 {
239 start = 0;
240 length = strlen (string);
241 }
242
243 __libc_lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
252}
253
254#ifdef _LIBC
255# include <shlib-compat.h>
256versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
257
258# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259__typeof__ (__regexec) __compat_regexec;
260
261int
262attribute_compat_text_section
263__compat_regexec (const regex_t *__restrict preg,
264 const char *__restrict string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
266{
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
269}
270compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271# endif
272#endif
273
274/* Entry points for GNU code. */
275
276/* re_match, re_search, re_match_2, re_search_2
277
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
281
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
284
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
289
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
293
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
298
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
302
303int
304re_match (struct re_pattern_buffer *bufp, const char *string, int length,
305 int start, struct re_registers *regs)
306{
307 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
308}
309#ifdef _LIBC
310weak_alias (__re_match, re_match)
311#endif
312
313int
314re_search (struct re_pattern_buffer *bufp, const char *string, int length,
315 int start, int range, struct re_registers *regs)
316{
317 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
318}
319#ifdef _LIBC
320weak_alias (__re_search, re_search)
321#endif
322
323int
324re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
325 const char *string2, int length2, int start,
326 struct re_registers *regs, int stop)
327{
328 return re_search_2_stub (bufp, string1, length1, string2, length2,
329 start, 0, regs, stop, 1);
330}
331#ifdef _LIBC
332weak_alias (__re_match_2, re_match_2)
333#endif
334
335int
336re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
337 const char *string2, int length2, int start, int range,
338 struct re_registers *regs, int stop)
339{
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, range, regs, stop, 0);
342}
343#ifdef _LIBC
344weak_alias (__re_search_2, re_search_2)
345#endif
346
347static int
348internal_function
349re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
350 int length1, const char *string2, int length2, int start,
351 int range, struct re_registers *regs,
352 int stop, int ret_len)
353{
354 const char *str;
355 int rval;
356 int len = length1 + length2;
357 char *s = NULL;
358
359 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
360 return -2;
361
362 /* Concatenate the strings. */
363 if (length2 > 0)
364 if (length1 > 0)
365 {
366 s = re_malloc (char, len);
367
368 if (BE (s == NULL, 0))
369 return -2;
370#ifdef _LIBC
371 memcpy (__mempcpy (s, string1, length1), string2, length2);
372#else
373 memcpy (s, string1, length1);
374 memcpy (s + length1, string2, length2);
375#endif
376 str = s;
377 }
378 else
379 str = string2;
380 else
381 str = string1;
382
383 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
384 re_free (s);
385 return rval;
386}
387
388/* The parameters have the same meaning as those of re_search.
389 Additional parameters:
390 If RET_LEN is nonzero the length of the match is returned (re_match style);
391 otherwise the position of the match is returned. */
392
393static int
394internal_function
395re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
396 int start, int range, int stop, struct re_registers *regs,
397 int ret_len)
398{
399 reg_errcode_t result;
400 regmatch_t *pmatch;
401 int nregs, rval;
402 int eflags = 0;
403 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
404
405 /* Check for out-of-range. */
406 if (BE (start < 0 || start > length, 0))
407 return -1;
408 if (BE (start + range > length, 0))
409 range = length - start;
410 else if (BE (start + range < 0, 0))
411 range = -start;
412
413 __libc_lock_lock (dfa->lock);
414
415 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
416 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417
418 /* Compile fastmap if we haven't yet. */
419 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
420 re_compile_fastmap (bufp);
421
422 if (BE (bufp->no_sub, 0))
423 regs = NULL;
424
425 /* We need at least 1 register. */
426 if (regs == NULL)
427 nregs = 1;
428 else if (BE (bufp->regs_allocated == REGS_FIXED &&
429 regs->num_regs < bufp->re_nsub + 1, 0))
430 {
431 nregs = regs->num_regs;
432 if (BE (nregs < 1, 0))
433 {
434 /* Nothing can be copied to regs. */
435 regs = NULL;
436 nregs = 1;
437 }
438 }
439 else
440 nregs = bufp->re_nsub + 1;
441 pmatch = re_malloc (regmatch_t, nregs);
442 if (BE (pmatch == NULL, 0))
443 {
444 rval = -2;
445 goto out;
446 }
447
448 result = re_search_internal (bufp, string, length, start, range, stop,
449 nregs, pmatch, eflags);
450
451 rval = 0;
452
453 /* I hope we needn't fill ther regs with -1's when no match was found. */
454 if (result != REG_NOERROR)
455 rval = -1;
456 else if (regs != NULL)
457 {
458 /* If caller wants register contents data back, copy them. */
459 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
460 bufp->regs_allocated);
461 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
462 rval = -2;
463 }
464
465 if (BE (rval == 0, 1))
466 {
467 if (ret_len)
468 {
469 assert (pmatch[0].rm_so == start);
470 rval = pmatch[0].rm_eo - start;
471 }
472 else
473 rval = pmatch[0].rm_so;
474 }
475 re_free (pmatch);
476 out:
477 __libc_lock_unlock (dfa->lock);
478 return rval;
479}
480
481static unsigned
482internal_function
483re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
484 int regs_allocated)
485{
486 int rval = REGS_REALLOCATE;
487 int i;
488 int need_regs = nregs + 1;
489 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
490 uses. */
491
492 /* Have the register data arrays been allocated? */
493 if (regs_allocated == REGS_UNALLOCATED)
494 { /* No. So allocate them with malloc. */
495 regs->start = re_malloc (regoff_t, need_regs);
496 if (BE (regs->start == NULL, 0))
497 return REGS_UNALLOCATED;
498 regs->end = re_malloc (regoff_t, need_regs);
499 if (BE (regs->end == NULL, 0))
500 {
501 re_free (regs->start);
502 return REGS_UNALLOCATED;
503 }
504 regs->num_regs = need_regs;
505 }
506 else if (regs_allocated == REGS_REALLOCATE)
507 { /* Yes. If we need more elements than were already
508 allocated, reallocate them. If we need fewer, just
509 leave it alone. */
510 if (BE (need_regs > regs->num_regs, 0))
511 {
512 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
513 regoff_t *new_end;
514 if (BE (new_start == NULL, 0))
515 return REGS_UNALLOCATED;
516 new_end = re_realloc (regs->end, regoff_t, need_regs);
517 if (BE (new_end == NULL, 0))
518 {
519 re_free (new_start);
520 return REGS_UNALLOCATED;
521 }
522 regs->start = new_start;
523 regs->end = new_end;
524 regs->num_regs = need_regs;
525 }
526 }
527 else
528 {
529 assert (regs_allocated == REGS_FIXED);
530 /* This function may not be called with REGS_FIXED and nregs too big. */
531 assert (regs->num_regs >= nregs);
532 rval = REGS_FIXED;
533 }
534
535 /* Copy the regs. */
536 for (i = 0; i < nregs; ++i)
537 {
538 regs->start[i] = pmatch[i].rm_so;
539 regs->end[i] = pmatch[i].rm_eo;
540 }
541 for ( ; i < regs->num_regs; ++i)
542 regs->start[i] = regs->end[i] = -1;
543
544 return rval;
545}
546
547/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
548 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
549 this memory for recording register information. STARTS and ENDS
550 must be allocated using the malloc library routine, and must each
551 be at least NUM_REGS * sizeof (regoff_t) bytes long.
552
553 If NUM_REGS == 0, then subsequent matches should allocate their own
554 register data.
555
556 Unless this function is called, the first search or match using
557 PATTERN_BUFFER will allocate its own register data, without
558 freeing the old data. */
559
560void
561re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
562 unsigned num_regs, regoff_t *starts, regoff_t *ends)
563{
564 if (num_regs)
565 {
566 bufp->regs_allocated = REGS_REALLOCATE;
567 regs->num_regs = num_regs;
568 regs->start = starts;
569 regs->end = ends;
570 }
571 else
572 {
573 bufp->regs_allocated = REGS_UNALLOCATED;
574 regs->num_regs = 0;
575 regs->start = regs->end = (regoff_t *) 0;
576 }
577}
578#ifdef _LIBC
579weak_alias (__re_set_registers, re_set_registers)
580#endif
581
582/* Entry points compatible with 4.2 BSD regex library. We don't define
583 them unless specifically requested. */
584
585#if defined _REGEX_RE_COMP || defined _LIBC
586int
587# ifdef _LIBC
588weak_function
589# endif
590re_exec (const char *s)
591{
592 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
593}
594#endif /* _REGEX_RE_COMP */
595
596/* Internal entry point. */
597
598/* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
601 with re_search.
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
606
607static reg_errcode_t
608__attribute_warn_unused_result__ internal_function
609re_search_internal (const regex_t *preg, const char *string, int length,
610 int start, int range, int stop, size_t nmatch,
611 regmatch_t pmatch[], int eflags)
612{
613 reg_errcode_t err;
614 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
615 int left_lim, right_lim, incr;
616 int fl_longest_match, match_first, match_kind, match_last = -1;
617 int extra_nmatch;
618 int sb, ch;
619#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
620 re_match_context_t mctx = { .dfa = dfa };
621#else
622 re_match_context_t mctx;
623#endif
624 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
625 && range && !preg->can_be_null) ? preg->fastmap : NULL;
626 RE_TRANSLATE_TYPE t = preg->translate;
627
628#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
629 memset (&mctx, '\0', sizeof (re_match_context_t));
630 mctx.dfa = dfa;
631#endif
632
633 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
634 nmatch -= extra_nmatch;
635
636 /* Check if the DFA haven't been compiled. */
637 if (BE (preg->used == 0 || dfa->init_state == NULL
638 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
639 || dfa->init_state_begbuf == NULL, 0))
640 return REG_NOMATCH;
641
642#ifdef DEBUG
643 /* We assume front-end functions already check them. */
644 assert (start + range >= 0 && start + range <= length);
645#endif
646
647 /* If initial states with non-begbuf contexts have no elements,
648 the regex must be anchored. If preg->newline_anchor is set,
649 we'll never use init_state_nl, so do not check it. */
650 if (dfa->init_state->nodes.nelem == 0
651 && dfa->init_state_word->nodes.nelem == 0
652 && (dfa->init_state_nl->nodes.nelem == 0
653 || !preg->newline_anchor))
654 {
655 if (start != 0 && start + range != 0)
656 return REG_NOMATCH;
657 start = range = 0;
658 }
659
660 /* We must check the longest matching, if nmatch > 0. */
661 fl_longest_match = (nmatch != 0 || dfa->nbackref);
662
663 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
664 preg->translate, preg->syntax & RE_ICASE, dfa);
665 if (BE (err != REG_NOERROR, 0))
666 goto free_return;
667 mctx.input.stop = stop;
668 mctx.input.raw_stop = stop;
669 mctx.input.newline_anchor = preg->newline_anchor;
670
671 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
672 if (BE (err != REG_NOERROR, 0))
673 goto free_return;
674
675 /* We will log all the DFA states through which the dfa pass,
676 if nmatch > 1, or this dfa has "multibyte node", which is a
677 back-reference or a node which can accept multibyte character or
678 multi character collating element. */
679 if (nmatch > 1 || dfa->has_mb_node)
680 {
681 /* Avoid overflow. */
682 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
683 {
684 err = REG_ESPACE;
685 goto free_return;
686 }
687
688 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
690 {
691 err = REG_ESPACE;
692 goto free_return;
693 }
694 }
695 else
696 mctx.state_log = NULL;
697
698 match_first = start;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
701
702 /* Check incrementally whether of not the input string match. */
703 incr = (range < 0) ? -1 : 1;
704 left_lim = (range < 0) ? start + range : start;
705 right_lim = (range < 0) ? start : start + range;
706 sb = dfa->mb_cur_max == 1;
707 match_kind =
708 (fastmap
709 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
710 | (range >= 0 ? 2 : 0)
711 | (t != NULL ? 1 : 0))
712 : 8);
713
714 for (;; match_first += incr)
715 {
716 err = REG_NOMATCH;
717 if (match_first < left_lim || right_lim < match_first)
718 goto free_return;
719
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
725 switch (match_kind)
726 {
727 case 8:
728 /* No fastmap. */
729 break;
730
731 case 7:
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
735 ++match_first;
736 goto forward_match_found_start_or_reached_end;
737
738 case 6:
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
742 ++match_first;
743
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
746 {
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
750 goto free_return;
751 }
752 break;
753
754 case 4:
755 case 5:
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
758 {
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
762 break;
763 --match_first;
764 }
765 if (match_first < left_lim)
766 goto free_return;
767 break;
768
769 default:
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
773 for (;;)
774 {
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
776 buffers. */
777 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
779 {
780 err = re_string_reconstruct (&mctx.input, match_first,
781 eflags);
782 if (BE (err != REG_NOERROR, 0))
783 goto free_return;
784
785 offset = match_first - mctx.input.raw_mbs_idx;
786 }
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
791 if (fastmap[ch])
792 break;
793 match_first += incr;
794 if (match_first < left_lim || match_first > right_lim)
795 {
796 err = REG_NOMATCH;
797 goto free_return;
798 }
799 }
800 break;
801 }
802
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
808
809#ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
813 continue;
814#endif
815
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 range >= 0 ? &match_first : NULL);
821 if (match_last != -1)
822 {
823 if (BE (match_last == -2, 0))
824 {
825 err = REG_ESPACE;
826 goto free_return;
827 }
828 else
829 {
830 mctx.match_last = match_last;
831 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
832 {
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
835 match_last);
836 }
837 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
838 || dfa->nbackref)
839 {
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
842 break;
843 if (BE (err != REG_NOMATCH, 0))
844 goto free_return;
845 match_last = -1;
846 }
847 else
848 break; /* We found a match. */
849 }
850 }
851
852 match_ctx_clean (&mctx);
853 }
854
855#ifdef DEBUG
856 assert (match_last != -1);
857 assert (err == REG_NOERROR);
858#endif
859
860 /* Set pmatch[] if we need. */
861 if (nmatch > 0)
862 {
863 int reg_idx;
864
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
868
869 /* Set the points where matching start/end. */
870 pmatch[0].rm_so = 0;
871 pmatch[0].rm_eo = mctx.match_last;
872
873 if (!preg->no_sub && nmatch > 1)
874 {
875 err = set_regs (preg, &mctx, nmatch, pmatch,
876 dfa->has_plural_match && dfa->nbackref > 0);
877 if (BE (err != REG_NOERROR, 0))
878 goto free_return;
879 }
880
881 /* At last, add the offset to the each registers, since we slided
882 the buffers so that we could assume that the matching starts
883 from 0. */
884 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
885 if (pmatch[reg_idx].rm_so != -1)
886 {
887#ifdef RE_ENABLE_I18N
888 if (BE (mctx.input.offsets_needed != 0, 0))
889 {
890 pmatch[reg_idx].rm_so =
891 (pmatch[reg_idx].rm_so == mctx.input.valid_len
892 ? mctx.input.valid_raw_len
893 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
894 pmatch[reg_idx].rm_eo =
895 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
896 ? mctx.input.valid_raw_len
897 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
898 }
899#else
900 assert (mctx.input.offsets_needed == 0);
901#endif
902 pmatch[reg_idx].rm_so += match_first;
903 pmatch[reg_idx].rm_eo += match_first;
904 }
905 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
906 {
907 pmatch[nmatch + reg_idx].rm_so = -1;
908 pmatch[nmatch + reg_idx].rm_eo = -1;
909 }
910
911 if (dfa->subexp_map)
912 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
913 if (dfa->subexp_map[reg_idx] != reg_idx)
914 {
915 pmatch[reg_idx + 1].rm_so
916 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
917 pmatch[reg_idx + 1].rm_eo
918 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
919 }
920 }
921
922 free_return:
923 re_free (mctx.state_log);
924 if (dfa->nbackref)
925 match_ctx_free (&mctx);
926 re_string_destruct (&mctx.input);
927 return err;
928}
929
930static reg_errcode_t
931internal_function __attribute_warn_unused_result__
932prune_impossible_nodes (re_match_context_t *mctx)
933{
934 const re_dfa_t *const dfa = mctx->dfa;
935 int halt_node, match_last;
936 reg_errcode_t ret;
937 re_dfastate_t **sifted_states;
938 re_dfastate_t **lim_states = NULL;
939 re_sift_context_t sctx;
940#ifdef DEBUG
941 assert (mctx->state_log != NULL);
942#endif
943 match_last = mctx->match_last;
944 halt_node = mctx->last_node;
945
946 /* Avoid overflow. */
947 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
948 return REG_ESPACE;
949
950 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
951 if (BE (sifted_states == NULL, 0))
952 {
953 ret = REG_ESPACE;
954 goto free_return;
955 }
956 if (dfa->nbackref)
957 {
958 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
959 if (BE (lim_states == NULL, 0))
960 {
961 ret = REG_ESPACE;
962 goto free_return;
963 }
964 while (1)
965 {
966 memset (lim_states, '\0',
967 sizeof (re_dfastate_t *) * (match_last + 1));
968 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
969 match_last);
970 ret = sift_states_backward (mctx, &sctx);
971 re_node_set_free (&sctx.limits);
972 if (BE (ret != REG_NOERROR, 0))
973 goto free_return;
974 if (sifted_states[0] != NULL || lim_states[0] != NULL)
975 break;
976 do
977 {
978 --match_last;
979 if (match_last < 0)
980 {
981 ret = REG_NOMATCH;
982 goto free_return;
983 }
984 } while (mctx->state_log[match_last] == NULL
985 || !mctx->state_log[match_last]->halt);
986 halt_node = check_halt_state_context (mctx,
987 mctx->state_log[match_last],
988 match_last);
989 }
990 ret = merge_state_array (dfa, sifted_states, lim_states,
991 match_last + 1);
992 re_free (lim_states);
993 lim_states = NULL;
994 if (BE (ret != REG_NOERROR, 0))
995 goto free_return;
996 }
997 else
998 {
999 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1000 ret = sift_states_backward (mctx, &sctx);
1001 re_node_set_free (&sctx.limits);
1002 if (BE (ret != REG_NOERROR, 0))
1003 goto free_return;
1004 if (sifted_states[0] == NULL)
1005 {
1006 ret = REG_NOMATCH;
1007 goto free_return;
1008 }
1009 }
1010 re_free (mctx->state_log);
1011 mctx->state_log = sifted_states;
1012 sifted_states = NULL;
1013 mctx->last_node = halt_node;
1014 mctx->match_last = match_last;
1015 ret = REG_NOERROR;
1016 free_return:
1017 re_free (sifted_states);
1018 re_free (lim_states);
1019 return ret;
1020}
1021
1022/* Acquire an initial state and return it.
1023 We must select appropriate initial state depending on the context,
1024 since initial states may have constraints like "\<", "^", etc.. */
1025
1026static inline re_dfastate_t *
1027__attribute ((always_inline)) internal_function
1028acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1029 int idx)
1030{
1031 const re_dfa_t *const dfa = mctx->dfa;
1032 if (dfa->init_state->has_constraint)
1033 {
1034 unsigned int context;
1035 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1036 if (IS_WORD_CONTEXT (context))
1037 return dfa->init_state_word;
1038 else if (IS_ORDINARY_CONTEXT (context))
1039 return dfa->init_state;
1040 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1041 return dfa->init_state_begbuf;
1042 else if (IS_NEWLINE_CONTEXT (context))
1043 return dfa->init_state_nl;
1044 else if (IS_BEGBUF_CONTEXT (context))
1045 {
1046 /* It is relatively rare case, then calculate on demand. */
1047 return re_acquire_state_context (err, dfa,
1048 dfa->init_state->entrance_nodes,
1049 context);
1050 }
1051 else
1052 /* Must not happen? */
1053 return dfa->init_state;
1054 }
1055 else
1056 return dfa->init_state;
1057}
1058
1059/* Check whether the regular expression match input string INPUT or not,
1060 and return the index where the matching end, return -1 if not match,
1061 or return -2 in case of an error.
1062 FL_LONGEST_MATCH means we want the POSIX longest matching.
1063 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1064 next place where we may want to try matching.
1065 Note that the matcher assume that the maching starts from the current
1066 index of the buffer. */
1067
1068static int
1069internal_function __attribute_warn_unused_result__
1070check_matching (re_match_context_t *mctx, int fl_longest_match,
1071 int *p_match_first)
1072{
1073 const re_dfa_t *const dfa = mctx->dfa;
1074 reg_errcode_t err;
1075 int match = 0;
1076 int match_last = -1;
1077 int cur_str_idx = re_string_cur_idx (&mctx->input);
1078 re_dfastate_t *cur_state;
1079 int at_init_state = p_match_first != NULL;
1080 int next_start_idx = cur_str_idx;
1081
1082 err = REG_NOERROR;
1083 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1084 /* An initial state must not be NULL (invalid). */
1085 if (BE (cur_state == NULL, 0))
1086 {
1087 assert (err == REG_ESPACE);
1088 return -2;
1089 }
1090
1091 if (mctx->state_log != NULL)
1092 {
1093 mctx->state_log[cur_str_idx] = cur_state;
1094
1095 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1096 later. E.g. Processing back references. */
1097 if (BE (dfa->nbackref, 0))
1098 {
1099 at_init_state = 0;
1100 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1101 if (BE (err != REG_NOERROR, 0))
1102 return err;
1103
1104 if (cur_state->has_backref)
1105 {
1106 err = transit_state_bkref (mctx, &cur_state->nodes);
1107 if (BE (err != REG_NOERROR, 0))
1108 return err;
1109 }
1110 }
1111 }
1112
1113 /* If the RE accepts NULL string. */
1114 if (BE (cur_state->halt, 0))
1115 {
1116 if (!cur_state->has_constraint
1117 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1118 {
1119 if (!fl_longest_match)
1120 return cur_str_idx;
1121 else
1122 {
1123 match_last = cur_str_idx;
1124 match = 1;
1125 }
1126 }
1127 }
1128
1129 while (!re_string_eoi (&mctx->input))
1130 {
1131 re_dfastate_t *old_state = cur_state;
1132 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1133
1134 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1135 && mctx->input.bufs_len < mctx->input.len)
1136 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1137 && mctx->input.valid_len < mctx->input.len))
1138 {
1139 err = extend_buffers (mctx, next_char_idx + 1);
1140 if (BE (err != REG_NOERROR, 0))
1141 {
1142 assert (err == REG_ESPACE);
1143 return -2;
1144 }
1145 }
1146
1147 cur_state = transit_state (&err, mctx, cur_state);
1148 if (mctx->state_log != NULL)
1149 cur_state = merge_state_with_log (&err, mctx, cur_state);
1150
1151 if (cur_state == NULL)
1152 {
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err != REG_NOERROR, 0))
1157 return -2;
1158
1159 if (mctx->state_log == NULL
1160 || (match && !fl_longest_match)
1161 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1162 break;
1163 }
1164
1165 if (BE (at_init_state, 0))
1166 {
1167 if (old_state == cur_state)
1168 next_start_idx = next_char_idx;
1169 else
1170 at_init_state = 0;
1171 }
1172
1173 if (cur_state->halt)
1174 {
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state->has_constraint
1178 || check_halt_state_context (mctx, cur_state,
1179 re_string_cur_idx (&mctx->input)))
1180 {
1181 /* We found an appropriate halt state. */
1182 match_last = re_string_cur_idx (&mctx->input);
1183 match = 1;
1184
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first = NULL;
1187 if (!fl_longest_match)
1188 break;
1189 }
1190 }
1191 }
1192
1193 if (p_match_first)
1194 *p_match_first += next_start_idx;
1195
1196 return match_last;
1197}
1198
1199/* Check NODE match the current context. */
1200
1201static int
1202internal_function
1203check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1204{
1205 re_token_type_t type = dfa->nodes[node].type;
1206 unsigned int constraint = dfa->nodes[node].constraint;
1207 if (type != END_OF_RE)
1208 return 0;
1209 if (!constraint)
1210 return 1;
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1212 return 0;
1213 return 1;
1214}
1215
1216/* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1219
1220static int
1221internal_function
1222check_halt_state_context (const re_match_context_t *mctx,
1223 const re_dfastate_t *state, int idx)
1224{
1225 int i;
1226 unsigned int context;
1227#ifdef DEBUG
1228 assert (state->halt);
1229#endif
1230 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1231 for (i = 0; i < state->nodes.nelem; ++i)
1232 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1233 return state->nodes.elems[i];
1234 return 0;
1235}
1236
1237/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1240 of errors. */
1241
1242static int
1243internal_function
1244proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1245 int *pidx, int node, re_node_set *eps_via_nodes,
1246 struct re_fail_stack_t *fs)
1247{
1248 const re_dfa_t *const dfa = mctx->dfa;
1249 int i, err;
1250 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1251 {
1252 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1253 re_node_set *edests = &dfa->edests[node];
1254 int dest_node;
1255 err = re_node_set_insert (eps_via_nodes, node);
1256 if (BE (err < 0, 0))
1257 return -2;
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1260 {
1261 int candidate = edests->elems[i];
1262 if (!re_node_set_contains (cur_nodes, candidate))
1263 continue;
1264 if (dest_node == -1)
1265 dest_node = candidate;
1266
1267 else
1268 {
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes, dest_node))
1272 return candidate;
1273
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1275 else if (fs != NULL
1276 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1277 eps_via_nodes))
1278 return -2;
1279
1280 /* We know we are going to exit. */
1281 break;
1282 }
1283 }
1284 return dest_node;
1285 }
1286 else
1287 {
1288 int naccepted = 0;
1289 re_token_type_t type = dfa->nodes[node].type;
1290
1291#ifdef RE_ENABLE_I18N
1292 if (dfa->nodes[node].accept_mb)
1293 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1294 else
1295#endif /* RE_ENABLE_I18N */
1296 if (type == OP_BACK_REF)
1297 {
1298 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1299 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1300 if (fs != NULL)
1301 {
1302 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1303 return -1;
1304 else if (naccepted)
1305 {
1306 char *buf = (char *) re_string_get_buffer (&mctx->input);
1307 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1308 naccepted) != 0)
1309 return -1;
1310 }
1311 }
1312
1313 if (naccepted == 0)
1314 {
1315 int dest_node;
1316 err = re_node_set_insert (eps_via_nodes, node);
1317 if (BE (err < 0, 0))
1318 return -2;
1319 dest_node = dfa->edests[node].elems[0];
1320 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 dest_node))
1322 return dest_node;
1323 }
1324 }
1325
1326 if (naccepted != 0
1327 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1328 {
1329 int dest_node = dfa->nexts[node];
1330 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1331 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1332 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1333 dest_node)))
1334 return -1;
1335 re_node_set_empty (eps_via_nodes);
1336 return dest_node;
1337 }
1338 }
1339 return -1;
1340}
1341
1342static reg_errcode_t
1343internal_function __attribute_warn_unused_result__
1344push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1345 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1346{
1347 reg_errcode_t err;
1348 int num = fs->num++;
1349 if (fs->num == fs->alloc)
1350 {
1351 struct re_fail_stack_ent_t *new_array;
1352 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1353 * fs->alloc * 2));
1354 if (new_array == NULL)
1355 return REG_ESPACE;
1356 fs->alloc *= 2;
1357 fs->stack = new_array;
1358 }
1359 fs->stack[num].idx = str_idx;
1360 fs->stack[num].node = dest_node;
1361 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1362 if (fs->stack[num].regs == NULL)
1363 return REG_ESPACE;
1364 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1365 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1366 return err;
1367}
1368
1369static int
1370internal_function
1371pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1372 regmatch_t *regs, re_node_set *eps_via_nodes)
1373{
1374 int num = --fs->num;
1375 assert (num >= 0);
1376 *pidx = fs->stack[num].idx;
1377 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1378 re_node_set_free (eps_via_nodes);
1379 re_free (fs->stack[num].regs);
1380 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1381 return fs->stack[num].node;
1382}
1383
1384/* Set the positions where the subexpressions are starts/ends to registers
1385 PMATCH.
1386 Note: We assume that pmatch[0] is already set, and
1387 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1388
1389static reg_errcode_t
1390internal_function __attribute_warn_unused_result__
1391set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1392 regmatch_t *pmatch, int fl_backtrack)
1393{
1394 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1395 int idx, cur_node;
1396 re_node_set eps_via_nodes;
1397 struct re_fail_stack_t *fs;
1398 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1399 regmatch_t *prev_idx_match;
1400 int prev_idx_match_malloced = 0;
1401
1402#ifdef DEBUG
1403 assert (nmatch > 1);
1404 assert (mctx->state_log != NULL);
1405#endif
1406 if (fl_backtrack)
1407 {
1408 fs = &fs_body;
1409 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1410 if (fs->stack == NULL)
1411 return REG_ESPACE;
1412 }
1413 else
1414 fs = NULL;
1415
1416 cur_node = dfa->init_node;
1417 re_node_set_init_empty (&eps_via_nodes);
1418
1419 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1420 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1421 else
1422 {
1423 prev_idx_match = re_malloc (regmatch_t, nmatch);
1424 if (prev_idx_match == NULL)
1425 {
1426 free_fail_stack_return (fs);
1427 return REG_ESPACE;
1428 }
1429 prev_idx_match_malloced = 1;
1430 }
1431 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1432
1433 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1434 {
1435 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1436
1437 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1438 {
1439 int reg_idx;
1440 if (fs)
1441 {
1442 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1443 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1444 break;
1445 if (reg_idx == nmatch)
1446 {
1447 re_node_set_free (&eps_via_nodes);
1448 if (prev_idx_match_malloced)
1449 re_free (prev_idx_match);
1450 return free_fail_stack_return (fs);
1451 }
1452 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1453 &eps_via_nodes);
1454 }
1455 else
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return REG_NOERROR;
1461 }
1462 }
1463
1464 /* Proceed to next node. */
1465 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1466 &eps_via_nodes, fs);
1467
1468 if (BE (cur_node < 0, 0))
1469 {
1470 if (BE (cur_node == -2, 0))
1471 {
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 free_fail_stack_return (fs);
1476 return REG_ESPACE;
1477 }
1478 if (fs)
1479 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 &eps_via_nodes);
1481 else
1482 {
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 return REG_NOMATCH;
1487 }
1488 }
1489 }
1490 re_node_set_free (&eps_via_nodes);
1491 if (prev_idx_match_malloced)
1492 re_free (prev_idx_match);
1493 return free_fail_stack_return (fs);
1494}
1495
1496static reg_errcode_t
1497internal_function
1498free_fail_stack_return (struct re_fail_stack_t *fs)
1499{
1500 if (fs)
1501 {
1502 int fs_idx;
1503 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1504 {
1505 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1506 re_free (fs->stack[fs_idx].regs);
1507 }
1508 re_free (fs->stack);
1509 }
1510 return REG_NOERROR;
1511}
1512
1513static void
1514internal_function
1515update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1516 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1517{
1518 int type = dfa->nodes[cur_node].type;
1519 if (type == OP_OPEN_SUBEXP)
1520 {
1521 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1522
1523 /* We are at the first node of this sub expression. */
1524 if (reg_num < nmatch)
1525 {
1526 pmatch[reg_num].rm_so = cur_idx;
1527 pmatch[reg_num].rm_eo = -1;
1528 }
1529 }
1530 else if (type == OP_CLOSE_SUBEXP)
1531 {
1532 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533 if (reg_num < nmatch)
1534 {
1535 /* We are at the last node of this sub expression. */
1536 if (pmatch[reg_num].rm_so < cur_idx)
1537 {
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 /* This is a non-empty match or we are not inside an optional
1540 subexpression. Accept this right away. */
1541 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1542 }
1543 else
1544 {
1545 if (dfa->nodes[cur_node].opt_subexp
1546 && prev_idx_match[reg_num].rm_so != -1)
1547 /* We transited through an empty match for an optional
1548 subexpression, like (a?)*, and this is not the subexp's
1549 first match. Copy back the old content of the registers
1550 so that matches of an inner subexpression are undone as
1551 well, like in ((a?))*. */
1552 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1553 else
1554 /* We completed a subexpression, but it may be part of
1555 an optional one, so do not update PREV_IDX_MATCH. */
1556 pmatch[reg_num].rm_eo = cur_idx;
1557 }
1558 }
1559 }
1560}
1561
1562/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1563 and sift the nodes in each states according to the following rules.
1564 Updated state_log will be wrote to STATE_LOG.
1565
1566 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1567 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1568 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1569 the LAST_NODE, we throw away the node `a'.
1570 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1571 string `s' and transit to `b':
1572 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1573 away the node `a'.
1574 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1575 thrown away, we throw away the node `a'.
1576 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1577 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1578 node `a'.
1579 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1580 we throw away the node `a'. */
1581
1582#define STATE_NODE_CONTAINS(state,node) \
1583 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1584
1585static reg_errcode_t
1586internal_function
1587sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1588{
1589 reg_errcode_t err;
1590 int null_cnt = 0;
1591 int str_idx = sctx->last_str_idx;
1592 re_node_set cur_dest;
1593
1594#ifdef DEBUG
1595 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1596#endif
1597
1598 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1599 transit to the last_node and the last_node itself. */
1600 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1601 if (BE (err != REG_NOERROR, 0))
1602 return err;
1603 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1604 if (BE (err != REG_NOERROR, 0))
1605 goto free_return;
1606
1607 /* Then check each states in the state_log. */
1608 while (str_idx > 0)
1609 {
1610 /* Update counters. */
1611 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1612 if (null_cnt > mctx->max_mb_elem_len)
1613 {
1614 memset (sctx->sifted_states, '\0',
1615 sizeof (re_dfastate_t *) * str_idx);
1616 re_node_set_free (&cur_dest);
1617 return REG_NOERROR;
1618 }
1619 re_node_set_empty (&cur_dest);
1620 --str_idx;
1621
1622 if (mctx->state_log[str_idx])
1623 {
1624 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1625 if (BE (err != REG_NOERROR, 0))
1626 goto free_return;
1627 }
1628
1629 /* Add all the nodes which satisfy the following conditions:
1630 - It can epsilon transit to a node in CUR_DEST.
1631 - It is in CUR_SRC.
1632 And update state_log. */
1633 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1636 }
1637 err = REG_NOERROR;
1638 free_return:
1639 re_node_set_free (&cur_dest);
1640 return err;
1641}
1642
1643static reg_errcode_t
1644internal_function __attribute_warn_unused_result__
1645build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1646 int str_idx, re_node_set *cur_dest)
1647{
1648 const re_dfa_t *const dfa = mctx->dfa;
1649 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1650 int i;
1651
1652 /* Then build the next sifted state.
1653 We build the next sifted state on `cur_dest', and update
1654 `sifted_states[str_idx]' with `cur_dest'.
1655 Note:
1656 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1657 `cur_src' points the node_set of the old `state_log[str_idx]'
1658 (with the epsilon nodes pre-filtered out). */
1659 for (i = 0; i < cur_src->nelem; i++)
1660 {
1661 int prev_node = cur_src->elems[i];
1662 int naccepted = 0;
1663 int ret;
1664
1665#ifdef DEBUG
1666 re_token_type_t type = dfa->nodes[prev_node].type;
1667 assert (!IS_EPSILON_NODE (type));
1668#endif
1669#ifdef RE_ENABLE_I18N
1670 /* If the node may accept `multi byte'. */
1671 if (dfa->nodes[prev_node].accept_mb)
1672 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1673 str_idx, sctx->last_str_idx);
1674#endif /* RE_ENABLE_I18N */
1675
1676 /* We don't check backreferences here.
1677 See update_cur_sifted_state(). */
1678 if (!naccepted
1679 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1680 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1681 dfa->nexts[prev_node]))
1682 naccepted = 1;
1683
1684 if (naccepted == 0)
1685 continue;
1686
1687 if (sctx->limits.nelem)
1688 {
1689 int to_idx = str_idx + naccepted;
1690 if (check_dst_limits (mctx, &sctx->limits,
1691 dfa->nexts[prev_node], to_idx,
1692 prev_node, str_idx))
1693 continue;
1694 }
1695 ret = re_node_set_insert (cur_dest, prev_node);
1696 if (BE (ret == -1, 0))
1697 return REG_ESPACE;
1698 }
1699
1700 return REG_NOERROR;
1701}
1702
1703/* Helper functions. */
1704
1705static reg_errcode_t
1706internal_function
1707clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1708{
1709 int top = mctx->state_log_top;
1710
1711 if ((next_state_log_idx >= mctx->input.bufs_len
1712 && mctx->input.bufs_len < mctx->input.len)
1713 || (next_state_log_idx >= mctx->input.valid_len
1714 && mctx->input.valid_len < mctx->input.len))
1715 {
1716 reg_errcode_t err;
1717 err = extend_buffers (mctx, next_state_log_idx + 1);
1718 if (BE (err != REG_NOERROR, 0))
1719 return err;
1720 }
1721
1722 if (top < next_state_log_idx)
1723 {
1724 memset (mctx->state_log + top + 1, '\0',
1725 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1726 mctx->state_log_top = next_state_log_idx;
1727 }
1728 return REG_NOERROR;
1729}
1730
1731static reg_errcode_t
1732internal_function
1733merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1734 re_dfastate_t **src, int num)
1735{
1736 int st_idx;
1737 reg_errcode_t err;
1738 for (st_idx = 0; st_idx < num; ++st_idx)
1739 {
1740 if (dst[st_idx] == NULL)
1741 dst[st_idx] = src[st_idx];
1742 else if (src[st_idx] != NULL)
1743 {
1744 re_node_set merged_set;
1745 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1746 &src[st_idx]->nodes);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1749 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1750 re_node_set_free (&merged_set);
1751 if (BE (err != REG_NOERROR, 0))
1752 return err;
1753 }
1754 }
1755 return REG_NOERROR;
1756}
1757
1758static reg_errcode_t
1759internal_function
1760update_cur_sifted_state (const re_match_context_t *mctx,
1761 re_sift_context_t *sctx, int str_idx,
1762 re_node_set *dest_nodes)
1763{
1764 const re_dfa_t *const dfa = mctx->dfa;
1765 reg_errcode_t err = REG_NOERROR;
1766 const re_node_set *candidates;
1767 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1768 : &mctx->state_log[str_idx]->nodes);
1769
1770 if (dest_nodes->nelem == 0)
1771 sctx->sifted_states[str_idx] = NULL;
1772 else
1773 {
1774 if (candidates)
1775 {
1776 /* At first, add the nodes which can epsilon transit to a node in
1777 DEST_NODE. */
1778 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1781
1782 /* Then, check the limitations in the current sift_context. */
1783 if (sctx->limits.nelem)
1784 {
1785 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1786 mctx->bkref_ents, str_idx);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1789 }
1790 }
1791
1792 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1795 }
1796
1797 if (candidates && mctx->state_log[str_idx]->has_backref)
1798 {
1799 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1800 if (BE (err != REG_NOERROR, 0))
1801 return err;
1802 }
1803 return REG_NOERROR;
1804}
1805
1806static reg_errcode_t
1807internal_function __attribute_warn_unused_result__
1808add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1809 const re_node_set *candidates)
1810{
1811 reg_errcode_t err = REG_NOERROR;
1812 int i;
1813
1814 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1815 if (BE (err != REG_NOERROR, 0))
1816 return err;
1817
1818 if (!state->inveclosure.alloc)
1819 {
1820 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1821 if (BE (err != REG_NOERROR, 0))
1822 return REG_ESPACE;
1823 for (i = 0; i < dest_nodes->nelem; i++)
1824 {
1825 err = re_node_set_merge (&state->inveclosure,
1826 dfa->inveclosures + dest_nodes->elems[i]);
1827 if (BE (err != REG_NOERROR, 0))
1828 return REG_ESPACE;
1829 }
1830 }
1831 return re_node_set_add_intersect (dest_nodes, candidates,
1832 &state->inveclosure);
1833}
1834
1835static reg_errcode_t
1836internal_function
1837sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1838 const re_node_set *candidates)
1839{
1840 int ecl_idx;
1841 reg_errcode_t err;
1842 re_node_set *inv_eclosure = dfa->inveclosures + node;
1843 re_node_set except_nodes;
1844 re_node_set_init_empty (&except_nodes);
1845 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1846 {
1847 int cur_node = inv_eclosure->elems[ecl_idx];
1848 if (cur_node == node)
1849 continue;
1850 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1851 {
1852 int edst1 = dfa->edests[cur_node].elems[0];
1853 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1854 ? dfa->edests[cur_node].elems[1] : -1);
1855 if ((!re_node_set_contains (inv_eclosure, edst1)
1856 && re_node_set_contains (dest_nodes, edst1))
1857 || (edst2 > 0
1858 && !re_node_set_contains (inv_eclosure, edst2)
1859 && re_node_set_contains (dest_nodes, edst2)))
1860 {
1861 err = re_node_set_add_intersect (&except_nodes, candidates,
1862 dfa->inveclosures + cur_node);
1863 if (BE (err != REG_NOERROR, 0))
1864 {
1865 re_node_set_free (&except_nodes);
1866 return err;
1867 }
1868 }
1869 }
1870 }
1871 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1872 {
1873 int cur_node = inv_eclosure->elems[ecl_idx];
1874 if (!re_node_set_contains (&except_nodes, cur_node))
1875 {
1876 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1877 re_node_set_remove_at (dest_nodes, idx);
1878 }
1879 }
1880 re_node_set_free (&except_nodes);
1881 return REG_NOERROR;
1882}
1883
1884static int
1885internal_function
1886check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1887 int dst_node, int dst_idx, int src_node, int src_idx)
1888{
1889 const re_dfa_t *const dfa = mctx->dfa;
1890 int lim_idx, src_pos, dst_pos;
1891
1892 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1893 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1894 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1895 {
1896 int subexp_idx;
1897 struct re_backref_cache_entry *ent;
1898 ent = mctx->bkref_ents + limits->elems[lim_idx];
1899 subexp_idx = dfa->nodes[ent->node].opr.idx;
1900
1901 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1902 subexp_idx, dst_node, dst_idx,
1903 dst_bkref_idx);
1904 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1905 subexp_idx, src_node, src_idx,
1906 src_bkref_idx);
1907
1908 /* In case of:
1909 <src> <dst> ( <subexp> )
1910 ( <subexp> ) <src> <dst>
1911 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1912 if (src_pos == dst_pos)
1913 continue; /* This is unrelated limitation. */
1914 else
1915 return 1;
1916 }
1917 return 0;
1918}
1919
1920static int
1921internal_function
1922check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1923 int subexp_idx, int from_node, int bkref_idx)
1924{
1925 const re_dfa_t *const dfa = mctx->dfa;
1926 const re_node_set *eclosures = dfa->eclosures + from_node;
1927 int node_idx;
1928
1929 /* Else, we are on the boundary: examine the nodes on the epsilon
1930 closure. */
1931 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1932 {
1933 int node = eclosures->elems[node_idx];
1934 switch (dfa->nodes[node].type)
1935 {
1936 case OP_BACK_REF:
1937 if (bkref_idx != -1)
1938 {
1939 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1940 do
1941 {
1942 int dst, cpos;
1943
1944 if (ent->node != node)
1945 continue;
1946
1947 if (subexp_idx < BITSET_WORD_BITS
1948 && !(ent->eps_reachable_subexps_map
1949 & ((bitset_word_t) 1 << subexp_idx)))
1950 continue;
1951
1952 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1953 OP_CLOSE_SUBEXP cases below. But, if the
1954 destination node is the same node as the source
1955 node, don't recurse because it would cause an
1956 infinite loop: a regex that exhibits this behavior
1957 is ()\1*\1* */
1958 dst = dfa->edests[node].elems[0];
1959 if (dst == from_node)
1960 {
1961 if (boundaries & 1)
1962 return -1;
1963 else /* if (boundaries & 2) */
1964 return 0;
1965 }
1966
1967 cpos =
1968 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1969 dst, bkref_idx);
1970 if (cpos == -1 /* && (boundaries & 1) */)
1971 return -1;
1972 if (cpos == 0 && (boundaries & 2))
1973 return 0;
1974
1975 if (subexp_idx < BITSET_WORD_BITS)
1976 ent->eps_reachable_subexps_map
1977 &= ~((bitset_word_t) 1 << subexp_idx);
1978 }
1979 while (ent++->more);
1980 }
1981 break;
1982
1983 case OP_OPEN_SUBEXP:
1984 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1985 return -1;
1986 break;
1987
1988 case OP_CLOSE_SUBEXP:
1989 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1990 return 0;
1991 break;
1992
1993 default:
1994 break;
1995 }
1996 }
1997
1998 return (boundaries & 2) ? 1 : 0;
1999}
2000
2001static int
2002internal_function
2003check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2004 int subexp_idx, int from_node, int str_idx,
2005 int bkref_idx)
2006{
2007 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2008 int boundaries;
2009
2010 /* If we are outside the range of the subexpression, return -1 or 1. */
2011 if (str_idx < lim->subexp_from)
2012 return -1;
2013
2014 if (lim->subexp_to < str_idx)
2015 return 1;
2016
2017 /* If we are within the subexpression, return 0. */
2018 boundaries = (str_idx == lim->subexp_from);
2019 boundaries |= (str_idx == lim->subexp_to) << 1;
2020 if (boundaries == 0)
2021 return 0;
2022
2023 /* Else, examine epsilon closure. */
2024 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2025 from_node, bkref_idx);
2026}
2027
2028/* Check the limitations of sub expressions LIMITS, and remove the nodes
2029 which are against limitations from DEST_NODES. */
2030
2031static reg_errcode_t
2032internal_function
2033check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2034 const re_node_set *candidates, re_node_set *limits,
2035 struct re_backref_cache_entry *bkref_ents, int str_idx)
2036{
2037 reg_errcode_t err;
2038 int node_idx, lim_idx;
2039
2040 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2041 {
2042 int subexp_idx;
2043 struct re_backref_cache_entry *ent;
2044 ent = bkref_ents + limits->elems[lim_idx];
2045
2046 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2047 continue; /* This is unrelated limitation. */
2048
2049 subexp_idx = dfa->nodes[ent->node].opr.idx;
2050 if (ent->subexp_to == str_idx)
2051 {
2052 int ops_node = -1;
2053 int cls_node = -1;
2054 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2055 {
2056 int node = dest_nodes->elems[node_idx];
2057 re_token_type_t type = dfa->nodes[node].type;
2058 if (type == OP_OPEN_SUBEXP
2059 && subexp_idx == dfa->nodes[node].opr.idx)
2060 ops_node = node;
2061 else if (type == OP_CLOSE_SUBEXP
2062 && subexp_idx == dfa->nodes[node].opr.idx)
2063 cls_node = node;
2064 }
2065
2066 /* Check the limitation of the open subexpression. */
2067 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2068 if (ops_node >= 0)
2069 {
2070 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2071 candidates);
2072 if (BE (err != REG_NOERROR, 0))
2073 return err;
2074 }
2075
2076 /* Check the limitation of the close subexpression. */
2077 if (cls_node >= 0)
2078 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2079 {
2080 int node = dest_nodes->elems[node_idx];
2081 if (!re_node_set_contains (dfa->inveclosures + node,
2082 cls_node)
2083 && !re_node_set_contains (dfa->eclosures + node,
2084 cls_node))
2085 {
2086 /* It is against this limitation.
2087 Remove it form the current sifted state. */
2088 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2089 candidates);
2090 if (BE (err != REG_NOERROR, 0))
2091 return err;
2092 --node_idx;
2093 }
2094 }
2095 }
2096 else /* (ent->subexp_to != str_idx) */
2097 {
2098 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2099 {
2100 int node = dest_nodes->elems[node_idx];
2101 re_token_type_t type = dfa->nodes[node].type;
2102 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2103 {
2104 if (subexp_idx != dfa->nodes[node].opr.idx)
2105 continue;
2106 /* It is against this limitation.
2107 Remove it form the current sifted state. */
2108 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2109 candidates);
2110 if (BE (err != REG_NOERROR, 0))
2111 return err;
2112 }
2113 }
2114 }
2115 }
2116 return REG_NOERROR;
2117}
2118
2119static reg_errcode_t
2120internal_function __attribute_warn_unused_result__
2121sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2122 int str_idx, const re_node_set *candidates)
2123{
2124 const re_dfa_t *const dfa = mctx->dfa;
2125 reg_errcode_t err;
2126 int node_idx, node;
2127 re_sift_context_t local_sctx;
2128 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2129
2130 if (first_idx == -1)
2131 return REG_NOERROR;
2132
2133 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2134
2135 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2136 {
2137 int enabled_idx;
2138 re_token_type_t type;
2139 struct re_backref_cache_entry *entry;
2140 node = candidates->elems[node_idx];
2141 type = dfa->nodes[node].type;
2142 /* Avoid infinite loop for the REs like "()\1+". */
2143 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2144 continue;
2145 if (type != OP_BACK_REF)
2146 continue;
2147
2148 entry = mctx->bkref_ents + first_idx;
2149 enabled_idx = first_idx;
2150 do
2151 {
2152 int subexp_len;
2153 int to_idx;
2154 int dst_node;
2155 int ret;
2156 re_dfastate_t *cur_state;
2157
2158 if (entry->node != node)
2159 continue;
2160 subexp_len = entry->subexp_to - entry->subexp_from;
2161 to_idx = str_idx + subexp_len;
2162 dst_node = (subexp_len ? dfa->nexts[node]
2163 : dfa->edests[node].elems[0]);
2164
2165 if (to_idx > sctx->last_str_idx
2166 || sctx->sifted_states[to_idx] == NULL
2167 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2168 || check_dst_limits (mctx, &sctx->limits, node,
2169 str_idx, dst_node, to_idx))
2170 continue;
2171
2172 if (local_sctx.sifted_states == NULL)
2173 {
2174 local_sctx = *sctx;
2175 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2176 if (BE (err != REG_NOERROR, 0))
2177 goto free_return;
2178 }
2179 local_sctx.last_node = node;
2180 local_sctx.last_str_idx = str_idx;
2181 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2182 if (BE (ret < 0, 0))
2183 {
2184 err = REG_ESPACE;
2185 goto free_return;
2186 }
2187 cur_state = local_sctx.sifted_states[str_idx];
2188 err = sift_states_backward (mctx, &local_sctx);
2189 if (BE (err != REG_NOERROR, 0))
2190 goto free_return;
2191 if (sctx->limited_states != NULL)
2192 {
2193 err = merge_state_array (dfa, sctx->limited_states,
2194 local_sctx.sifted_states,
2195 str_idx + 1);
2196 if (BE (err != REG_NOERROR, 0))
2197 goto free_return;
2198 }
2199 local_sctx.sifted_states[str_idx] = cur_state;
2200 re_node_set_remove (&local_sctx.limits, enabled_idx);
2201
2202 /* mctx->bkref_ents may have changed, reload the pointer. */
2203 entry = mctx->bkref_ents + enabled_idx;
2204 }
2205 while (enabled_idx++, entry++->more);
2206 }
2207 err = REG_NOERROR;
2208 free_return:
2209 if (local_sctx.sifted_states != NULL)
2210 {
2211 re_node_set_free (&local_sctx.limits);
2212 }
2213
2214 return err;
2215}
2216
2217
2218#ifdef RE_ENABLE_I18N
2219static int
2220internal_function
2221sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2222 int node_idx, int str_idx, int max_str_idx)
2223{
2224 const re_dfa_t *const dfa = mctx->dfa;
2225 int naccepted;
2226 /* Check the node can accept `multi byte'. */
2227 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2228 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2229 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2230 dfa->nexts[node_idx]))
2231 /* The node can't accept the `multi byte', or the
2232 destination was already thrown away, then the node
2233 could't accept the current input `multi byte'. */
2234 naccepted = 0;
2235 /* Otherwise, it is sure that the node could accept
2236 `naccepted' bytes input. */
2237 return naccepted;
2238}
2239#endif /* RE_ENABLE_I18N */
2240
2241
2242/* Functions for state transition. */
2243
2244/* Return the next state to which the current state STATE will transit by
2245 accepting the current input byte, and update STATE_LOG if necessary.
2246 If STATE can accept a multibyte char/collating element/back reference
2247 update the destination of STATE_LOG. */
2248
2249static re_dfastate_t *
2250internal_function __attribute_warn_unused_result__
2251transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2252 re_dfastate_t *state)
2253{
2254 re_dfastate_t **trtable;
2255 unsigned char ch;
2256
2257#ifdef RE_ENABLE_I18N
2258 /* If the current state can accept multibyte. */
2259 if (BE (state->accept_mb, 0))
2260 {
2261 *err = transit_state_mb (mctx, state);
2262 if (BE (*err != REG_NOERROR, 0))
2263 return NULL;
2264 }
2265#endif /* RE_ENABLE_I18N */
2266
2267 /* Then decide the next state with the single byte. */
2268#if 0
2269 if (0)
2270 /* don't use transition table */
2271 return transit_state_sb (err, mctx, state);
2272#endif
2273
2274 /* Use transition table */
2275 ch = re_string_fetch_byte (&mctx->input);
2276 for (;;)
2277 {
2278 trtable = state->trtable;
2279 if (BE (trtable != NULL, 1))
2280 return trtable[ch];
2281
2282 trtable = state->word_trtable;
2283 if (BE (trtable != NULL, 1))
2284 {
2285 unsigned int context;
2286 context
2287 = re_string_context_at (&mctx->input,
2288 re_string_cur_idx (&mctx->input) - 1,
2289 mctx->eflags);
2290 if (IS_WORD_CONTEXT (context))
2291 return trtable[ch + SBC_MAX];
2292 else
2293 return trtable[ch];
2294 }
2295
2296 if (!build_trtable (mctx->dfa, state))
2297 {
2298 *err = REG_ESPACE;
2299 return NULL;
2300 }
2301
2302 /* Retry, we now have a transition table. */
2303 }
2304}
2305
2306/* Update the state_log if we need */
2307re_dfastate_t *
2308internal_function
2309merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2310 re_dfastate_t *next_state)
2311{
2312 const re_dfa_t *const dfa = mctx->dfa;
2313 int cur_idx = re_string_cur_idx (&mctx->input);
2314
2315 if (cur_idx > mctx->state_log_top)
2316 {
2317 mctx->state_log[cur_idx] = next_state;
2318 mctx->state_log_top = cur_idx;
2319 }
2320 else if (mctx->state_log[cur_idx] == 0)
2321 {
2322 mctx->state_log[cur_idx] = next_state;
2323 }
2324 else
2325 {
2326 re_dfastate_t *pstate;
2327 unsigned int context;
2328 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2329 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2330 the destination of a multibyte char/collating element/
2331 back reference. Then the next state is the union set of
2332 these destinations and the results of the transition table. */
2333 pstate = mctx->state_log[cur_idx];
2334 log_nodes = pstate->entrance_nodes;
2335 if (next_state != NULL)
2336 {
2337 table_nodes = next_state->entrance_nodes;
2338 *err = re_node_set_init_union (&next_nodes, table_nodes,
2339 log_nodes);
2340 if (BE (*err != REG_NOERROR, 0))
2341 return NULL;
2342 }
2343 else
2344 next_nodes = *log_nodes;
2345 /* Note: We already add the nodes of the initial state,
2346 then we don't need to add them here. */
2347
2348 context = re_string_context_at (&mctx->input,
2349 re_string_cur_idx (&mctx->input) - 1,
2350 mctx->eflags);
2351 next_state = mctx->state_log[cur_idx]
2352 = re_acquire_state_context (err, dfa, &next_nodes, context);
2353 /* We don't need to check errors here, since the return value of
2354 this function is next_state and ERR is already set. */
2355
2356 if (table_nodes != NULL)
2357 re_node_set_free (&next_nodes);
2358 }
2359
2360 if (BE (dfa->nbackref, 0) && next_state != NULL)
2361 {
2362 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2363 later. We must check them here, since the back references in the
2364 next state might use them. */
2365 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2366 cur_idx);
2367 if (BE (*err != REG_NOERROR, 0))
2368 return NULL;
2369
2370 /* If the next state has back references. */
2371 if (next_state->has_backref)
2372 {
2373 *err = transit_state_bkref (mctx, &next_state->nodes);
2374 if (BE (*err != REG_NOERROR, 0))
2375 return NULL;
2376 next_state = mctx->state_log[cur_idx];
2377 }
2378 }
2379
2380 return next_state;
2381}
2382
2383/* Skip bytes in the input that correspond to part of a
2384 multi-byte match, then look in the log for a state
2385 from which to restart matching. */
2386re_dfastate_t *
2387internal_function
2388find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2389{
2390 re_dfastate_t *cur_state;
2391 do
2392 {
2393 int max = mctx->state_log_top;
2394 int cur_str_idx = re_string_cur_idx (&mctx->input);
2395
2396 do
2397 {
2398 if (++cur_str_idx > max)
2399 return NULL;
2400 re_string_skip_bytes (&mctx->input, 1);
2401 }
2402 while (mctx->state_log[cur_str_idx] == NULL);
2403
2404 cur_state = merge_state_with_log (err, mctx, NULL);
2405 }
2406 while (*err == REG_NOERROR && cur_state == NULL);
2407 return cur_state;
2408}
2409
2410/* Helper functions for transit_state. */
2411
2412/* From the node set CUR_NODES, pick up the nodes whose types are
2413 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2414 expression. And register them to use them later for evaluating the
2415 correspoding back references. */
2416
2417static reg_errcode_t
2418internal_function
2419check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2420 int str_idx)
2421{
2422 const re_dfa_t *const dfa = mctx->dfa;
2423 int node_idx;
2424 reg_errcode_t err;
2425
2426 /* TODO: This isn't efficient.
2427 Because there might be more than one nodes whose types are
2428 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2429 nodes.
2430 E.g. RE: (a){2} */
2431 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2432 {
2433 int node = cur_nodes->elems[node_idx];
2434 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2435 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2436 && (dfa->used_bkref_map
2437 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2438 {
2439 err = match_ctx_add_subtop (mctx, node, str_idx);
2440 if (BE (err != REG_NOERROR, 0))
2441 return err;
2442 }
2443 }
2444 return REG_NOERROR;
2445}
2446
2447#if 0
2448/* Return the next state to which the current state STATE will transit by
2449 accepting the current input byte. */
2450
2451static re_dfastate_t *
2452transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2453 re_dfastate_t *state)
2454{
2455 const re_dfa_t *const dfa = mctx->dfa;
2456 re_node_set next_nodes;
2457 re_dfastate_t *next_state;
2458 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2459 unsigned int context;
2460
2461 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2462 if (BE (*err != REG_NOERROR, 0))
2463 return NULL;
2464 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2465 {
2466 int cur_node = state->nodes.elems[node_cnt];
2467 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2468 {
2469 *err = re_node_set_merge (&next_nodes,
2470 dfa->eclosures + dfa->nexts[cur_node]);
2471 if (BE (*err != REG_NOERROR, 0))
2472 {
2473 re_node_set_free (&next_nodes);
2474 return NULL;
2475 }
2476 }
2477 }
2478 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2479 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2480 /* We don't need to check errors here, since the return value of
2481 this function is next_state and ERR is already set. */
2482
2483 re_node_set_free (&next_nodes);
2484 re_string_skip_bytes (&mctx->input, 1);
2485 return next_state;
2486}
2487#endif
2488
2489#ifdef RE_ENABLE_I18N
2490static reg_errcode_t
2491internal_function
2492transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2493{
2494 const re_dfa_t *const dfa = mctx->dfa;
2495 reg_errcode_t err;
2496 int i;
2497
2498 for (i = 0; i < pstate->nodes.nelem; ++i)
2499 {
2500 re_node_set dest_nodes, *new_nodes;
2501 int cur_node_idx = pstate->nodes.elems[i];
2502 int naccepted, dest_idx;
2503 unsigned int context;
2504 re_dfastate_t *dest_state;
2505
2506 if (!dfa->nodes[cur_node_idx].accept_mb)
2507 continue;
2508
2509 if (dfa->nodes[cur_node_idx].constraint)
2510 {
2511 context = re_string_context_at (&mctx->input,
2512 re_string_cur_idx (&mctx->input),
2513 mctx->eflags);
2514 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2515 context))
2516 continue;
2517 }
2518
2519 /* How many bytes the node can accept? */
2520 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2521 re_string_cur_idx (&mctx->input));
2522 if (naccepted == 0)
2523 continue;
2524
2525 /* The node can accepts `naccepted' bytes. */
2526 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2527 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2528 : mctx->max_mb_elem_len);
2529 err = clean_state_log_if_needed (mctx, dest_idx);
2530 if (BE (err != REG_NOERROR, 0))
2531 return err;
2532#ifdef DEBUG
2533 assert (dfa->nexts[cur_node_idx] != -1);
2534#endif
2535 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2536
2537 dest_state = mctx->state_log[dest_idx];
2538 if (dest_state == NULL)
2539 dest_nodes = *new_nodes;
2540 else
2541 {
2542 err = re_node_set_init_union (&dest_nodes,
2543 dest_state->entrance_nodes, new_nodes);
2544 if (BE (err != REG_NOERROR, 0))
2545 return err;
2546 }
2547 context = re_string_context_at (&mctx->input, dest_idx - 1,
2548 mctx->eflags);
2549 mctx->state_log[dest_idx]
2550 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2551 if (dest_state != NULL)
2552 re_node_set_free (&dest_nodes);
2553 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2554 return err;
2555 }
2556 return REG_NOERROR;
2557}
2558#endif /* RE_ENABLE_I18N */
2559
2560static reg_errcode_t
2561internal_function
2562transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2563{
2564 const re_dfa_t *const dfa = mctx->dfa;
2565 reg_errcode_t err;
2566 int i;
2567 int cur_str_idx = re_string_cur_idx (&mctx->input);
2568
2569 for (i = 0; i < nodes->nelem; ++i)
2570 {
2571 int dest_str_idx, prev_nelem, bkc_idx;
2572 int node_idx = nodes->elems[i];
2573 unsigned int context;
2574 const re_token_t *node = dfa->nodes + node_idx;
2575 re_node_set *new_dest_nodes;
2576
2577 /* Check whether `node' is a backreference or not. */
2578 if (node->type != OP_BACK_REF)
2579 continue;
2580
2581 if (node->constraint)
2582 {
2583 context = re_string_context_at (&mctx->input, cur_str_idx,
2584 mctx->eflags);
2585 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2586 continue;
2587 }
2588
2589 /* `node' is a backreference.
2590 Check the substring which the substring matched. */
2591 bkc_idx = mctx->nbkref_ents;
2592 err = get_subexp (mctx, node_idx, cur_str_idx);
2593 if (BE (err != REG_NOERROR, 0))
2594 goto free_return;
2595
2596 /* And add the epsilon closures (which is `new_dest_nodes') of
2597 the backreference to appropriate state_log. */
2598#ifdef DEBUG
2599 assert (dfa->nexts[node_idx] != -1);
2600#endif
2601 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2602 {
2603 int subexp_len;
2604 re_dfastate_t *dest_state;
2605 struct re_backref_cache_entry *bkref_ent;
2606 bkref_ent = mctx->bkref_ents + bkc_idx;
2607 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2608 continue;
2609 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2610 new_dest_nodes = (subexp_len == 0
2611 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2612 : dfa->eclosures + dfa->nexts[node_idx]);
2613 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2614 - bkref_ent->subexp_from);
2615 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2616 mctx->eflags);
2617 dest_state = mctx->state_log[dest_str_idx];
2618 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2619 : mctx->state_log[cur_str_idx]->nodes.nelem);
2620 /* Add `new_dest_node' to state_log. */
2621 if (dest_state == NULL)
2622 {
2623 mctx->state_log[dest_str_idx]
2624 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2625 context);
2626 if (BE (mctx->state_log[dest_str_idx] == NULL
2627 && err != REG_NOERROR, 0))
2628 goto free_return;
2629 }
2630 else
2631 {
2632 re_node_set dest_nodes;
2633 err = re_node_set_init_union (&dest_nodes,
2634 dest_state->entrance_nodes,
2635 new_dest_nodes);
2636 if (BE (err != REG_NOERROR, 0))
2637 {
2638 re_node_set_free (&dest_nodes);
2639 goto free_return;
2640 }
2641 mctx->state_log[dest_str_idx]
2642 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2643 re_node_set_free (&dest_nodes);
2644 if (BE (mctx->state_log[dest_str_idx] == NULL
2645 && err != REG_NOERROR, 0))
2646 goto free_return;
2647 }
2648 /* We need to check recursively if the backreference can epsilon
2649 transit. */
2650 if (subexp_len == 0
2651 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2652 {
2653 err = check_subexp_matching_top (mctx, new_dest_nodes,
2654 cur_str_idx);
2655 if (BE (err != REG_NOERROR, 0))
2656 goto free_return;
2657 err = transit_state_bkref (mctx, new_dest_nodes);
2658 if (BE (err != REG_NOERROR, 0))
2659 goto free_return;
2660 }
2661 }
2662 }
2663 err = REG_NOERROR;
2664 free_return:
2665 return err;
2666}
2667
2668/* Enumerate all the candidates which the backreference BKREF_NODE can match
2669 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2670 Note that we might collect inappropriate candidates here.
2671 However, the cost of checking them strictly here is too high, then we
2672 delay these checking for prune_impossible_nodes(). */
2673
2674static reg_errcode_t
2675internal_function __attribute_warn_unused_result__
2676get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2677{
2678 const re_dfa_t *const dfa = mctx->dfa;
2679 int subexp_num, sub_top_idx;
2680 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2681 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2682 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2683 if (cache_idx != -1)
2684 {
2685 const struct re_backref_cache_entry *entry
2686 = mctx->bkref_ents + cache_idx;
2687 do
2688 if (entry->node == bkref_node)
2689 return REG_NOERROR; /* We already checked it. */
2690 while (entry++->more);
2691 }
2692
2693 subexp_num = dfa->nodes[bkref_node].opr.idx;
2694
2695 /* For each sub expression */
2696 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2697 {
2698 reg_errcode_t err;
2699 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2700 re_sub_match_last_t *sub_last;
2701 int sub_last_idx, sl_str, bkref_str_off;
2702
2703 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2704 continue; /* It isn't related. */
2705
2706 sl_str = sub_top->str_idx;
2707 bkref_str_off = bkref_str_idx;
2708 /* At first, check the last node of sub expressions we already
2709 evaluated. */
2710 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2711 {
2712 int sl_str_diff;
2713 sub_last = sub_top->lasts[sub_last_idx];
2714 sl_str_diff = sub_last->str_idx - sl_str;
2715 /* The matched string by the sub expression match with the substring
2716 at the back reference? */
2717 if (sl_str_diff > 0)
2718 {
2719 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2720 {
2721 /* Not enough chars for a successful match. */
2722 if (bkref_str_off + sl_str_diff > mctx->input.len)
2723 break;
2724
2725 err = clean_state_log_if_needed (mctx,
2726 bkref_str_off
2727 + sl_str_diff);
2728 if (BE (err != REG_NOERROR, 0))
2729 return err;
2730 buf = (const char *) re_string_get_buffer (&mctx->input);
2731 }
2732 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2733 /* We don't need to search this sub expression any more. */
2734 break;
2735 }
2736 bkref_str_off += sl_str_diff;
2737 sl_str += sl_str_diff;
2738 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2739 bkref_str_idx);
2740
2741 /* Reload buf, since the preceding call might have reallocated
2742 the buffer. */
2743 buf = (const char *) re_string_get_buffer (&mctx->input);
2744
2745 if (err == REG_NOMATCH)
2746 continue;
2747 if (BE (err != REG_NOERROR, 0))
2748 return err;
2749 }
2750
2751 if (sub_last_idx < sub_top->nlasts)
2752 continue;
2753 if (sub_last_idx > 0)
2754 ++sl_str;
2755 /* Then, search for the other last nodes of the sub expression. */
2756 for (; sl_str <= bkref_str_idx; ++sl_str)
2757 {
2758 int cls_node, sl_str_off;
2759 const re_node_set *nodes;
2760 sl_str_off = sl_str - sub_top->str_idx;
2761 /* The matched string by the sub expression match with the substring
2762 at the back reference? */
2763 if (sl_str_off > 0)
2764 {
2765 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2766 {
2767 /* If we are at the end of the input, we cannot match. */
2768 if (bkref_str_off >= mctx->input.len)
2769 break;
2770
2771 err = extend_buffers (mctx, bkref_str_off + 1);
2772 if (BE (err != REG_NOERROR, 0))
2773 return err;
2774
2775 buf = (const char *) re_string_get_buffer (&mctx->input);
2776 }
2777 if (buf [bkref_str_off++] != buf[sl_str - 1])
2778 break; /* We don't need to search this sub expression
2779 any more. */
2780 }
2781 if (mctx->state_log[sl_str] == NULL)
2782 continue;
2783 /* Does this state have a ')' of the sub expression? */
2784 nodes = &mctx->state_log[sl_str]->nodes;
2785 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2786 OP_CLOSE_SUBEXP);
2787 if (cls_node == -1)
2788 continue; /* No. */
2789 if (sub_top->path == NULL)
2790 {
2791 sub_top->path = calloc (sizeof (state_array_t),
2792 sl_str - sub_top->str_idx + 1);
2793 if (sub_top->path == NULL)
2794 return REG_ESPACE;
2795 }
2796 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2797 in the current context? */
2798 err = check_arrival (mctx, sub_top->path, sub_top->node,
2799 sub_top->str_idx, cls_node, sl_str,
2800 OP_CLOSE_SUBEXP);
2801 if (err == REG_NOMATCH)
2802 continue;
2803 if (BE (err != REG_NOERROR, 0))
2804 return err;
2805 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2806 if (BE (sub_last == NULL, 0))
2807 return REG_ESPACE;
2808 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2809 bkref_str_idx);
2810 if (err == REG_NOMATCH)
2811 continue;
2812 }
2813 }
2814 return REG_NOERROR;
2815}
2816
2817/* Helper functions for get_subexp(). */
2818
2819/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2820 If it can arrive, register the sub expression expressed with SUB_TOP
2821 and SUB_LAST. */
2822
2823static reg_errcode_t
2824internal_function
2825get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2826 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2827{
2828 reg_errcode_t err;
2829 int to_idx;
2830 /* Can the subexpression arrive the back reference? */
2831 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2832 sub_last->str_idx, bkref_node, bkref_str,
2833 OP_OPEN_SUBEXP);
2834 if (err != REG_NOERROR)
2835 return err;
2836 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2837 sub_last->str_idx);
2838 if (BE (err != REG_NOERROR, 0))
2839 return err;
2840 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2841 return clean_state_log_if_needed (mctx, to_idx);
2842}
2843
2844/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2845 Search '(' if FL_OPEN, or search ')' otherwise.
2846 TODO: This function isn't efficient...
2847 Because there might be more than one nodes whose types are
2848 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2849 nodes.
2850 E.g. RE: (a){2} */
2851
2852static int
2853internal_function
2854find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2855 int subexp_idx, int type)
2856{
2857 int cls_idx;
2858 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2859 {
2860 int cls_node = nodes->elems[cls_idx];
2861 const re_token_t *node = dfa->nodes + cls_node;
2862 if (node->type == type
2863 && node->opr.idx == subexp_idx)
2864 return cls_node;
2865 }
2866 return -1;
2867}
2868
2869/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2870 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2871 heavily reused.
2872 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2873
2874static reg_errcode_t
2875internal_function __attribute_warn_unused_result__
2876check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2877 int top_str, int last_node, int last_str, int type)
2878{
2879 const re_dfa_t *const dfa = mctx->dfa;
2880 reg_errcode_t err = REG_NOERROR;
2881 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2882 re_dfastate_t *cur_state = NULL;
2883 re_node_set *cur_nodes, next_nodes;
2884 re_dfastate_t **backup_state_log;
2885 unsigned int context;
2886
2887 subexp_num = dfa->nodes[top_node].opr.idx;
2888 /* Extend the buffer if we need. */
2889 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2890 {
2891 re_dfastate_t **new_array;
2892 int old_alloc = path->alloc;
2893 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2894 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2895 if (BE (new_array == NULL, 0))
2896 {
2897 path->alloc = old_alloc;
2898 return REG_ESPACE;
2899 }
2900 path->array = new_array;
2901 memset (new_array + old_alloc, '\0',
2902 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2903 }
2904
2905 str_idx = path->next_idx ?: top_str;
2906
2907 /* Temporary modify MCTX. */
2908 backup_state_log = mctx->state_log;
2909 backup_cur_idx = mctx->input.cur_idx;
2910 mctx->state_log = path->array;
2911 mctx->input.cur_idx = str_idx;
2912
2913 /* Setup initial node set. */
2914 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2915 if (str_idx == top_str)
2916 {
2917 err = re_node_set_init_1 (&next_nodes, top_node);
2918 if (BE (err != REG_NOERROR, 0))
2919 return err;
2920 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2921 if (BE (err != REG_NOERROR, 0))
2922 {
2923 re_node_set_free (&next_nodes);
2924 return err;
2925 }
2926 }
2927 else
2928 {
2929 cur_state = mctx->state_log[str_idx];
2930 if (cur_state && cur_state->has_backref)
2931 {
2932 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2933 if (BE (err != REG_NOERROR, 0))
2934 return err;
2935 }
2936 else
2937 re_node_set_init_empty (&next_nodes);
2938 }
2939 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2940 {
2941 if (next_nodes.nelem)
2942 {
2943 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2944 subexp_num, type);
2945 if (BE (err != REG_NOERROR, 0))
2946 {
2947 re_node_set_free (&next_nodes);
2948 return err;
2949 }
2950 }
2951 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2952 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2953 {
2954 re_node_set_free (&next_nodes);
2955 return err;
2956 }
2957 mctx->state_log[str_idx] = cur_state;
2958 }
2959
2960 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2961 {
2962 re_node_set_empty (&next_nodes);
2963 if (mctx->state_log[str_idx + 1])
2964 {
2965 err = re_node_set_merge (&next_nodes,
2966 &mctx->state_log[str_idx + 1]->nodes);
2967 if (BE (err != REG_NOERROR, 0))
2968 {
2969 re_node_set_free (&next_nodes);
2970 return err;
2971 }
2972 }
2973 if (cur_state)
2974 {
2975 err = check_arrival_add_next_nodes (mctx, str_idx,
2976 &cur_state->non_eps_nodes,
2977 &next_nodes);
2978 if (BE (err != REG_NOERROR, 0))
2979 {
2980 re_node_set_free (&next_nodes);
2981 return err;
2982 }
2983 }
2984 ++str_idx;
2985 if (next_nodes.nelem)
2986 {
2987 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2988 if (BE (err != REG_NOERROR, 0))
2989 {
2990 re_node_set_free (&next_nodes);
2991 return err;
2992 }
2993 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2994 subexp_num, type);
2995 if (BE (err != REG_NOERROR, 0))
2996 {
2997 re_node_set_free (&next_nodes);
2998 return err;
2999 }
3000 }
3001 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3002 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3003 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3004 {
3005 re_node_set_free (&next_nodes);
3006 return err;
3007 }
3008 mctx->state_log[str_idx] = cur_state;
3009 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3010 }
3011 re_node_set_free (&next_nodes);
3012 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3013 : &mctx->state_log[last_str]->nodes);
3014 path->next_idx = str_idx;
3015
3016 /* Fix MCTX. */
3017 mctx->state_log = backup_state_log;
3018 mctx->input.cur_idx = backup_cur_idx;
3019
3020 /* Then check the current node set has the node LAST_NODE. */
3021 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3022 return REG_NOERROR;
3023
3024 return REG_NOMATCH;
3025}
3026
3027/* Helper functions for check_arrival. */
3028
3029/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3030 to NEXT_NODES.
3031 TODO: This function is similar to the functions transit_state*(),
3032 however this function has many additional works.
3033 Can't we unify them? */
3034
3035static reg_errcode_t
3036internal_function __attribute_warn_unused_result__
3037check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3038 re_node_set *cur_nodes, re_node_set *next_nodes)
3039{
3040 const re_dfa_t *const dfa = mctx->dfa;
3041 int result;
3042 int cur_idx;
3043 reg_errcode_t err = REG_NOERROR;
3044 re_node_set union_set;
3045 re_node_set_init_empty (&union_set);
3046 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3047 {
3048 int naccepted = 0;
3049 int cur_node = cur_nodes->elems[cur_idx];
3050#ifdef DEBUG
3051 re_token_type_t type = dfa->nodes[cur_node].type;
3052 assert (!IS_EPSILON_NODE (type));
3053#endif
3054#ifdef RE_ENABLE_I18N
3055 /* If the node may accept `multi byte'. */
3056 if (dfa->nodes[cur_node].accept_mb)
3057 {
3058 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3059 str_idx);
3060 if (naccepted > 1)
3061 {
3062 re_dfastate_t *dest_state;
3063 int next_node = dfa->nexts[cur_node];
3064 int next_idx = str_idx + naccepted;
3065 dest_state = mctx->state_log[next_idx];
3066 re_node_set_empty (&union_set);
3067 if (dest_state)
3068 {
3069 err = re_node_set_merge (&union_set, &dest_state->nodes);
3070 if (BE (err != REG_NOERROR, 0))
3071 {
3072 re_node_set_free (&union_set);
3073 return err;
3074 }
3075 }
3076 result = re_node_set_insert (&union_set, next_node);
3077 if (BE (result < 0, 0))
3078 {
3079 re_node_set_free (&union_set);
3080 return REG_ESPACE;
3081 }
3082 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3083 &union_set);
3084 if (BE (mctx->state_log[next_idx] == NULL
3085 && err != REG_NOERROR, 0))
3086 {
3087 re_node_set_free (&union_set);
3088 return err;
3089 }
3090 }
3091 }
3092#endif /* RE_ENABLE_I18N */
3093 if (naccepted
3094 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3095 {
3096 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3097 if (BE (result < 0, 0))
3098 {
3099 re_node_set_free (&union_set);
3100 return REG_ESPACE;
3101 }
3102 }
3103 }
3104 re_node_set_free (&union_set);
3105 return REG_NOERROR;
3106}
3107
3108/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3109 CUR_NODES, however exclude the nodes which are:
3110 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3111 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3112*/
3113
3114static reg_errcode_t
3115internal_function
3116check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3117 int ex_subexp, int type)
3118{
3119 reg_errcode_t err;
3120 int idx, outside_node;
3121 re_node_set new_nodes;
3122#ifdef DEBUG
3123 assert (cur_nodes->nelem);
3124#endif
3125 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3126 if (BE (err != REG_NOERROR, 0))
3127 return err;
3128 /* Create a new node set NEW_NODES with the nodes which are epsilon
3129 closures of the node in CUR_NODES. */
3130
3131 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3132 {
3133 int cur_node = cur_nodes->elems[idx];
3134 const re_node_set *eclosure = dfa->eclosures + cur_node;
3135 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3136 if (outside_node == -1)
3137 {
3138 /* There are no problematic nodes, just merge them. */
3139 err = re_node_set_merge (&new_nodes, eclosure);
3140 if (BE (err != REG_NOERROR, 0))
3141 {
3142 re_node_set_free (&new_nodes);
3143 return err;
3144 }
3145 }
3146 else
3147 {
3148 /* There are problematic nodes, re-calculate incrementally. */
3149 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3150 ex_subexp, type);
3151 if (BE (err != REG_NOERROR, 0))
3152 {
3153 re_node_set_free (&new_nodes);
3154 return err;
3155 }
3156 }
3157 }
3158 re_node_set_free (cur_nodes);
3159 *cur_nodes = new_nodes;
3160 return REG_NOERROR;
3161}
3162
3163/* Helper function for check_arrival_expand_ecl.
3164 Check incrementally the epsilon closure of TARGET, and if it isn't
3165 problematic append it to DST_NODES. */
3166
3167static reg_errcode_t
3168internal_function __attribute_warn_unused_result__
3169check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3170 int target, int ex_subexp, int type)
3171{
3172 int cur_node;
3173 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3174 {
3175 int err;
3176
3177 if (dfa->nodes[cur_node].type == type
3178 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3179 {
3180 if (type == OP_CLOSE_SUBEXP)
3181 {
3182 err = re_node_set_insert (dst_nodes, cur_node);
3183 if (BE (err == -1, 0))
3184 return REG_ESPACE;
3185 }
3186 break;
3187 }
3188 err = re_node_set_insert (dst_nodes, cur_node);
3189 if (BE (err == -1, 0))
3190 return REG_ESPACE;
3191 if (dfa->edests[cur_node].nelem == 0)
3192 break;
3193 if (dfa->edests[cur_node].nelem == 2)
3194 {
3195 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3196 dfa->edests[cur_node].elems[1],
3197 ex_subexp, type);
3198 if (BE (err != REG_NOERROR, 0))
3199 return err;
3200 }
3201 cur_node = dfa->edests[cur_node].elems[0];
3202 }
3203 return REG_NOERROR;
3204}
3205
3206
3207/* For all the back references in the current state, calculate the
3208 destination of the back references by the appropriate entry
3209 in MCTX->BKREF_ENTS. */
3210
3211static reg_errcode_t
3212internal_function __attribute_warn_unused_result__
3213expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3214 int cur_str, int subexp_num, int type)
3215{
3216 const re_dfa_t *const dfa = mctx->dfa;
3217 reg_errcode_t err;
3218 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3219 struct re_backref_cache_entry *ent;
3220
3221 if (cache_idx_start == -1)
3222 return REG_NOERROR;
3223
3224 restart:
3225 ent = mctx->bkref_ents + cache_idx_start;
3226 do
3227 {
3228 int to_idx, next_node;
3229
3230 /* Is this entry ENT is appropriate? */
3231 if (!re_node_set_contains (cur_nodes, ent->node))
3232 continue; /* No. */
3233
3234 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3235 /* Calculate the destination of the back reference, and append it
3236 to MCTX->STATE_LOG. */
3237 if (to_idx == cur_str)
3238 {
3239 /* The backreference did epsilon transit, we must re-check all the
3240 node in the current state. */
3241 re_node_set new_dests;
3242 reg_errcode_t err2, err3;
3243 next_node = dfa->edests[ent->node].elems[0];
3244 if (re_node_set_contains (cur_nodes, next_node))
3245 continue;
3246 err = re_node_set_init_1 (&new_dests, next_node);
3247 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3248 err3 = re_node_set_merge (cur_nodes, &new_dests);
3249 re_node_set_free (&new_dests);
3250 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3251 || err3 != REG_NOERROR, 0))
3252 {
3253 err = (err != REG_NOERROR ? err
3254 : (err2 != REG_NOERROR ? err2 : err3));
3255 return err;
3256 }
3257 /* TODO: It is still inefficient... */
3258 goto restart;
3259 }
3260 else
3261 {
3262 re_node_set union_set;
3263 next_node = dfa->nexts[ent->node];
3264 if (mctx->state_log[to_idx])
3265 {
3266 int ret;
3267 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3268 next_node))
3269 continue;
3270 err = re_node_set_init_copy (&union_set,
3271 &mctx->state_log[to_idx]->nodes);
3272 ret = re_node_set_insert (&union_set, next_node);
3273 if (BE (err != REG_NOERROR || ret < 0, 0))
3274 {
3275 re_node_set_free (&union_set);
3276 err = err != REG_NOERROR ? err : REG_ESPACE;
3277 return err;
3278 }
3279 }
3280 else
3281 {
3282 err = re_node_set_init_1 (&union_set, next_node);
3283 if (BE (err != REG_NOERROR, 0))
3284 return err;
3285 }
3286 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3287 re_node_set_free (&union_set);
3288 if (BE (mctx->state_log[to_idx] == NULL
3289 && err != REG_NOERROR, 0))
3290 return err;
3291 }
3292 }
3293 while (ent++->more);
3294 return REG_NOERROR;
3295}
3296
3297/* Build transition table for the state.
3298 Return 1 if succeeded, otherwise return NULL. */
3299
3300static int
3301internal_function
3302build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3303{
3304 reg_errcode_t err;
3305 int i, j, ch, need_word_trtable = 0;
3306 bitset_word_t elem, mask;
3307 bool dests_node_malloced = false;
3308 bool dest_states_malloced = false;
3309 int ndests; /* Number of the destination states from `state'. */
3310 re_dfastate_t **trtable;
3311 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3312 re_node_set follows, *dests_node;
3313 bitset_t *dests_ch;
3314 bitset_t acceptable;
3315
3316 struct dests_alloc
3317 {
3318 re_node_set dests_node[SBC_MAX];
3319 bitset_t dests_ch[SBC_MAX];
3320 } *dests_alloc;
3321
3322 /* We build DFA states which corresponds to the destination nodes
3323 from `state'. `dests_node[i]' represents the nodes which i-th
3324 destination state contains, and `dests_ch[i]' represents the
3325 characters which i-th destination state accepts. */
3326 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3327 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3328 else
3329 {
3330 dests_alloc = re_malloc (struct dests_alloc, 1);
3331 if (BE (dests_alloc == NULL, 0))
3332 return 0;
3333 dests_node_malloced = true;
3334 }
3335 dests_node = dests_alloc->dests_node;
3336 dests_ch = dests_alloc->dests_ch;
3337
3338 /* Initialize transiton table. */
3339 state->word_trtable = state->trtable = NULL;
3340
3341 /* At first, group all nodes belonging to `state' into several
3342 destinations. */
3343 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3344 if (BE (ndests <= 0, 0))
3345 {
3346 if (dests_node_malloced)
3347 free (dests_alloc);
3348 /* Return 0 in case of an error, 1 otherwise. */
3349 if (ndests == 0)
3350 {
3351 state->trtable = (re_dfastate_t **)
3352 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3353 if (BE (state->trtable == NULL, 0))
3354 return 0;
3355 return 1;
3356 }
3357 return 0;
3358 }
3359
3360 err = re_node_set_alloc (&follows, ndests + 1);
3361 if (BE (err != REG_NOERROR, 0))
3362 goto out_free;
3363
3364 /* Avoid arithmetic overflow in size calculation. */
3365 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3366 / (3 * sizeof (re_dfastate_t *)))
3367 < ndests),
3368 0))
3369 goto out_free;
3370
3371 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3372 + ndests * 3 * sizeof (re_dfastate_t *)))
3373 dest_states = (re_dfastate_t **)
3374 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3375 else
3376 {
3377 dest_states = (re_dfastate_t **)
3378 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3379 if (BE (dest_states == NULL, 0))
3380 {
3381out_free:
3382 if (dest_states_malloced)
3383 free (dest_states);
3384 re_node_set_free (&follows);
3385 for (i = 0; i < ndests; ++i)
3386 re_node_set_free (dests_node + i);
3387 if (dests_node_malloced)
3388 free (dests_alloc);
3389 return 0;
3390 }
3391 dest_states_malloced = true;
3392 }
3393 dest_states_word = dest_states + ndests;
3394 dest_states_nl = dest_states_word + ndests;
3395 bitset_empty (acceptable);
3396
3397 /* Then build the states for all destinations. */
3398 for (i = 0; i < ndests; ++i)
3399 {
3400 int next_node;
3401 re_node_set_empty (&follows);
3402 /* Merge the follows of this destination states. */
3403 for (j = 0; j < dests_node[i].nelem; ++j)
3404 {
3405 next_node = dfa->nexts[dests_node[i].elems[j]];
3406 if (next_node != -1)
3407 {
3408 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3409 if (BE (err != REG_NOERROR, 0))
3410 goto out_free;
3411 }
3412 }
3413 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3414 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3415 goto out_free;
3416 /* If the new state has context constraint,
3417 build appropriate states for these contexts. */
3418 if (dest_states[i]->has_constraint)
3419 {
3420 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3421 CONTEXT_WORD);
3422 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3423 goto out_free;
3424
3425 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3426 need_word_trtable = 1;
3427
3428 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3429 CONTEXT_NEWLINE);
3430 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3431 goto out_free;
3432 }
3433 else
3434 {
3435 dest_states_word[i] = dest_states[i];
3436 dest_states_nl[i] = dest_states[i];
3437 }
3438 bitset_merge (acceptable, dests_ch[i]);
3439 }
3440
3441 if (!BE (need_word_trtable, 0))
3442 {
3443 /* We don't care about whether the following character is a word
3444 character, or we are in a single-byte character set so we can
3445 discern by looking at the character code: allocate a
3446 256-entry transition table. */
3447 trtable = state->trtable =
3448 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3449 if (BE (trtable == NULL, 0))
3450 goto out_free;
3451
3452 /* For all characters ch...: */
3453 for (i = 0; i < BITSET_WORDS; ++i)
3454 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3455 elem;
3456 mask <<= 1, elem >>= 1, ++ch)
3457 if (BE (elem & 1, 0))
3458 {
3459 /* There must be exactly one destination which accepts
3460 character ch. See group_nodes_into_DFAstates. */
3461 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3462 ;
3463
3464 /* j-th destination accepts the word character ch. */
3465 if (dfa->word_char[i] & mask)
3466 trtable[ch] = dest_states_word[j];
3467 else
3468 trtable[ch] = dest_states[j];
3469 }
3470 }
3471 else
3472 {
3473 /* We care about whether the following character is a word
3474 character, and we are in a multi-byte character set: discern
3475 by looking at the character code: build two 256-entry
3476 transition tables, one starting at trtable[0] and one
3477 starting at trtable[SBC_MAX]. */
3478 trtable = state->word_trtable =
3479 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3480 if (BE (trtable == NULL, 0))
3481 goto out_free;
3482
3483 /* For all characters ch...: */
3484 for (i = 0; i < BITSET_WORDS; ++i)
3485 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3486 elem;
3487 mask <<= 1, elem >>= 1, ++ch)
3488 if (BE (elem & 1, 0))
3489 {
3490 /* There must be exactly one destination which accepts
3491 character ch. See group_nodes_into_DFAstates. */
3492 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3493 ;
3494
3495 /* j-th destination accepts the word character ch. */
3496 trtable[ch] = dest_states[j];
3497 trtable[ch + SBC_MAX] = dest_states_word[j];
3498 }
3499 }
3500
3501 /* new line */
3502 if (bitset_contain (acceptable, NEWLINE_CHAR))
3503 {
3504 /* The current state accepts newline character. */
3505 for (j = 0; j < ndests; ++j)
3506 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3507 {
3508 /* k-th destination accepts newline character. */
3509 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3510 if (need_word_trtable)
3511 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3512 /* There must be only one destination which accepts
3513 newline. See group_nodes_into_DFAstates. */
3514 break;
3515 }
3516 }
3517
3518 if (dest_states_malloced)
3519 free (dest_states);
3520
3521 re_node_set_free (&follows);
3522 for (i = 0; i < ndests; ++i)
3523 re_node_set_free (dests_node + i);
3524
3525 if (dests_node_malloced)
3526 free (dests_alloc);
3527
3528 return 1;
3529}
3530
3531/* Group all nodes belonging to STATE into several destinations.
3532 Then for all destinations, set the nodes belonging to the destination
3533 to DESTS_NODE[i] and set the characters accepted by the destination
3534 to DEST_CH[i]. This function return the number of destinations. */
3535
3536static int
3537internal_function
3538group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3539 re_node_set *dests_node, bitset_t *dests_ch)
3540{
3541 reg_errcode_t err;
3542 int result;
3543 int i, j, k;
3544 int ndests; /* Number of the destinations from `state'. */
3545 bitset_t accepts; /* Characters a node can accept. */
3546 const re_node_set *cur_nodes = &state->nodes;
3547 bitset_empty (accepts);
3548 ndests = 0;
3549
3550 /* For all the nodes belonging to `state', */
3551 for (i = 0; i < cur_nodes->nelem; ++i)
3552 {
3553 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3554 re_token_type_t type = node->type;
3555 unsigned int constraint = node->constraint;
3556
3557 /* Enumerate all single byte character this node can accept. */
3558 if (type == CHARACTER)
3559 bitset_set (accepts, node->opr.c);
3560 else if (type == SIMPLE_BRACKET)
3561 {
3562 bitset_merge (accepts, node->opr.sbcset);
3563 }
3564 else if (type == OP_PERIOD)
3565 {
3566#ifdef RE_ENABLE_I18N
3567 if (dfa->mb_cur_max > 1)
3568 bitset_merge (accepts, dfa->sb_char);
3569 else
3570#endif
3571 bitset_set_all (accepts);
3572 if (!(dfa->syntax & RE_DOT_NEWLINE))
3573 bitset_clear (accepts, '\n');
3574 if (dfa->syntax & RE_DOT_NOT_NULL)
3575 bitset_clear (accepts, '\0');
3576 }
3577#ifdef RE_ENABLE_I18N
3578 else if (type == OP_UTF8_PERIOD)
3579 {
3580 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3581 if (!(dfa->syntax & RE_DOT_NEWLINE))
3582 bitset_clear (accepts, '\n');
3583 if (dfa->syntax & RE_DOT_NOT_NULL)
3584 bitset_clear (accepts, '\0');
3585 }
3586#endif
3587 else
3588 continue;
3589
3590 /* Check the `accepts' and sift the characters which are not
3591 match it the context. */
3592 if (constraint)
3593 {
3594 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3595 {
3596 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3597 bitset_empty (accepts);
3598 if (accepts_newline)
3599 bitset_set (accepts, NEWLINE_CHAR);
3600 else
3601 continue;
3602 }
3603 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3604 {
3605 bitset_empty (accepts);
3606 continue;
3607 }
3608
3609 if (constraint & NEXT_WORD_CONSTRAINT)
3610 {
3611 bitset_word_t any_set = 0;
3612 if (type == CHARACTER && !node->word_char)
3613 {
3614 bitset_empty (accepts);
3615 continue;
3616 }
3617#ifdef RE_ENABLE_I18N
3618 if (dfa->mb_cur_max > 1)
3619 for (j = 0; j < BITSET_WORDS; ++j)
3620 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3621 else
3622#endif
3623 for (j = 0; j < BITSET_WORDS; ++j)
3624 any_set |= (accepts[j] &= dfa->word_char[j]);
3625 if (!any_set)
3626 continue;
3627 }
3628 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3629 {
3630 bitset_word_t any_set = 0;
3631 if (type == CHARACTER && node->word_char)
3632 {
3633 bitset_empty (accepts);
3634 continue;
3635 }
3636#ifdef RE_ENABLE_I18N
3637 if (dfa->mb_cur_max > 1)
3638 for (j = 0; j < BITSET_WORDS; ++j)
3639 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3640 else
3641#endif
3642 for (j = 0; j < BITSET_WORDS; ++j)
3643 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3644 if (!any_set)
3645 continue;
3646 }
3647 }
3648
3649 /* Then divide `accepts' into DFA states, or create a new
3650 state. Above, we make sure that accepts is not empty. */
3651 for (j = 0; j < ndests; ++j)
3652 {
3653 bitset_t intersec; /* Intersection sets, see below. */
3654 bitset_t remains;
3655 /* Flags, see below. */
3656 bitset_word_t has_intersec, not_subset, not_consumed;
3657
3658 /* Optimization, skip if this state doesn't accept the character. */
3659 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3660 continue;
3661
3662 /* Enumerate the intersection set of this state and `accepts'. */
3663 has_intersec = 0;
3664 for (k = 0; k < BITSET_WORDS; ++k)
3665 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3666 /* And skip if the intersection set is empty. */
3667 if (!has_intersec)
3668 continue;
3669
3670 /* Then check if this state is a subset of `accepts'. */
3671 not_subset = not_consumed = 0;
3672 for (k = 0; k < BITSET_WORDS; ++k)
3673 {
3674 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3675 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3676 }
3677
3678 /* If this state isn't a subset of `accepts', create a
3679 new group state, which has the `remains'. */
3680 if (not_subset)
3681 {
3682 bitset_copy (dests_ch[ndests], remains);
3683 bitset_copy (dests_ch[j], intersec);
3684 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3685 if (BE (err != REG_NOERROR, 0))
3686 goto error_return;
3687 ++ndests;
3688 }
3689
3690 /* Put the position in the current group. */
3691 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3692 if (BE (result < 0, 0))
3693 goto error_return;
3694
3695 /* If all characters are consumed, go to next node. */
3696 if (!not_consumed)
3697 break;
3698 }
3699 /* Some characters remain, create a new group. */
3700 if (j == ndests)
3701 {
3702 bitset_copy (dests_ch[ndests], accepts);
3703 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3704 if (BE (err != REG_NOERROR, 0))
3705 goto error_return;
3706 ++ndests;
3707 bitset_empty (accepts);
3708 }
3709 }
3710 return ndests;
3711 error_return:
3712 for (j = 0; j < ndests; ++j)
3713 re_node_set_free (dests_node + j);
3714 return -1;
3715}
3716
3717#ifdef RE_ENABLE_I18N
3718/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3719 Return the number of the bytes the node accepts.
3720 STR_IDX is the current index of the input string.
3721
3722 This function handles the nodes which can accept one character, or
3723 one collating element like '.', '[a-z]', opposite to the other nodes
3724 can only accept one byte. */
3725
3726# ifdef _LIBC
3727# include <locale/weight.h>
3728# endif
3729
3730static int
3731internal_function
3732check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3733 const re_string_t *input, int str_idx)
3734{
3735 const re_token_t *node = dfa->nodes + node_idx;
3736 int char_len, elem_len;
3737 int i;
3738
3739 if (BE (node->type == OP_UTF8_PERIOD, 0))
3740 {
3741 unsigned char c = re_string_byte_at (input, str_idx), d;
3742 if (BE (c < 0xc2, 1))
3743 return 0;
3744
3745 if (str_idx + 2 > input->len)
3746 return 0;
3747
3748 d = re_string_byte_at (input, str_idx + 1);
3749 if (c < 0xe0)
3750 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3751 else if (c < 0xf0)
3752 {
3753 char_len = 3;
3754 if (c == 0xe0 && d < 0xa0)
3755 return 0;
3756 }
3757 else if (c < 0xf8)
3758 {
3759 char_len = 4;
3760 if (c == 0xf0 && d < 0x90)
3761 return 0;
3762 }
3763 else if (c < 0xfc)
3764 {
3765 char_len = 5;
3766 if (c == 0xf8 && d < 0x88)
3767 return 0;
3768 }
3769 else if (c < 0xfe)
3770 {
3771 char_len = 6;
3772 if (c == 0xfc && d < 0x84)
3773 return 0;
3774 }
3775 else
3776 return 0;
3777
3778 if (str_idx + char_len > input->len)
3779 return 0;
3780
3781 for (i = 1; i < char_len; ++i)
3782 {
3783 d = re_string_byte_at (input, str_idx + i);
3784 if (d < 0x80 || d > 0xbf)
3785 return 0;
3786 }
3787 return char_len;
3788 }
3789
3790 char_len = re_string_char_size_at (input, str_idx);
3791 if (node->type == OP_PERIOD)
3792 {
3793 if (char_len <= 1)
3794 return 0;
3795 /* FIXME: I don't think this if is needed, as both '\n'
3796 and '\0' are char_len == 1. */
3797 /* '.' accepts any one character except the following two cases. */
3798 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3799 re_string_byte_at (input, str_idx) == '\n') ||
3800 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3801 re_string_byte_at (input, str_idx) == '\0'))
3802 return 0;
3803 return char_len;
3804 }
3805
3806 elem_len = re_string_elem_size_at (input, str_idx);
3807 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3808 return 0;
3809
3810 if (node->type == COMPLEX_BRACKET)
3811 {
3812 const re_charset_t *cset = node->opr.mbcset;
3813# ifdef _LIBC
3814 const unsigned char *pin
3815 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3816 int j;
3817 uint32_t nrules;
3818# endif /* _LIBC */
3819 int match_len = 0;
3820 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3821 ? re_string_wchar_at (input, str_idx) : 0);
3822
3823 /* match with multibyte character? */
3824 for (i = 0; i < cset->nmbchars; ++i)
3825 if (wc == cset->mbchars[i])
3826 {
3827 match_len = char_len;
3828 goto check_node_accept_bytes_match;
3829 }
3830 /* match with character_class? */
3831 for (i = 0; i < cset->nchar_classes; ++i)
3832 {
3833 wctype_t wt = cset->char_classes[i];
3834 if (__iswctype (wc, wt))
3835 {
3836 match_len = char_len;
3837 goto check_node_accept_bytes_match;
3838 }
3839 }
3840
3841# ifdef _LIBC
3842 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3843 if (nrules != 0)
3844 {
3845 unsigned int in_collseq = 0;
3846 const int32_t *table, *indirect;
3847 const unsigned char *weights, *extra;
3848 const char *collseqwc;
3849
3850 /* match with collating_symbol? */
3851 if (cset->ncoll_syms)
3852 extra = (const unsigned char *)
3853 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3854 for (i = 0; i < cset->ncoll_syms; ++i)
3855 {
3856 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3857 /* Compare the length of input collating element and
3858 the length of current collating element. */
3859 if (*coll_sym != elem_len)
3860 continue;
3861 /* Compare each bytes. */
3862 for (j = 0; j < *coll_sym; j++)
3863 if (pin[j] != coll_sym[1 + j])
3864 break;
3865 if (j == *coll_sym)
3866 {
3867 /* Match if every bytes is equal. */
3868 match_len = j;
3869 goto check_node_accept_bytes_match;
3870 }
3871 }
3872
3873 if (cset->nranges)
3874 {
3875 if (elem_len <= char_len)
3876 {
3877 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3878 in_collseq = __collseq_table_lookup (collseqwc, wc);
3879 }
3880 else
3881 in_collseq = find_collation_sequence_value (pin, elem_len);
3882 }
3883 /* match with range expression? */
3884 for (i = 0; i < cset->nranges; ++i)
3885 if (cset->range_starts[i] <= in_collseq
3886 && in_collseq <= cset->range_ends[i])
3887 {
3888 match_len = elem_len;
3889 goto check_node_accept_bytes_match;
3890 }
3891
3892 /* match with equivalence_class? */
3893 if (cset->nequiv_classes)
3894 {
3895 const unsigned char *cp = pin;
3896 table = (const int32_t *)
3897 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3898 weights = (const unsigned char *)
3899 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3900 extra = (const unsigned char *)
3901 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3902 indirect = (const int32_t *)
3903 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3904 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3905 if (idx > 0)
3906 for (i = 0; i < cset->nequiv_classes; ++i)
3907 {
3908 int32_t equiv_class_idx = cset->equiv_classes[i];
3909 size_t weight_len = weights[idx & 0xffffff];
3910 if (weight_len == weights[equiv_class_idx & 0xffffff]
3911 && (idx >> 24) == (equiv_class_idx >> 24))
3912 {
3913 int cnt = 0;
3914
3915 idx &= 0xffffff;
3916 equiv_class_idx &= 0xffffff;
3917
3918 while (cnt <= weight_len
3919 && (weights[equiv_class_idx + 1 + cnt]
3920 == weights[idx + 1 + cnt]))
3921 ++cnt;
3922 if (cnt > weight_len)
3923 {
3924 match_len = elem_len;
3925 goto check_node_accept_bytes_match;
3926 }
3927 }
3928 }
3929 }
3930 }
3931 else
3932# endif /* _LIBC */
3933 {
3934 /* match with range expression? */
3935#if __GNUC__ >= 2
3936 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3937#else
3938 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3939 cmp_buf[2] = wc;
3940#endif
3941 for (i = 0; i < cset->nranges; ++i)
3942 {
3943 cmp_buf[0] = cset->range_starts[i];
3944 cmp_buf[4] = cset->range_ends[i];
3945 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
3946 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3947 {
3948 match_len = char_len;
3949 goto check_node_accept_bytes_match;
3950 }
3951 }
3952 }
3953 check_node_accept_bytes_match:
3954 if (!cset->non_match)
3955 return match_len;
3956 else
3957 {
3958 if (match_len > 0)
3959 return 0;
3960 else
3961 return (elem_len > char_len) ? elem_len : char_len;
3962 }
3963 }
3964 return 0;
3965}
3966
3967# ifdef _LIBC
3968static unsigned int
3969internal_function
3970find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3971{
3972 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3973 if (nrules == 0)
3974 {
3975 if (mbs_len == 1)
3976 {
3977 /* No valid character. Match it as a single byte character. */
3978 const unsigned char *collseq = (const unsigned char *)
3979 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3980 return collseq[mbs[0]];
3981 }
3982 return UINT_MAX;
3983 }
3984 else
3985 {
3986 int32_t idx;
3987 const unsigned char *extra = (const unsigned char *)
3988 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3989 int32_t extrasize = (const unsigned char *)
3990 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3991
3992 for (idx = 0; idx < extrasize;)
3993 {
3994 int mbs_cnt, found = 0;
3995 int32_t elem_mbs_len;
3996 /* Skip the name of collating element name. */
3997 idx = idx + extra[idx] + 1;
3998 elem_mbs_len = extra[idx++];
3999 if (mbs_len == elem_mbs_len)
4000 {
4001 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4002 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4003 break;
4004 if (mbs_cnt == elem_mbs_len)
4005 /* Found the entry. */
4006 found = 1;
4007 }
4008 /* Skip the byte sequence of the collating element. */
4009 idx += elem_mbs_len;
4010 /* Adjust for the alignment. */
4011 idx = (idx + 3) & ~3;
4012 /* Skip the collation sequence value. */
4013 idx += sizeof (uint32_t);
4014 /* Skip the wide char sequence of the collating element. */
4015 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4016 /* If we found the entry, return the sequence value. */
4017 if (found)
4018 return *(uint32_t *) (extra + idx);
4019 /* Skip the collation sequence value. */
4020 idx += sizeof (uint32_t);
4021 }
4022 return UINT_MAX;
4023 }
4024}
4025# endif /* _LIBC */
4026#endif /* RE_ENABLE_I18N */
4027
4028/* Check whether the node accepts the byte which is IDX-th
4029 byte of the INPUT. */
4030
4031static int
4032internal_function
4033check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4034 int idx)
4035{
4036 unsigned char ch;
4037 ch = re_string_byte_at (&mctx->input, idx);
4038 switch (node->type)
4039 {
4040 case CHARACTER:
4041 if (node->opr.c != ch)
4042 return 0;
4043 break;
4044
4045 case SIMPLE_BRACKET:
4046 if (!bitset_contain (node->opr.sbcset, ch))
4047 return 0;
4048 break;
4049
4050#ifdef RE_ENABLE_I18N
4051 case OP_UTF8_PERIOD:
4052 if (ch >= 0x80)
4053 return 0;
4054 /* FALLTHROUGH */
4055#endif
4056 case OP_PERIOD:
4057 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4058 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4059 return 0;
4060 break;
4061
4062 default:
4063 return 0;
4064 }
4065
4066 if (node->constraint)
4067 {
4068 /* The node has constraints. Check whether the current context
4069 satisfies the constraints. */
4070 unsigned int context = re_string_context_at (&mctx->input, idx,
4071 mctx->eflags);
4072 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4073 return 0;
4074 }
4075
4076 return 1;
4077}
4078
4079/* Extend the buffers, if the buffers have run out. */
4080
4081static reg_errcode_t
4082internal_function __attribute_warn_unused_result__
4083extend_buffers (re_match_context_t *mctx, int min_len)
4084{
4085 reg_errcode_t ret;
4086 re_string_t *pstr = &mctx->input;
4087
4088 /* Avoid overflow. */
4089 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4090 return REG_ESPACE;
4091
4092 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4093 ret = re_string_realloc_buffers (pstr,
4094 MAX (min_len,
4095 MIN (pstr->len, pstr->bufs_len * 2)));
4096 if (BE (ret != REG_NOERROR, 0))
4097 return ret;
4098
4099 if (mctx->state_log != NULL)
4100 {
4101 /* And double the length of state_log. */
4102 /* XXX We have no indication of the size of this buffer. If this
4103 allocation fail we have no indication that the state_log array
4104 does not have the right size. */
4105 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4106 pstr->bufs_len + 1);
4107 if (BE (new_array == NULL, 0))
4108 return REG_ESPACE;
4109 mctx->state_log = new_array;
4110 }
4111
4112 /* Then reconstruct the buffers. */
4113 if (pstr->icase)
4114 {
4115#ifdef RE_ENABLE_I18N
4116 if (pstr->mb_cur_max > 1)
4117 {
4118 ret = build_wcs_upper_buffer (pstr);
4119 if (BE (ret != REG_NOERROR, 0))
4120 return ret;
4121 }
4122 else
4123#endif /* RE_ENABLE_I18N */
4124 build_upper_buffer (pstr);
4125 }
4126 else
4127 {
4128#ifdef RE_ENABLE_I18N
4129 if (pstr->mb_cur_max > 1)
4130 build_wcs_buffer (pstr);
4131 else
4132#endif /* RE_ENABLE_I18N */
4133 {
4134 if (pstr->trans != NULL)
4135 re_string_translate_buffer (pstr);
4136 }
4137 }
4138 return REG_NOERROR;
4139}
4140
4141
4142/* Functions for matching context. */
4143
4144/* Initialize MCTX. */
4145
4146static reg_errcode_t
4147internal_function __attribute_warn_unused_result__
4148match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4149{
4150 mctx->eflags = eflags;
4151 mctx->match_last = -1;
4152 if (n > 0)
4153 {
4154 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4155 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4156 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4157 return REG_ESPACE;
4158 }
4159 /* Already zero-ed by the caller.
4160 else
4161 mctx->bkref_ents = NULL;
4162 mctx->nbkref_ents = 0;
4163 mctx->nsub_tops = 0; */
4164 mctx->abkref_ents = n;
4165 mctx->max_mb_elem_len = 1;
4166 mctx->asub_tops = n;
4167 return REG_NOERROR;
4168}
4169
4170/* Clean the entries which depend on the current input in MCTX.
4171 This function must be invoked when the matcher changes the start index
4172 of the input, or changes the input string. */
4173
4174static void
4175internal_function
4176match_ctx_clean (re_match_context_t *mctx)
4177{
4178 int st_idx;
4179 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4180 {
4181 int sl_idx;
4182 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4183 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4184 {
4185 re_sub_match_last_t *last = top->lasts[sl_idx];
4186 re_free (last->path.array);
4187 re_free (last);
4188 }
4189 re_free (top->lasts);
4190 if (top->path)
4191 {
4192 re_free (top->path->array);
4193 re_free (top->path);
4194 }
4195 free (top);
4196 }
4197
4198 mctx->nsub_tops = 0;
4199 mctx->nbkref_ents = 0;
4200}
4201
4202/* Free all the memory associated with MCTX. */
4203
4204static void
4205internal_function
4206match_ctx_free (re_match_context_t *mctx)
4207{
4208 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4209 match_ctx_clean (mctx);
4210 re_free (mctx->sub_tops);
4211 re_free (mctx->bkref_ents);
4212}
4213
4214/* Add a new backreference entry to MCTX.
4215 Note that we assume that caller never call this function with duplicate
4216 entry, and call with STR_IDX which isn't smaller than any existing entry.
4217*/
4218
4219static reg_errcode_t
4220internal_function __attribute_warn_unused_result__
4221match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4222 int to)
4223{
4224 if (mctx->nbkref_ents >= mctx->abkref_ents)
4225 {
4226 struct re_backref_cache_entry* new_entry;
4227 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4228 mctx->abkref_ents * 2);
4229 if (BE (new_entry == NULL, 0))
4230 {
4231 re_free (mctx->bkref_ents);
4232 return REG_ESPACE;
4233 }
4234 mctx->bkref_ents = new_entry;
4235 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4236 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4237 mctx->abkref_ents *= 2;
4238 }
4239 if (mctx->nbkref_ents > 0
4240 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4241 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4242
4243 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4244 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4245 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4246 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4247
4248 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4249 If bit N is clear, means that this entry won't epsilon-transition to
4250 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4251 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4252 such node.
4253
4254 A backreference does not epsilon-transition unless it is empty, so set
4255 to all zeros if FROM != TO. */
4256 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4257 = (from == to ? ~0 : 0);
4258
4259 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4260 if (mctx->max_mb_elem_len < to - from)
4261 mctx->max_mb_elem_len = to - from;
4262 return REG_NOERROR;
4263}
4264
4265/* Search for the first entry which has the same str_idx, or -1 if none is
4266 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4267
4268static int
4269internal_function
4270search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4271{
4272 int left, right, mid, last;
4273 last = right = mctx->nbkref_ents;
4274 for (left = 0; left < right;)
4275 {
4276 mid = (left + right) / 2;
4277 if (mctx->bkref_ents[mid].str_idx < str_idx)
4278 left = mid + 1;
4279 else
4280 right = mid;
4281 }
4282 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4283 return left;
4284 else
4285 return -1;
4286}
4287
4288/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4289 at STR_IDX. */
4290
4291static reg_errcode_t
4292internal_function __attribute_warn_unused_result__
4293match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4294{
4295#ifdef DEBUG
4296 assert (mctx->sub_tops != NULL);
4297 assert (mctx->asub_tops > 0);
4298#endif
4299 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4300 {
4301 int new_asub_tops = mctx->asub_tops * 2;
4302 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4303 re_sub_match_top_t *,
4304 new_asub_tops);
4305 if (BE (new_array == NULL, 0))
4306 return REG_ESPACE;
4307 mctx->sub_tops = new_array;
4308 mctx->asub_tops = new_asub_tops;
4309 }
4310 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4311 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4312 return REG_ESPACE;
4313 mctx->sub_tops[mctx->nsub_tops]->node = node;
4314 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4315 return REG_NOERROR;
4316}
4317
4318/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4319 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4320
4321static re_sub_match_last_t *
4322internal_function
4323match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4324{
4325 re_sub_match_last_t *new_entry;
4326 if (BE (subtop->nlasts == subtop->alasts, 0))
4327 {
4328 int new_alasts = 2 * subtop->alasts + 1;
4329 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4330 re_sub_match_last_t *,
4331 new_alasts);
4332 if (BE (new_array == NULL, 0))
4333 return NULL;
4334 subtop->lasts = new_array;
4335 subtop->alasts = new_alasts;
4336 }
4337 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4338 if (BE (new_entry != NULL, 1))
4339 {
4340 subtop->lasts[subtop->nlasts] = new_entry;
4341 new_entry->node = node;
4342 new_entry->str_idx = str_idx;
4343 ++subtop->nlasts;
4344 }
4345 return new_entry;
4346}
4347
4348static void
4349internal_function
4350sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4351 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4352{
4353 sctx->sifted_states = sifted_sts;
4354 sctx->limited_states = limited_sts;
4355 sctx->last_node = last_node;
4356 sctx->last_str_idx = last_str_idx;
4357 re_node_set_init_empty (&sctx->limits);
4358}
4359