1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22static void match_ctx_clean (re_match_context_t *mctx);
23static void match_ctx_free (re_match_context_t *cache);
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
64static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70#ifdef RE_ENABLE_I18N
71static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74#endif /* RE_ENABLE_I18N */
75static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121#if 0
122static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125#endif
126#ifdef RE_ENABLE_I18N
127static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129#endif /* RE_ENABLE_I18N */
130static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159#ifdef RE_ENABLE_I18N
160static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162# ifdef _LIBC
163static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165# endif /* _LIBC */
166#endif /* RE_ENABLE_I18N */
167static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174
175/* Entry point for POSIX code. */
176
177/* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
179
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
184
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
188
189 We return 0 if we find a match and REG_NOMATCH if not. */
190
191int
192regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
194{
195 reg_errcode_t err;
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
198
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
201
202 if (eflags & REG_STARTEND)
203 {
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
206 }
207 else
208 {
209 start = 0;
210 length = strlen (string);
211 }
212
213 lock_lock (dfa->lock);
214 if (preg->no_sub)
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
217 else
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
221 return err != REG_NOERROR;
222}
223
224#ifdef _LIBC
225libc_hidden_def (__regexec)
226
227# include <shlib-compat.h>
228versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
229
230# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231__typeof__ (__regexec) __compat_regexec;
232
233int
234attribute_compat_text_section
235__compat_regexec (const regex_t *_Restrict_ preg,
236 const char *_Restrict_ string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
238{
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
241}
242compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243# endif
244#endif
245
246/* Entry points for GNU code. */
247
248/* re_match, re_search, re_match_2, re_search_2
249
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
253
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
256
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
261
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
265
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
268 computed relative to the concatenation, not relative to the individual
269 strings.)
270
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
274
275regoff_t
276re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
278{
279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
280}
281#ifdef _LIBC
282weak_alias (__re_match, re_match)
283#endif
284
285regoff_t
286re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
288{
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
291}
292#ifdef _LIBC
293weak_alias (__re_search, re_search)
294#endif
295
296regoff_t
297re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
300{
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
302 start, 0, regs, stop, true);
303}
304#ifdef _LIBC
305weak_alias (__re_match_2, re_match_2)
306#endif
307
308regoff_t
309re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
312{
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, range, regs, stop, false);
315}
316#ifdef _LIBC
317weak_alias (__re_search_2, re_search_2)
318#endif
319
320static regoff_t
321re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
325{
326 const char *str;
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
330
331 if (BE ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len)),
333 0))
334 return -2;
335
336 /* Concatenate the strings. */
337 if (length2 > 0)
338 if (length1 > 0)
339 {
340 s = re_malloc (char, len);
341
342 if (BE (s == NULL, 0))
343 return -2;
344#ifdef _LIBC
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
346#else
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
349#endif
350 str = s;
351 }
352 else
353 str = string2;
354 else
355 str = string1;
356
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 ret_len);
359 re_free (s);
360 return rval;
361}
362
363/* The parameters have the same meaning as those of re_search.
364 Additional parameters:
365 If RET_LEN is true the length of the match is returned (re_match style);
366 otherwise the position of the match is returned. */
367
368static regoff_t
369re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 bool ret_len)
372{
373 reg_errcode_t result;
374 regmatch_t *pmatch;
375 Idx nregs;
376 regoff_t rval;
377 int eflags = 0;
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
380
381 /* Check for out-of-range. */
382 if (BE (start < 0 || start > length, 0))
383 return -1;
384 if (BE (length < last_start || (0 <= range && last_start < start), 0))
385 last_start = length;
386 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
387 last_start = 0;
388
389 lock_lock (dfa->lock);
390
391 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
392 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
393
394 /* Compile fastmap if we haven't yet. */
395 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
396 re_compile_fastmap (bufp);
397
398 if (BE (bufp->no_sub, 0))
399 regs = NULL;
400
401 /* We need at least 1 register. */
402 if (regs == NULL)
403 nregs = 1;
404 else if (BE (bufp->regs_allocated == REGS_FIXED
405 && regs->num_regs <= bufp->re_nsub, 0))
406 {
407 nregs = regs->num_regs;
408 if (BE (nregs < 1, 0))
409 {
410 /* Nothing can be copied to regs. */
411 regs = NULL;
412 nregs = 1;
413 }
414 }
415 else
416 nregs = bufp->re_nsub + 1;
417 pmatch = re_malloc (regmatch_t, nregs);
418 if (BE (pmatch == NULL, 0))
419 {
420 rval = -2;
421 goto out;
422 }
423
424 result = re_search_internal (bufp, string, length, start, last_start, stop,
425 nregs, pmatch, eflags);
426
427 rval = 0;
428
429 /* I hope we needn't fill their regs with -1's when no match was found. */
430 if (result != REG_NOERROR)
431 rval = result == REG_NOMATCH ? -1 : -2;
432 else if (regs != NULL)
433 {
434 /* If caller wants register contents data back, copy them. */
435 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
436 bufp->regs_allocated);
437 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
438 rval = -2;
439 }
440
441 if (BE (rval == 0, 1))
442 {
443 if (ret_len)
444 {
445 assert (pmatch[0].rm_so == start);
446 rval = pmatch[0].rm_eo - start;
447 }
448 else
449 rval = pmatch[0].rm_so;
450 }
451 re_free (pmatch);
452 out:
453 lock_unlock (dfa->lock);
454 return rval;
455}
456
457static unsigned
458re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
459 int regs_allocated)
460{
461 int rval = REGS_REALLOCATE;
462 Idx i;
463 Idx need_regs = nregs + 1;
464 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
465 uses. */
466
467 /* Have the register data arrays been allocated? */
468 if (regs_allocated == REGS_UNALLOCATED)
469 { /* No. So allocate them with malloc. */
470 regs->start = re_malloc (regoff_t, need_regs);
471 if (BE (regs->start == NULL, 0))
472 return REGS_UNALLOCATED;
473 regs->end = re_malloc (regoff_t, need_regs);
474 if (BE (regs->end == NULL, 0))
475 {
476 re_free (regs->start);
477 return REGS_UNALLOCATED;
478 }
479 regs->num_regs = need_regs;
480 }
481 else if (regs_allocated == REGS_REALLOCATE)
482 { /* Yes. If we need more elements than were already
483 allocated, reallocate them. If we need fewer, just
484 leave it alone. */
485 if (BE (need_regs > regs->num_regs, 0))
486 {
487 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
488 regoff_t *new_end;
489 if (BE (new_start == NULL, 0))
490 return REGS_UNALLOCATED;
491 new_end = re_realloc (regs->end, regoff_t, need_regs);
492 if (BE (new_end == NULL, 0))
493 {
494 re_free (new_start);
495 return REGS_UNALLOCATED;
496 }
497 regs->start = new_start;
498 regs->end = new_end;
499 regs->num_regs = need_regs;
500 }
501 }
502 else
503 {
504 assert (regs_allocated == REGS_FIXED);
505 /* This function may not be called with REGS_FIXED and nregs too big. */
506 assert (regs->num_regs >= nregs);
507 rval = REGS_FIXED;
508 }
509
510 /* Copy the regs. */
511 for (i = 0; i < nregs; ++i)
512 {
513 regs->start[i] = pmatch[i].rm_so;
514 regs->end[i] = pmatch[i].rm_eo;
515 }
516 for ( ; i < regs->num_regs; ++i)
517 regs->start[i] = regs->end[i] = -1;
518
519 return rval;
520}
521
522/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
523 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
524 this memory for recording register information. STARTS and ENDS
525 must be allocated using the malloc library routine, and must each
526 be at least NUM_REGS * sizeof (regoff_t) bytes long.
527
528 If NUM_REGS == 0, then subsequent matches should allocate their own
529 register data.
530
531 Unless this function is called, the first search or match using
532 PATTERN_BUFFER will allocate its own register data, without
533 freeing the old data. */
534
535void
536re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
537 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
538{
539 if (num_regs)
540 {
541 bufp->regs_allocated = REGS_REALLOCATE;
542 regs->num_regs = num_regs;
543 regs->start = starts;
544 regs->end = ends;
545 }
546 else
547 {
548 bufp->regs_allocated = REGS_UNALLOCATED;
549 regs->num_regs = 0;
550 regs->start = regs->end = NULL;
551 }
552}
553#ifdef _LIBC
554weak_alias (__re_set_registers, re_set_registers)
555#endif
556
557/* Entry points compatible with 4.2 BSD regex library. We don't define
558 them unless specifically requested. */
559
560#if defined _REGEX_RE_COMP || defined _LIBC
561int
562# ifdef _LIBC
563weak_function
564# endif
565re_exec (const char *s)
566{
567 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
568}
569#endif /* _REGEX_RE_COMP */
570
571/* Internal entry point. */
572
573/* Searches for a compiled pattern PREG in the string STRING, whose
574 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
575 meaning as with regexec. LAST_START is START + RANGE, where
576 START and RANGE have the same meaning as with re_search.
577 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
578 otherwise return the error code.
579 Note: We assume front end functions already check ranges.
580 (0 <= LAST_START && LAST_START <= LENGTH) */
581
582static reg_errcode_t
583__attribute_warn_unused_result__
584re_search_internal (const regex_t *preg, const char *string, Idx length,
585 Idx start, Idx last_start, Idx stop, size_t nmatch,
586 regmatch_t pmatch[], int eflags)
587{
588 reg_errcode_t err;
589 const re_dfa_t *dfa = preg->buffer;
590 Idx left_lim, right_lim;
591 int incr;
592 bool fl_longest_match;
593 int match_kind;
594 Idx match_first;
595 Idx match_last = -1;
596 Idx extra_nmatch;
597 bool sb;
598 int ch;
599#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
600 re_match_context_t mctx = { .dfa = dfa };
601#else
602 re_match_context_t mctx;
603#endif
604 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
605 && start != last_start && !preg->can_be_null)
606 ? preg->fastmap : NULL);
607 RE_TRANSLATE_TYPE t = preg->translate;
608
609#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
610 memset (&mctx, '\0', sizeof (re_match_context_t));
611 mctx.dfa = dfa;
612#endif
613
614 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
615 nmatch -= extra_nmatch;
616
617 /* Check if the DFA haven't been compiled. */
618 if (BE (preg->used == 0 || dfa->init_state == NULL
619 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
620 || dfa->init_state_begbuf == NULL, 0))
621 return REG_NOMATCH;
622
623#ifdef DEBUG
624 /* We assume front-end functions already check them. */
625 assert (0 <= last_start && last_start <= length);
626#endif
627
628 /* If initial states with non-begbuf contexts have no elements,
629 the regex must be anchored. If preg->newline_anchor is set,
630 we'll never use init_state_nl, so do not check it. */
631 if (dfa->init_state->nodes.nelem == 0
632 && dfa->init_state_word->nodes.nelem == 0
633 && (dfa->init_state_nl->nodes.nelem == 0
634 || !preg->newline_anchor))
635 {
636 if (start != 0 && last_start != 0)
637 return REG_NOMATCH;
638 start = last_start = 0;
639 }
640
641 /* We must check the longest matching, if nmatch > 0. */
642 fl_longest_match = (nmatch != 0 || dfa->nbackref);
643
644 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
645 preg->translate, (preg->syntax & RE_ICASE) != 0,
646 dfa);
647 if (BE (err != REG_NOERROR, 0))
648 goto free_return;
649 mctx.input.stop = stop;
650 mctx.input.raw_stop = stop;
651 mctx.input.newline_anchor = preg->newline_anchor;
652
653 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
654 if (BE (err != REG_NOERROR, 0))
655 goto free_return;
656
657 /* We will log all the DFA states through which the dfa pass,
658 if nmatch > 1, or this dfa has "multibyte node", which is a
659 back-reference or a node which can accept multibyte character or
660 multi character collating element. */
661 if (nmatch > 1 || dfa->has_mb_node)
662 {
663 /* Avoid overflow. */
664 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
665 <= mctx.input.bufs_len), 0))
666 {
667 err = REG_ESPACE;
668 goto free_return;
669 }
670
671 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
672 if (BE (mctx.state_log == NULL, 0))
673 {
674 err = REG_ESPACE;
675 goto free_return;
676 }
677 }
678 else
679 mctx.state_log = NULL;
680
681 match_first = start;
682 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
683 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684
685 /* Check incrementally whether the input string matches. */
686 incr = (last_start < start) ? -1 : 1;
687 left_lim = (last_start < start) ? last_start : start;
688 right_lim = (last_start < start) ? start : last_start;
689 sb = dfa->mb_cur_max == 1;
690 match_kind =
691 (fastmap
692 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
693 | (start <= last_start ? 2 : 0)
694 | (t != NULL ? 1 : 0))
695 : 8);
696
697 for (;; match_first += incr)
698 {
699 err = REG_NOMATCH;
700 if (match_first < left_lim || right_lim < match_first)
701 goto free_return;
702
703 /* Advance as rapidly as possible through the string, until we
704 find a plausible place to start matching. This may be done
705 with varying efficiency, so there are various possibilities:
706 only the most common of them are specialized, in order to
707 save on code size. We use a switch statement for speed. */
708 switch (match_kind)
709 {
710 case 8:
711 /* No fastmap. */
712 break;
713
714 case 7:
715 /* Fastmap with single-byte translation, match forward. */
716 while (BE (match_first < right_lim, 1)
717 && !fastmap[t[(unsigned char) string[match_first]]])
718 ++match_first;
719 goto forward_match_found_start_or_reached_end;
720
721 case 6:
722 /* Fastmap without translation, match forward. */
723 while (BE (match_first < right_lim, 1)
724 && !fastmap[(unsigned char) string[match_first]])
725 ++match_first;
726
727 forward_match_found_start_or_reached_end:
728 if (BE (match_first == right_lim, 0))
729 {
730 ch = match_first >= length
731 ? 0 : (unsigned char) string[match_first];
732 if (!fastmap[t ? t[ch] : ch])
733 goto free_return;
734 }
735 break;
736
737 case 4:
738 case 5:
739 /* Fastmap without multi-byte translation, match backwards. */
740 while (match_first >= left_lim)
741 {
742 ch = match_first >= length
743 ? 0 : (unsigned char) string[match_first];
744 if (fastmap[t ? t[ch] : ch])
745 break;
746 --match_first;
747 }
748 if (match_first < left_lim)
749 goto free_return;
750 break;
751
752 default:
753 /* In this case, we can't determine easily the current byte,
754 since it might be a component byte of a multibyte
755 character. Then we use the constructed buffer instead. */
756 for (;;)
757 {
758 /* If MATCH_FIRST is out of the valid range, reconstruct the
759 buffers. */
760 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
761 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
762 {
763 err = re_string_reconstruct (&mctx.input, match_first,
764 eflags);
765 if (BE (err != REG_NOERROR, 0))
766 goto free_return;
767
768 offset = match_first - mctx.input.raw_mbs_idx;
769 }
770 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
771 Note that MATCH_FIRST must not be smaller than 0. */
772 ch = (match_first >= length
773 ? 0 : re_string_byte_at (&mctx.input, offset));
774 if (fastmap[ch])
775 break;
776 match_first += incr;
777 if (match_first < left_lim || match_first > right_lim)
778 {
779 err = REG_NOMATCH;
780 goto free_return;
781 }
782 }
783 break;
784 }
785
786 /* Reconstruct the buffers so that the matcher can assume that
787 the matching starts from the beginning of the buffer. */
788 err = re_string_reconstruct (&mctx.input, match_first, eflags);
789 if (BE (err != REG_NOERROR, 0))
790 goto free_return;
791
792#ifdef RE_ENABLE_I18N
793 /* Don't consider this char as a possible match start if it part,
794 yet isn't the head, of a multibyte character. */
795 if (!sb && !re_string_first_byte (&mctx.input, 0))
796 continue;
797#endif
798
799 /* It seems to be appropriate one, then use the matcher. */
800 /* We assume that the matching starts from 0. */
801 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
802 match_last = check_matching (&mctx, fl_longest_match,
803 start <= last_start ? &match_first : NULL);
804 if (match_last != -1)
805 {
806 if (BE (match_last == -2, 0))
807 {
808 err = REG_ESPACE;
809 goto free_return;
810 }
811 else
812 {
813 mctx.match_last = match_last;
814 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
815 {
816 re_dfastate_t *pstate = mctx.state_log[match_last];
817 mctx.last_node = check_halt_state_context (&mctx, pstate,
818 match_last);
819 }
820 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
821 || dfa->nbackref)
822 {
823 err = prune_impossible_nodes (&mctx);
824 if (err == REG_NOERROR)
825 break;
826 if (BE (err != REG_NOMATCH, 0))
827 goto free_return;
828 match_last = -1;
829 }
830 else
831 break; /* We found a match. */
832 }
833 }
834
835 match_ctx_clean (&mctx);
836 }
837
838#ifdef DEBUG
839 assert (match_last != -1);
840 assert (err == REG_NOERROR);
841#endif
842
843 /* Set pmatch[] if we need. */
844 if (nmatch > 0)
845 {
846 Idx reg_idx;
847
848 /* Initialize registers. */
849 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
850 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851
852 /* Set the points where matching start/end. */
853 pmatch[0].rm_so = 0;
854 pmatch[0].rm_eo = mctx.match_last;
855 /* FIXME: This function should fail if mctx.match_last exceeds
856 the maximum possible regoff_t value. We need a new error
857 code REG_OVERFLOW. */
858
859 if (!preg->no_sub && nmatch > 1)
860 {
861 err = set_regs (preg, &mctx, nmatch, pmatch,
862 dfa->has_plural_match && dfa->nbackref > 0);
863 if (BE (err != REG_NOERROR, 0))
864 goto free_return;
865 }
866
867 /* At last, add the offset to each register, since we slid
868 the buffers so that we could assume that the matching starts
869 from 0. */
870 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
871 if (pmatch[reg_idx].rm_so != -1)
872 {
873#ifdef RE_ENABLE_I18N
874 if (BE (mctx.input.offsets_needed != 0, 0))
875 {
876 pmatch[reg_idx].rm_so =
877 (pmatch[reg_idx].rm_so == mctx.input.valid_len
878 ? mctx.input.valid_raw_len
879 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
880 pmatch[reg_idx].rm_eo =
881 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
882 ? mctx.input.valid_raw_len
883 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
884 }
885#else
886 assert (mctx.input.offsets_needed == 0);
887#endif
888 pmatch[reg_idx].rm_so += match_first;
889 pmatch[reg_idx].rm_eo += match_first;
890 }
891 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
892 {
893 pmatch[nmatch + reg_idx].rm_so = -1;
894 pmatch[nmatch + reg_idx].rm_eo = -1;
895 }
896
897 if (dfa->subexp_map)
898 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
899 if (dfa->subexp_map[reg_idx] != reg_idx)
900 {
901 pmatch[reg_idx + 1].rm_so
902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
903 pmatch[reg_idx + 1].rm_eo
904 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 }
906 }
907
908 free_return:
909 re_free (mctx.state_log);
910 if (dfa->nbackref)
911 match_ctx_free (&mctx);
912 re_string_destruct (&mctx.input);
913 return err;
914}
915
916static reg_errcode_t
917__attribute_warn_unused_result__
918prune_impossible_nodes (re_match_context_t *mctx)
919{
920 const re_dfa_t *const dfa = mctx->dfa;
921 Idx halt_node, match_last;
922 reg_errcode_t ret;
923 re_dfastate_t **sifted_states;
924 re_dfastate_t **lim_states = NULL;
925 re_sift_context_t sctx;
926#ifdef DEBUG
927 assert (mctx->state_log != NULL);
928#endif
929 match_last = mctx->match_last;
930 halt_node = mctx->last_node;
931
932 /* Avoid overflow. */
933 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
934 return REG_ESPACE;
935
936 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
937 if (BE (sifted_states == NULL, 0))
938 {
939 ret = REG_ESPACE;
940 goto free_return;
941 }
942 if (dfa->nbackref)
943 {
944 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
945 if (BE (lim_states == NULL, 0))
946 {
947 ret = REG_ESPACE;
948 goto free_return;
949 }
950 while (1)
951 {
952 memset (lim_states, '\0',
953 sizeof (re_dfastate_t *) * (match_last + 1));
954 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
955 match_last);
956 ret = sift_states_backward (mctx, &sctx);
957 re_node_set_free (&sctx.limits);
958 if (BE (ret != REG_NOERROR, 0))
959 goto free_return;
960 if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 break;
962 do
963 {
964 --match_last;
965 if (match_last < 0)
966 {
967 ret = REG_NOMATCH;
968 goto free_return;
969 }
970 } while (mctx->state_log[match_last] == NULL
971 || !mctx->state_log[match_last]->halt);
972 halt_node = check_halt_state_context (mctx,
973 mctx->state_log[match_last],
974 match_last);
975 }
976 ret = merge_state_array (dfa, sifted_states, lim_states,
977 match_last + 1);
978 re_free (lim_states);
979 lim_states = NULL;
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 }
983 else
984 {
985 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
986 ret = sift_states_backward (mctx, &sctx);
987 re_node_set_free (&sctx.limits);
988 if (BE (ret != REG_NOERROR, 0))
989 goto free_return;
990 if (sifted_states[0] == NULL)
991 {
992 ret = REG_NOMATCH;
993 goto free_return;
994 }
995 }
996 re_free (mctx->state_log);
997 mctx->state_log = sifted_states;
998 sifted_states = NULL;
999 mctx->last_node = halt_node;
1000 mctx->match_last = match_last;
1001 ret = REG_NOERROR;
1002 free_return:
1003 re_free (sifted_states);
1004 re_free (lim_states);
1005 return ret;
1006}
1007
1008/* Acquire an initial state and return it.
1009 We must select appropriate initial state depending on the context,
1010 since initial states may have constraints like "\<", "^", etc.. */
1011
1012static inline re_dfastate_t *
1013__attribute__ ((always_inline))
1014acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015 Idx idx)
1016{
1017 const re_dfa_t *const dfa = mctx->dfa;
1018 if (dfa->init_state->has_constraint)
1019 {
1020 unsigned int context;
1021 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022 if (IS_WORD_CONTEXT (context))
1023 return dfa->init_state_word;
1024 else if (IS_ORDINARY_CONTEXT (context))
1025 return dfa->init_state;
1026 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027 return dfa->init_state_begbuf;
1028 else if (IS_NEWLINE_CONTEXT (context))
1029 return dfa->init_state_nl;
1030 else if (IS_BEGBUF_CONTEXT (context))
1031 {
1032 /* It is relatively rare case, then calculate on demand. */
1033 return re_acquire_state_context (err, dfa,
1034 dfa->init_state->entrance_nodes,
1035 context);
1036 }
1037 else
1038 /* Must not happen? */
1039 return dfa->init_state;
1040 }
1041 else
1042 return dfa->init_state;
1043}
1044
1045/* Check whether the regular expression match input string INPUT or not,
1046 and return the index where the matching end. Return -1 if
1047 there is no match, and return -2 in case of an error.
1048 FL_LONGEST_MATCH means we want the POSIX longest matching.
1049 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050 next place where we may want to try matching.
1051 Note that the matcher assumes that the matching starts from the current
1052 index of the buffer. */
1053
1054static Idx
1055__attribute_warn_unused_result__
1056check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057 Idx *p_match_first)
1058{
1059 const re_dfa_t *const dfa = mctx->dfa;
1060 reg_errcode_t err;
1061 Idx match = 0;
1062 Idx match_last = -1;
1063 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1064 re_dfastate_t *cur_state;
1065 bool at_init_state = p_match_first != NULL;
1066 Idx next_start_idx = cur_str_idx;
1067
1068 err = REG_NOERROR;
1069 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070 /* An initial state must not be NULL (invalid). */
1071 if (BE (cur_state == NULL, 0))
1072 {
1073 assert (err == REG_ESPACE);
1074 return -2;
1075 }
1076
1077 if (mctx->state_log != NULL)
1078 {
1079 mctx->state_log[cur_str_idx] = cur_state;
1080
1081 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082 later. E.g. Processing back references. */
1083 if (BE (dfa->nbackref, 0))
1084 {
1085 at_init_state = false;
1086 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087 if (BE (err != REG_NOERROR, 0))
1088 return err;
1089
1090 if (cur_state->has_backref)
1091 {
1092 err = transit_state_bkref (mctx, &cur_state->nodes);
1093 if (BE (err != REG_NOERROR, 0))
1094 return err;
1095 }
1096 }
1097 }
1098
1099 /* If the RE accepts NULL string. */
1100 if (BE (cur_state->halt, 0))
1101 {
1102 if (!cur_state->has_constraint
1103 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1104 {
1105 if (!fl_longest_match)
1106 return cur_str_idx;
1107 else
1108 {
1109 match_last = cur_str_idx;
1110 match = 1;
1111 }
1112 }
1113 }
1114
1115 while (!re_string_eoi (&mctx->input))
1116 {
1117 re_dfastate_t *old_state = cur_state;
1118 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119
1120 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121 && mctx->input.bufs_len < mctx->input.len)
1122 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123 && mctx->input.valid_len < mctx->input.len))
1124 {
1125 err = extend_buffers (mctx, next_char_idx + 1);
1126 if (BE (err != REG_NOERROR, 0))
1127 {
1128 assert (err == REG_ESPACE);
1129 return -2;
1130 }
1131 }
1132
1133 cur_state = transit_state (&err, mctx, cur_state);
1134 if (mctx->state_log != NULL)
1135 cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137 if (cur_state == NULL)
1138 {
1139 /* Reached the invalid state or an error. Try to recover a valid
1140 state using the state log, if available and if we have not
1141 already found a valid (even if not the longest) match. */
1142 if (BE (err != REG_NOERROR, 0))
1143 return -2;
1144
1145 if (mctx->state_log == NULL
1146 || (match && !fl_longest_match)
1147 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148 break;
1149 }
1150
1151 if (BE (at_init_state, 0))
1152 {
1153 if (old_state == cur_state)
1154 next_start_idx = next_char_idx;
1155 else
1156 at_init_state = false;
1157 }
1158
1159 if (cur_state->halt)
1160 {
1161 /* Reached a halt state.
1162 Check the halt state can satisfy the current context. */
1163 if (!cur_state->has_constraint
1164 || check_halt_state_context (mctx, cur_state,
1165 re_string_cur_idx (&mctx->input)))
1166 {
1167 /* We found an appropriate halt state. */
1168 match_last = re_string_cur_idx (&mctx->input);
1169 match = 1;
1170
1171 /* We found a match, do not modify match_first below. */
1172 p_match_first = NULL;
1173 if (!fl_longest_match)
1174 break;
1175 }
1176 }
1177 }
1178
1179 if (p_match_first)
1180 *p_match_first += next_start_idx;
1181
1182 return match_last;
1183}
1184
1185/* Check NODE match the current context. */
1186
1187static bool
1188check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1189{
1190 re_token_type_t type = dfa->nodes[node].type;
1191 unsigned int constraint = dfa->nodes[node].constraint;
1192 if (type != END_OF_RE)
1193 return false;
1194 if (!constraint)
1195 return true;
1196 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1197 return false;
1198 return true;
1199}
1200
1201/* Check the halt state STATE match the current context.
1202 Return 0 if not match, if the node, STATE has, is a halt node and
1203 match the context, return the node. */
1204
1205static Idx
1206check_halt_state_context (const re_match_context_t *mctx,
1207 const re_dfastate_t *state, Idx idx)
1208{
1209 Idx i;
1210 unsigned int context;
1211#ifdef DEBUG
1212 assert (state->halt);
1213#endif
1214 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1215 for (i = 0; i < state->nodes.nelem; ++i)
1216 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1217 return state->nodes.elems[i];
1218 return 0;
1219}
1220
1221/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222 corresponding to the DFA).
1223 Return the destination node, and update EPS_VIA_NODES;
1224 return -1 in case of errors. */
1225
1226static Idx
1227proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229 struct re_fail_stack_t *fs)
1230{
1231 const re_dfa_t *const dfa = mctx->dfa;
1232 Idx i;
1233 bool ok;
1234 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235 {
1236 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1237 re_node_set *edests = &dfa->edests[node];
1238 Idx dest_node;
1239 ok = re_node_set_insert (eps_via_nodes, node);
1240 if (BE (! ok, 0))
1241 return -2;
1242 /* Pick up a valid destination, or return -1 if none
1243 is found. */
1244 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245 {
1246 Idx candidate = edests->elems[i];
1247 if (!re_node_set_contains (cur_nodes, candidate))
1248 continue;
1249 if (dest_node == -1)
1250 dest_node = candidate;
1251
1252 else
1253 {
1254 /* In order to avoid infinite loop like "(a*)*", return the second
1255 epsilon-transition if the first was already considered. */
1256 if (re_node_set_contains (eps_via_nodes, dest_node))
1257 return candidate;
1258
1259 /* Otherwise, push the second epsilon-transition on the fail stack. */
1260 else if (fs != NULL
1261 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 eps_via_nodes))
1263 return -2;
1264
1265 /* We know we are going to exit. */
1266 break;
1267 }
1268 }
1269 return dest_node;
1270 }
1271 else
1272 {
1273 Idx naccepted = 0;
1274 re_token_type_t type = dfa->nodes[node].type;
1275
1276#ifdef RE_ENABLE_I18N
1277 if (dfa->nodes[node].accept_mb)
1278 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279 else
1280#endif /* RE_ENABLE_I18N */
1281 if (type == OP_BACK_REF)
1282 {
1283 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1284 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 if (fs != NULL)
1286 {
1287 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 return -1;
1289 else if (naccepted)
1290 {
1291 char *buf = (char *) re_string_get_buffer (&mctx->input);
1292 if (mctx->input.valid_len - *pidx < naccepted
1293 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1294 naccepted)
1295 != 0))
1296 return -1;
1297 }
1298 }
1299
1300 if (naccepted == 0)
1301 {
1302 Idx dest_node;
1303 ok = re_node_set_insert (eps_via_nodes, node);
1304 if (BE (! ok, 0))
1305 return -2;
1306 dest_node = dfa->edests[node].elems[0];
1307 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1308 dest_node))
1309 return dest_node;
1310 }
1311 }
1312
1313 if (naccepted != 0
1314 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1315 {
1316 Idx dest_node = dfa->nexts[node];
1317 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1318 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1319 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1320 dest_node)))
1321 return -1;
1322 re_node_set_empty (eps_via_nodes);
1323 return dest_node;
1324 }
1325 }
1326 return -1;
1327}
1328
1329static reg_errcode_t
1330__attribute_warn_unused_result__
1331push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1332 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1333{
1334 reg_errcode_t err;
1335 Idx num = fs->num++;
1336 if (fs->num == fs->alloc)
1337 {
1338 struct re_fail_stack_ent_t *new_array;
1339 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1340 fs->alloc * 2);
1341 if (new_array == NULL)
1342 return REG_ESPACE;
1343 fs->alloc *= 2;
1344 fs->stack = new_array;
1345 }
1346 fs->stack[num].idx = str_idx;
1347 fs->stack[num].node = dest_node;
1348 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1349 if (fs->stack[num].regs == NULL)
1350 return REG_ESPACE;
1351 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1352 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1353 return err;
1354}
1355
1356static Idx
1357pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1358 regmatch_t *regs, re_node_set *eps_via_nodes)
1359{
1360 Idx num = --fs->num;
1361 assert (num >= 0);
1362 *pidx = fs->stack[num].idx;
1363 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1364 re_node_set_free (eps_via_nodes);
1365 re_free (fs->stack[num].regs);
1366 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1367 return fs->stack[num].node;
1368}
1369
1370/* Set the positions where the subexpressions are starts/ends to registers
1371 PMATCH.
1372 Note: We assume that pmatch[0] is already set, and
1373 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1374
1375static reg_errcode_t
1376__attribute_warn_unused_result__
1377set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1378 regmatch_t *pmatch, bool fl_backtrack)
1379{
1380 const re_dfa_t *dfa = preg->buffer;
1381 Idx idx, cur_node;
1382 re_node_set eps_via_nodes;
1383 struct re_fail_stack_t *fs;
1384 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385 regmatch_t *prev_idx_match;
1386 bool prev_idx_match_malloced = false;
1387
1388#ifdef DEBUG
1389 assert (nmatch > 1);
1390 assert (mctx->state_log != NULL);
1391#endif
1392 if (fl_backtrack)
1393 {
1394 fs = &fs_body;
1395 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1396 if (fs->stack == NULL)
1397 return REG_ESPACE;
1398 }
1399 else
1400 fs = NULL;
1401
1402 cur_node = dfa->init_node;
1403 re_node_set_init_empty (&eps_via_nodes);
1404
1405 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1406 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1407 else
1408 {
1409 prev_idx_match = re_malloc (regmatch_t, nmatch);
1410 if (prev_idx_match == NULL)
1411 {
1412 free_fail_stack_return (fs);
1413 return REG_ESPACE;
1414 }
1415 prev_idx_match_malloced = true;
1416 }
1417 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1418
1419 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1420 {
1421 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1422
1423 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1424 {
1425 Idx reg_idx;
1426 if (fs)
1427 {
1428 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1429 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1430 break;
1431 if (reg_idx == nmatch)
1432 {
1433 re_node_set_free (&eps_via_nodes);
1434 if (prev_idx_match_malloced)
1435 re_free (prev_idx_match);
1436 return free_fail_stack_return (fs);
1437 }
1438 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1439 &eps_via_nodes);
1440 }
1441 else
1442 {
1443 re_node_set_free (&eps_via_nodes);
1444 if (prev_idx_match_malloced)
1445 re_free (prev_idx_match);
1446 return REG_NOERROR;
1447 }
1448 }
1449
1450 /* Proceed to next node. */
1451 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1452 &eps_via_nodes, fs);
1453
1454 if (BE (cur_node < 0, 0))
1455 {
1456 if (BE (cur_node == -2, 0))
1457 {
1458 re_node_set_free (&eps_via_nodes);
1459 if (prev_idx_match_malloced)
1460 re_free (prev_idx_match);
1461 free_fail_stack_return (fs);
1462 return REG_ESPACE;
1463 }
1464 if (fs)
1465 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1466 &eps_via_nodes);
1467 else
1468 {
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 return REG_NOMATCH;
1473 }
1474 }
1475 }
1476 re_node_set_free (&eps_via_nodes);
1477 if (prev_idx_match_malloced)
1478 re_free (prev_idx_match);
1479 return free_fail_stack_return (fs);
1480}
1481
1482static reg_errcode_t
1483free_fail_stack_return (struct re_fail_stack_t *fs)
1484{
1485 if (fs)
1486 {
1487 Idx fs_idx;
1488 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1489 {
1490 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1491 re_free (fs->stack[fs_idx].regs);
1492 }
1493 re_free (fs->stack);
1494 }
1495 return REG_NOERROR;
1496}
1497
1498static void
1499update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1500 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1501{
1502 int type = dfa->nodes[cur_node].type;
1503 if (type == OP_OPEN_SUBEXP)
1504 {
1505 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1506
1507 /* We are at the first node of this sub expression. */
1508 if (reg_num < nmatch)
1509 {
1510 pmatch[reg_num].rm_so = cur_idx;
1511 pmatch[reg_num].rm_eo = -1;
1512 }
1513 }
1514 else if (type == OP_CLOSE_SUBEXP)
1515 {
1516 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1517 if (reg_num < nmatch)
1518 {
1519 /* We are at the last node of this sub expression. */
1520 if (pmatch[reg_num].rm_so < cur_idx)
1521 {
1522 pmatch[reg_num].rm_eo = cur_idx;
1523 /* This is a non-empty match or we are not inside an optional
1524 subexpression. Accept this right away. */
1525 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1526 }
1527 else
1528 {
1529 if (dfa->nodes[cur_node].opt_subexp
1530 && prev_idx_match[reg_num].rm_so != -1)
1531 /* We transited through an empty match for an optional
1532 subexpression, like (a?)*, and this is not the subexp's
1533 first match. Copy back the old content of the registers
1534 so that matches of an inner subexpression are undone as
1535 well, like in ((a?))*. */
1536 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1537 else
1538 /* We completed a subexpression, but it may be part of
1539 an optional one, so do not update PREV_IDX_MATCH. */
1540 pmatch[reg_num].rm_eo = cur_idx;
1541 }
1542 }
1543 }
1544}
1545
1546/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1547 and sift the nodes in each states according to the following rules.
1548 Updated state_log will be wrote to STATE_LOG.
1549
1550 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1551 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1552 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1553 the LAST_NODE, we throw away the node 'a'.
1554 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1555 string 's' and transit to 'b':
1556 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1557 away the node 'a'.
1558 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1559 thrown away, we throw away the node 'a'.
1560 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1561 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1562 node 'a'.
1563 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1564 we throw away the node 'a'. */
1565
1566#define STATE_NODE_CONTAINS(state,node) \
1567 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1568
1569static reg_errcode_t
1570sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1571{
1572 reg_errcode_t err;
1573 int null_cnt = 0;
1574 Idx str_idx = sctx->last_str_idx;
1575 re_node_set cur_dest;
1576
1577#ifdef DEBUG
1578 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1579#endif
1580
1581 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1582 transit to the last_node and the last_node itself. */
1583 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1584 if (BE (err != REG_NOERROR, 0))
1585 return err;
1586 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1587 if (BE (err != REG_NOERROR, 0))
1588 goto free_return;
1589
1590 /* Then check each states in the state_log. */
1591 while (str_idx > 0)
1592 {
1593 /* Update counters. */
1594 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1595 if (null_cnt > mctx->max_mb_elem_len)
1596 {
1597 memset (sctx->sifted_states, '\0',
1598 sizeof (re_dfastate_t *) * str_idx);
1599 re_node_set_free (&cur_dest);
1600 return REG_NOERROR;
1601 }
1602 re_node_set_empty (&cur_dest);
1603 --str_idx;
1604
1605 if (mctx->state_log[str_idx])
1606 {
1607 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1608 if (BE (err != REG_NOERROR, 0))
1609 goto free_return;
1610 }
1611
1612 /* Add all the nodes which satisfy the following conditions:
1613 - It can epsilon transit to a node in CUR_DEST.
1614 - It is in CUR_SRC.
1615 And update state_log. */
1616 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1617 if (BE (err != REG_NOERROR, 0))
1618 goto free_return;
1619 }
1620 err = REG_NOERROR;
1621 free_return:
1622 re_node_set_free (&cur_dest);
1623 return err;
1624}
1625
1626static reg_errcode_t
1627__attribute_warn_unused_result__
1628build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1629 Idx str_idx, re_node_set *cur_dest)
1630{
1631 const re_dfa_t *const dfa = mctx->dfa;
1632 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1633 Idx i;
1634
1635 /* Then build the next sifted state.
1636 We build the next sifted state on 'cur_dest', and update
1637 'sifted_states[str_idx]' with 'cur_dest'.
1638 Note:
1639 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1640 'cur_src' points the node_set of the old 'state_log[str_idx]'
1641 (with the epsilon nodes pre-filtered out). */
1642 for (i = 0; i < cur_src->nelem; i++)
1643 {
1644 Idx prev_node = cur_src->elems[i];
1645 int naccepted = 0;
1646 bool ok;
1647
1648#ifdef DEBUG
1649 re_token_type_t type = dfa->nodes[prev_node].type;
1650 assert (!IS_EPSILON_NODE (type));
1651#endif
1652#ifdef RE_ENABLE_I18N
1653 /* If the node may accept "multi byte". */
1654 if (dfa->nodes[prev_node].accept_mb)
1655 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1656 str_idx, sctx->last_str_idx);
1657#endif /* RE_ENABLE_I18N */
1658
1659 /* We don't check backreferences here.
1660 See update_cur_sifted_state(). */
1661 if (!naccepted
1662 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1663 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1664 dfa->nexts[prev_node]))
1665 naccepted = 1;
1666
1667 if (naccepted == 0)
1668 continue;
1669
1670 if (sctx->limits.nelem)
1671 {
1672 Idx to_idx = str_idx + naccepted;
1673 if (check_dst_limits (mctx, &sctx->limits,
1674 dfa->nexts[prev_node], to_idx,
1675 prev_node, str_idx))
1676 continue;
1677 }
1678 ok = re_node_set_insert (cur_dest, prev_node);
1679 if (BE (! ok, 0))
1680 return REG_ESPACE;
1681 }
1682
1683 return REG_NOERROR;
1684}
1685
1686/* Helper functions. */
1687
1688static reg_errcode_t
1689clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1690{
1691 Idx top = mctx->state_log_top;
1692
1693 if ((next_state_log_idx >= mctx->input.bufs_len
1694 && mctx->input.bufs_len < mctx->input.len)
1695 || (next_state_log_idx >= mctx->input.valid_len
1696 && mctx->input.valid_len < mctx->input.len))
1697 {
1698 reg_errcode_t err;
1699 err = extend_buffers (mctx, next_state_log_idx + 1);
1700 if (BE (err != REG_NOERROR, 0))
1701 return err;
1702 }
1703
1704 if (top < next_state_log_idx)
1705 {
1706 memset (mctx->state_log + top + 1, '\0',
1707 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1708 mctx->state_log_top = next_state_log_idx;
1709 }
1710 return REG_NOERROR;
1711}
1712
1713static reg_errcode_t
1714merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1715 re_dfastate_t **src, Idx num)
1716{
1717 Idx st_idx;
1718 reg_errcode_t err;
1719 for (st_idx = 0; st_idx < num; ++st_idx)
1720 {
1721 if (dst[st_idx] == NULL)
1722 dst[st_idx] = src[st_idx];
1723 else if (src[st_idx] != NULL)
1724 {
1725 re_node_set merged_set;
1726 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1727 &src[st_idx]->nodes);
1728 if (BE (err != REG_NOERROR, 0))
1729 return err;
1730 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1731 re_node_set_free (&merged_set);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 }
1735 }
1736 return REG_NOERROR;
1737}
1738
1739static reg_errcode_t
1740update_cur_sifted_state (const re_match_context_t *mctx,
1741 re_sift_context_t *sctx, Idx str_idx,
1742 re_node_set *dest_nodes)
1743{
1744 const re_dfa_t *const dfa = mctx->dfa;
1745 reg_errcode_t err = REG_NOERROR;
1746 const re_node_set *candidates;
1747 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1748 : &mctx->state_log[str_idx]->nodes);
1749
1750 if (dest_nodes->nelem == 0)
1751 sctx->sifted_states[str_idx] = NULL;
1752 else
1753 {
1754 if (candidates)
1755 {
1756 /* At first, add the nodes which can epsilon transit to a node in
1757 DEST_NODE. */
1758 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761
1762 /* Then, check the limitations in the current sift_context. */
1763 if (sctx->limits.nelem)
1764 {
1765 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1766 mctx->bkref_ents, str_idx);
1767 if (BE (err != REG_NOERROR, 0))
1768 return err;
1769 }
1770 }
1771
1772 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1773 if (BE (err != REG_NOERROR, 0))
1774 return err;
1775 }
1776
1777 if (candidates && mctx->state_log[str_idx]->has_backref)
1778 {
1779 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1780 if (BE (err != REG_NOERROR, 0))
1781 return err;
1782 }
1783 return REG_NOERROR;
1784}
1785
1786static reg_errcode_t
1787__attribute_warn_unused_result__
1788add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1789 const re_node_set *candidates)
1790{
1791 reg_errcode_t err = REG_NOERROR;
1792 Idx i;
1793
1794 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1797
1798 if (!state->inveclosure.alloc)
1799 {
1800 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1801 if (BE (err != REG_NOERROR, 0))
1802 return REG_ESPACE;
1803 for (i = 0; i < dest_nodes->nelem; i++)
1804 {
1805 err = re_node_set_merge (&state->inveclosure,
1806 dfa->inveclosures + dest_nodes->elems[i]);
1807 if (BE (err != REG_NOERROR, 0))
1808 return REG_ESPACE;
1809 }
1810 }
1811 return re_node_set_add_intersect (dest_nodes, candidates,
1812 &state->inveclosure);
1813}
1814
1815static reg_errcode_t
1816sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1817 const re_node_set *candidates)
1818{
1819 Idx ecl_idx;
1820 reg_errcode_t err;
1821 re_node_set *inv_eclosure = dfa->inveclosures + node;
1822 re_node_set except_nodes;
1823 re_node_set_init_empty (&except_nodes);
1824 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1825 {
1826 Idx cur_node = inv_eclosure->elems[ecl_idx];
1827 if (cur_node == node)
1828 continue;
1829 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1830 {
1831 Idx edst1 = dfa->edests[cur_node].elems[0];
1832 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1833 ? dfa->edests[cur_node].elems[1] : -1);
1834 if ((!re_node_set_contains (inv_eclosure, edst1)
1835 && re_node_set_contains (dest_nodes, edst1))
1836 || (edst2 > 0
1837 && !re_node_set_contains (inv_eclosure, edst2)
1838 && re_node_set_contains (dest_nodes, edst2)))
1839 {
1840 err = re_node_set_add_intersect (&except_nodes, candidates,
1841 dfa->inveclosures + cur_node);
1842 if (BE (err != REG_NOERROR, 0))
1843 {
1844 re_node_set_free (&except_nodes);
1845 return err;
1846 }
1847 }
1848 }
1849 }
1850 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1851 {
1852 Idx cur_node = inv_eclosure->elems[ecl_idx];
1853 if (!re_node_set_contains (&except_nodes, cur_node))
1854 {
1855 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1856 re_node_set_remove_at (dest_nodes, idx);
1857 }
1858 }
1859 re_node_set_free (&except_nodes);
1860 return REG_NOERROR;
1861}
1862
1863static bool
1864check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1865 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1866{
1867 const re_dfa_t *const dfa = mctx->dfa;
1868 Idx lim_idx, src_pos, dst_pos;
1869
1870 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1871 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1872 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1873 {
1874 Idx subexp_idx;
1875 struct re_backref_cache_entry *ent;
1876 ent = mctx->bkref_ents + limits->elems[lim_idx];
1877 subexp_idx = dfa->nodes[ent->node].opr.idx;
1878
1879 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1880 subexp_idx, dst_node, dst_idx,
1881 dst_bkref_idx);
1882 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1883 subexp_idx, src_node, src_idx,
1884 src_bkref_idx);
1885
1886 /* In case of:
1887 <src> <dst> ( <subexp> )
1888 ( <subexp> ) <src> <dst>
1889 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1890 if (src_pos == dst_pos)
1891 continue; /* This is unrelated limitation. */
1892 else
1893 return true;
1894 }
1895 return false;
1896}
1897
1898static int
1899check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1900 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1901{
1902 const re_dfa_t *const dfa = mctx->dfa;
1903 const re_node_set *eclosures = dfa->eclosures + from_node;
1904 Idx node_idx;
1905
1906 /* Else, we are on the boundary: examine the nodes on the epsilon
1907 closure. */
1908 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1909 {
1910 Idx node = eclosures->elems[node_idx];
1911 switch (dfa->nodes[node].type)
1912 {
1913 case OP_BACK_REF:
1914 if (bkref_idx != -1)
1915 {
1916 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1917 do
1918 {
1919 Idx dst;
1920 int cpos;
1921
1922 if (ent->node != node)
1923 continue;
1924
1925 if (subexp_idx < BITSET_WORD_BITS
1926 && !(ent->eps_reachable_subexps_map
1927 & ((bitset_word_t) 1 << subexp_idx)))
1928 continue;
1929
1930 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1931 OP_CLOSE_SUBEXP cases below. But, if the
1932 destination node is the same node as the source
1933 node, don't recurse because it would cause an
1934 infinite loop: a regex that exhibits this behavior
1935 is ()\1*\1* */
1936 dst = dfa->edests[node].elems[0];
1937 if (dst == from_node)
1938 {
1939 if (boundaries & 1)
1940 return -1;
1941 else /* if (boundaries & 2) */
1942 return 0;
1943 }
1944
1945 cpos =
1946 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1947 dst, bkref_idx);
1948 if (cpos == -1 /* && (boundaries & 1) */)
1949 return -1;
1950 if (cpos == 0 && (boundaries & 2))
1951 return 0;
1952
1953 if (subexp_idx < BITSET_WORD_BITS)
1954 ent->eps_reachable_subexps_map
1955 &= ~((bitset_word_t) 1 << subexp_idx);
1956 }
1957 while (ent++->more);
1958 }
1959 break;
1960
1961 case OP_OPEN_SUBEXP:
1962 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1963 return -1;
1964 break;
1965
1966 case OP_CLOSE_SUBEXP:
1967 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1968 return 0;
1969 break;
1970
1971 default:
1972 break;
1973 }
1974 }
1975
1976 return (boundaries & 2) ? 1 : 0;
1977}
1978
1979static int
1980check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1981 Idx subexp_idx, Idx from_node, Idx str_idx,
1982 Idx bkref_idx)
1983{
1984 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1985 int boundaries;
1986
1987 /* If we are outside the range of the subexpression, return -1 or 1. */
1988 if (str_idx < lim->subexp_from)
1989 return -1;
1990
1991 if (lim->subexp_to < str_idx)
1992 return 1;
1993
1994 /* If we are within the subexpression, return 0. */
1995 boundaries = (str_idx == lim->subexp_from);
1996 boundaries |= (str_idx == lim->subexp_to) << 1;
1997 if (boundaries == 0)
1998 return 0;
1999
2000 /* Else, examine epsilon closure. */
2001 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2002 from_node, bkref_idx);
2003}
2004
2005/* Check the limitations of sub expressions LIMITS, and remove the nodes
2006 which are against limitations from DEST_NODES. */
2007
2008static reg_errcode_t
2009check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2010 const re_node_set *candidates, re_node_set *limits,
2011 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2012{
2013 reg_errcode_t err;
2014 Idx node_idx, lim_idx;
2015
2016 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2017 {
2018 Idx subexp_idx;
2019 struct re_backref_cache_entry *ent;
2020 ent = bkref_ents + limits->elems[lim_idx];
2021
2022 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2023 continue; /* This is unrelated limitation. */
2024
2025 subexp_idx = dfa->nodes[ent->node].opr.idx;
2026 if (ent->subexp_to == str_idx)
2027 {
2028 Idx ops_node = -1;
2029 Idx cls_node = -1;
2030 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2031 {
2032 Idx node = dest_nodes->elems[node_idx];
2033 re_token_type_t type = dfa->nodes[node].type;
2034 if (type == OP_OPEN_SUBEXP
2035 && subexp_idx == dfa->nodes[node].opr.idx)
2036 ops_node = node;
2037 else if (type == OP_CLOSE_SUBEXP
2038 && subexp_idx == dfa->nodes[node].opr.idx)
2039 cls_node = node;
2040 }
2041
2042 /* Check the limitation of the open subexpression. */
2043 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2044 if (ops_node >= 0)
2045 {
2046 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2047 candidates);
2048 if (BE (err != REG_NOERROR, 0))
2049 return err;
2050 }
2051
2052 /* Check the limitation of the close subexpression. */
2053 if (cls_node >= 0)
2054 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2055 {
2056 Idx node = dest_nodes->elems[node_idx];
2057 if (!re_node_set_contains (dfa->inveclosures + node,
2058 cls_node)
2059 && !re_node_set_contains (dfa->eclosures + node,
2060 cls_node))
2061 {
2062 /* It is against this limitation.
2063 Remove it form the current sifted state. */
2064 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2065 candidates);
2066 if (BE (err != REG_NOERROR, 0))
2067 return err;
2068 --node_idx;
2069 }
2070 }
2071 }
2072 else /* (ent->subexp_to != str_idx) */
2073 {
2074 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2075 {
2076 Idx node = dest_nodes->elems[node_idx];
2077 re_token_type_t type = dfa->nodes[node].type;
2078 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2079 {
2080 if (subexp_idx != dfa->nodes[node].opr.idx)
2081 continue;
2082 /* It is against this limitation.
2083 Remove it form the current sifted state. */
2084 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2085 candidates);
2086 if (BE (err != REG_NOERROR, 0))
2087 return err;
2088 }
2089 }
2090 }
2091 }
2092 return REG_NOERROR;
2093}
2094
2095static reg_errcode_t
2096__attribute_warn_unused_result__
2097sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2098 Idx str_idx, const re_node_set *candidates)
2099{
2100 const re_dfa_t *const dfa = mctx->dfa;
2101 reg_errcode_t err;
2102 Idx node_idx, node;
2103 re_sift_context_t local_sctx;
2104 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2105
2106 if (first_idx == -1)
2107 return REG_NOERROR;
2108
2109 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2110
2111 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2112 {
2113 Idx enabled_idx;
2114 re_token_type_t type;
2115 struct re_backref_cache_entry *entry;
2116 node = candidates->elems[node_idx];
2117 type = dfa->nodes[node].type;
2118 /* Avoid infinite loop for the REs like "()\1+". */
2119 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2120 continue;
2121 if (type != OP_BACK_REF)
2122 continue;
2123
2124 entry = mctx->bkref_ents + first_idx;
2125 enabled_idx = first_idx;
2126 do
2127 {
2128 Idx subexp_len;
2129 Idx to_idx;
2130 Idx dst_node;
2131 bool ok;
2132 re_dfastate_t *cur_state;
2133
2134 if (entry->node != node)
2135 continue;
2136 subexp_len = entry->subexp_to - entry->subexp_from;
2137 to_idx = str_idx + subexp_len;
2138 dst_node = (subexp_len ? dfa->nexts[node]
2139 : dfa->edests[node].elems[0]);
2140
2141 if (to_idx > sctx->last_str_idx
2142 || sctx->sifted_states[to_idx] == NULL
2143 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2144 || check_dst_limits (mctx, &sctx->limits, node,
2145 str_idx, dst_node, to_idx))
2146 continue;
2147
2148 if (local_sctx.sifted_states == NULL)
2149 {
2150 local_sctx = *sctx;
2151 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2152 if (BE (err != REG_NOERROR, 0))
2153 goto free_return;
2154 }
2155 local_sctx.last_node = node;
2156 local_sctx.last_str_idx = str_idx;
2157 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2158 if (BE (! ok, 0))
2159 {
2160 err = REG_ESPACE;
2161 goto free_return;
2162 }
2163 cur_state = local_sctx.sifted_states[str_idx];
2164 err = sift_states_backward (mctx, &local_sctx);
2165 if (BE (err != REG_NOERROR, 0))
2166 goto free_return;
2167 if (sctx->limited_states != NULL)
2168 {
2169 err = merge_state_array (dfa, sctx->limited_states,
2170 local_sctx.sifted_states,
2171 str_idx + 1);
2172 if (BE (err != REG_NOERROR, 0))
2173 goto free_return;
2174 }
2175 local_sctx.sifted_states[str_idx] = cur_state;
2176 re_node_set_remove (&local_sctx.limits, enabled_idx);
2177
2178 /* mctx->bkref_ents may have changed, reload the pointer. */
2179 entry = mctx->bkref_ents + enabled_idx;
2180 }
2181 while (enabled_idx++, entry++->more);
2182 }
2183 err = REG_NOERROR;
2184 free_return:
2185 if (local_sctx.sifted_states != NULL)
2186 {
2187 re_node_set_free (&local_sctx.limits);
2188 }
2189
2190 return err;
2191}
2192
2193
2194#ifdef RE_ENABLE_I18N
2195static int
2196sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2197 Idx node_idx, Idx str_idx, Idx max_str_idx)
2198{
2199 const re_dfa_t *const dfa = mctx->dfa;
2200 int naccepted;
2201 /* Check the node can accept "multi byte". */
2202 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2203 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2204 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2205 dfa->nexts[node_idx]))
2206 /* The node can't accept the "multi byte", or the
2207 destination was already thrown away, then the node
2208 could't accept the current input "multi byte". */
2209 naccepted = 0;
2210 /* Otherwise, it is sure that the node could accept
2211 'naccepted' bytes input. */
2212 return naccepted;
2213}
2214#endif /* RE_ENABLE_I18N */
2215
2216
2217/* Functions for state transition. */
2218
2219/* Return the next state to which the current state STATE will transit by
2220 accepting the current input byte, and update STATE_LOG if necessary.
2221 If STATE can accept a multibyte char/collating element/back reference
2222 update the destination of STATE_LOG. */
2223
2224static re_dfastate_t *
2225__attribute_warn_unused_result__
2226transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2227 re_dfastate_t *state)
2228{
2229 re_dfastate_t **trtable;
2230 unsigned char ch;
2231
2232#ifdef RE_ENABLE_I18N
2233 /* If the current state can accept multibyte. */
2234 if (BE (state->accept_mb, 0))
2235 {
2236 *err = transit_state_mb (mctx, state);
2237 if (BE (*err != REG_NOERROR, 0))
2238 return NULL;
2239 }
2240#endif /* RE_ENABLE_I18N */
2241
2242 /* Then decide the next state with the single byte. */
2243#if 0
2244 if (0)
2245 /* don't use transition table */
2246 return transit_state_sb (err, mctx, state);
2247#endif
2248
2249 /* Use transition table */
2250 ch = re_string_fetch_byte (&mctx->input);
2251 for (;;)
2252 {
2253 trtable = state->trtable;
2254 if (BE (trtable != NULL, 1))
2255 return trtable[ch];
2256
2257 trtable = state->word_trtable;
2258 if (BE (trtable != NULL, 1))
2259 {
2260 unsigned int context;
2261 context
2262 = re_string_context_at (&mctx->input,
2263 re_string_cur_idx (&mctx->input) - 1,
2264 mctx->eflags);
2265 if (IS_WORD_CONTEXT (context))
2266 return trtable[ch + SBC_MAX];
2267 else
2268 return trtable[ch];
2269 }
2270
2271 if (!build_trtable (mctx->dfa, state))
2272 {
2273 *err = REG_ESPACE;
2274 return NULL;
2275 }
2276
2277 /* Retry, we now have a transition table. */
2278 }
2279}
2280
2281/* Update the state_log if we need */
2282static re_dfastate_t *
2283merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2284 re_dfastate_t *next_state)
2285{
2286 const re_dfa_t *const dfa = mctx->dfa;
2287 Idx cur_idx = re_string_cur_idx (&mctx->input);
2288
2289 if (cur_idx > mctx->state_log_top)
2290 {
2291 mctx->state_log[cur_idx] = next_state;
2292 mctx->state_log_top = cur_idx;
2293 }
2294 else if (mctx->state_log[cur_idx] == 0)
2295 {
2296 mctx->state_log[cur_idx] = next_state;
2297 }
2298 else
2299 {
2300 re_dfastate_t *pstate;
2301 unsigned int context;
2302 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2303 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2304 the destination of a multibyte char/collating element/
2305 back reference. Then the next state is the union set of
2306 these destinations and the results of the transition table. */
2307 pstate = mctx->state_log[cur_idx];
2308 log_nodes = pstate->entrance_nodes;
2309 if (next_state != NULL)
2310 {
2311 table_nodes = next_state->entrance_nodes;
2312 *err = re_node_set_init_union (&next_nodes, table_nodes,
2313 log_nodes);
2314 if (BE (*err != REG_NOERROR, 0))
2315 return NULL;
2316 }
2317 else
2318 next_nodes = *log_nodes;
2319 /* Note: We already add the nodes of the initial state,
2320 then we don't need to add them here. */
2321
2322 context = re_string_context_at (&mctx->input,
2323 re_string_cur_idx (&mctx->input) - 1,
2324 mctx->eflags);
2325 next_state = mctx->state_log[cur_idx]
2326 = re_acquire_state_context (err, dfa, &next_nodes, context);
2327 /* We don't need to check errors here, since the return value of
2328 this function is next_state and ERR is already set. */
2329
2330 if (table_nodes != NULL)
2331 re_node_set_free (&next_nodes);
2332 }
2333
2334 if (BE (dfa->nbackref, 0) && next_state != NULL)
2335 {
2336 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2337 later. We must check them here, since the back references in the
2338 next state might use them. */
2339 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2340 cur_idx);
2341 if (BE (*err != REG_NOERROR, 0))
2342 return NULL;
2343
2344 /* If the next state has back references. */
2345 if (next_state->has_backref)
2346 {
2347 *err = transit_state_bkref (mctx, &next_state->nodes);
2348 if (BE (*err != REG_NOERROR, 0))
2349 return NULL;
2350 next_state = mctx->state_log[cur_idx];
2351 }
2352 }
2353
2354 return next_state;
2355}
2356
2357/* Skip bytes in the input that correspond to part of a
2358 multi-byte match, then look in the log for a state
2359 from which to restart matching. */
2360static re_dfastate_t *
2361find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2362{
2363 re_dfastate_t *cur_state;
2364 do
2365 {
2366 Idx max = mctx->state_log_top;
2367 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2368
2369 do
2370 {
2371 if (++cur_str_idx > max)
2372 return NULL;
2373 re_string_skip_bytes (&mctx->input, 1);
2374 }
2375 while (mctx->state_log[cur_str_idx] == NULL);
2376
2377 cur_state = merge_state_with_log (err, mctx, NULL);
2378 }
2379 while (*err == REG_NOERROR && cur_state == NULL);
2380 return cur_state;
2381}
2382
2383/* Helper functions for transit_state. */
2384
2385/* From the node set CUR_NODES, pick up the nodes whose types are
2386 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2387 expression. And register them to use them later for evaluating the
2388 corresponding back references. */
2389
2390static reg_errcode_t
2391check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2392 Idx str_idx)
2393{
2394 const re_dfa_t *const dfa = mctx->dfa;
2395 Idx node_idx;
2396 reg_errcode_t err;
2397
2398 /* TODO: This isn't efficient.
2399 Because there might be more than one nodes whose types are
2400 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2401 nodes.
2402 E.g. RE: (a){2} */
2403 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2404 {
2405 Idx node = cur_nodes->elems[node_idx];
2406 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2407 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2408 && (dfa->used_bkref_map
2409 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2410 {
2411 err = match_ctx_add_subtop (mctx, node, str_idx);
2412 if (BE (err != REG_NOERROR, 0))
2413 return err;
2414 }
2415 }
2416 return REG_NOERROR;
2417}
2418
2419#if 0
2420/* Return the next state to which the current state STATE will transit by
2421 accepting the current input byte. */
2422
2423static re_dfastate_t *
2424transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2425 re_dfastate_t *state)
2426{
2427 const re_dfa_t *const dfa = mctx->dfa;
2428 re_node_set next_nodes;
2429 re_dfastate_t *next_state;
2430 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2431 unsigned int context;
2432
2433 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2434 if (BE (*err != REG_NOERROR, 0))
2435 return NULL;
2436 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2437 {
2438 Idx cur_node = state->nodes.elems[node_cnt];
2439 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2440 {
2441 *err = re_node_set_merge (&next_nodes,
2442 dfa->eclosures + dfa->nexts[cur_node]);
2443 if (BE (*err != REG_NOERROR, 0))
2444 {
2445 re_node_set_free (&next_nodes);
2446 return NULL;
2447 }
2448 }
2449 }
2450 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2451 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2452 /* We don't need to check errors here, since the return value of
2453 this function is next_state and ERR is already set. */
2454
2455 re_node_set_free (&next_nodes);
2456 re_string_skip_bytes (&mctx->input, 1);
2457 return next_state;
2458}
2459#endif
2460
2461#ifdef RE_ENABLE_I18N
2462static reg_errcode_t
2463transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2464{
2465 const re_dfa_t *const dfa = mctx->dfa;
2466 reg_errcode_t err;
2467 Idx i;
2468
2469 for (i = 0; i < pstate->nodes.nelem; ++i)
2470 {
2471 re_node_set dest_nodes, *new_nodes;
2472 Idx cur_node_idx = pstate->nodes.elems[i];
2473 int naccepted;
2474 Idx dest_idx;
2475 unsigned int context;
2476 re_dfastate_t *dest_state;
2477
2478 if (!dfa->nodes[cur_node_idx].accept_mb)
2479 continue;
2480
2481 if (dfa->nodes[cur_node_idx].constraint)
2482 {
2483 context = re_string_context_at (&mctx->input,
2484 re_string_cur_idx (&mctx->input),
2485 mctx->eflags);
2486 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2487 context))
2488 continue;
2489 }
2490
2491 /* How many bytes the node can accept? */
2492 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2493 re_string_cur_idx (&mctx->input));
2494 if (naccepted == 0)
2495 continue;
2496
2497 /* The node can accepts 'naccepted' bytes. */
2498 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2499 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2500 : mctx->max_mb_elem_len);
2501 err = clean_state_log_if_needed (mctx, dest_idx);
2502 if (BE (err != REG_NOERROR, 0))
2503 return err;
2504#ifdef DEBUG
2505 assert (dfa->nexts[cur_node_idx] != -1);
2506#endif
2507 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2508
2509 dest_state = mctx->state_log[dest_idx];
2510 if (dest_state == NULL)
2511 dest_nodes = *new_nodes;
2512 else
2513 {
2514 err = re_node_set_init_union (&dest_nodes,
2515 dest_state->entrance_nodes, new_nodes);
2516 if (BE (err != REG_NOERROR, 0))
2517 return err;
2518 }
2519 context = re_string_context_at (&mctx->input, dest_idx - 1,
2520 mctx->eflags);
2521 mctx->state_log[dest_idx]
2522 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2523 if (dest_state != NULL)
2524 re_node_set_free (&dest_nodes);
2525 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2526 return err;
2527 }
2528 return REG_NOERROR;
2529}
2530#endif /* RE_ENABLE_I18N */
2531
2532static reg_errcode_t
2533transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2534{
2535 const re_dfa_t *const dfa = mctx->dfa;
2536 reg_errcode_t err;
2537 Idx i;
2538 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2539
2540 for (i = 0; i < nodes->nelem; ++i)
2541 {
2542 Idx dest_str_idx, prev_nelem, bkc_idx;
2543 Idx node_idx = nodes->elems[i];
2544 unsigned int context;
2545 const re_token_t *node = dfa->nodes + node_idx;
2546 re_node_set *new_dest_nodes;
2547
2548 /* Check whether 'node' is a backreference or not. */
2549 if (node->type != OP_BACK_REF)
2550 continue;
2551
2552 if (node->constraint)
2553 {
2554 context = re_string_context_at (&mctx->input, cur_str_idx,
2555 mctx->eflags);
2556 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2557 continue;
2558 }
2559
2560 /* 'node' is a backreference.
2561 Check the substring which the substring matched. */
2562 bkc_idx = mctx->nbkref_ents;
2563 err = get_subexp (mctx, node_idx, cur_str_idx);
2564 if (BE (err != REG_NOERROR, 0))
2565 goto free_return;
2566
2567 /* And add the epsilon closures (which is 'new_dest_nodes') of
2568 the backreference to appropriate state_log. */
2569#ifdef DEBUG
2570 assert (dfa->nexts[node_idx] != -1);
2571#endif
2572 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2573 {
2574 Idx subexp_len;
2575 re_dfastate_t *dest_state;
2576 struct re_backref_cache_entry *bkref_ent;
2577 bkref_ent = mctx->bkref_ents + bkc_idx;
2578 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2579 continue;
2580 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2581 new_dest_nodes = (subexp_len == 0
2582 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2583 : dfa->eclosures + dfa->nexts[node_idx]);
2584 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2585 - bkref_ent->subexp_from);
2586 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2587 mctx->eflags);
2588 dest_state = mctx->state_log[dest_str_idx];
2589 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2590 : mctx->state_log[cur_str_idx]->nodes.nelem);
2591 /* Add 'new_dest_node' to state_log. */
2592 if (dest_state == NULL)
2593 {
2594 mctx->state_log[dest_str_idx]
2595 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2596 context);
2597 if (BE (mctx->state_log[dest_str_idx] == NULL
2598 && err != REG_NOERROR, 0))
2599 goto free_return;
2600 }
2601 else
2602 {
2603 re_node_set dest_nodes;
2604 err = re_node_set_init_union (&dest_nodes,
2605 dest_state->entrance_nodes,
2606 new_dest_nodes);
2607 if (BE (err != REG_NOERROR, 0))
2608 {
2609 re_node_set_free (&dest_nodes);
2610 goto free_return;
2611 }
2612 mctx->state_log[dest_str_idx]
2613 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2614 re_node_set_free (&dest_nodes);
2615 if (BE (mctx->state_log[dest_str_idx] == NULL
2616 && err != REG_NOERROR, 0))
2617 goto free_return;
2618 }
2619 /* We need to check recursively if the backreference can epsilon
2620 transit. */
2621 if (subexp_len == 0
2622 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2623 {
2624 err = check_subexp_matching_top (mctx, new_dest_nodes,
2625 cur_str_idx);
2626 if (BE (err != REG_NOERROR, 0))
2627 goto free_return;
2628 err = transit_state_bkref (mctx, new_dest_nodes);
2629 if (BE (err != REG_NOERROR, 0))
2630 goto free_return;
2631 }
2632 }
2633 }
2634 err = REG_NOERROR;
2635 free_return:
2636 return err;
2637}
2638
2639/* Enumerate all the candidates which the backreference BKREF_NODE can match
2640 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2641 Note that we might collect inappropriate candidates here.
2642 However, the cost of checking them strictly here is too high, then we
2643 delay these checking for prune_impossible_nodes(). */
2644
2645static reg_errcode_t
2646__attribute_warn_unused_result__
2647get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2648{
2649 const re_dfa_t *const dfa = mctx->dfa;
2650 Idx subexp_num, sub_top_idx;
2651 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2652 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2653 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2654 if (cache_idx != -1)
2655 {
2656 const struct re_backref_cache_entry *entry
2657 = mctx->bkref_ents + cache_idx;
2658 do
2659 if (entry->node == bkref_node)
2660 return REG_NOERROR; /* We already checked it. */
2661 while (entry++->more);
2662 }
2663
2664 subexp_num = dfa->nodes[bkref_node].opr.idx;
2665
2666 /* For each sub expression */
2667 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2668 {
2669 reg_errcode_t err;
2670 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2671 re_sub_match_last_t *sub_last;
2672 Idx sub_last_idx, sl_str, bkref_str_off;
2673
2674 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2675 continue; /* It isn't related. */
2676
2677 sl_str = sub_top->str_idx;
2678 bkref_str_off = bkref_str_idx;
2679 /* At first, check the last node of sub expressions we already
2680 evaluated. */
2681 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2682 {
2683 regoff_t sl_str_diff;
2684 sub_last = sub_top->lasts[sub_last_idx];
2685 sl_str_diff = sub_last->str_idx - sl_str;
2686 /* The matched string by the sub expression match with the substring
2687 at the back reference? */
2688 if (sl_str_diff > 0)
2689 {
2690 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2691 {
2692 /* Not enough chars for a successful match. */
2693 if (bkref_str_off + sl_str_diff > mctx->input.len)
2694 break;
2695
2696 err = clean_state_log_if_needed (mctx,
2697 bkref_str_off
2698 + sl_str_diff);
2699 if (BE (err != REG_NOERROR, 0))
2700 return err;
2701 buf = (const char *) re_string_get_buffer (&mctx->input);
2702 }
2703 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2704 /* We don't need to search this sub expression any more. */
2705 break;
2706 }
2707 bkref_str_off += sl_str_diff;
2708 sl_str += sl_str_diff;
2709 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2710 bkref_str_idx);
2711
2712 /* Reload buf, since the preceding call might have reallocated
2713 the buffer. */
2714 buf = (const char *) re_string_get_buffer (&mctx->input);
2715
2716 if (err == REG_NOMATCH)
2717 continue;
2718 if (BE (err != REG_NOERROR, 0))
2719 return err;
2720 }
2721
2722 if (sub_last_idx < sub_top->nlasts)
2723 continue;
2724 if (sub_last_idx > 0)
2725 ++sl_str;
2726 /* Then, search for the other last nodes of the sub expression. */
2727 for (; sl_str <= bkref_str_idx; ++sl_str)
2728 {
2729 Idx cls_node;
2730 regoff_t sl_str_off;
2731 const re_node_set *nodes;
2732 sl_str_off = sl_str - sub_top->str_idx;
2733 /* The matched string by the sub expression match with the substring
2734 at the back reference? */
2735 if (sl_str_off > 0)
2736 {
2737 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2738 {
2739 /* If we are at the end of the input, we cannot match. */
2740 if (bkref_str_off >= mctx->input.len)
2741 break;
2742
2743 err = extend_buffers (mctx, bkref_str_off + 1);
2744 if (BE (err != REG_NOERROR, 0))
2745 return err;
2746
2747 buf = (const char *) re_string_get_buffer (&mctx->input);
2748 }
2749 if (buf [bkref_str_off++] != buf[sl_str - 1])
2750 break; /* We don't need to search this sub expression
2751 any more. */
2752 }
2753 if (mctx->state_log[sl_str] == NULL)
2754 continue;
2755 /* Does this state have a ')' of the sub expression? */
2756 nodes = &mctx->state_log[sl_str]->nodes;
2757 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2758 OP_CLOSE_SUBEXP);
2759 if (cls_node == -1)
2760 continue; /* No. */
2761 if (sub_top->path == NULL)
2762 {
2763 sub_top->path = calloc (sizeof (state_array_t),
2764 sl_str - sub_top->str_idx + 1);
2765 if (sub_top->path == NULL)
2766 return REG_ESPACE;
2767 }
2768 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2769 in the current context? */
2770 err = check_arrival (mctx, sub_top->path, sub_top->node,
2771 sub_top->str_idx, cls_node, sl_str,
2772 OP_CLOSE_SUBEXP);
2773 if (err == REG_NOMATCH)
2774 continue;
2775 if (BE (err != REG_NOERROR, 0))
2776 return err;
2777 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2778 if (BE (sub_last == NULL, 0))
2779 return REG_ESPACE;
2780 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2781 bkref_str_idx);
2782 if (err == REG_NOMATCH)
2783 continue;
2784 }
2785 }
2786 return REG_NOERROR;
2787}
2788
2789/* Helper functions for get_subexp(). */
2790
2791/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2792 If it can arrive, register the sub expression expressed with SUB_TOP
2793 and SUB_LAST. */
2794
2795static reg_errcode_t
2796get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2797 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2798{
2799 reg_errcode_t err;
2800 Idx to_idx;
2801 /* Can the subexpression arrive the back reference? */
2802 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2803 sub_last->str_idx, bkref_node, bkref_str,
2804 OP_OPEN_SUBEXP);
2805 if (err != REG_NOERROR)
2806 return err;
2807 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2808 sub_last->str_idx);
2809 if (BE (err != REG_NOERROR, 0))
2810 return err;
2811 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2812 return clean_state_log_if_needed (mctx, to_idx);
2813}
2814
2815/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2816 Search '(' if FL_OPEN, or search ')' otherwise.
2817 TODO: This function isn't efficient...
2818 Because there might be more than one nodes whose types are
2819 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2820 nodes.
2821 E.g. RE: (a){2} */
2822
2823static Idx
2824find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2825 Idx subexp_idx, int type)
2826{
2827 Idx cls_idx;
2828 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2829 {
2830 Idx cls_node = nodes->elems[cls_idx];
2831 const re_token_t *node = dfa->nodes + cls_node;
2832 if (node->type == type
2833 && node->opr.idx == subexp_idx)
2834 return cls_node;
2835 }
2836 return -1;
2837}
2838
2839/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2840 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2841 heavily reused.
2842 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2843
2844static reg_errcode_t
2845__attribute_warn_unused_result__
2846check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2847 Idx top_str, Idx last_node, Idx last_str, int type)
2848{
2849 const re_dfa_t *const dfa = mctx->dfa;
2850 reg_errcode_t err = REG_NOERROR;
2851 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2852 re_dfastate_t *cur_state = NULL;
2853 re_node_set *cur_nodes, next_nodes;
2854 re_dfastate_t **backup_state_log;
2855 unsigned int context;
2856
2857 subexp_num = dfa->nodes[top_node].opr.idx;
2858 /* Extend the buffer if we need. */
2859 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2860 {
2861 re_dfastate_t **new_array;
2862 Idx old_alloc = path->alloc;
2863 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2864 Idx new_alloc;
2865 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2866 return REG_ESPACE;
2867 new_alloc = old_alloc + incr_alloc;
2868 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2869 return REG_ESPACE;
2870 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2871 if (BE (new_array == NULL, 0))
2872 return REG_ESPACE;
2873 path->array = new_array;
2874 path->alloc = new_alloc;
2875 memset (new_array + old_alloc, '\0',
2876 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2877 }
2878
2879 str_idx = path->next_idx ? path->next_idx : top_str;
2880
2881 /* Temporary modify MCTX. */
2882 backup_state_log = mctx->state_log;
2883 backup_cur_idx = mctx->input.cur_idx;
2884 mctx->state_log = path->array;
2885 mctx->input.cur_idx = str_idx;
2886
2887 /* Setup initial node set. */
2888 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2889 if (str_idx == top_str)
2890 {
2891 err = re_node_set_init_1 (&next_nodes, top_node);
2892 if (BE (err != REG_NOERROR, 0))
2893 return err;
2894 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2895 if (BE (err != REG_NOERROR, 0))
2896 {
2897 re_node_set_free (&next_nodes);
2898 return err;
2899 }
2900 }
2901 else
2902 {
2903 cur_state = mctx->state_log[str_idx];
2904 if (cur_state && cur_state->has_backref)
2905 {
2906 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2907 if (BE (err != REG_NOERROR, 0))
2908 return err;
2909 }
2910 else
2911 re_node_set_init_empty (&next_nodes);
2912 }
2913 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2914 {
2915 if (next_nodes.nelem)
2916 {
2917 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2918 subexp_num, type);
2919 if (BE (err != REG_NOERROR, 0))
2920 {
2921 re_node_set_free (&next_nodes);
2922 return err;
2923 }
2924 }
2925 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2926 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2927 {
2928 re_node_set_free (&next_nodes);
2929 return err;
2930 }
2931 mctx->state_log[str_idx] = cur_state;
2932 }
2933
2934 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2935 {
2936 re_node_set_empty (&next_nodes);
2937 if (mctx->state_log[str_idx + 1])
2938 {
2939 err = re_node_set_merge (&next_nodes,
2940 &mctx->state_log[str_idx + 1]->nodes);
2941 if (BE (err != REG_NOERROR, 0))
2942 {
2943 re_node_set_free (&next_nodes);
2944 return err;
2945 }
2946 }
2947 if (cur_state)
2948 {
2949 err = check_arrival_add_next_nodes (mctx, str_idx,
2950 &cur_state->non_eps_nodes,
2951 &next_nodes);
2952 if (BE (err != REG_NOERROR, 0))
2953 {
2954 re_node_set_free (&next_nodes);
2955 return err;
2956 }
2957 }
2958 ++str_idx;
2959 if (next_nodes.nelem)
2960 {
2961 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2962 if (BE (err != REG_NOERROR, 0))
2963 {
2964 re_node_set_free (&next_nodes);
2965 return err;
2966 }
2967 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2968 subexp_num, type);
2969 if (BE (err != REG_NOERROR, 0))
2970 {
2971 re_node_set_free (&next_nodes);
2972 return err;
2973 }
2974 }
2975 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2976 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2977 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2978 {
2979 re_node_set_free (&next_nodes);
2980 return err;
2981 }
2982 mctx->state_log[str_idx] = cur_state;
2983 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2984 }
2985 re_node_set_free (&next_nodes);
2986 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2987 : &mctx->state_log[last_str]->nodes);
2988 path->next_idx = str_idx;
2989
2990 /* Fix MCTX. */
2991 mctx->state_log = backup_state_log;
2992 mctx->input.cur_idx = backup_cur_idx;
2993
2994 /* Then check the current node set has the node LAST_NODE. */
2995 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2996 return REG_NOERROR;
2997
2998 return REG_NOMATCH;
2999}
3000
3001/* Helper functions for check_arrival. */
3002
3003/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3004 to NEXT_NODES.
3005 TODO: This function is similar to the functions transit_state*(),
3006 however this function has many additional works.
3007 Can't we unify them? */
3008
3009static reg_errcode_t
3010__attribute_warn_unused_result__
3011check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3012 re_node_set *cur_nodes, re_node_set *next_nodes)
3013{
3014 const re_dfa_t *const dfa = mctx->dfa;
3015 bool ok;
3016 Idx cur_idx;
3017#ifdef RE_ENABLE_I18N
3018 reg_errcode_t err = REG_NOERROR;
3019#endif
3020 re_node_set union_set;
3021 re_node_set_init_empty (&union_set);
3022 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3023 {
3024 int naccepted = 0;
3025 Idx cur_node = cur_nodes->elems[cur_idx];
3026#ifdef DEBUG
3027 re_token_type_t type = dfa->nodes[cur_node].type;
3028 assert (!IS_EPSILON_NODE (type));
3029#endif
3030#ifdef RE_ENABLE_I18N
3031 /* If the node may accept "multi byte". */
3032 if (dfa->nodes[cur_node].accept_mb)
3033 {
3034 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3035 str_idx);
3036 if (naccepted > 1)
3037 {
3038 re_dfastate_t *dest_state;
3039 Idx next_node = dfa->nexts[cur_node];
3040 Idx next_idx = str_idx + naccepted;
3041 dest_state = mctx->state_log[next_idx];
3042 re_node_set_empty (&union_set);
3043 if (dest_state)
3044 {
3045 err = re_node_set_merge (&union_set, &dest_state->nodes);
3046 if (BE (err != REG_NOERROR, 0))
3047 {
3048 re_node_set_free (&union_set);
3049 return err;
3050 }
3051 }
3052 ok = re_node_set_insert (&union_set, next_node);
3053 if (BE (! ok, 0))
3054 {
3055 re_node_set_free (&union_set);
3056 return REG_ESPACE;
3057 }
3058 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3059 &union_set);
3060 if (BE (mctx->state_log[next_idx] == NULL
3061 && err != REG_NOERROR, 0))
3062 {
3063 re_node_set_free (&union_set);
3064 return err;
3065 }
3066 }
3067 }
3068#endif /* RE_ENABLE_I18N */
3069 if (naccepted
3070 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3071 {
3072 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3073 if (BE (! ok, 0))
3074 {
3075 re_node_set_free (&union_set);
3076 return REG_ESPACE;
3077 }
3078 }
3079 }
3080 re_node_set_free (&union_set);
3081 return REG_NOERROR;
3082}
3083
3084/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3085 CUR_NODES, however exclude the nodes which are:
3086 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3087 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3088*/
3089
3090static reg_errcode_t
3091check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3092 Idx ex_subexp, int type)
3093{
3094 reg_errcode_t err;
3095 Idx idx, outside_node;
3096 re_node_set new_nodes;
3097#ifdef DEBUG
3098 assert (cur_nodes->nelem);
3099#endif
3100 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3101 if (BE (err != REG_NOERROR, 0))
3102 return err;
3103 /* Create a new node set NEW_NODES with the nodes which are epsilon
3104 closures of the node in CUR_NODES. */
3105
3106 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3107 {
3108 Idx cur_node = cur_nodes->elems[idx];
3109 const re_node_set *eclosure = dfa->eclosures + cur_node;
3110 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3111 if (outside_node == -1)
3112 {
3113 /* There are no problematic nodes, just merge them. */
3114 err = re_node_set_merge (&new_nodes, eclosure);
3115 if (BE (err != REG_NOERROR, 0))
3116 {
3117 re_node_set_free (&new_nodes);
3118 return err;
3119 }
3120 }
3121 else
3122 {
3123 /* There are problematic nodes, re-calculate incrementally. */
3124 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3125 ex_subexp, type);
3126 if (BE (err != REG_NOERROR, 0))
3127 {
3128 re_node_set_free (&new_nodes);
3129 return err;
3130 }
3131 }
3132 }
3133 re_node_set_free (cur_nodes);
3134 *cur_nodes = new_nodes;
3135 return REG_NOERROR;
3136}
3137
3138/* Helper function for check_arrival_expand_ecl.
3139 Check incrementally the epsilon closure of TARGET, and if it isn't
3140 problematic append it to DST_NODES. */
3141
3142static reg_errcode_t
3143__attribute_warn_unused_result__
3144check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3145 Idx target, Idx ex_subexp, int type)
3146{
3147 Idx cur_node;
3148 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3149 {
3150 bool ok;
3151
3152 if (dfa->nodes[cur_node].type == type
3153 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3154 {
3155 if (type == OP_CLOSE_SUBEXP)
3156 {
3157 ok = re_node_set_insert (dst_nodes, cur_node);
3158 if (BE (! ok, 0))
3159 return REG_ESPACE;
3160 }
3161 break;
3162 }
3163 ok = re_node_set_insert (dst_nodes, cur_node);
3164 if (BE (! ok, 0))
3165 return REG_ESPACE;
3166 if (dfa->edests[cur_node].nelem == 0)
3167 break;
3168 if (dfa->edests[cur_node].nelem == 2)
3169 {
3170 reg_errcode_t err;
3171 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3172 dfa->edests[cur_node].elems[1],
3173 ex_subexp, type);
3174 if (BE (err != REG_NOERROR, 0))
3175 return err;
3176 }
3177 cur_node = dfa->edests[cur_node].elems[0];
3178 }
3179 return REG_NOERROR;
3180}
3181
3182
3183/* For all the back references in the current state, calculate the
3184 destination of the back references by the appropriate entry
3185 in MCTX->BKREF_ENTS. */
3186
3187static reg_errcode_t
3188__attribute_warn_unused_result__
3189expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3190 Idx cur_str, Idx subexp_num, int type)
3191{
3192 const re_dfa_t *const dfa = mctx->dfa;
3193 reg_errcode_t err;
3194 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3195 struct re_backref_cache_entry *ent;
3196
3197 if (cache_idx_start == -1)
3198 return REG_NOERROR;
3199
3200 restart:
3201 ent = mctx->bkref_ents + cache_idx_start;
3202 do
3203 {
3204 Idx to_idx, next_node;
3205
3206 /* Is this entry ENT is appropriate? */
3207 if (!re_node_set_contains (cur_nodes, ent->node))
3208 continue; /* No. */
3209
3210 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3211 /* Calculate the destination of the back reference, and append it
3212 to MCTX->STATE_LOG. */
3213 if (to_idx == cur_str)
3214 {
3215 /* The backreference did epsilon transit, we must re-check all the
3216 node in the current state. */
3217 re_node_set new_dests;
3218 reg_errcode_t err2, err3;
3219 next_node = dfa->edests[ent->node].elems[0];
3220 if (re_node_set_contains (cur_nodes, next_node))
3221 continue;
3222 err = re_node_set_init_1 (&new_dests, next_node);
3223 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3224 err3 = re_node_set_merge (cur_nodes, &new_dests);
3225 re_node_set_free (&new_dests);
3226 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3227 || err3 != REG_NOERROR, 0))
3228 {
3229 err = (err != REG_NOERROR ? err
3230 : (err2 != REG_NOERROR ? err2 : err3));
3231 return err;
3232 }
3233 /* TODO: It is still inefficient... */
3234 goto restart;
3235 }
3236 else
3237 {
3238 re_node_set union_set;
3239 next_node = dfa->nexts[ent->node];
3240 if (mctx->state_log[to_idx])
3241 {
3242 bool ok;
3243 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3244 next_node))
3245 continue;
3246 err = re_node_set_init_copy (&union_set,
3247 &mctx->state_log[to_idx]->nodes);
3248 ok = re_node_set_insert (&union_set, next_node);
3249 if (BE (err != REG_NOERROR || ! ok, 0))
3250 {
3251 re_node_set_free (&union_set);
3252 err = err != REG_NOERROR ? err : REG_ESPACE;
3253 return err;
3254 }
3255 }
3256 else
3257 {
3258 err = re_node_set_init_1 (&union_set, next_node);
3259 if (BE (err != REG_NOERROR, 0))
3260 return err;
3261 }
3262 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3263 re_node_set_free (&union_set);
3264 if (BE (mctx->state_log[to_idx] == NULL
3265 && err != REG_NOERROR, 0))
3266 return err;
3267 }
3268 }
3269 while (ent++->more);
3270 return REG_NOERROR;
3271}
3272
3273/* Build transition table for the state.
3274 Return true if successful. */
3275
3276static bool
3277build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3278{
3279 reg_errcode_t err;
3280 Idx i, j;
3281 int ch;
3282 bool need_word_trtable = false;
3283 bitset_word_t elem, mask;
3284 bool dests_node_malloced = false;
3285 bool dest_states_malloced = false;
3286 Idx ndests; /* Number of the destination states from 'state'. */
3287 re_dfastate_t **trtable;
3288 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3289 re_node_set follows, *dests_node;
3290 bitset_t *dests_ch;
3291 bitset_t acceptable;
3292
3293 struct dests_alloc
3294 {
3295 re_node_set dests_node[SBC_MAX];
3296 bitset_t dests_ch[SBC_MAX];
3297 } *dests_alloc;
3298
3299 /* We build DFA states which corresponds to the destination nodes
3300 from 'state'. 'dests_node[i]' represents the nodes which i-th
3301 destination state contains, and 'dests_ch[i]' represents the
3302 characters which i-th destination state accepts. */
3303 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3304 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3305 else
3306 {
3307 dests_alloc = re_malloc (struct dests_alloc, 1);
3308 if (BE (dests_alloc == NULL, 0))
3309 return false;
3310 dests_node_malloced = true;
3311 }
3312 dests_node = dests_alloc->dests_node;
3313 dests_ch = dests_alloc->dests_ch;
3314
3315 /* Initialize transition table. */
3316 state->word_trtable = state->trtable = NULL;
3317
3318 /* At first, group all nodes belonging to 'state' into several
3319 destinations. */
3320 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3321 if (BE (ndests <= 0, 0))
3322 {
3323 if (dests_node_malloced)
3324 re_free (dests_alloc);
3325 /* Return false in case of an error, true otherwise. */
3326 if (ndests == 0)
3327 {
3328 state->trtable = (re_dfastate_t **)
3329 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3330 if (BE (state->trtable == NULL, 0))
3331 return false;
3332 return true;
3333 }
3334 return false;
3335 }
3336
3337 err = re_node_set_alloc (&follows, ndests + 1);
3338 if (BE (err != REG_NOERROR, 0))
3339 goto out_free;
3340
3341 /* Avoid arithmetic overflow in size calculation. */
3342 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3343 / (3 * sizeof (re_dfastate_t *)))
3344 < ndests),
3345 0))
3346 goto out_free;
3347
3348 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3349 + ndests * 3 * sizeof (re_dfastate_t *)))
3350 dest_states = (re_dfastate_t **)
3351 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3352 else
3353 {
3354 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3355 if (BE (dest_states == NULL, 0))
3356 {
3357out_free:
3358 if (dest_states_malloced)
3359 re_free (dest_states);
3360 re_node_set_free (&follows);
3361 for (i = 0; i < ndests; ++i)
3362 re_node_set_free (dests_node + i);
3363 if (dests_node_malloced)
3364 re_free (dests_alloc);
3365 return false;
3366 }
3367 dest_states_malloced = true;
3368 }
3369 dest_states_word = dest_states + ndests;
3370 dest_states_nl = dest_states_word + ndests;
3371 bitset_empty (acceptable);
3372
3373 /* Then build the states for all destinations. */
3374 for (i = 0; i < ndests; ++i)
3375 {
3376 Idx next_node;
3377 re_node_set_empty (&follows);
3378 /* Merge the follows of this destination states. */
3379 for (j = 0; j < dests_node[i].nelem; ++j)
3380 {
3381 next_node = dfa->nexts[dests_node[i].elems[j]];
3382 if (next_node != -1)
3383 {
3384 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3385 if (BE (err != REG_NOERROR, 0))
3386 goto out_free;
3387 }
3388 }
3389 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3390 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3391 goto out_free;
3392 /* If the new state has context constraint,
3393 build appropriate states for these contexts. */
3394 if (dest_states[i]->has_constraint)
3395 {
3396 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3397 CONTEXT_WORD);
3398 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3399 goto out_free;
3400
3401 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3402 need_word_trtable = true;
3403
3404 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3405 CONTEXT_NEWLINE);
3406 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3407 goto out_free;
3408 }
3409 else
3410 {
3411 dest_states_word[i] = dest_states[i];
3412 dest_states_nl[i] = dest_states[i];
3413 }
3414 bitset_merge (acceptable, dests_ch[i]);
3415 }
3416
3417 if (!BE (need_word_trtable, 0))
3418 {
3419 /* We don't care about whether the following character is a word
3420 character, or we are in a single-byte character set so we can
3421 discern by looking at the character code: allocate a
3422 256-entry transition table. */
3423 trtable = state->trtable =
3424 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3425 if (BE (trtable == NULL, 0))
3426 goto out_free;
3427
3428 /* For all characters ch...: */
3429 for (i = 0; i < BITSET_WORDS; ++i)
3430 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3431 elem;
3432 mask <<= 1, elem >>= 1, ++ch)
3433 if (BE (elem & 1, 0))
3434 {
3435 /* There must be exactly one destination which accepts
3436 character ch. See group_nodes_into_DFAstates. */
3437 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3438 ;
3439
3440 /* j-th destination accepts the word character ch. */
3441 if (dfa->word_char[i] & mask)
3442 trtable[ch] = dest_states_word[j];
3443 else
3444 trtable[ch] = dest_states[j];
3445 }
3446 }
3447 else
3448 {
3449 /* We care about whether the following character is a word
3450 character, and we are in a multi-byte character set: discern
3451 by looking at the character code: build two 256-entry
3452 transition tables, one starting at trtable[0] and one
3453 starting at trtable[SBC_MAX]. */
3454 trtable = state->word_trtable =
3455 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3456 if (BE (trtable == NULL, 0))
3457 goto out_free;
3458
3459 /* For all characters ch...: */
3460 for (i = 0; i < BITSET_WORDS; ++i)
3461 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3462 elem;
3463 mask <<= 1, elem >>= 1, ++ch)
3464 if (BE (elem & 1, 0))
3465 {
3466 /* There must be exactly one destination which accepts
3467 character ch. See group_nodes_into_DFAstates. */
3468 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3469 ;
3470
3471 /* j-th destination accepts the word character ch. */
3472 trtable[ch] = dest_states[j];
3473 trtable[ch + SBC_MAX] = dest_states_word[j];
3474 }
3475 }
3476
3477 /* new line */
3478 if (bitset_contain (acceptable, NEWLINE_CHAR))
3479 {
3480 /* The current state accepts newline character. */
3481 for (j = 0; j < ndests; ++j)
3482 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3483 {
3484 /* k-th destination accepts newline character. */
3485 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3486 if (need_word_trtable)
3487 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3488 /* There must be only one destination which accepts
3489 newline. See group_nodes_into_DFAstates. */
3490 break;
3491 }
3492 }
3493
3494 if (dest_states_malloced)
3495 re_free (dest_states);
3496
3497 re_node_set_free (&follows);
3498 for (i = 0; i < ndests; ++i)
3499 re_node_set_free (dests_node + i);
3500
3501 if (dests_node_malloced)
3502 re_free (dests_alloc);
3503
3504 return true;
3505}
3506
3507/* Group all nodes belonging to STATE into several destinations.
3508 Then for all destinations, set the nodes belonging to the destination
3509 to DESTS_NODE[i] and set the characters accepted by the destination
3510 to DEST_CH[i]. This function return the number of destinations. */
3511
3512static Idx
3513group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3514 re_node_set *dests_node, bitset_t *dests_ch)
3515{
3516 reg_errcode_t err;
3517 bool ok;
3518 Idx i, j, k;
3519 Idx ndests; /* Number of the destinations from 'state'. */
3520 bitset_t accepts; /* Characters a node can accept. */
3521 const re_node_set *cur_nodes = &state->nodes;
3522 bitset_empty (accepts);
3523 ndests = 0;
3524
3525 /* For all the nodes belonging to 'state', */
3526 for (i = 0; i < cur_nodes->nelem; ++i)
3527 {
3528 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3529 re_token_type_t type = node->type;
3530 unsigned int constraint = node->constraint;
3531
3532 /* Enumerate all single byte character this node can accept. */
3533 if (type == CHARACTER)
3534 bitset_set (accepts, node->opr.c);
3535 else if (type == SIMPLE_BRACKET)
3536 {
3537 bitset_merge (accepts, node->opr.sbcset);
3538 }
3539 else if (type == OP_PERIOD)
3540 {
3541#ifdef RE_ENABLE_I18N
3542 if (dfa->mb_cur_max > 1)
3543 bitset_merge (accepts, dfa->sb_char);
3544 else
3545#endif
3546 bitset_set_all (accepts);
3547 if (!(dfa->syntax & RE_DOT_NEWLINE))
3548 bitset_clear (accepts, '\n');
3549 if (dfa->syntax & RE_DOT_NOT_NULL)
3550 bitset_clear (accepts, '\0');
3551 }
3552#ifdef RE_ENABLE_I18N
3553 else if (type == OP_UTF8_PERIOD)
3554 {
3555 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3556 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3557 else
3558 bitset_merge (accepts, utf8_sb_map);
3559 if (!(dfa->syntax & RE_DOT_NEWLINE))
3560 bitset_clear (accepts, '\n');
3561 if (dfa->syntax & RE_DOT_NOT_NULL)
3562 bitset_clear (accepts, '\0');
3563 }
3564#endif
3565 else
3566 continue;
3567
3568 /* Check the 'accepts' and sift the characters which are not
3569 match it the context. */
3570 if (constraint)
3571 {
3572 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3573 {
3574 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3575 bitset_empty (accepts);
3576 if (accepts_newline)
3577 bitset_set (accepts, NEWLINE_CHAR);
3578 else
3579 continue;
3580 }
3581 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3582 {
3583 bitset_empty (accepts);
3584 continue;
3585 }
3586
3587 if (constraint & NEXT_WORD_CONSTRAINT)
3588 {
3589 bitset_word_t any_set = 0;
3590 if (type == CHARACTER && !node->word_char)
3591 {
3592 bitset_empty (accepts);
3593 continue;
3594 }
3595#ifdef RE_ENABLE_I18N
3596 if (dfa->mb_cur_max > 1)
3597 for (j = 0; j < BITSET_WORDS; ++j)
3598 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3599 else
3600#endif
3601 for (j = 0; j < BITSET_WORDS; ++j)
3602 any_set |= (accepts[j] &= dfa->word_char[j]);
3603 if (!any_set)
3604 continue;
3605 }
3606 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3607 {
3608 bitset_word_t any_set = 0;
3609 if (type == CHARACTER && node->word_char)
3610 {
3611 bitset_empty (accepts);
3612 continue;
3613 }
3614#ifdef RE_ENABLE_I18N
3615 if (dfa->mb_cur_max > 1)
3616 for (j = 0; j < BITSET_WORDS; ++j)
3617 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3618 else
3619#endif
3620 for (j = 0; j < BITSET_WORDS; ++j)
3621 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3622 if (!any_set)
3623 continue;
3624 }
3625 }
3626
3627 /* Then divide 'accepts' into DFA states, or create a new
3628 state. Above, we make sure that accepts is not empty. */
3629 for (j = 0; j < ndests; ++j)
3630 {
3631 bitset_t intersec; /* Intersection sets, see below. */
3632 bitset_t remains;
3633 /* Flags, see below. */
3634 bitset_word_t has_intersec, not_subset, not_consumed;
3635
3636 /* Optimization, skip if this state doesn't accept the character. */
3637 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3638 continue;
3639
3640 /* Enumerate the intersection set of this state and 'accepts'. */
3641 has_intersec = 0;
3642 for (k = 0; k < BITSET_WORDS; ++k)
3643 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3644 /* And skip if the intersection set is empty. */
3645 if (!has_intersec)
3646 continue;
3647
3648 /* Then check if this state is a subset of 'accepts'. */
3649 not_subset = not_consumed = 0;
3650 for (k = 0; k < BITSET_WORDS; ++k)
3651 {
3652 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3653 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3654 }
3655
3656 /* If this state isn't a subset of 'accepts', create a
3657 new group state, which has the 'remains'. */
3658 if (not_subset)
3659 {
3660 bitset_copy (dests_ch[ndests], remains);
3661 bitset_copy (dests_ch[j], intersec);
3662 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3663 if (BE (err != REG_NOERROR, 0))
3664 goto error_return;
3665 ++ndests;
3666 }
3667
3668 /* Put the position in the current group. */
3669 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3670 if (BE (! ok, 0))
3671 goto error_return;
3672
3673 /* If all characters are consumed, go to next node. */
3674 if (!not_consumed)
3675 break;
3676 }
3677 /* Some characters remain, create a new group. */
3678 if (j == ndests)
3679 {
3680 bitset_copy (dests_ch[ndests], accepts);
3681 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3682 if (BE (err != REG_NOERROR, 0))
3683 goto error_return;
3684 ++ndests;
3685 bitset_empty (accepts);
3686 }
3687 }
3688 return ndests;
3689 error_return:
3690 for (j = 0; j < ndests; ++j)
3691 re_node_set_free (dests_node + j);
3692 return -1;
3693}
3694
3695#ifdef RE_ENABLE_I18N
3696/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3697 Return the number of the bytes the node accepts.
3698 STR_IDX is the current index of the input string.
3699
3700 This function handles the nodes which can accept one character, or
3701 one collating element like '.', '[a-z]', opposite to the other nodes
3702 can only accept one byte. */
3703
3704# ifdef _LIBC
3705# include <locale/weight.h>
3706# endif
3707
3708static int
3709check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3710 const re_string_t *input, Idx str_idx)
3711{
3712 const re_token_t *node = dfa->nodes + node_idx;
3713 int char_len, elem_len;
3714 Idx i;
3715
3716 if (BE (node->type == OP_UTF8_PERIOD, 0))
3717 {
3718 unsigned char c = re_string_byte_at (input, str_idx), d;
3719 if (BE (c < 0xc2, 1))
3720 return 0;
3721
3722 if (str_idx + 2 > input->len)
3723 return 0;
3724
3725 d = re_string_byte_at (input, str_idx + 1);
3726 if (c < 0xe0)
3727 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3728 else if (c < 0xf0)
3729 {
3730 char_len = 3;
3731 if (c == 0xe0 && d < 0xa0)
3732 return 0;
3733 }
3734 else if (c < 0xf8)
3735 {
3736 char_len = 4;
3737 if (c == 0xf0 && d < 0x90)
3738 return 0;
3739 }
3740 else if (c < 0xfc)
3741 {
3742 char_len = 5;
3743 if (c == 0xf8 && d < 0x88)
3744 return 0;
3745 }
3746 else if (c < 0xfe)
3747 {
3748 char_len = 6;
3749 if (c == 0xfc && d < 0x84)
3750 return 0;
3751 }
3752 else
3753 return 0;
3754
3755 if (str_idx + char_len > input->len)
3756 return 0;
3757
3758 for (i = 1; i < char_len; ++i)
3759 {
3760 d = re_string_byte_at (input, str_idx + i);
3761 if (d < 0x80 || d > 0xbf)
3762 return 0;
3763 }
3764 return char_len;
3765 }
3766
3767 char_len = re_string_char_size_at (input, str_idx);
3768 if (node->type == OP_PERIOD)
3769 {
3770 if (char_len <= 1)
3771 return 0;
3772 /* FIXME: I don't think this if is needed, as both '\n'
3773 and '\0' are char_len == 1. */
3774 /* '.' accepts any one character except the following two cases. */
3775 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3776 re_string_byte_at (input, str_idx) == '\n') ||
3777 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3778 re_string_byte_at (input, str_idx) == '\0'))
3779 return 0;
3780 return char_len;
3781 }
3782
3783 elem_len = re_string_elem_size_at (input, str_idx);
3784 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3785 return 0;
3786
3787 if (node->type == COMPLEX_BRACKET)
3788 {
3789 const re_charset_t *cset = node->opr.mbcset;
3790# ifdef _LIBC
3791 const unsigned char *pin
3792 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3793 Idx j;
3794 uint32_t nrules;
3795# endif /* _LIBC */
3796 int match_len = 0;
3797 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3798 ? re_string_wchar_at (input, str_idx) : 0);
3799
3800 /* match with multibyte character? */
3801 for (i = 0; i < cset->nmbchars; ++i)
3802 if (wc == cset->mbchars[i])
3803 {
3804 match_len = char_len;
3805 goto check_node_accept_bytes_match;
3806 }
3807 /* match with character_class? */
3808 for (i = 0; i < cset->nchar_classes; ++i)
3809 {
3810 wctype_t wt = cset->char_classes[i];
3811 if (__iswctype (wc, wt))
3812 {
3813 match_len = char_len;
3814 goto check_node_accept_bytes_match;
3815 }
3816 }
3817
3818# ifdef _LIBC
3819 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3820 if (nrules != 0)
3821 {
3822 unsigned int in_collseq = 0;
3823 const int32_t *table, *indirect;
3824 const unsigned char *weights, *extra;
3825 const char *collseqwc;
3826
3827 /* match with collating_symbol? */
3828 if (cset->ncoll_syms)
3829 extra = (const unsigned char *)
3830 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3831 for (i = 0; i < cset->ncoll_syms; ++i)
3832 {
3833 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3834 /* Compare the length of input collating element and
3835 the length of current collating element. */
3836 if (*coll_sym != elem_len)
3837 continue;
3838 /* Compare each bytes. */
3839 for (j = 0; j < *coll_sym; j++)
3840 if (pin[j] != coll_sym[1 + j])
3841 break;
3842 if (j == *coll_sym)
3843 {
3844 /* Match if every bytes is equal. */
3845 match_len = j;
3846 goto check_node_accept_bytes_match;
3847 }
3848 }
3849
3850 if (cset->nranges)
3851 {
3852 if (elem_len <= char_len)
3853 {
3854 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3855 in_collseq = __collseq_table_lookup (collseqwc, wc);
3856 }
3857 else
3858 in_collseq = find_collation_sequence_value (pin, elem_len);
3859 }
3860 /* match with range expression? */
3861 /* FIXME: Implement rational ranges here, too. */
3862 for (i = 0; i < cset->nranges; ++i)
3863 if (cset->range_starts[i] <= in_collseq
3864 && in_collseq <= cset->range_ends[i])
3865 {
3866 match_len = elem_len;
3867 goto check_node_accept_bytes_match;
3868 }
3869
3870 /* match with equivalence_class? */
3871 if (cset->nequiv_classes)
3872 {
3873 const unsigned char *cp = pin;
3874 table = (const int32_t *)
3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3876 weights = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3878 extra = (const unsigned char *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3880 indirect = (const int32_t *)
3881 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3882 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3883 int32_t rule = idx >> 24;
3884 idx &= 0xffffff;
3885 if (idx > 0)
3886 {
3887 size_t weight_len = weights[idx];
3888 for (i = 0; i < cset->nequiv_classes; ++i)
3889 {
3890 int32_t equiv_class_idx = cset->equiv_classes[i];
3891 int32_t equiv_class_rule = equiv_class_idx >> 24;
3892 equiv_class_idx &= 0xffffff;
3893 if (weights[equiv_class_idx] == weight_len
3894 && equiv_class_rule == rule
3895 && memcmp (weights + idx + 1,
3896 weights + equiv_class_idx + 1,
3897 weight_len) == 0)
3898 {
3899 match_len = elem_len;
3900 goto check_node_accept_bytes_match;
3901 }
3902 }
3903 }
3904 }
3905 }
3906 else
3907# endif /* _LIBC */
3908 {
3909 /* match with range expression? */
3910 for (i = 0; i < cset->nranges; ++i)
3911 {
3912 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3913 {
3914 match_len = char_len;
3915 goto check_node_accept_bytes_match;
3916 }
3917 }
3918 }
3919 check_node_accept_bytes_match:
3920 if (!cset->non_match)
3921 return match_len;
3922 else
3923 {
3924 if (match_len > 0)
3925 return 0;
3926 else
3927 return (elem_len > char_len) ? elem_len : char_len;
3928 }
3929 }
3930 return 0;
3931}
3932
3933# ifdef _LIBC
3934static unsigned int
3935find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3936{
3937 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3938 if (nrules == 0)
3939 {
3940 if (mbs_len == 1)
3941 {
3942 /* No valid character. Match it as a single byte character. */
3943 const unsigned char *collseq = (const unsigned char *)
3944 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3945 return collseq[mbs[0]];
3946 }
3947 return UINT_MAX;
3948 }
3949 else
3950 {
3951 int32_t idx;
3952 const unsigned char *extra = (const unsigned char *)
3953 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3954 int32_t extrasize = (const unsigned char *)
3955 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3956
3957 for (idx = 0; idx < extrasize;)
3958 {
3959 int mbs_cnt;
3960 bool found = false;
3961 int32_t elem_mbs_len;
3962 /* Skip the name of collating element name. */
3963 idx = idx + extra[idx] + 1;
3964 elem_mbs_len = extra[idx++];
3965 if (mbs_len == elem_mbs_len)
3966 {
3967 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3968 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3969 break;
3970 if (mbs_cnt == elem_mbs_len)
3971 /* Found the entry. */
3972 found = true;
3973 }
3974 /* Skip the byte sequence of the collating element. */
3975 idx += elem_mbs_len;
3976 /* Adjust for the alignment. */
3977 idx = (idx + 3) & ~3;
3978 /* Skip the collation sequence value. */
3979 idx += sizeof (uint32_t);
3980 /* Skip the wide char sequence of the collating element. */
3981 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3982 /* If we found the entry, return the sequence value. */
3983 if (found)
3984 return *(uint32_t *) (extra + idx);
3985 /* Skip the collation sequence value. */
3986 idx += sizeof (uint32_t);
3987 }
3988 return UINT_MAX;
3989 }
3990}
3991# endif /* _LIBC */
3992#endif /* RE_ENABLE_I18N */
3993
3994/* Check whether the node accepts the byte which is IDX-th
3995 byte of the INPUT. */
3996
3997static bool
3998check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3999 Idx idx)
4000{
4001 unsigned char ch;
4002 ch = re_string_byte_at (&mctx->input, idx);
4003 switch (node->type)
4004 {
4005 case CHARACTER:
4006 if (node->opr.c != ch)
4007 return false;
4008 break;
4009
4010 case SIMPLE_BRACKET:
4011 if (!bitset_contain (node->opr.sbcset, ch))
4012 return false;
4013 break;
4014
4015#ifdef RE_ENABLE_I18N
4016 case OP_UTF8_PERIOD:
4017 if (ch >= ASCII_CHARS)
4018 return false;
4019 FALLTHROUGH;
4020#endif
4021 case OP_PERIOD:
4022 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4023 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4024 return false;
4025 break;
4026
4027 default:
4028 return false;
4029 }
4030
4031 if (node->constraint)
4032 {
4033 /* The node has constraints. Check whether the current context
4034 satisfies the constraints. */
4035 unsigned int context = re_string_context_at (&mctx->input, idx,
4036 mctx->eflags);
4037 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4038 return false;
4039 }
4040
4041 return true;
4042}
4043
4044/* Extend the buffers, if the buffers have run out. */
4045
4046static reg_errcode_t
4047__attribute_warn_unused_result__
4048extend_buffers (re_match_context_t *mctx, int min_len)
4049{
4050 reg_errcode_t ret;
4051 re_string_t *pstr = &mctx->input;
4052
4053 /* Avoid overflow. */
4054 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4055 <= pstr->bufs_len, 0))
4056 return REG_ESPACE;
4057
4058 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4059 ret = re_string_realloc_buffers (pstr,
4060 MAX (min_len,
4061 MIN (pstr->len, pstr->bufs_len * 2)));
4062 if (BE (ret != REG_NOERROR, 0))
4063 return ret;
4064
4065 if (mctx->state_log != NULL)
4066 {
4067 /* And double the length of state_log. */
4068 /* XXX We have no indication of the size of this buffer. If this
4069 allocation fail we have no indication that the state_log array
4070 does not have the right size. */
4071 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4072 pstr->bufs_len + 1);
4073 if (BE (new_array == NULL, 0))
4074 return REG_ESPACE;
4075 mctx->state_log = new_array;
4076 }
4077
4078 /* Then reconstruct the buffers. */
4079 if (pstr->icase)
4080 {
4081#ifdef RE_ENABLE_I18N
4082 if (pstr->mb_cur_max > 1)
4083 {
4084 ret = build_wcs_upper_buffer (pstr);
4085 if (BE (ret != REG_NOERROR, 0))
4086 return ret;
4087 }
4088 else
4089#endif /* RE_ENABLE_I18N */
4090 build_upper_buffer (pstr);
4091 }
4092 else
4093 {
4094#ifdef RE_ENABLE_I18N
4095 if (pstr->mb_cur_max > 1)
4096 build_wcs_buffer (pstr);
4097 else
4098#endif /* RE_ENABLE_I18N */
4099 {
4100 if (pstr->trans != NULL)
4101 re_string_translate_buffer (pstr);
4102 }
4103 }
4104 return REG_NOERROR;
4105}
4106
4107
4108/* Functions for matching context. */
4109
4110/* Initialize MCTX. */
4111
4112static reg_errcode_t
4113__attribute_warn_unused_result__
4114match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4115{
4116 mctx->eflags = eflags;
4117 mctx->match_last = -1;
4118 if (n > 0)
4119 {
4120 /* Avoid overflow. */
4121 size_t max_object_size =
4122 MAX (sizeof (struct re_backref_cache_entry),
4123 sizeof (re_sub_match_top_t *));
4124 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4125 return REG_ESPACE;
4126
4127 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4128 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4129 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4130 return REG_ESPACE;
4131 }
4132 /* Already zero-ed by the caller.
4133 else
4134 mctx->bkref_ents = NULL;
4135 mctx->nbkref_ents = 0;
4136 mctx->nsub_tops = 0; */
4137 mctx->abkref_ents = n;
4138 mctx->max_mb_elem_len = 1;
4139 mctx->asub_tops = n;
4140 return REG_NOERROR;
4141}
4142
4143/* Clean the entries which depend on the current input in MCTX.
4144 This function must be invoked when the matcher changes the start index
4145 of the input, or changes the input string. */
4146
4147static void
4148match_ctx_clean (re_match_context_t *mctx)
4149{
4150 Idx st_idx;
4151 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4152 {
4153 Idx sl_idx;
4154 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4155 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4156 {
4157 re_sub_match_last_t *last = top->lasts[sl_idx];
4158 re_free (last->path.array);
4159 re_free (last);
4160 }
4161 re_free (top->lasts);
4162 if (top->path)
4163 {
4164 re_free (top->path->array);
4165 re_free (top->path);
4166 }
4167 re_free (top);
4168 }
4169
4170 mctx->nsub_tops = 0;
4171 mctx->nbkref_ents = 0;
4172}
4173
4174/* Free all the memory associated with MCTX. */
4175
4176static void
4177match_ctx_free (re_match_context_t *mctx)
4178{
4179 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4180 match_ctx_clean (mctx);
4181 re_free (mctx->sub_tops);
4182 re_free (mctx->bkref_ents);
4183}
4184
4185/* Add a new backreference entry to MCTX.
4186 Note that we assume that caller never call this function with duplicate
4187 entry, and call with STR_IDX which isn't smaller than any existing entry.
4188*/
4189
4190static reg_errcode_t
4191__attribute_warn_unused_result__
4192match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4193 Idx to)
4194{
4195 if (mctx->nbkref_ents >= mctx->abkref_ents)
4196 {
4197 struct re_backref_cache_entry* new_entry;
4198 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4199 mctx->abkref_ents * 2);
4200 if (BE (new_entry == NULL, 0))
4201 {
4202 re_free (mctx->bkref_ents);
4203 return REG_ESPACE;
4204 }
4205 mctx->bkref_ents = new_entry;
4206 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4207 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4208 mctx->abkref_ents *= 2;
4209 }
4210 if (mctx->nbkref_ents > 0
4211 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4212 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4213
4214 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4215 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4216 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4217 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4218
4219 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4220 If bit N is clear, means that this entry won't epsilon-transition to
4221 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4222 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4223 such node.
4224
4225 A backreference does not epsilon-transition unless it is empty, so set
4226 to all zeros if FROM != TO. */
4227 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4228 = (from == to ? -1 : 0);
4229
4230 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4231 if (mctx->max_mb_elem_len < to - from)
4232 mctx->max_mb_elem_len = to - from;
4233 return REG_NOERROR;
4234}
4235
4236/* Return the first entry with the same str_idx, or -1 if none is
4237 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4238
4239static Idx
4240search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4241{
4242 Idx left, right, mid, last;
4243 last = right = mctx->nbkref_ents;
4244 for (left = 0; left < right;)
4245 {
4246 mid = (left + right) / 2;
4247 if (mctx->bkref_ents[mid].str_idx < str_idx)
4248 left = mid + 1;
4249 else
4250 right = mid;
4251 }
4252 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4253 return left;
4254 else
4255 return -1;
4256}
4257
4258/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4259 at STR_IDX. */
4260
4261static reg_errcode_t
4262__attribute_warn_unused_result__
4263match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4264{
4265#ifdef DEBUG
4266 assert (mctx->sub_tops != NULL);
4267 assert (mctx->asub_tops > 0);
4268#endif
4269 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4270 {
4271 Idx new_asub_tops = mctx->asub_tops * 2;
4272 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4273 re_sub_match_top_t *,
4274 new_asub_tops);
4275 if (BE (new_array == NULL, 0))
4276 return REG_ESPACE;
4277 mctx->sub_tops = new_array;
4278 mctx->asub_tops = new_asub_tops;
4279 }
4280 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4281 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4282 return REG_ESPACE;
4283 mctx->sub_tops[mctx->nsub_tops]->node = node;
4284 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4285 return REG_NOERROR;
4286}
4287
4288/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4289 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4290
4291static re_sub_match_last_t *
4292match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4293{
4294 re_sub_match_last_t *new_entry;
4295 if (BE (subtop->nlasts == subtop->alasts, 0))
4296 {
4297 Idx new_alasts = 2 * subtop->alasts + 1;
4298 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4299 re_sub_match_last_t *,
4300 new_alasts);
4301 if (BE (new_array == NULL, 0))
4302 return NULL;
4303 subtop->lasts = new_array;
4304 subtop->alasts = new_alasts;
4305 }
4306 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4307 if (BE (new_entry != NULL, 1))
4308 {
4309 subtop->lasts[subtop->nlasts] = new_entry;
4310 new_entry->node = node;
4311 new_entry->str_idx = str_idx;
4312 ++subtop->nlasts;
4313 }
4314 return new_entry;
4315}
4316
4317static void
4318sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4319 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4320{
4321 sctx->sifted_states = sifted_sts;
4322 sctx->limited_states = limited_sts;
4323 sctx->last_node = last_node;
4324 sctx->last_str_idx = last_str_idx;
4325 re_node_set_init_empty (&sctx->limits);
4326}
4327