1/* Copyright (C) 1995-2016 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <assert.h>
20#include <langinfo.h>
21#include <locale.h>
22#include <stddef.h>
23#include <stdint.h>
24#include <stdlib.h>
25#include <string.h>
26#include <sys/param.h>
27
28#ifndef STRING_TYPE
29# define STRING_TYPE char
30# define USTRING_TYPE unsigned char
31# define STRXFRM __strxfrm_l
32# define STRLEN strlen
33# define STPNCPY __stpncpy
34# define WEIGHT_H "../locale/weight.h"
35# define SUFFIX MB
36# define L(arg) arg
37#endif
38
39#define CONCAT(a,b) CONCAT1(a,b)
40#define CONCAT1(a,b) a##b
41
42/* Maximum string size that is calculated with cached indices. Right now this
43 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
44 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
45#define SMALL_STR_SIZE 4095
46
47#include "../locale/localeinfo.h"
48#include WEIGHT_H
49
50/* Group locale data for shorter parameter lists. */
51typedef struct
52{
53 uint_fast32_t nrules;
54 unsigned char *rulesets;
55 USTRING_TYPE *weights;
56 int32_t *table;
57 USTRING_TYPE *extra;
58 int32_t *indirect;
59} locale_data_t;
60
61#ifndef WIDE_CHAR_VERSION
62
63/* We need UTF-8 encoding of numbers. */
64static int
65utf8_encode (char *buf, int val)
66{
67 int retval;
68
69 if (val < 0x80)
70 {
71 *buf++ = (char) val;
72 retval = 1;
73 }
74 else
75 {
76 int step;
77
78 for (step = 2; step < 6; ++step)
79 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
80 break;
81 retval = step;
82
83 *buf = (unsigned char) (~0xff >> step);
84 --step;
85 do
86 {
87 buf[step] = 0x80 | (val & 0x3f);
88 val >>= 6;
89 }
90 while (--step > 0);
91 *buf |= val;
92 }
93
94 return retval;
95}
96#endif
97
98/* Find next weight and rule index. Inlined since called for every char. */
99static __always_inline size_t
100find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
101 unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
102{
103 int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
104 -1);
105 *rule_idx = tmp >> 24;
106 int32_t idx = tmp & 0xffffff;
107 size_t len = l_data->weights[idx++];
108
109 /* Skip over indices of previous levels. */
110 for (int i = 0; i < pass; i++)
111 {
112 idx += len;
113 len = l_data->weights[idx++];
114 }
115
116 *weight_idx = idx;
117 return len;
118}
119
120static int
121find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
122 const int pass)
123{
124 int32_t weight_idx;
125 unsigned char rule_idx;
126 const USTRING_TYPE *usrc = us;
127
128 find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
129 return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
130}
131
132/* Do the transformation. */
133static size_t
134do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
135 const locale_data_t *l_data)
136{
137 int32_t weight_idx;
138 unsigned char rule_idx;
139 uint_fast32_t pass;
140 size_t needed = 0;
141 size_t last_needed;
142
143 /* Now the passes over the weights. */
144 for (pass = 0; pass < l_data->nrules; ++pass)
145 {
146 size_t backw_len = 0;
147 last_needed = needed;
148 const USTRING_TYPE *cur = usrc;
149 const USTRING_TYPE *backw_start = NULL;
150
151 /* We assume that if a rule has defined `position' in one section
152 this is true for all of them. */
153 int position = find_position (cur, l_data, pass);
154
155 if (position == 0)
156 {
157 while (*cur != L('\0'))
158 {
159 const USTRING_TYPE *pos = cur;
160 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
161 pass);
162 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
163
164 if ((rule & sort_forward) != 0)
165 {
166 /* Handle the pushed backward sequence. */
167 if (backw_start != NULL)
168 {
169 for (size_t i = backw_len; i > 0; )
170 {
171 int32_t weight_idx;
172 unsigned char rule_idx;
173 size_t len = find_idx (&backw_start, &weight_idx,
174 &rule_idx, l_data, pass);
175 if (needed + i < n)
176 for (size_t j = len; j > 0; j--)
177 dest[needed + i - j] =
178 l_data->weights[weight_idx++];
179
180 i -= len;
181 }
182
183 needed += backw_len;
184 backw_start = NULL;
185 backw_len = 0;
186 }
187
188 /* Now handle the forward element. */
189 if (needed + len < n)
190 while (len-- > 0)
191 dest[needed++] = l_data->weights[weight_idx++];
192 else
193 /* No more characters fit into the buffer. */
194 needed += len;
195 }
196 else
197 {
198 /* Remember start of the backward sequence & track length. */
199 if (backw_start == NULL)
200 backw_start = pos;
201 backw_len += len;
202 }
203 }
204
205
206 /* Handle the pushed backward sequence. */
207 if (backw_start != NULL)
208 {
209 for (size_t i = backw_len; i > 0; )
210 {
211 size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
212 l_data, pass);
213 if (needed + i < n)
214 for (size_t j = len; j > 0; j--)
215 dest[needed + i - j] =
216 l_data->weights[weight_idx++];
217
218 i -= len;
219 }
220
221 needed += backw_len;
222 }
223 }
224 else
225 {
226 int val = 1;
227#ifndef WIDE_CHAR_VERSION
228 char buf[7];
229 size_t buflen;
230#endif
231 size_t i;
232
233 while (*cur != L('\0'))
234 {
235 const USTRING_TYPE *pos = cur;
236 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
237 pass);
238 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
239
240 if ((rule & sort_forward) != 0)
241 {
242 /* Handle the pushed backward sequence. */
243 if (backw_start != NULL)
244 {
245 for (size_t p = backw_len; p > 0; p--)
246 {
247 size_t len;
248 int32_t weight_idx;
249 unsigned char rule_idx;
250 const USTRING_TYPE *backw_cur = backw_start;
251
252 /* To prevent a warning init the used vars. */
253 len = find_idx (&backw_cur, &weight_idx,
254 &rule_idx, l_data, pass);
255
256 for (i = 1; i < p; i++)
257 len = find_idx (&backw_cur, &weight_idx,
258 &rule_idx, l_data, pass);
259
260 if (len != 0)
261 {
262#ifdef WIDE_CHAR_VERSION
263 if (needed + 1 + len < n)
264 {
265 dest[needed] = val;
266 for (i = 0; i < len; ++i)
267 dest[needed + 1 + i] =
268 l_data->weights[weight_idx + i];
269 }
270 needed += 1 + len;
271#else
272 buflen = utf8_encode (buf, val);
273 if (needed + buflen + len < n)
274 {
275 for (i = 0; i < buflen; ++i)
276 dest[needed + i] = buf[i];
277 for (i = 0; i < len; ++i)
278 dest[needed + buflen + i] =
279 l_data->weights[weight_idx + i];
280 }
281 needed += buflen + len;
282#endif
283 val = 1;
284 }
285 else
286 ++val;
287 }
288
289 backw_start = NULL;
290 backw_len = 0;
291 }
292
293 /* Now handle the forward element. */
294 if (len != 0)
295 {
296#ifdef WIDE_CHAR_VERSION
297 if (needed + 1 + len < n)
298 {
299 dest[needed] = val;
300 for (i = 0; i < len; ++i)
301 dest[needed + 1 + i] =
302 l_data->weights[weight_idx + i];
303 }
304 needed += 1 + len;
305#else
306 buflen = utf8_encode (buf, val);
307 if (needed + buflen + len < n)
308 {
309 for (i = 0; i < buflen; ++i)
310 dest[needed + i] = buf[i];
311 for (i = 0; i < len; ++i)
312 dest[needed + buflen + i] =
313 l_data->weights[weight_idx + i];
314 }
315 needed += buflen + len;
316#endif
317 val = 1;
318 }
319 else
320 ++val;
321 }
322 else
323 {
324 /* Remember start of the backward sequence & track length. */
325 if (backw_start == NULL)
326 backw_start = pos;
327 backw_len++;
328 }
329 }
330
331 /* Handle the pushed backward sequence. */
332 if (backw_start != NULL)
333 {
334 for (size_t p = backw_len; p > 0; p--)
335 {
336 size_t len;
337 int32_t weight_idx;
338 unsigned char rule_idx;
339 const USTRING_TYPE *backw_cur = backw_start;
340
341 /* To prevent a warning init the used vars. */
342 len = find_idx (&backw_cur, &weight_idx,
343 &rule_idx, l_data, pass);
344
345 for (i = 1; i < p; i++)
346 len = find_idx (&backw_cur, &weight_idx,
347 &rule_idx, l_data, pass);
348
349 if (len != 0)
350 {
351#ifdef WIDE_CHAR_VERSION
352 if (needed + 1 + len < n)
353 {
354 dest[needed] = val;
355 for (i = 0; i < len; ++i)
356 dest[needed + 1 + i] =
357 l_data->weights[weight_idx + i];
358 }
359 needed += 1 + len;
360#else
361 buflen = utf8_encode (buf, val);
362 if (needed + buflen + len < n)
363 {
364 for (i = 0; i < buflen; ++i)
365 dest[needed + i] = buf[i];
366 for (i = 0; i < len; ++i)
367 dest[needed + buflen + i] =
368 l_data->weights[weight_idx + i];
369 }
370 needed += buflen + len;
371#endif
372 val = 1;
373 }
374 else
375 ++val;
376 }
377 }
378 }
379
380 /* Finally store the byte to separate the passes or terminate
381 the string. */
382 if (needed < n)
383 dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
384 ++needed;
385 }
386
387 /* This is a little optimization: many collation specifications have
388 a `position' rule at the end and if no non-ignored character
389 is found the last \1 byte is immediately followed by a \0 byte
390 signalling this. We can avoid the \1 byte(s). */
391 if (needed > 2 && needed == last_needed + 1)
392 {
393 /* Remove the \1 byte. */
394 if (--needed <= n)
395 dest[needed - 1] = L('\0');
396 }
397
398 /* Return the number of bytes/words we need, but don't count the NUL
399 byte/word at the end. */
400 return needed - 1;
401}
402
403/* Do the transformation using weight-index and rule cache. */
404static size_t
405do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
406 size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
407{
408 uint_fast32_t nrules = l_data->nrules;
409 unsigned char *rulesets = l_data->rulesets;
410 USTRING_TYPE *weights = l_data->weights;
411 uint_fast32_t pass;
412 size_t needed = 0;
413 size_t last_needed;
414 size_t idxcnt;
415
416 /* Now the passes over the weights. */
417 for (pass = 0; pass < nrules; ++pass)
418 {
419 size_t backw_stop = ~0ul;
420 int rule = rulesets[rulearr[0] * nrules + pass];
421 /* We assume that if a rule has defined `position' in one section
422 this is true for all of them. */
423 int position = rule & sort_position;
424
425 last_needed = needed;
426 if (position == 0)
427 {
428 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
429 {
430 if ((rule & sort_forward) != 0)
431 {
432 size_t len;
433
434 if (backw_stop != ~0ul)
435 {
436 /* Handle the pushed elements now. */
437 size_t backw;
438
439 for (backw = idxcnt; backw > backw_stop; )
440 {
441 --backw;
442 len = weights[idxarr[backw]++];
443
444 if (needed + len < n)
445 while (len-- > 0)
446 dest[needed++] = weights[idxarr[backw]++];
447 else
448 {
449 /* No more characters fit into the buffer. */
450 needed += len;
451 idxarr[backw] += len;
452 }
453 }
454
455 backw_stop = ~0ul;
456 }
457
458 /* Now handle the forward element. */
459 len = weights[idxarr[idxcnt]++];
460 if (needed + len < n)
461 while (len-- > 0)
462 dest[needed++] = weights[idxarr[idxcnt]++];
463 else
464 {
465 /* No more characters fit into the buffer. */
466 needed += len;
467 idxarr[idxcnt] += len;
468 }
469 }
470 else
471 {
472 /* Remember where the backwards series started. */
473 if (backw_stop == ~0ul)
474 backw_stop = idxcnt;
475 }
476
477 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
478 }
479
480
481 if (backw_stop != ~0ul)
482 {
483 /* Handle the pushed elements now. */
484 size_t backw;
485
486 backw = idxcnt;
487 while (backw > backw_stop)
488 {
489 size_t len = weights[idxarr[--backw]++];
490
491 if (needed + len < n)
492 while (len-- > 0)
493 dest[needed++] = weights[idxarr[backw]++];
494 else
495 {
496 /* No more characters fit into the buffer. */
497 needed += len;
498 idxarr[backw] += len;
499 }
500 }
501 }
502 }
503 else
504 {
505 int val = 1;
506#ifndef WIDE_CHAR_VERSION
507 char buf[7];
508 size_t buflen;
509#endif
510 size_t i;
511
512 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
513 {
514 if ((rule & sort_forward) != 0)
515 {
516 size_t len;
517
518 if (backw_stop != ~0ul)
519 {
520 /* Handle the pushed elements now. */
521 size_t backw;
522
523 for (backw = idxcnt; backw > backw_stop; )
524 {
525 --backw;
526 len = weights[idxarr[backw]++];
527 if (len != 0)
528 {
529#ifdef WIDE_CHAR_VERSION
530 if (needed + 1 + len < n)
531 {
532 dest[needed] = val;
533 for (i = 0; i < len; ++i)
534 dest[needed + 1 + i] =
535 weights[idxarr[backw] + i];
536 }
537 needed += 1 + len;
538#else
539 buflen = utf8_encode (buf, val);
540 if (needed + buflen + len < n)
541 {
542 for (i = 0; i < buflen; ++i)
543 dest[needed + i] = buf[i];
544 for (i = 0; i < len; ++i)
545 dest[needed + buflen + i] =
546 weights[idxarr[backw] + i];
547 }
548 needed += buflen + len;
549#endif
550 idxarr[backw] += len;
551 val = 1;
552 }
553 else
554 ++val;
555 }
556
557 backw_stop = ~0ul;
558 }
559
560 /* Now handle the forward element. */
561 len = weights[idxarr[idxcnt]++];
562 if (len != 0)
563 {
564#ifdef WIDE_CHAR_VERSION
565 if (needed + 1+ len < n)
566 {
567 dest[needed] = val;
568 for (i = 0; i < len; ++i)
569 dest[needed + 1 + i] =
570 weights[idxarr[idxcnt] + i];
571 }
572 needed += 1 + len;
573#else
574 buflen = utf8_encode (buf, val);
575 if (needed + buflen + len < n)
576 {
577 for (i = 0; i < buflen; ++i)
578 dest[needed + i] = buf[i];
579 for (i = 0; i < len; ++i)
580 dest[needed + buflen + i] =
581 weights[idxarr[idxcnt] + i];
582 }
583 needed += buflen + len;
584#endif
585 idxarr[idxcnt] += len;
586 val = 1;
587 }
588 else
589 /* Note that we don't have to increment `idxarr[idxcnt]'
590 since the length is zero. */
591 ++val;
592 }
593 else
594 {
595 /* Remember where the backwards series started. */
596 if (backw_stop == ~0ul)
597 backw_stop = idxcnt;
598 }
599
600 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
601 }
602
603 if (backw_stop != ~0ul)
604 {
605 /* Handle the pushed elements now. */
606 size_t backw;
607
608 backw = idxmax - 1;
609 while (backw > backw_stop)
610 {
611 size_t len = weights[idxarr[--backw]++];
612 if (len != 0)
613 {
614#ifdef WIDE_CHAR_VERSION
615 if (needed + 1 + len < n)
616 {
617 dest[needed] = val;
618 for (i = 0; i < len; ++i)
619 dest[needed + 1 + i] =
620 weights[idxarr[backw] + i];
621 }
622 needed += 1 + len;
623#else
624 buflen = utf8_encode (buf, val);
625 if (needed + buflen + len < n)
626 {
627 for (i = 0; i < buflen; ++i)
628 dest[needed + i] = buf[i];
629 for (i = 0; i < len; ++i)
630 dest[needed + buflen + i] =
631 weights[idxarr[backw] + i];
632 }
633 needed += buflen + len;
634#endif
635 idxarr[backw] += len;
636 val = 1;
637 }
638 else
639 ++val;
640 }
641 }
642 }
643
644 /* Finally store the byte to separate the passes or terminate
645 the string. */
646 if (needed < n)
647 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
648 ++needed;
649 }
650
651 /* This is a little optimization: many collation specifications have
652 a `position' rule at the end and if no non-ignored character
653 is found the last \1 byte is immediately followed by a \0 byte
654 signalling this. We can avoid the \1 byte(s). */
655 if (needed > 2 && needed == last_needed + 1)
656 {
657 /* Remove the \1 byte. */
658 if (--needed <= n)
659 dest[needed - 1] = L('\0');
660 }
661
662 /* Return the number of bytes/words we need, but don't count the NUL
663 byte/word at the end. */
664 return needed - 1;
665}
666
667size_t
668STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
669{
670 locale_data_t l_data;
671 struct __locale_data *current = l->__locales[LC_COLLATE];
672 l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
673
674 /* Handle byte comparison case. */
675 if (l_data.nrules == 0)
676 {
677 size_t srclen = STRLEN (src);
678
679 if (n != 0)
680 STPNCPY (dest, src, MIN (srclen + 1, n));
681
682 return srclen;
683 }
684
685 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
686 if (*src == L('\0'))
687 {
688 if (n != 0)
689 *dest = L('\0');
690 return 0;
691 }
692
693 /* Get the locale data. */
694 l_data.rulesets = (unsigned char *)
695 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
696 l_data.table = (int32_t *)
697 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
698 l_data.weights = (USTRING_TYPE *)
699 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
700 l_data.extra = (USTRING_TYPE *)
701 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
702 l_data.indirect = (int32_t *)
703 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
704
705 assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
706 assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
707 assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
708 assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
709
710 /* We need the elements of the string as unsigned values since they
711 are used as indeces. */
712 const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
713
714 /* Allocate cache for small strings on the stack and fill it with weight and
715 rule indices. If the cache size is not sufficient, continue with the
716 uncached xfrm version. */
717 size_t idxmax = 0;
718 const USTRING_TYPE *cur = usrc;
719 int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
720 unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
721
722 do
723 {
724 int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
725 -1);
726 rulearr[idxmax] = tmp >> 24;
727 idxarr[idxmax] = tmp & 0xffffff;
728
729 ++idxmax;
730 }
731 while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
732
733 /* This element is only read, the value never used but to determine
734 another value which then is ignored. */
735 rulearr[idxmax] = '\0';
736
737 /* Do the transformation. */
738 if (*cur == L('\0'))
739 return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
740 else
741 return do_xfrm (usrc, dest, n, &l_data);
742}
743libc_hidden_def (STRXFRM)
744
745#ifndef WIDE_CHAR_VERSION
746weak_alias (__strxfrm_l, strxfrm_l)
747#endif
748