1/* Copyright (C) 1995-2019 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <alloca.h>
23#include <byteswap.h>
24#include <endian.h>
25#include <errno.h>
26#include <limits.h>
27#include <obstack.h>
28#include <stdlib.h>
29#include <string.h>
30#include <wchar.h>
31#include <wctype.h>
32#include <stdint.h>
33#include <sys/uio.h>
34
35#include "localedef.h"
36#include "charmap.h"
37#include "localeinfo.h"
38#include "langinfo.h"
39#include "linereader.h"
40#include "locfile-token.h"
41#include "locfile.h"
42
43#include <assert.h>
44
45
46/* The bit used for representing a special class. */
47#define BITPOS(class) ((class) - tok_upper)
48#define BIT(class) (_ISbit (BITPOS (class)))
49#define BITw(class) (_ISwbit (BITPOS (class)))
50
51#define ELEM(ctype, collection, idx, value) \
52 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
53 &ctype->collection##_act idx, value)
54
55
56/* To be compatible with former implementations we for now restrict
57 the number of bits for character classes to 16. When compatibility
58 is not necessary anymore increase the number to 32. */
59#define char_class_t uint16_t
60#define char_class32_t uint32_t
61
62
63/* Type to describe a transliteration action. We have a possibly
64 multiple character from-string and a set of multiple character
65 to-strings. All are 32bit values since this is what is used in
66 the gconv functions. */
67struct translit_to_t
68{
69 uint32_t *str;
70
71 struct translit_to_t *next;
72};
73
74struct translit_t
75{
76 uint32_t *from;
77
78 const char *fname;
79 size_t lineno;
80
81 struct translit_to_t *to;
82
83 struct translit_t *next;
84};
85
86struct translit_ignore_t
87{
88 uint32_t from;
89 uint32_t to;
90 uint32_t step;
91
92 const char *fname;
93 size_t lineno;
94
95 struct translit_ignore_t *next;
96};
97
98
99/* Type to describe a transliteration include statement. */
100struct translit_include_t
101{
102 const char *copy_locale;
103 const char *copy_repertoire;
104
105 struct translit_include_t *next;
106};
107
108/* Provide some dummy pointer for empty string. */
109static uint32_t no_str[] = { 0 };
110
111
112/* Sparse table of uint32_t. */
113#define TABLE idx_table
114#define ELEMENT uint32_t
115#define DEFAULT ((uint32_t) ~0)
116#define NO_ADD_LOCALE
117#include "3level.h"
118
119#define TABLE wcwidth_table
120#define ELEMENT uint8_t
121#define DEFAULT 0xff
122#include "3level.h"
123
124#define TABLE wctrans_table
125#define ELEMENT int32_t
126#define DEFAULT 0
127#define wctrans_table_add wctrans_table_add_internal
128#include "3level.h"
129#undef wctrans_table_add
130/* The wctrans_table must actually store the difference between the
131 desired result and the argument. */
132static inline void
133wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
134{
135 wctrans_table_add_internal (t, wc, mapped_wc - wc);
136}
137
138/* Construction of sparse 3-level tables.
139 See wchar-lookup.h for their structure and the meaning of p and q. */
140
141struct wctype_table
142{
143 /* Parameters. */
144 unsigned int p;
145 unsigned int q;
146 /* Working representation. */
147 size_t level1_alloc;
148 size_t level1_size;
149 uint32_t *level1;
150 size_t level2_alloc;
151 size_t level2_size;
152 uint32_t *level2;
153 size_t level3_alloc;
154 size_t level3_size;
155 uint32_t *level3;
156 size_t result_size;
157};
158
159static void add_locale_wctype_table (struct locale_file *file,
160 struct wctype_table *t);
161
162/* The real definition of the struct for the LC_CTYPE locale. */
163struct locale_ctype_t
164{
165 uint32_t *charnames;
166 size_t charnames_max;
167 size_t charnames_act;
168 /* An index lookup table, to speedup find_idx. */
169 struct idx_table charnames_idx;
170
171 struct repertoire_t *repertoire;
172
173 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
174#define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
175 size_t nr_charclass;
176 const char *classnames[MAX_NR_CHARCLASS];
177 uint32_t last_class_char;
178 uint32_t class256_collection[256];
179 uint32_t *class_collection;
180 size_t class_collection_max;
181 size_t class_collection_act;
182 uint32_t class_done;
183 uint32_t class_offset;
184
185 struct charseq **mbdigits;
186 size_t mbdigits_act;
187 size_t mbdigits_max;
188 uint32_t *wcdigits;
189 size_t wcdigits_act;
190 size_t wcdigits_max;
191
192 struct charseq *mboutdigits[10];
193 uint32_t wcoutdigits[10];
194 size_t outdigits_act;
195
196 /* If the following number ever turns out to be too small simply
197 increase it. But I doubt it will. --drepper@gnu */
198#define MAX_NR_CHARMAP 16
199 const char *mapnames[MAX_NR_CHARMAP];
200 uint32_t *map_collection[MAX_NR_CHARMAP];
201 uint32_t map256_collection[2][256];
202 size_t map_collection_max[MAX_NR_CHARMAP];
203 size_t map_collection_act[MAX_NR_CHARMAP];
204 size_t map_collection_nr;
205 size_t last_map_idx;
206 int tomap_done[MAX_NR_CHARMAP];
207 uint32_t map_offset;
208
209 /* Transliteration information. */
210 struct translit_include_t *translit_include;
211 struct translit_t *translit;
212 struct translit_ignore_t *translit_ignore;
213 uint32_t ntranslit_ignore;
214
215 uint32_t *default_missing;
216 const char *default_missing_file;
217 size_t default_missing_lineno;
218
219 uint32_t to_nonascii;
220 uint32_t nonascii_case;
221
222 /* The arrays for the binary representation. */
223 char_class_t *ctype_b;
224 char_class32_t *ctype32_b;
225 uint32_t **map_b;
226 uint32_t **map32_b;
227 uint32_t **class_b;
228 struct wctype_table *class_3level;
229 struct wctrans_table *map_3level;
230 uint32_t *class_name_ptr;
231 uint32_t *map_name_ptr;
232 struct wcwidth_table width;
233 uint32_t mb_cur_max;
234 const char *codeset_name;
235 uint32_t *translit_from_idx;
236 uint32_t *translit_from_tbl;
237 uint32_t *translit_to_idx;
238 uint32_t *translit_to_tbl;
239 uint32_t translit_idx_size;
240 size_t translit_from_tbl_size;
241 size_t translit_to_tbl_size;
242
243 struct obstack mempool;
244};
245
246
247/* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
248 whether 'int' is 16 bit, 32 bit, or 64 bit. */
249#define EMPTY ((uint32_t) ~0)
250
251
252#define obstack_chunk_alloc xmalloc
253#define obstack_chunk_free free
254
255
256/* Prototypes for local functions. */
257static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
258 const struct charmap_t *charmap,
259 struct localedef_t *copy_locale,
260 int ignore_content);
261static void ctype_class_new (struct linereader *lr,
262 struct locale_ctype_t *ctype, const char *name);
263static void ctype_map_new (struct linereader *lr,
264 struct locale_ctype_t *ctype,
265 const char *name, const struct charmap_t *charmap);
266static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
267 size_t *max, size_t *act, uint32_t idx);
268static void set_class_defaults (struct locale_ctype_t *ctype,
269 const struct charmap_t *charmap,
270 struct repertoire_t *repertoire);
271static void allocate_arrays (struct locale_ctype_t *ctype,
272 const struct charmap_t *charmap,
273 struct repertoire_t *repertoire);
274
275
276static const char *longnames[] =
277{
278 "zero", "one", "two", "three", "four",
279 "five", "six", "seven", "eight", "nine"
280};
281static const char *uninames[] =
282{
283 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
284 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
285};
286static const unsigned char digits[] = "0123456789";
287
288
289static void
290ctype_startup (struct linereader *lr, struct localedef_t *locale,
291 const struct charmap_t *charmap,
292 struct localedef_t *copy_locale, int ignore_content)
293{
294 unsigned int cnt;
295 struct locale_ctype_t *ctype;
296
297 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
298 {
299 if (copy_locale == NULL)
300 {
301 /* Allocate the needed room. */
302 locale->categories[LC_CTYPE].ctype = ctype =
303 (struct locale_ctype_t *) xcalloc (1,
304 sizeof (struct locale_ctype_t));
305
306 /* We have seen no names yet. */
307 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
308 ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max
309 * sizeof (uint32_t));
310 for (cnt = 0; cnt < 256; ++cnt)
311 ctype->charnames[cnt] = cnt;
312 ctype->charnames_act = 256;
313 idx_table_init (&ctype->charnames_idx);
314
315 /* Fill character class information. */
316 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
317 /* The order of the following instructions determines the bit
318 positions! */
319 ctype_class_new (lr, ctype, "upper");
320 ctype_class_new (lr, ctype, "lower");
321 ctype_class_new (lr, ctype, "alpha");
322 ctype_class_new (lr, ctype, "digit");
323 ctype_class_new (lr, ctype, "xdigit");
324 ctype_class_new (lr, ctype, "space");
325 ctype_class_new (lr, ctype, "print");
326 ctype_class_new (lr, ctype, "graph");
327 ctype_class_new (lr, ctype, "blank");
328 ctype_class_new (lr, ctype, "cntrl");
329 ctype_class_new (lr, ctype, "punct");
330 ctype_class_new (lr, ctype, "alnum");
331
332 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
333 ctype->class_collection
334 = (uint32_t *) xcalloc (sizeof (unsigned long int),
335 ctype->class_collection_max);
336 ctype->class_collection_act = 256;
337
338 /* Fill character map information. */
339 ctype->last_map_idx = MAX_NR_CHARMAP;
340 ctype_map_new (lr, ctype, "toupper", charmap);
341 ctype_map_new (lr, ctype, "tolower", charmap);
342
343 /* Fill first 256 entries in `toXXX' arrays. */
344 for (cnt = 0; cnt < 256; ++cnt)
345 {
346 ctype->map_collection[0][cnt] = cnt;
347 ctype->map_collection[1][cnt] = cnt;
348
349 ctype->map256_collection[0][cnt] = cnt;
350 ctype->map256_collection[1][cnt] = cnt;
351 }
352
353 if (enc_not_ascii_compatible)
354 ctype->to_nonascii = 1;
355
356 obstack_init (&ctype->mempool);
357 }
358 else
359 ctype = locale->categories[LC_CTYPE].ctype =
360 copy_locale->categories[LC_CTYPE].ctype;
361 }
362}
363
364
365void
366ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
367{
368 /* See POSIX.2, table 2-6 for the meaning of the following table. */
369#define NCLASS 12
370 static const struct
371 {
372 const char *name;
373 const char allow[NCLASS];
374 }
375 valid_table[NCLASS] =
376 {
377 /* The order is important. See token.h for more information.
378 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
379 { "upper", "--MX-XDDXXX-" },
380 { "lower", "--MX-XDDXXX-" },
381 { "alpha", "---X-XDDXXX-" },
382 { "digit", "XXX--XDDXXX-" },
383 { "xdigit", "-----XDDXXX-" },
384 { "space", "XXXXX------X" },
385 { "print", "---------X--" },
386 { "graph", "---------X--" },
387 { "blank", "XXXXXM-----X" },
388 { "cntrl", "XXXXX-XX--XX" },
389 { "punct", "XXXXX-DD-X-X" },
390 { "alnum", "-----XDDXXX-" }
391 };
392 size_t cnt;
393 int cls1, cls2;
394 uint32_t space_value;
395 struct charseq *space_seq;
396 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
397 int warned;
398 const void *key;
399 size_t len;
400 void *vdata;
401 void *curs;
402
403 /* Now resolve copying and also handle completely missing definitions. */
404 if (ctype == NULL)
405 {
406 const char *repertoire_name;
407
408 /* First see whether we were supposed to copy. If yes, find the
409 actual definition. */
410 if (locale->copy_name[LC_CTYPE] != NULL)
411 {
412 /* Find the copying locale. This has to happen transitively since
413 the locale we are copying from might also copying another one. */
414 struct localedef_t *from = locale;
415
416 do
417 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
418 from->repertoire_name, charmap);
419 while (from->categories[LC_CTYPE].ctype == NULL
420 && from->copy_name[LC_CTYPE] != NULL);
421
422 ctype = locale->categories[LC_CTYPE].ctype
423 = from->categories[LC_CTYPE].ctype;
424 }
425
426 /* If there is still no definition issue an warning and create an
427 empty one. */
428 if (ctype == NULL)
429 {
430 record_warning (_("\
431No definition for %s category found"), "LC_CTYPE");
432 ctype_startup (NULL, locale, charmap, NULL, 0);
433 ctype = locale->categories[LC_CTYPE].ctype;
434 }
435
436 /* Get the repertoire we have to use. */
437 repertoire_name = locale->repertoire_name ?: repertoire_global;
438 if (repertoire_name != NULL)
439 ctype->repertoire = repertoire_read (repertoire_name);
440 }
441
442 /* We need the name of the currently used 8-bit character set to
443 make correct conversion between this 8-bit representation and the
444 ISO 10646 character set used internally for wide characters. */
445 ctype->codeset_name = charmap->code_set_name;
446 if (ctype->codeset_name == NULL)
447 {
448 record_error (0, 0, _("\
449No character set name specified in charmap"));
450 ctype->codeset_name = "//UNKNOWN//";
451 }
452
453 /* Set default value for classes not specified. */
454 set_class_defaults (ctype, charmap, ctype->repertoire);
455
456 /* Check according to table. */
457 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
458 {
459 uint32_t tmp = ctype->class_collection[cnt];
460
461 if (tmp != 0)
462 {
463 for (cls1 = 0; cls1 < NCLASS; ++cls1)
464 if ((tmp & _ISwbit (cls1)) != 0)
465 for (cls2 = 0; cls2 < NCLASS; ++cls2)
466 if (valid_table[cls1].allow[cls2] != '-')
467 {
468 int eq = (tmp & _ISwbit (cls2)) != 0;
469 switch (valid_table[cls1].allow[cls2])
470 {
471 case 'M':
472 if (!eq)
473 {
474 uint32_t value = ctype->charnames[cnt];
475
476 record_error (0, 0, _("\
477character L'\\u%0*x' in class `%s' must be in class `%s'"),
478 value > 0xffff ? 8 : 4,
479 value,
480 valid_table[cls1].name,
481 valid_table[cls2].name);
482 }
483 break;
484
485 case 'X':
486 if (eq)
487 {
488 uint32_t value = ctype->charnames[cnt];
489
490 record_error (0, 0, _("\
491character L'\\u%0*x' in class `%s' must not be in class `%s'"),
492 value > 0xffff ? 8 : 4,
493 value,
494 valid_table[cls1].name,
495 valid_table[cls2].name);
496 }
497 break;
498
499 case 'D':
500 ctype->class_collection[cnt] |= _ISwbit (cls2);
501 break;
502
503 default:
504 record_error (5, 0, _("\
505internal error in %s, line %u"), __FUNCTION__, __LINE__);
506 }
507 }
508 }
509 }
510
511 for (cnt = 0; cnt < 256; ++cnt)
512 {
513 uint32_t tmp = ctype->class256_collection[cnt];
514
515 if (tmp != 0)
516 {
517 for (cls1 = 0; cls1 < NCLASS; ++cls1)
518 if ((tmp & _ISbit (cls1)) != 0)
519 for (cls2 = 0; cls2 < NCLASS; ++cls2)
520 if (valid_table[cls1].allow[cls2] != '-')
521 {
522 int eq = (tmp & _ISbit (cls2)) != 0;
523 switch (valid_table[cls1].allow[cls2])
524 {
525 case 'M':
526 if (!eq)
527 {
528 char buf[17];
529
530 snprintf (buf, sizeof buf, "\\%Zo", cnt);
531
532 record_error (0, 0, _("\
533character '%s' in class `%s' must be in class `%s'"),
534 buf,
535 valid_table[cls1].name,
536 valid_table[cls2].name);
537 }
538 break;
539
540 case 'X':
541 if (eq)
542 {
543 char buf[17];
544
545 snprintf (buf, sizeof buf, "\\%Zo", cnt);
546
547 record_error (0, 0, _("\
548character '%s' in class `%s' must not be in class `%s'"),
549 buf,
550 valid_table[cls1].name,
551 valid_table[cls2].name);
552 }
553 break;
554
555 case 'D':
556 ctype->class256_collection[cnt] |= _ISbit (cls2);
557 break;
558
559 default:
560 record_error (5, 0, _("\
561internal error in %s, line %u"), __FUNCTION__, __LINE__);
562 }
563 }
564 }
565 }
566
567 /* ... and now test <SP> as a special case. */
568 space_value = 32;
569 if (((cnt = BITPOS (tok_space),
570 (ELEM (ctype, class_collection, , space_value)
571 & BITw (tok_space)) == 0)
572 || (cnt = BITPOS (tok_blank),
573 (ELEM (ctype, class_collection, , space_value)
574 & BITw (tok_blank)) == 0)))
575 {
576 record_error (0, 0, _("<SP> character not in class `%s'"),
577 valid_table[cnt].name);
578 }
579 else if (((cnt = BITPOS (tok_punct),
580 (ELEM (ctype, class_collection, , space_value)
581 & BITw (tok_punct)) != 0)
582 || (cnt = BITPOS (tok_graph),
583 (ELEM (ctype, class_collection, , space_value)
584 & BITw (tok_graph))
585 != 0)))
586 {
587 record_error (0, 0, _("\
588<SP> character must not be in class `%s'"),
589 valid_table[cnt].name);
590 }
591 else
592 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
593
594 space_seq = charmap_find_value (charmap, "SP", 2);
595 if (space_seq == NULL)
596 space_seq = charmap_find_value (charmap, "space", 5);
597 if (space_seq == NULL)
598 space_seq = charmap_find_value (charmap, "U00000020", 9);
599 if (space_seq == NULL || space_seq->nbytes != 1)
600 {
601 record_error (0, 0, _("\
602character <SP> not defined in character map"));
603 }
604 else if (((cnt = BITPOS (tok_space),
605 (ctype->class256_collection[space_seq->bytes[0]]
606 & BIT (tok_space)) == 0)
607 || (cnt = BITPOS (tok_blank),
608 (ctype->class256_collection[space_seq->bytes[0]]
609 & BIT (tok_blank)) == 0)))
610 {
611 record_error (0, 0, _("<SP> character not in class `%s'"),
612 valid_table[cnt].name);
613 }
614 else if (((cnt = BITPOS (tok_punct),
615 (ctype->class256_collection[space_seq->bytes[0]]
616 & BIT (tok_punct)) != 0)
617 || (cnt = BITPOS (tok_graph),
618 (ctype->class256_collection[space_seq->bytes[0]]
619 & BIT (tok_graph)) != 0)))
620 {
621 record_error (0, 0, _("\
622<SP> character must not be in class `%s'"),
623 valid_table[cnt].name);
624 }
625 else
626 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
627
628 /* Check whether all single-byte characters make to their upper/lowercase
629 equivalent according to the ASCII rules. */
630 for (cnt = 'A'; cnt <= 'Z'; ++cnt)
631 {
632 uint32_t uppval = ctype->map256_collection[0][cnt];
633 uint32_t lowval = ctype->map256_collection[1][cnt];
634 uint32_t lowuppval = ctype->map256_collection[0][lowval];
635 uint32_t lowlowval = ctype->map256_collection[1][lowval];
636
637 if (uppval != cnt
638 || lowval != cnt + 0x20
639 || lowuppval != cnt
640 || lowlowval != cnt + 0x20)
641 ctype->nonascii_case = 1;
642 }
643 for (cnt = 0; cnt < 256; ++cnt)
644 if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
645 if (ctype->map256_collection[0][cnt] != cnt
646 || ctype->map256_collection[1][cnt] != cnt)
647 ctype->nonascii_case = 1;
648
649 /* Now that the tests are done make sure the name array contains all
650 characters which are handled in the WIDTH section of the
651 character set definition file. */
652 if (charmap->width_rules != NULL)
653 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
654 {
655 unsigned char bytes[charmap->mb_cur_max];
656 int nbytes = charmap->width_rules[cnt].from->nbytes;
657
658 /* We have the range of character for which the width is
659 specified described using byte sequences of the multibyte
660 charset. We have to convert this to UCS4 now. And we
661 cannot simply convert the beginning and the end of the
662 sequence, we have to iterate over the byte sequence and
663 convert it for every single character. */
664 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
665
666 while (nbytes < charmap->width_rules[cnt].to->nbytes
667 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
668 nbytes) <= 0)
669 {
670 /* Find the UCS value for `bytes'. */
671 int inner;
672 uint32_t wch;
673 struct charseq *seq
674 = charmap_find_symbol (charmap, (char *) bytes, nbytes);
675
676 if (seq == NULL)
677 wch = ILLEGAL_CHAR_VALUE;
678 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
679 wch = seq->ucs4;
680 else
681 wch = repertoire_find_value (ctype->repertoire, seq->name,
682 strlen (seq->name));
683
684 if (wch != ILLEGAL_CHAR_VALUE)
685 /* We are only interested in the side-effects of the
686 `find_idx' call. It will add appropriate entries in
687 the name array if this is necessary. */
688 (void) find_idx (ctype, NULL, NULL, NULL, wch);
689
690 /* "Increment" the bytes sequence. */
691 inner = nbytes - 1;
692 while (inner >= 0 && bytes[inner] == 0xff)
693 --inner;
694
695 if (inner < 0)
696 {
697 /* We have to extend the byte sequence. */
698 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
699 break;
700
701 bytes[0] = 1;
702 memset (&bytes[1], 0, nbytes);
703 ++nbytes;
704 }
705 else
706 {
707 ++bytes[inner];
708 while (++inner < nbytes)
709 bytes[inner] = 0;
710 }
711 }
712 }
713
714 /* Now set all the other characters of the character set to the
715 default width. */
716 curs = NULL;
717 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
718 {
719 struct charseq *data = (struct charseq *) vdata;
720
721 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
722 data->ucs4 = repertoire_find_value (ctype->repertoire,
723 data->name, len);
724
725 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
726 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
727 }
728
729 /* There must be a multiple of 10 digits. */
730 if (ctype->mbdigits_act % 10 != 0)
731 {
732 assert (ctype->mbdigits_act == ctype->wcdigits_act);
733 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
734 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
735 record_error (0, 0, _("\
736`digit' category has not entries in groups of ten"));
737 }
738
739 /* Check the input digits. There must be a multiple of ten available.
740 In each group it could be that one or the other character is missing.
741 In this case the whole group must be removed. */
742 cnt = 0;
743 while (cnt < ctype->mbdigits_act)
744 {
745 size_t inner;
746 for (inner = 0; inner < 10; ++inner)
747 if (ctype->mbdigits[cnt + inner] == NULL)
748 break;
749
750 if (inner == 10)
751 cnt += 10;
752 else
753 {
754 /* Remove the group. */
755 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
756 ((ctype->wcdigits_act - cnt - 10)
757 * sizeof (ctype->mbdigits[0])));
758 ctype->mbdigits_act -= 10;
759 }
760 }
761
762 /* If no input digits are given use the default. */
763 if (ctype->mbdigits_act == 0)
764 {
765 if (ctype->mbdigits_max == 0)
766 {
767 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
768 10 * sizeof (struct charseq *));
769 ctype->mbdigits_max = 10;
770 }
771
772 for (cnt = 0; cnt < 10; ++cnt)
773 {
774 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
775 (char *) digits + cnt, 1);
776 if (ctype->mbdigits[cnt] == NULL)
777 {
778 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
779 longnames[cnt],
780 strlen (longnames[cnt]));
781 if (ctype->mbdigits[cnt] == NULL)
782 {
783 /* Hum, this ain't good. */
784 record_error (0, 0, _("\
785no input digits defined and none of the standard names in the charmap"));
786
787 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
788 sizeof (struct charseq) + 1);
789
790 /* This is better than nothing. */
791 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
792 ctype->mbdigits[cnt]->nbytes = 1;
793 }
794 }
795 }
796
797 ctype->mbdigits_act = 10;
798 }
799
800 /* Check the wide character input digits. There must be a multiple
801 of ten available. In each group it could be that one or the other
802 character is missing. In this case the whole group must be
803 removed. */
804 cnt = 0;
805 while (cnt < ctype->wcdigits_act)
806 {
807 size_t inner;
808 for (inner = 0; inner < 10; ++inner)
809 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
810 break;
811
812 if (inner == 10)
813 cnt += 10;
814 else
815 {
816 /* Remove the group. */
817 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
818 ((ctype->wcdigits_act - cnt - 10)
819 * sizeof (ctype->wcdigits[0])));
820 ctype->wcdigits_act -= 10;
821 }
822 }
823
824 /* If no input digits are given use the default. */
825 if (ctype->wcdigits_act == 0)
826 {
827 if (ctype->wcdigits_max == 0)
828 {
829 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
830 10 * sizeof (uint32_t));
831 ctype->wcdigits_max = 10;
832 }
833
834 for (cnt = 0; cnt < 10; ++cnt)
835 ctype->wcdigits[cnt] = L'0' + cnt;
836
837 ctype->mbdigits_act = 10;
838 }
839
840 /* Check the outdigits. */
841 warned = 0;
842 for (cnt = 0; cnt < 10; ++cnt)
843 if (ctype->mboutdigits[cnt] == NULL)
844 {
845 static struct charseq replace[2];
846
847 if (!warned)
848 {
849 record_error (0, 0, _("\
850not all characters used in `outdigit' are available in the charmap"));
851 warned = 1;
852 }
853
854 replace[0].nbytes = 1;
855 replace[0].bytes[0] = '?';
856 replace[0].bytes[1] = '\0';
857 ctype->mboutdigits[cnt] = &replace[0];
858 }
859
860 warned = 0;
861 for (cnt = 0; cnt < 10; ++cnt)
862 if (ctype->wcoutdigits[cnt] == 0)
863 {
864 if (!warned)
865 {
866 record_error (0, 0, _("\
867not all characters used in `outdigit' are available in the repertoire"));
868 warned = 1;
869 }
870
871 ctype->wcoutdigits[cnt] = L'?';
872 }
873
874 /* Sort the entries in the translit_ignore list. */
875 if (ctype->translit_ignore != NULL)
876 {
877 struct translit_ignore_t *firstp = ctype->translit_ignore;
878 struct translit_ignore_t *runp;
879
880 ctype->ntranslit_ignore = 1;
881
882 for (runp = firstp->next; runp != NULL; runp = runp->next)
883 {
884 struct translit_ignore_t *lastp = NULL;
885 struct translit_ignore_t *cmpp;
886
887 ++ctype->ntranslit_ignore;
888
889 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
890 if (runp->from < cmpp->from)
891 break;
892
893 runp->next = lastp;
894 if (lastp == NULL)
895 firstp = runp;
896 }
897
898 ctype->translit_ignore = firstp;
899 }
900}
901
902
903void
904ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
905 const char *output_path)
906{
907 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
908 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
909 + ctype->nr_charclass + ctype->map_collection_nr);
910 struct locale_file file;
911 uint32_t default_missing_len;
912 size_t elem, cnt;
913
914 /* Now prepare the output: Find the sizes of the table we can use. */
915 allocate_arrays (ctype, charmap, ctype->repertoire);
916
917 default_missing_len = (ctype->default_missing
918 ? wcslen ((wchar_t *) ctype->default_missing)
919 : 0);
920
921 init_locale_data (&file, nelems);
922 for (elem = 0; elem < nelems; ++elem)
923 {
924 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
925 switch (elem)
926 {
927#define CTYPE_EMPTY(name) \
928 case name: \
929 add_locale_empty (&file); \
930 break
931
932 CTYPE_EMPTY(_NL_CTYPE_GAP1);
933 CTYPE_EMPTY(_NL_CTYPE_GAP2);
934 CTYPE_EMPTY(_NL_CTYPE_GAP3);
935 CTYPE_EMPTY(_NL_CTYPE_GAP4);
936 CTYPE_EMPTY(_NL_CTYPE_GAP5);
937 CTYPE_EMPTY(_NL_CTYPE_GAP6);
938
939#define CTYPE_RAW_DATA(name, base, size) \
940 case _NL_ITEM_INDEX (name): \
941 add_locale_raw_data (&file, base, size); \
942 break
943
944 CTYPE_RAW_DATA (_NL_CTYPE_CLASS,
945 ctype->ctype_b,
946 (256 + 128) * sizeof (char_class_t));
947
948#define CTYPE_UINT32_ARRAY(name, base, n_elems) \
949 case _NL_ITEM_INDEX (name): \
950 add_locale_uint32_array (&file, base, n_elems); \
951 break
952
953 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128);
954 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128);
955 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256);
956 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256);
957 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32,
958 ctype->ctype32_b,
959 256 * sizeof (char_class32_t));
960
961#define CTYPE_UINT32(name, value) \
962 case _NL_ITEM_INDEX (name): \
963 add_locale_uint32 (&file, value); \
964 break
965
966 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset);
967 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset);
968 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size);
969
970 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX,
971 ctype->translit_from_idx,
972 ctype->translit_idx_size);
973
974 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL,
975 ctype->translit_from_tbl,
976 ctype->translit_from_tbl_size
977 / sizeof (uint32_t));
978
979 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX,
980 ctype->translit_to_idx,
981 ctype->translit_idx_size);
982
983 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL,
984 ctype->translit_to_tbl,
985 ctype->translit_to_tbl_size / sizeof (uint32_t));
986
987 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
988 /* The class name array. */
989 start_locale_structure (&file);
990 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
991 add_locale_string (&file, ctype->classnames[cnt]);
992 add_locale_char (&file, 0);
993 align_locale_data (&file, LOCFILE_ALIGN);
994 end_locale_structure (&file);
995 break;
996
997 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
998 /* The class name array. */
999 start_locale_structure (&file);
1000 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1001 add_locale_string (&file, ctype->mapnames[cnt]);
1002 add_locale_char (&file, 0);
1003 align_locale_data (&file, LOCFILE_ALIGN);
1004 end_locale_structure (&file);
1005 break;
1006
1007 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH):
1008 add_locale_wcwidth_table (&file, &ctype->width);
1009 break;
1010
1011 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max);
1012
1013 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1014 add_locale_string (&file, ctype->codeset_name);
1015 break;
1016
1017 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii);
1018
1019 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case);
1020
1021 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1022 add_locale_uint32 (&file, ctype->mbdigits_act / 10);
1023 break;
1024
1025 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1026 add_locale_uint32 (&file, ctype->wcdigits_act / 10);
1027 break;
1028
1029 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1030 start_locale_structure (&file);
1031 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1032 cnt < ctype->mbdigits_act; cnt += 10)
1033 {
1034 add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes,
1035 ctype->mbdigits[cnt]->nbytes);
1036 add_locale_char (&file, 0);
1037 }
1038 end_locale_structure (&file);
1039 break;
1040
1041 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1042 start_locale_structure (&file);
1043 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1044 add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes,
1045 ctype->mboutdigits[cnt]->nbytes);
1046 add_locale_char (&file, 0);
1047 end_locale_structure (&file);
1048 break;
1049
1050 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1051 start_locale_structure (&file);
1052 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1053 cnt < ctype->wcdigits_act; cnt += 10)
1054 add_locale_uint32 (&file, ctype->wcdigits[cnt]);
1055 end_locale_structure (&file);
1056 break;
1057
1058 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1059 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1060 add_locale_uint32 (&file, ctype->wcoutdigits[cnt]);
1061 break;
1062
1063 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1064 add_locale_uint32 (&file, default_missing_len);
1065 break;
1066
1067 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1068 add_locale_uint32_array (&file, ctype->default_missing,
1069 default_missing_len);
1070 break;
1071
1072 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1073 add_locale_uint32 (&file, ctype->ntranslit_ignore);
1074 break;
1075
1076 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1077 start_locale_structure (&file);
1078 {
1079 struct translit_ignore_t *runp;
1080 for (runp = ctype->translit_ignore; runp != NULL;
1081 runp = runp->next)
1082 {
1083 add_locale_uint32 (&file, runp->from);
1084 add_locale_uint32 (&file, runp->to);
1085 add_locale_uint32 (&file, runp->step);
1086 }
1087 }
1088 end_locale_structure (&file);
1089 break;
1090
1091 default:
1092 assert (! "unknown CTYPE element");
1093 }
1094 else
1095 {
1096 /* Handle extra maps. */
1097 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1098 if (nr < ctype->nr_charclass)
1099 {
1100 start_locale_prelude (&file);
1101 add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32);
1102 end_locale_prelude (&file);
1103 add_locale_wctype_table (&file, &ctype->class_3level[nr]);
1104 }
1105 else
1106 {
1107 nr -= ctype->nr_charclass;
1108 assert (nr < ctype->map_collection_nr);
1109 add_locale_wctrans_table (&file, &ctype->map_3level[nr]);
1110 }
1111 }
1112 }
1113
1114 write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file);
1115}
1116
1117
1118/* Local functions. */
1119static void
1120ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1121 const char *name)
1122{
1123 size_t cnt;
1124
1125 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1126 if (strcmp (ctype->classnames[cnt], name) == 0)
1127 break;
1128
1129 if (cnt < ctype->nr_charclass)
1130 {
1131 lr_error (lr, _("character class `%s' already defined"), name);
1132 return;
1133 }
1134
1135 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1136 /* Exit code 2 is prescribed in P1003.2b. */
1137 record_error (2, 0, _("\
1138implementation limit: no more than %Zd character classes allowed"),
1139 MAX_NR_CHARCLASS);
1140
1141 ctype->classnames[ctype->nr_charclass++] = name;
1142}
1143
1144
1145static void
1146ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1147 const char *name, const struct charmap_t *charmap)
1148{
1149 size_t max_chars = 0;
1150 size_t cnt;
1151
1152 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1153 {
1154 if (strcmp (ctype->mapnames[cnt], name) == 0)
1155 break;
1156
1157 if (max_chars < ctype->map_collection_max[cnt])
1158 max_chars = ctype->map_collection_max[cnt];
1159 }
1160
1161 if (cnt < ctype->map_collection_nr)
1162 {
1163 lr_error (lr, _("character map `%s' already defined"), name);
1164 return;
1165 }
1166
1167 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1168 /* Exit code 2 is prescribed in P1003.2b. */
1169 record_error (2, 0, _("\
1170implementation limit: no more than %d character maps allowed"),
1171 MAX_NR_CHARMAP);
1172
1173 ctype->mapnames[cnt] = name;
1174
1175 if (max_chars == 0)
1176 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1177 else
1178 ctype->map_collection_max[cnt] = max_chars;
1179
1180 ctype->map_collection[cnt] = (uint32_t *)
1181 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1182 ctype->map_collection_act[cnt] = 256;
1183
1184 ++ctype->map_collection_nr;
1185}
1186
1187
1188/* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1189 is possible if we only want to extend the name array. */
1190static uint32_t *
1191find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1192 size_t *act, uint32_t idx)
1193{
1194 size_t cnt;
1195
1196 if (idx < 256)
1197 return table == NULL ? NULL : &(*table)[idx];
1198
1199 /* Use the charnames_idx lookup table instead of the slow search loop. */
1200#if 1
1201 cnt = idx_table_get (&ctype->charnames_idx, idx);
1202 if (cnt == EMPTY)
1203 /* Not found. */
1204 cnt = ctype->charnames_act;
1205#else
1206 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1207 if (ctype->charnames[cnt] == idx)
1208 break;
1209#endif
1210
1211 /* We have to distinguish two cases: the name is found or not. */
1212 if (cnt == ctype->charnames_act)
1213 {
1214 /* Extend the name array. */
1215 if (ctype->charnames_act == ctype->charnames_max)
1216 {
1217 ctype->charnames_max *= 2;
1218 ctype->charnames = (uint32_t *)
1219 xrealloc (ctype->charnames,
1220 sizeof (uint32_t) * ctype->charnames_max);
1221 }
1222 ctype->charnames[ctype->charnames_act++] = idx;
1223 idx_table_add (&ctype->charnames_idx, idx, cnt);
1224 }
1225
1226 if (table == NULL)
1227 /* We have done everything we are asked to do. */
1228 return NULL;
1229
1230 if (max == NULL)
1231 /* The caller does not want to extend the table. */
1232 return (cnt >= *act ? NULL : &(*table)[cnt]);
1233
1234 if (cnt >= *act)
1235 {
1236 if (cnt >= *max)
1237 {
1238 size_t old_max = *max;
1239 do
1240 *max *= 2;
1241 while (*max <= cnt);
1242
1243 *table =
1244 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1245 memset (&(*table)[old_max], '\0',
1246 (*max - old_max) * sizeof (uint32_t));
1247 }
1248
1249 *act = cnt + 1;
1250 }
1251
1252 return &(*table)[cnt];
1253}
1254
1255
1256static int
1257get_character (struct token *now, const struct charmap_t *charmap,
1258 struct repertoire_t *repertoire,
1259 struct charseq **seqp, uint32_t *wchp)
1260{
1261 if (now->tok == tok_bsymbol)
1262 {
1263 /* This will hopefully be the normal case. */
1264 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1265 now->val.str.lenmb);
1266 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1267 now->val.str.lenmb);
1268 }
1269 else if (now->tok == tok_ucs4)
1270 {
1271 char utmp[10];
1272
1273 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1274 *seqp = charmap_find_value (charmap, utmp, 9);
1275
1276 if (*seqp == NULL)
1277 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1278
1279 if (*seqp == NULL)
1280 {
1281 /* Compute the value in the charmap from the UCS value. */
1282 const char *symbol = repertoire_find_symbol (repertoire,
1283 now->val.ucs4);
1284
1285 if (symbol == NULL)
1286 *seqp = NULL;
1287 else
1288 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1289
1290 if (*seqp == NULL)
1291 {
1292 if (repertoire != NULL)
1293 {
1294 /* Insert a negative entry. */
1295 static const struct charseq negative
1296 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1297 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1298 sizeof (uint32_t));
1299 *newp = now->val.ucs4;
1300
1301 insert_entry (&repertoire->seq_table, newp,
1302 sizeof (uint32_t), (void *) &negative);
1303 }
1304 }
1305 else
1306 (*seqp)->ucs4 = now->val.ucs4;
1307 }
1308 else if ((*seqp)->ucs4 != now->val.ucs4)
1309 *seqp = NULL;
1310
1311 *wchp = now->val.ucs4;
1312 }
1313 else if (now->tok == tok_charcode)
1314 {
1315 /* We must map from the byte code to UCS4. */
1316 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1317 now->val.str.lenmb);
1318
1319 if (*seqp == NULL)
1320 *wchp = ILLEGAL_CHAR_VALUE;
1321 else
1322 {
1323 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1324 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1325 strlen ((*seqp)->name));
1326 *wchp = (*seqp)->ucs4;
1327 }
1328 }
1329 else
1330 return 1;
1331
1332 return 0;
1333}
1334
1335
1336/* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1337 the .(2). counterparts. */
1338static void
1339charclass_symbolic_ellipsis (struct linereader *ldfile,
1340 struct locale_ctype_t *ctype,
1341 const struct charmap_t *charmap,
1342 struct repertoire_t *repertoire,
1343 struct token *now,
1344 const char *last_str,
1345 unsigned long int class256_bit,
1346 unsigned long int class_bit, int base,
1347 int ignore_content, int handle_digits, int step)
1348{
1349 const char *nowstr = now->val.str.startmb;
1350 char tmp[now->val.str.lenmb + 1];
1351 const char *cp;
1352 char *endp;
1353 unsigned long int from;
1354 unsigned long int to;
1355
1356 /* We have to compute the ellipsis values using the symbolic names. */
1357 assert (last_str != NULL);
1358
1359 if (strlen (last_str) != now->val.str.lenmb)
1360 {
1361 invalid_range:
1362 lr_error (ldfile,
1363 _("`%s' and `%.*s' are not valid names for symbolic range"),
1364 last_str, (int) now->val.str.lenmb, nowstr);
1365 return;
1366 }
1367
1368 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1369 /* Nothing to do, the names are the same. */
1370 return;
1371
1372 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1373 ;
1374
1375 errno = 0;
1376 from = strtoul (cp, &endp, base);
1377 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1378 goto invalid_range;
1379
1380 to = strtoul (nowstr + (cp - last_str), &endp, base);
1381 if ((to == UINT_MAX && errno == ERANGE)
1382 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1383 goto invalid_range;
1384
1385 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1386 if (!ignore_content)
1387 {
1388 now->val.str.startmb = tmp;
1389 while ((from += step) <= to)
1390 {
1391 struct charseq *seq;
1392 uint32_t wch;
1393
1394 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1395 (int) (cp - last_str), last_str,
1396 (int) (now->val.str.lenmb - (cp - last_str)),
1397 from);
1398
1399 if (get_character (now, charmap, repertoire, &seq, &wch))
1400 goto invalid_range;
1401
1402 if (seq != NULL && seq->nbytes == 1)
1403 /* Yep, we can store information about this byte sequence. */
1404 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1405
1406 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1407 /* We have the UCS4 position. */
1408 *find_idx (ctype, &ctype->class_collection,
1409 &ctype->class_collection_max,
1410 &ctype->class_collection_act, wch) |= class_bit;
1411
1412 if (handle_digits == 1)
1413 {
1414 /* We must store the digit values. */
1415 if (ctype->mbdigits_act == ctype->mbdigits_max)
1416 {
1417 ctype->mbdigits_max *= 2;
1418 ctype->mbdigits = xrealloc (ctype->mbdigits,
1419 (ctype->mbdigits_max
1420 * sizeof (char *)));
1421 ctype->wcdigits_max *= 2;
1422 ctype->wcdigits = xrealloc (ctype->wcdigits,
1423 (ctype->wcdigits_max
1424 * sizeof (uint32_t)));
1425 }
1426
1427 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1428 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1429 }
1430 else if (handle_digits == 2)
1431 {
1432 /* We must store the digit values. */
1433 if (ctype->outdigits_act >= 10)
1434 {
1435 lr_error (ldfile, _("\
1436%s: field `%s' does not contain exactly ten entries"),
1437 "LC_CTYPE", "outdigit");
1438 return;
1439 }
1440
1441 ctype->mboutdigits[ctype->outdigits_act] = seq;
1442 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1443 ++ctype->outdigits_act;
1444 }
1445 }
1446 }
1447}
1448
1449
1450/* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1451static void
1452charclass_ucs4_ellipsis (struct linereader *ldfile,
1453 struct locale_ctype_t *ctype,
1454 const struct charmap_t *charmap,
1455 struct repertoire_t *repertoire,
1456 struct token *now, uint32_t last_wch,
1457 unsigned long int class256_bit,
1458 unsigned long int class_bit, int ignore_content,
1459 int handle_digits, int step)
1460{
1461 if (last_wch > now->val.ucs4)
1462 {
1463 lr_error (ldfile, _("\
1464to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1465 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1466 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1467 return;
1468 }
1469
1470 if (!ignore_content)
1471 while ((last_wch += step) <= now->val.ucs4)
1472 {
1473 /* We have to find out whether there is a byte sequence corresponding
1474 to this UCS4 value. */
1475 struct charseq *seq;
1476 char utmp[10];
1477
1478 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1479 seq = charmap_find_value (charmap, utmp, 9);
1480 if (seq == NULL)
1481 {
1482 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1483 seq = charmap_find_value (charmap, utmp, 5);
1484 }
1485
1486 if (seq == NULL)
1487 /* Try looking in the repertoire map. */
1488 seq = repertoire_find_seq (repertoire, last_wch);
1489
1490 /* If this is the first time we look for this sequence create a new
1491 entry. */
1492 if (seq == NULL)
1493 {
1494 static const struct charseq negative
1495 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1496
1497 /* Find the symbolic name for this UCS4 value. */
1498 if (repertoire != NULL)
1499 {
1500 const char *symbol = repertoire_find_symbol (repertoire,
1501 last_wch);
1502 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1503 sizeof (uint32_t));
1504 *newp = last_wch;
1505
1506 if (symbol != NULL)
1507 /* We have a name, now search the multibyte value. */
1508 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1509
1510 if (seq == NULL)
1511 /* We have to create a fake entry. */
1512 seq = (struct charseq *) &negative;
1513 else
1514 seq->ucs4 = last_wch;
1515
1516 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1517 seq);
1518 }
1519 else
1520 /* We have to create a fake entry. */
1521 seq = (struct charseq *) &negative;
1522 }
1523
1524 /* We have a name, now search the multibyte value. */
1525 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1526 /* Yep, we can store information about this byte sequence. */
1527 ctype->class256_collection[(size_t) seq->bytes[0]]
1528 |= class256_bit;
1529
1530 /* And of course we have the UCS4 position. */
1531 if (class_bit != 0)
1532 *find_idx (ctype, &ctype->class_collection,
1533 &ctype->class_collection_max,
1534 &ctype->class_collection_act, last_wch) |= class_bit;
1535
1536 if (handle_digits == 1)
1537 {
1538 /* We must store the digit values. */
1539 if (ctype->mbdigits_act == ctype->mbdigits_max)
1540 {
1541 ctype->mbdigits_max *= 2;
1542 ctype->mbdigits = xrealloc (ctype->mbdigits,
1543 (ctype->mbdigits_max
1544 * sizeof (char *)));
1545 ctype->wcdigits_max *= 2;
1546 ctype->wcdigits = xrealloc (ctype->wcdigits,
1547 (ctype->wcdigits_max
1548 * sizeof (uint32_t)));
1549 }
1550
1551 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1552 ? seq : NULL);
1553 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1554 }
1555 else if (handle_digits == 2)
1556 {
1557 /* We must store the digit values. */
1558 if (ctype->outdigits_act >= 10)
1559 {
1560 lr_error (ldfile, _("\
1561%s: field `%s' does not contain exactly ten entries"),
1562 "LC_CTYPE", "outdigit");
1563 return;
1564 }
1565
1566 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1567 ? seq : NULL);
1568 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1569 ++ctype->outdigits_act;
1570 }
1571 }
1572}
1573
1574
1575/* Ellipsis as in `/xea/x12.../xea/x34'. */
1576static void
1577charclass_charcode_ellipsis (struct linereader *ldfile,
1578 struct locale_ctype_t *ctype,
1579 const struct charmap_t *charmap,
1580 struct repertoire_t *repertoire,
1581 struct token *now, char *last_charcode,
1582 uint32_t last_charcode_len,
1583 unsigned long int class256_bit,
1584 unsigned long int class_bit, int ignore_content,
1585 int handle_digits)
1586{
1587 /* First check whether the to-value is larger. */
1588 if (now->val.charcode.nbytes != last_charcode_len)
1589 {
1590 lr_error (ldfile, _("\
1591start and end character sequence of range must have the same length"));
1592 return;
1593 }
1594
1595 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1596 {
1597 lr_error (ldfile, _("\
1598to-value character sequence is smaller than from-value sequence"));
1599 return;
1600 }
1601
1602 if (!ignore_content)
1603 {
1604 do
1605 {
1606 /* Increment the byte sequence value. */
1607 struct charseq *seq;
1608 uint32_t wch;
1609 int i;
1610
1611 for (i = last_charcode_len - 1; i >= 0; --i)
1612 if (++last_charcode[i] != 0)
1613 break;
1614
1615 if (last_charcode_len == 1)
1616 /* Of course we have the charcode value. */
1617 ctype->class256_collection[(size_t) last_charcode[0]]
1618 |= class256_bit;
1619
1620 /* Find the symbolic name. */
1621 seq = charmap_find_symbol (charmap, last_charcode,
1622 last_charcode_len);
1623 if (seq != NULL)
1624 {
1625 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1626 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1627 strlen (seq->name));
1628 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1629
1630 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1631 *find_idx (ctype, &ctype->class_collection,
1632 &ctype->class_collection_max,
1633 &ctype->class_collection_act, wch) |= class_bit;
1634 }
1635 else
1636 wch = ILLEGAL_CHAR_VALUE;
1637
1638 if (handle_digits == 1)
1639 {
1640 /* We must store the digit values. */
1641 if (ctype->mbdigits_act == ctype->mbdigits_max)
1642 {
1643 ctype->mbdigits_max *= 2;
1644 ctype->mbdigits = xrealloc (ctype->mbdigits,
1645 (ctype->mbdigits_max
1646 * sizeof (char *)));
1647 ctype->wcdigits_max *= 2;
1648 ctype->wcdigits = xrealloc (ctype->wcdigits,
1649 (ctype->wcdigits_max
1650 * sizeof (uint32_t)));
1651 }
1652
1653 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1654 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1655 seq->nbytes = last_charcode_len;
1656
1657 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1658 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1659 }
1660 else if (handle_digits == 2)
1661 {
1662 struct charseq *seq;
1663 /* We must store the digit values. */
1664 if (ctype->outdigits_act >= 10)
1665 {
1666 lr_error (ldfile, _("\
1667%s: field `%s' does not contain exactly ten entries"),
1668 "LC_CTYPE", "outdigit");
1669 return;
1670 }
1671
1672 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1673 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1674 seq->nbytes = last_charcode_len;
1675
1676 ctype->mboutdigits[ctype->outdigits_act] = seq;
1677 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1678 ++ctype->outdigits_act;
1679 }
1680 }
1681 while (memcmp (last_charcode, now->val.charcode.bytes,
1682 last_charcode_len) != 0);
1683 }
1684}
1685
1686
1687static uint32_t *
1688find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1689 uint32_t wch)
1690{
1691 struct translit_t *trunp = ctype->translit;
1692 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1693
1694 while (trunp != NULL)
1695 {
1696 /* XXX We simplify things here. The transliterations we look
1697 for are only allowed to have one character. */
1698 if (trunp->from[0] == wch && trunp->from[1] == 0)
1699 {
1700 /* Found it. Now look for a transliteration which can be
1701 represented with the character set. */
1702 struct translit_to_t *torunp = trunp->to;
1703
1704 while (torunp != NULL)
1705 {
1706 int i;
1707
1708 for (i = 0; torunp->str[i] != 0; ++i)
1709 {
1710 char utmp[10];
1711
1712 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1713 if (charmap_find_value (charmap, utmp, 9) == NULL)
1714 /* This character cannot be represented. */
1715 break;
1716 }
1717
1718 if (torunp->str[i] == 0)
1719 return torunp->str;
1720
1721 torunp = torunp->next;
1722 }
1723
1724 break;
1725 }
1726
1727 trunp = trunp->next;
1728 }
1729
1730 /* Check for ignored chars. */
1731 while (tirunp != NULL)
1732 {
1733 if (tirunp->from <= wch && tirunp->to >= wch)
1734 {
1735 uint32_t wi;
1736
1737 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1738 if (wi == wch)
1739 return no_str;
1740 }
1741 }
1742
1743 /* Nothing found. */
1744 return NULL;
1745}
1746
1747
1748uint32_t *
1749find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1750 uint32_t wch)
1751{
1752 struct locale_ctype_t *ctype;
1753 uint32_t *result = NULL;
1754
1755 assert (locale != NULL);
1756 ctype = locale->categories[LC_CTYPE].ctype;
1757
1758 if (ctype == NULL)
1759 return NULL;
1760
1761 if (ctype->translit != NULL)
1762 result = find_translit2 (ctype, charmap, wch);
1763
1764 if (result == NULL)
1765 {
1766 struct translit_include_t *irunp = ctype->translit_include;
1767
1768 while (irunp != NULL && result == NULL)
1769 {
1770 result = find_translit (find_locale (CTYPE_LOCALE,
1771 irunp->copy_locale,
1772 irunp->copy_repertoire,
1773 charmap),
1774 charmap, wch);
1775 irunp = irunp->next;
1776 }
1777 }
1778
1779 return result;
1780}
1781
1782
1783/* Read one transliteration entry. */
1784static uint32_t *
1785read_widestring (struct linereader *ldfile, struct token *now,
1786 const struct charmap_t *charmap,
1787 struct repertoire_t *repertoire)
1788{
1789 uint32_t *wstr;
1790
1791 if (now->tok == tok_default_missing)
1792 /* The special name "" will denote this case. */
1793 wstr = no_str;
1794 else if (now->tok == tok_bsymbol)
1795 {
1796 /* Get the value from the repertoire. */
1797 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1798 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1799 now->val.str.lenmb);
1800 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1801 {
1802 /* We cannot proceed, we don't know the UCS4 value. */
1803 free (wstr);
1804 return NULL;
1805 }
1806
1807 wstr[1] = 0;
1808 }
1809 else if (now->tok == tok_ucs4)
1810 {
1811 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1812 wstr[0] = now->val.ucs4;
1813 wstr[1] = 0;
1814 }
1815 else if (now->tok == tok_charcode)
1816 {
1817 /* Argh, we have to convert to the symbol name first and then to the
1818 UCS4 value. */
1819 struct charseq *seq = charmap_find_symbol (charmap,
1820 now->val.str.startmb,
1821 now->val.str.lenmb);
1822 if (seq == NULL)
1823 /* Cannot find the UCS4 value. */
1824 return NULL;
1825
1826 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1827 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1828 strlen (seq->name));
1829 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1830 /* We cannot proceed, we don't know the UCS4 value. */
1831 return NULL;
1832
1833 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1834 wstr[0] = seq->ucs4;
1835 wstr[1] = 0;
1836 }
1837 else if (now->tok == tok_string)
1838 {
1839 wstr = now->val.str.startwc;
1840 if (wstr == NULL || wstr[0] == 0)
1841 return NULL;
1842 }
1843 else
1844 {
1845 if (now->tok != tok_eol && now->tok != tok_eof)
1846 lr_ignore_rest (ldfile, 0);
1847 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1848 return (uint32_t *) -1l;
1849 }
1850
1851 return wstr;
1852}
1853
1854
1855static void
1856read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1857 struct token *now, const struct charmap_t *charmap,
1858 struct repertoire_t *repertoire)
1859{
1860 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1861 struct translit_t *result;
1862 struct translit_to_t **top;
1863 struct obstack *ob = &ctype->mempool;
1864 int first;
1865 int ignore;
1866
1867 if (from_wstr == NULL)
1868 /* There is no valid from string. */
1869 return;
1870
1871 result = (struct translit_t *) obstack_alloc (ob,
1872 sizeof (struct translit_t));
1873 result->from = from_wstr;
1874 result->fname = ldfile->fname;
1875 result->lineno = ldfile->lineno;
1876 result->next = NULL;
1877 result->to = NULL;
1878 top = &result->to;
1879 first = 1;
1880 ignore = 0;
1881
1882 while (1)
1883 {
1884 uint32_t *to_wstr;
1885
1886 /* Next we have one or more transliterations. They are
1887 separated by semicolons. */
1888 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1889
1890 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1891 {
1892 /* One string read. */
1893 const uint32_t zero = 0;
1894
1895 if (!ignore)
1896 {
1897 obstack_grow (ob, &zero, 4);
1898 to_wstr = obstack_finish (ob);
1899
1900 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1901 (*top)->str = to_wstr;
1902 (*top)->next = NULL;
1903 }
1904
1905 if (now->tok == tok_eol)
1906 {
1907 result->next = ctype->translit;
1908 ctype->translit = result;
1909 return;
1910 }
1911
1912 if (!ignore)
1913 top = &(*top)->next;
1914 ignore = 0;
1915 }
1916 else
1917 {
1918 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1919 if (to_wstr == (uint32_t *) -1l)
1920 {
1921 /* An error occurred. */
1922 obstack_free (ob, result);
1923 return;
1924 }
1925
1926 if (to_wstr == NULL)
1927 ignore = 1;
1928 else
1929 /* This value is usable. */
1930 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1931
1932 first = 0;
1933 }
1934 }
1935}
1936
1937
1938static void
1939read_translit_ignore_entry (struct linereader *ldfile,
1940 struct locale_ctype_t *ctype,
1941 const struct charmap_t *charmap,
1942 struct repertoire_t *repertoire)
1943{
1944 /* We expect a semicolon-separated list of characters we ignore. We are
1945 only interested in the wide character definitions. These must be
1946 single characters, possibly defining a range when an ellipsis is used. */
1947 while (1)
1948 {
1949 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
1950 verbose);
1951 struct translit_ignore_t *newp;
1952 uint32_t from;
1953
1954 if (now->tok == tok_eol || now->tok == tok_eof)
1955 {
1956 lr_error (ldfile,
1957 _("premature end of `translit_ignore' definition"));
1958 return;
1959 }
1960
1961 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1962 {
1963 lr_error (ldfile, _("syntax error"));
1964 lr_ignore_rest (ldfile, 0);
1965 return;
1966 }
1967
1968 if (now->tok == tok_ucs4)
1969 from = now->val.ucs4;
1970 else
1971 /* Try to get the value. */
1972 from = repertoire_find_value (repertoire, now->val.str.startmb,
1973 now->val.str.lenmb);
1974
1975 if (from == ILLEGAL_CHAR_VALUE)
1976 {
1977 lr_error (ldfile, "invalid character name");
1978 newp = NULL;
1979 }
1980 else
1981 {
1982 newp = (struct translit_ignore_t *)
1983 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
1984 newp->from = from;
1985 newp->to = from;
1986 newp->step = 1;
1987
1988 newp->next = ctype->translit_ignore;
1989 ctype->translit_ignore = newp;
1990 }
1991
1992 /* Now we expect either a semicolon, an ellipsis, or the end of the
1993 line. */
1994 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1995
1996 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
1997 {
1998 /* XXX Should we bother implementing `....'? `...' certainly
1999 will not be implemented. */
2000 uint32_t to;
2001 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2002
2003 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2004
2005 if (now->tok == tok_eol || now->tok == tok_eof)
2006 {
2007 lr_error (ldfile,
2008 _("premature end of `translit_ignore' definition"));
2009 return;
2010 }
2011
2012 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2013 {
2014 lr_error (ldfile, _("syntax error"));
2015 lr_ignore_rest (ldfile, 0);
2016 return;
2017 }
2018
2019 if (now->tok == tok_ucs4)
2020 to = now->val.ucs4;
2021 else
2022 /* Try to get the value. */
2023 to = repertoire_find_value (repertoire, now->val.str.startmb,
2024 now->val.str.lenmb);
2025
2026 if (to == ILLEGAL_CHAR_VALUE)
2027 lr_error (ldfile, "invalid character name");
2028 else
2029 {
2030 /* Make sure the `to'-value is larger. */
2031 if (to >= from)
2032 {
2033 newp->to = to;
2034 newp->step = step;
2035 }
2036 else
2037 lr_error (ldfile, _("\
2038to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2039 (to | from) < 65536 ? 4 : 8, to,
2040 (to | from) < 65536 ? 4 : 8, from);
2041 }
2042
2043 /* And the next token. */
2044 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2045 }
2046
2047 if (now->tok == tok_eol || now->tok == tok_eof)
2048 /* We are done. */
2049 return;
2050
2051 if (now->tok == tok_semicolon)
2052 /* Next round. */
2053 continue;
2054
2055 /* If we come here something is wrong. */
2056 lr_error (ldfile, _("syntax error"));
2057 lr_ignore_rest (ldfile, 0);
2058 return;
2059 }
2060}
2061
2062
2063/* The parser for the LC_CTYPE section of the locale definition. */
2064void
2065ctype_read (struct linereader *ldfile, struct localedef_t *result,
2066 const struct charmap_t *charmap, const char *repertoire_name,
2067 int ignore_content)
2068{
2069 struct repertoire_t *repertoire = NULL;
2070 struct locale_ctype_t *ctype;
2071 struct token *now;
2072 enum token_t nowtok;
2073 size_t cnt;
2074 uint32_t last_wch = 0;
2075 enum token_t last_token;
2076 enum token_t ellipsis_token;
2077 int step;
2078 char last_charcode[16];
2079 size_t last_charcode_len = 0;
2080 const char *last_str = NULL;
2081 int mapidx;
2082 struct localedef_t *copy_locale = NULL;
2083
2084 /* Get the repertoire we have to use. */
2085 if (repertoire_name != NULL)
2086 repertoire = repertoire_read (repertoire_name);
2087
2088 /* The rest of the line containing `LC_CTYPE' must be free. */
2089 lr_ignore_rest (ldfile, 1);
2090
2091
2092 do
2093 {
2094 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2095 nowtok = now->tok;
2096 }
2097 while (nowtok == tok_eol);
2098
2099 /* If we see `copy' now we are almost done. */
2100 if (nowtok == tok_copy)
2101 {
2102 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2103 if (now->tok != tok_string)
2104 {
2105 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2106
2107 skip_category:
2108 do
2109 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2110 while (now->tok != tok_eof && now->tok != tok_end);
2111
2112 if (now->tok != tok_eof
2113 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2114 now->tok == tok_eof))
2115 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2116 else if (now->tok != tok_lc_ctype)
2117 {
2118 lr_error (ldfile, _("\
2119%1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2120 lr_ignore_rest (ldfile, 0);
2121 }
2122 else
2123 lr_ignore_rest (ldfile, 1);
2124
2125 return;
2126 }
2127
2128 if (! ignore_content)
2129 {
2130 /* Get the locale definition. */
2131 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2132 repertoire_name, charmap, NULL);
2133 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2134 {
2135 /* Not yet loaded. So do it now. */
2136 if (locfile_read (copy_locale, charmap) != 0)
2137 goto skip_category;
2138 }
2139
2140 if (copy_locale->categories[LC_CTYPE].ctype == NULL)
2141 return;
2142 }
2143
2144 lr_ignore_rest (ldfile, 1);
2145
2146 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2147 nowtok = now->tok;
2148 }
2149
2150 /* Prepare the data structures. */
2151 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2152 ctype = result->categories[LC_CTYPE].ctype;
2153
2154 /* Remember the repertoire we use. */
2155 if (!ignore_content)
2156 ctype->repertoire = repertoire;
2157
2158 while (1)
2159 {
2160 unsigned long int class_bit = 0;
2161 unsigned long int class256_bit = 0;
2162 int handle_digits = 0;
2163
2164 /* Of course we don't proceed beyond the end of file. */
2165 if (nowtok == tok_eof)
2166 break;
2167
2168 /* Ingore empty lines. */
2169 if (nowtok == tok_eol)
2170 {
2171 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2172 nowtok = now->tok;
2173 continue;
2174 }
2175
2176 switch (nowtok)
2177 {
2178 case tok_charclass:
2179 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2180 while (now->tok == tok_ident || now->tok == tok_string)
2181 {
2182 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2183 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2184 if (now->tok != tok_semicolon)
2185 break;
2186 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2187 }
2188 if (now->tok != tok_eol)
2189 SYNTAX_ERROR (_("\
2190%s: syntax error in definition of new character class"), "LC_CTYPE");
2191 break;
2192
2193 case tok_charconv:
2194 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2195 while (now->tok == tok_ident || now->tok == tok_string)
2196 {
2197 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2198 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2199 if (now->tok != tok_semicolon)
2200 break;
2201 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2202 }
2203 if (now->tok != tok_eol)
2204 SYNTAX_ERROR (_("\
2205%s: syntax error in definition of new character map"), "LC_CTYPE");
2206 break;
2207
2208 case tok_class:
2209 /* Ignore the rest of the line if we don't need the input of
2210 this line. */
2211 if (ignore_content)
2212 {
2213 lr_ignore_rest (ldfile, 0);
2214 break;
2215 }
2216
2217 /* We simply forget the `class' keyword and use the following
2218 operand to determine the bit. */
2219 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2220 if (now->tok == tok_ident || now->tok == tok_string)
2221 {
2222 /* Must can be one of the predefined class names. */
2223 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2224 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2225 break;
2226 if (cnt >= ctype->nr_charclass)
2227 {
2228 /* OK, it's a new class. */
2229 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2230
2231 class_bit = _ISwbit (ctype->nr_charclass - 1);
2232 }
2233 else
2234 {
2235 class_bit = _ISwbit (cnt);
2236
2237 free (now->val.str.startmb);
2238 }
2239 }
2240 else if (now->tok == tok_digit)
2241 goto handle_tok_digit;
2242 else if (now->tok < tok_upper || now->tok > tok_blank)
2243 goto err_label;
2244 else
2245 {
2246 class_bit = BITw (now->tok);
2247 class256_bit = BIT (now->tok);
2248 }
2249
2250 /* The next character must be a semicolon. */
2251 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2252 if (now->tok != tok_semicolon)
2253 goto err_label;
2254 goto read_charclass;
2255
2256 case tok_upper:
2257 case tok_lower:
2258 case tok_alpha:
2259 case tok_alnum:
2260 case tok_space:
2261 case tok_cntrl:
2262 case tok_punct:
2263 case tok_graph:
2264 case tok_print:
2265 case tok_xdigit:
2266 case tok_blank:
2267 /* Ignore the rest of the line if we don't need the input of
2268 this line. */
2269 if (ignore_content)
2270 {
2271 lr_ignore_rest (ldfile, 0);
2272 break;
2273 }
2274
2275 class_bit = BITw (now->tok);
2276 class256_bit = BIT (now->tok);
2277 handle_digits = 0;
2278 read_charclass:
2279 ctype->class_done |= class_bit;
2280 last_token = tok_none;
2281 ellipsis_token = tok_none;
2282 step = 1;
2283 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2284 while (now->tok != tok_eol && now->tok != tok_eof)
2285 {
2286 uint32_t wch;
2287 struct charseq *seq;
2288
2289 if (ellipsis_token == tok_none)
2290 {
2291 if (get_character (now, charmap, repertoire, &seq, &wch))
2292 goto err_label;
2293
2294 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2295 /* Yep, we can store information about this byte
2296 sequence. */
2297 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2298
2299 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2300 && class_bit != 0)
2301 /* We have the UCS4 position. */
2302 *find_idx (ctype, &ctype->class_collection,
2303 &ctype->class_collection_max,
2304 &ctype->class_collection_act, wch) |= class_bit;
2305
2306 last_token = now->tok;
2307 /* Terminate the string. */
2308 if (last_token == tok_bsymbol)
2309 {
2310 now->val.str.startmb[now->val.str.lenmb] = '\0';
2311 last_str = now->val.str.startmb;
2312 }
2313 else
2314 last_str = NULL;
2315 last_wch = wch;
2316 memcpy (last_charcode, now->val.charcode.bytes, 16);
2317 last_charcode_len = now->val.charcode.nbytes;
2318
2319 if (!ignore_content && handle_digits == 1)
2320 {
2321 /* We must store the digit values. */
2322 if (ctype->mbdigits_act == ctype->mbdigits_max)
2323 {
2324 ctype->mbdigits_max += 10;
2325 ctype->mbdigits = xrealloc (ctype->mbdigits,
2326 (ctype->mbdigits_max
2327 * sizeof (char *)));
2328 ctype->wcdigits_max += 10;
2329 ctype->wcdigits = xrealloc (ctype->wcdigits,
2330 (ctype->wcdigits_max
2331 * sizeof (uint32_t)));
2332 }
2333
2334 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2335 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2336 }
2337 else if (!ignore_content && handle_digits == 2)
2338 {
2339 /* We must store the digit values. */
2340 if (ctype->outdigits_act >= 10)
2341 {
2342 lr_error (ldfile, _("\
2343%s: field `%s' does not contain exactly ten entries"),
2344 "LC_CTYPE", "outdigit");
2345 lr_ignore_rest (ldfile, 0);
2346 break;
2347 }
2348
2349 ctype->mboutdigits[ctype->outdigits_act] = seq;
2350 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2351 ++ctype->outdigits_act;
2352 }
2353 }
2354 else
2355 {
2356 /* Now it gets complicated. We have to resolve the
2357 ellipsis problem. First we must distinguish between
2358 the different kind of ellipsis and this must match the
2359 tokens we have seen. */
2360 assert (last_token != tok_none);
2361
2362 if (last_token != now->tok)
2363 {
2364 lr_error (ldfile, _("\
2365ellipsis range must be marked by two operands of same type"));
2366 lr_ignore_rest (ldfile, 0);
2367 break;
2368 }
2369
2370 if (last_token == tok_bsymbol)
2371 {
2372 if (ellipsis_token == tok_ellipsis3)
2373 lr_error (ldfile, _("with symbolic name range values \
2374the absolute ellipsis `...' must not be used"));
2375
2376 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2377 repertoire, now, last_str,
2378 class256_bit, class_bit,
2379 (ellipsis_token
2380 == tok_ellipsis4
2381 ? 10 : 16),
2382 ignore_content,
2383 handle_digits, step);
2384 }
2385 else if (last_token == tok_ucs4)
2386 {
2387 if (ellipsis_token != tok_ellipsis2)
2388 lr_error (ldfile, _("\
2389with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2390
2391 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2392 repertoire, now, last_wch,
2393 class256_bit, class_bit,
2394 ignore_content, handle_digits,
2395 step);
2396 }
2397 else
2398 {
2399 assert (last_token == tok_charcode);
2400
2401 if (ellipsis_token != tok_ellipsis3)
2402 lr_error (ldfile, _("\
2403with character code range values one must use the absolute ellipsis `...'"));
2404
2405 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2406 repertoire, now,
2407 last_charcode,
2408 last_charcode_len,
2409 class256_bit, class_bit,
2410 ignore_content,
2411 handle_digits);
2412 }
2413
2414 /* Now we have used the last value. */
2415 last_token = tok_none;
2416 }
2417
2418 /* Next we expect a semicolon or the end of the line. */
2419 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2420 if (now->tok == tok_eol || now->tok == tok_eof)
2421 break;
2422
2423 if (last_token != tok_none
2424 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2425 {
2426 if (now->tok == tok_ellipsis2_2)
2427 {
2428 now->tok = tok_ellipsis2;
2429 step = 2;
2430 }
2431 else if (now->tok == tok_ellipsis4_2)
2432 {
2433 now->tok = tok_ellipsis4;
2434 step = 2;
2435 }
2436
2437 ellipsis_token = now->tok;
2438
2439 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2440 continue;
2441 }
2442
2443 if (now->tok != tok_semicolon)
2444 goto err_label;
2445
2446 /* And get the next character. */
2447 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2448
2449 ellipsis_token = tok_none;
2450 step = 1;
2451 }
2452 break;
2453
2454 case tok_digit:
2455 /* Ignore the rest of the line if we don't need the input of
2456 this line. */
2457 if (ignore_content)
2458 {
2459 lr_ignore_rest (ldfile, 0);
2460 break;
2461 }
2462
2463 handle_tok_digit:
2464 class_bit = _ISwdigit;
2465 class256_bit = _ISdigit;
2466 handle_digits = 1;
2467 goto read_charclass;
2468
2469 case tok_outdigit:
2470 /* Ignore the rest of the line if we don't need the input of
2471 this line. */
2472 if (ignore_content)
2473 {
2474 lr_ignore_rest (ldfile, 0);
2475 break;
2476 }
2477
2478 if (ctype->outdigits_act != 0)
2479 lr_error (ldfile, _("\
2480%s: field `%s' declared more than once"),
2481 "LC_CTYPE", "outdigit");
2482 class_bit = 0;
2483 class256_bit = 0;
2484 handle_digits = 2;
2485 goto read_charclass;
2486
2487 case tok_toupper:
2488 /* Ignore the rest of the line if we don't need the input of
2489 this line. */
2490 if (ignore_content)
2491 {
2492 lr_ignore_rest (ldfile, 0);
2493 break;
2494 }
2495
2496 mapidx = 0;
2497 goto read_mapping;
2498
2499 case tok_tolower:
2500 /* Ignore the rest of the line if we don't need the input of
2501 this line. */
2502 if (ignore_content)
2503 {
2504 lr_ignore_rest (ldfile, 0);
2505 break;
2506 }
2507
2508 mapidx = 1;
2509 goto read_mapping;
2510
2511 case tok_map:
2512 /* Ignore the rest of the line if we don't need the input of
2513 this line. */
2514 if (ignore_content)
2515 {
2516 lr_ignore_rest (ldfile, 0);
2517 break;
2518 }
2519
2520 /* We simply forget the `map' keyword and use the following
2521 operand to determine the mapping. */
2522 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2523 if (now->tok == tok_ident || now->tok == tok_string)
2524 {
2525 size_t cnt;
2526
2527 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2528 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2529 break;
2530
2531 if (cnt < ctype->map_collection_nr)
2532 free (now->val.str.startmb);
2533 else
2534 /* OK, it's a new map. */
2535 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2536
2537 mapidx = cnt;
2538 }
2539 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2540 goto err_label;
2541 else
2542 mapidx = now->tok - tok_toupper;
2543
2544 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2545 /* This better should be a semicolon. */
2546 if (now->tok != tok_semicolon)
2547 goto err_label;
2548
2549 read_mapping:
2550 /* Test whether this mapping was already defined. */
2551 if (ctype->tomap_done[mapidx])
2552 {
2553 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2554 ctype->mapnames[mapidx]);
2555 lr_ignore_rest (ldfile, 0);
2556 break;
2557 }
2558 ctype->tomap_done[mapidx] = 1;
2559
2560 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2561 while (now->tok != tok_eol && now->tok != tok_eof)
2562 {
2563 struct charseq *from_seq;
2564 uint32_t from_wch;
2565 struct charseq *to_seq;
2566 uint32_t to_wch;
2567
2568 /* Every pair starts with an opening brace. */
2569 if (now->tok != tok_open_brace)
2570 goto err_label;
2571
2572 /* Next comes the from-value. */
2573 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2574 if (get_character (now, charmap, repertoire, &from_seq,
2575 &from_wch) != 0)
2576 goto err_label;
2577
2578 /* The next is a comma. */
2579 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2580 if (now->tok != tok_comma)
2581 goto err_label;
2582
2583 /* And the other value. */
2584 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2585 if (get_character (now, charmap, repertoire, &to_seq,
2586 &to_wch) != 0)
2587 goto err_label;
2588
2589 /* And the last thing is the closing brace. */
2590 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2591 if (now->tok != tok_close_brace)
2592 goto err_label;
2593
2594 if (!ignore_content)
2595 {
2596 /* Check whether the mapping converts from an ASCII value
2597 to a non-ASCII value. */
2598 if (from_seq != NULL && from_seq->nbytes == 1
2599 && isascii (from_seq->bytes[0])
2600 && to_seq != NULL && (to_seq->nbytes != 1
2601 || !isascii (to_seq->bytes[0])))
2602 ctype->to_nonascii = 1;
2603
2604 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2605 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2606 /* We can use this value. */
2607 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2608 = to_seq->bytes[0];
2609
2610 if (from_wch != ILLEGAL_CHAR_VALUE
2611 && to_wch != ILLEGAL_CHAR_VALUE)
2612 /* Both correct values. */
2613 *find_idx (ctype, &ctype->map_collection[mapidx],
2614 &ctype->map_collection_max[mapidx],
2615 &ctype->map_collection_act[mapidx],
2616 from_wch) = to_wch;
2617 }
2618
2619 /* Now comes a semicolon or the end of the line/file. */
2620 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2621 if (now->tok == tok_semicolon)
2622 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2623 }
2624 break;
2625
2626 case tok_translit_start:
2627 /* Ignore the entire translit section with its peculiar syntax
2628 if we don't need the input. */
2629 if (ignore_content)
2630 {
2631 do
2632 {
2633 lr_ignore_rest (ldfile, 0);
2634 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2635 }
2636 while (now->tok != tok_translit_end && now->tok != tok_eof);
2637
2638 if (now->tok == tok_eof)
2639 lr_error (ldfile, _(\
2640"%s: `translit_start' section does not end with `translit_end'"),
2641 "LC_CTYPE");
2642
2643 break;
2644 }
2645
2646 /* The rest of the line better should be empty. */
2647 lr_ignore_rest (ldfile, 1);
2648
2649 /* We count here the number of allocated entries in the `translit'
2650 array. */
2651 cnt = 0;
2652
2653 ldfile->translate_strings = 1;
2654 ldfile->return_widestr = 1;
2655
2656 /* We proceed until we see the `translit_end' token. */
2657 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2658 now->tok != tok_translit_end && now->tok != tok_eof)
2659 {
2660 if (now->tok == tok_eol)
2661 /* Ignore empty lines. */
2662 continue;
2663
2664 if (now->tok == tok_include)
2665 {
2666 /* We have to include locale. */
2667 const char *locale_name;
2668 const char *repertoire_name;
2669 struct translit_include_t *include_stmt, **include_ptr;
2670
2671 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2672 /* This should be a string or an identifier. In any
2673 case something to name a locale. */
2674 if (now->tok != tok_string && now->tok != tok_ident)
2675 {
2676 translit_syntax:
2677 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2678 lr_ignore_rest (ldfile, 0);
2679 continue;
2680 }
2681 locale_name = now->val.str.startmb;
2682
2683 /* Next should be a semicolon. */
2684 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2685 if (now->tok != tok_semicolon)
2686 goto translit_syntax;
2687
2688 /* Now the repertoire name. */
2689 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2690 if ((now->tok != tok_string && now->tok != tok_ident)
2691 || now->val.str.startmb == NULL)
2692 goto translit_syntax;
2693 repertoire_name = now->val.str.startmb;
2694 if (repertoire_name[0] == '\0')
2695 /* Ignore the empty string. */
2696 repertoire_name = NULL;
2697
2698 /* Save the include statement for later processing. */
2699 include_stmt = (struct translit_include_t *)
2700 xmalloc (sizeof (struct translit_include_t));
2701 include_stmt->copy_locale = locale_name;
2702 include_stmt->copy_repertoire = repertoire_name;
2703 include_stmt->next = NULL;
2704
2705 include_ptr = &ctype->translit_include;
2706 while (*include_ptr != NULL)
2707 include_ptr = &(*include_ptr)->next;
2708 *include_ptr = include_stmt;
2709
2710 /* The rest of the line must be empty. */
2711 lr_ignore_rest (ldfile, 1);
2712
2713 /* Make sure the locale is read. */
2714 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2715 1, NULL);
2716 continue;
2717 }
2718 else if (now->tok == tok_default_missing)
2719 {
2720 uint32_t *wstr;
2721
2722 while (1)
2723 {
2724 /* We expect a single character or string as the
2725 argument. */
2726 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2727 wstr = read_widestring (ldfile, now, charmap,
2728 repertoire);
2729
2730 if (wstr != NULL)
2731 {
2732 if (ctype->default_missing != NULL)
2733 {
2734 lr_error (ldfile, _("\
2735%s: duplicate `default_missing' definition"), "LC_CTYPE");
2736 record_error_at_line (0, 0,
2737 ctype->default_missing_file,
2738 ctype->default_missing_lineno,
2739 _("\
2740previous definition was here"));
2741 }
2742 else
2743 {
2744 ctype->default_missing = wstr;
2745 ctype->default_missing_file = ldfile->fname;
2746 ctype->default_missing_lineno = ldfile->lineno;
2747 }
2748 /* We can have more entries, ignore them. */
2749 lr_ignore_rest (ldfile, 0);
2750 break;
2751 }
2752 else if (wstr == (uint32_t *) -1l)
2753 /* This was an syntax error. */
2754 break;
2755
2756 /* Maybe there is another replacement we can use. */
2757 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2758 if (now->tok == tok_eol || now->tok == tok_eof)
2759 {
2760 /* Nothing found. We tell the user. */
2761 lr_error (ldfile, _("\
2762%s: no representable `default_missing' definition found"), "LC_CTYPE");
2763 break;
2764 }
2765 if (now->tok != tok_semicolon)
2766 goto translit_syntax;
2767 }
2768
2769 continue;
2770 }
2771 else if (now->tok == tok_translit_ignore)
2772 {
2773 read_translit_ignore_entry (ldfile, ctype, charmap,
2774 repertoire);
2775 continue;
2776 }
2777
2778 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2779 }
2780 ldfile->return_widestr = 0;
2781
2782 if (now->tok == tok_eof)
2783 lr_error (ldfile, _(\
2784"%s: `translit_start' section does not end with `translit_end'"),
2785 "LC_CTYPE");
2786
2787 break;
2788
2789 case tok_ident:
2790 /* Ignore the rest of the line if we don't need the input of
2791 this line. */
2792 if (ignore_content)
2793 {
2794 lr_ignore_rest (ldfile, 0);
2795 break;
2796 }
2797
2798 /* This could mean one of several things. First test whether
2799 it's a character class name. */
2800 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2801 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2802 break;
2803 if (cnt < ctype->nr_charclass)
2804 {
2805 class_bit = _ISwbit (cnt);
2806 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2807 free (now->val.str.startmb);
2808 goto read_charclass;
2809 }
2810 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2811 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2812 break;
2813 if (cnt < ctype->map_collection_nr)
2814 {
2815 mapidx = cnt;
2816 free (now->val.str.startmb);
2817 goto read_mapping;
2818 }
2819 break;
2820
2821 case tok_end:
2822 /* Next we assume `LC_CTYPE'. */
2823 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2824 if (now->tok == tok_eof)
2825 break;
2826 if (now->tok == tok_eol)
2827 lr_error (ldfile, _("%s: incomplete `END' line"),
2828 "LC_CTYPE");
2829 else if (now->tok != tok_lc_ctype)
2830 lr_error (ldfile, _("\
2831%1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2832 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2833 return;
2834
2835 default:
2836 err_label:
2837 if (now->tok != tok_eof)
2838 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2839 }
2840
2841 /* Prepare for the next round. */
2842 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2843 nowtok = now->tok;
2844 }
2845
2846 /* When we come here we reached the end of the file. */
2847 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2848}
2849
2850
2851/* Subroutine of set_class_defaults, below. */
2852static void
2853set_one_default (struct locale_ctype_t *ctype,
2854 const struct charmap_t *charmap,
2855 int bitpos, int from, int to)
2856{
2857 char tmp[2];
2858 int ch;
2859 int bit = _ISbit (bitpos);
2860 int bitw = _ISwbit (bitpos);
2861 /* Define string. */
2862 strcpy (tmp, "?");
2863
2864 for (ch = from; ch <= to; ++ch)
2865 {
2866 struct charseq *seq;
2867 tmp[0] = ch;
2868
2869 seq = charmap_find_value (charmap, tmp, 1);
2870 if (seq == NULL)
2871 {
2872 char buf[10];
2873 sprintf (buf, "U%08X", ch);
2874 seq = charmap_find_value (charmap, buf, 9);
2875 }
2876 if (seq == NULL)
2877 {
2878 record_error (0, 0, _("\
2879%s: character `%s' not defined while needed as default value"),
2880 "LC_CTYPE", tmp);
2881 }
2882 else if (seq->nbytes != 1)
2883 record_error (0, 0, _("\
2884%s: character `%s' in charmap not representable with one byte"),
2885 "LC_CTYPE", tmp);
2886 else
2887 ctype->class256_collection[seq->bytes[0]] |= bit;
2888
2889 /* No need to search here, the ASCII value is also the Unicode
2890 value. */
2891 ELEM (ctype, class_collection, , ch) |= bitw;
2892 }
2893}
2894
2895static void
2896set_class_defaults (struct locale_ctype_t *ctype,
2897 const struct charmap_t *charmap,
2898 struct repertoire_t *repertoire)
2899{
2900#define set_default(bitpos, from, to) \
2901 set_one_default (ctype, charmap, bitpos, from, to)
2902
2903 /* These function defines the default values for the classes and conversions
2904 according to POSIX.2 2.5.2.1.
2905 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2906 Don't move them unless you know what you do! */
2907
2908 /* Set default values if keyword was not present. */
2909 if ((ctype->class_done & BITw (tok_upper)) == 0)
2910 /* "If this keyword [lower] is not specified, the lowercase letters
2911 `A' through `Z', ..., shall automatically belong to this class,
2912 with implementation defined character values." [P1003.2, 2.5.2.1] */
2913 set_default (BITPOS (tok_upper), 'A', 'Z');
2914
2915 if ((ctype->class_done & BITw (tok_lower)) == 0)
2916 /* "If this keyword [lower] is not specified, the lowercase letters
2917 `a' through `z', ..., shall automatically belong to this class,
2918 with implementation defined character values." [P1003.2, 2.5.2.1] */
2919 set_default (BITPOS (tok_lower), 'a', 'z');
2920
2921 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2922 {
2923 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2924 class `lower' *must* be in class `alpha'. */
2925 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2926 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2927
2928 for (size_t cnt = 0; cnt < 256; ++cnt)
2929 if ((ctype->class256_collection[cnt] & mask) != 0)
2930 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2931
2932 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2933 if ((ctype->class_collection[cnt] & maskw) != 0)
2934 ctype->class_collection[cnt] |= BITw (tok_alpha);
2935 }
2936
2937 if ((ctype->class_done & BITw (tok_digit)) == 0)
2938 /* "If this keyword [digit] is not specified, the digits `0' through
2939 `9', ..., shall automatically belong to this class, with
2940 implementation-defined character values." [P1003.2, 2.5.2.1] */
2941 set_default (BITPOS (tok_digit), '0', '9');
2942
2943 /* "Only characters specified for the `alpha' and `digit' keyword
2944 shall be specified. Characters specified for the keyword `alpha'
2945 and `digit' are automatically included in this class. */
2946 {
2947 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2948 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2949
2950 for (size_t cnt = 0; cnt < 256; ++cnt)
2951 if ((ctype->class256_collection[cnt] & mask) != 0)
2952 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2953
2954 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2955 if ((ctype->class_collection[cnt] & maskw) != 0)
2956 ctype->class_collection[cnt] |= BITw (tok_alnum);
2957 }
2958
2959 if ((ctype->class_done & BITw (tok_space)) == 0)
2960 /* "If this keyword [space] is not specified, the characters <space>,
2961 <form-feed>, <newline>, <carriage-return>, <tab>, and
2962 <vertical-tab>, ..., shall automatically belong to this class,
2963 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2964 {
2965 struct charseq *seq;
2966
2967 seq = charmap_find_value (charmap, "space", 5);
2968 if (seq == NULL)
2969 seq = charmap_find_value (charmap, "SP", 2);
2970 if (seq == NULL)
2971 seq = charmap_find_value (charmap, "U00000020", 9);
2972 if (seq == NULL)
2973 {
2974 record_error (0, 0, _("\
2975%s: character `%s' not defined while needed as default value"),
2976 "LC_CTYPE", "<space>");
2977 }
2978 else if (seq->nbytes != 1)
2979 record_error (0, 0, _("\
2980%s: character `%s' in charmap not representable with one byte"),
2981 "LC_CTYPE", "<space>");
2982 else
2983 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2984
2985 /* No need to search. */
2986 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
2987
2988 seq = charmap_find_value (charmap, "form-feed", 9);
2989 if (seq == NULL)
2990 seq = charmap_find_value (charmap, "U0000000C", 9);
2991 if (seq == NULL)
2992 {
2993 record_error (0, 0, _("\
2994%s: character `%s' not defined while needed as default value"),
2995 "LC_CTYPE", "<form-feed>");
2996 }
2997 else if (seq->nbytes != 1)
2998 record_error (0, 0, _("\
2999%s: character `%s' in charmap not representable with one byte"),
3000 "LC_CTYPE", "<form-feed>");
3001 else
3002 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3003
3004 /* No need to search. */
3005 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3006
3007
3008 seq = charmap_find_value (charmap, "newline", 7);
3009 if (seq == NULL)
3010 seq = charmap_find_value (charmap, "U0000000A", 9);
3011 if (seq == NULL)
3012 {
3013 record_error (0, 0, _("\
3014%s: character `%s' not defined while needed as default value"),
3015 "LC_CTYPE", "<newline>");
3016 }
3017 else if (seq->nbytes != 1)
3018 record_error (0, 0, _("\
3019%s: character `%s' in charmap not representable with one byte"),
3020 "LC_CTYPE", "<newline>");
3021 else
3022 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3023
3024 /* No need to search. */
3025 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3026
3027
3028 seq = charmap_find_value (charmap, "carriage-return", 15);
3029 if (seq == NULL)
3030 seq = charmap_find_value (charmap, "U0000000D", 9);
3031 if (seq == NULL)
3032 {
3033 record_error (0, 0, _("\
3034%s: character `%s' not defined while needed as default value"),
3035 "LC_CTYPE", "<carriage-return>");
3036 }
3037 else if (seq->nbytes != 1)
3038 record_error (0, 0, _("\
3039%s: character `%s' in charmap not representable with one byte"),
3040 "LC_CTYPE", "<carriage-return>");
3041 else
3042 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3043
3044 /* No need to search. */
3045 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3046
3047
3048 seq = charmap_find_value (charmap, "tab", 3);
3049 if (seq == NULL)
3050 seq = charmap_find_value (charmap, "U00000009", 9);
3051 if (seq == NULL)
3052 {
3053 record_error (0, 0, _("\
3054%s: character `%s' not defined while needed as default value"),
3055 "LC_CTYPE", "<tab>");
3056 }
3057 else if (seq->nbytes != 1)
3058 record_error (0, 0, _("\
3059%s: character `%s' in charmap not representable with one byte"),
3060 "LC_CTYPE", "<tab>");
3061 else
3062 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3063
3064 /* No need to search. */
3065 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3066
3067
3068 seq = charmap_find_value (charmap, "vertical-tab", 12);
3069 if (seq == NULL)
3070 seq = charmap_find_value (charmap, "U0000000B", 9);
3071 if (seq == NULL)
3072 {
3073 record_error (0, 0, _("\
3074%s: character `%s' not defined while needed as default value"),
3075 "LC_CTYPE", "<vertical-tab>");
3076 }
3077 else if (seq->nbytes != 1)
3078 record_error (0, 0, _("\
3079%s: character `%s' in charmap not representable with one byte"),
3080 "LC_CTYPE", "<vertical-tab>");
3081 else
3082 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3083
3084 /* No need to search. */
3085 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3086 }
3087
3088 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3089 /* "If this keyword is not specified, the digits `0' to `9', the
3090 uppercase letters `A' through `F', and the lowercase letters `a'
3091 through `f', ..., shell automatically belong to this class, with
3092 implementation defined character values." [P1003.2, 2.5.2.1] */
3093 {
3094 set_default (BITPOS (tok_xdigit), '0', '9');
3095 set_default (BITPOS (tok_xdigit), 'A', 'F');
3096 set_default (BITPOS (tok_xdigit), 'a', 'f');
3097 }
3098
3099 if ((ctype->class_done & BITw (tok_blank)) == 0)
3100 /* "If this keyword [blank] is unspecified, the characters <space> and
3101 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3102 {
3103 struct charseq *seq;
3104
3105 seq = charmap_find_value (charmap, "space", 5);
3106 if (seq == NULL)
3107 seq = charmap_find_value (charmap, "SP", 2);
3108 if (seq == NULL)
3109 seq = charmap_find_value (charmap, "U00000020", 9);
3110 if (seq == NULL)
3111 {
3112 record_error (0, 0, _("\
3113%s: character `%s' not defined while needed as default value"),
3114 "LC_CTYPE", "<space>");
3115 }
3116 else if (seq->nbytes != 1)
3117 record_error (0, 0, _("\
3118%s: character `%s' in charmap not representable with one byte"),
3119 "LC_CTYPE", "<space>");
3120 else
3121 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3122
3123 /* No need to search. */
3124 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3125
3126
3127 seq = charmap_find_value (charmap, "tab", 3);
3128 if (seq == NULL)
3129 seq = charmap_find_value (charmap, "U00000009", 9);
3130 if (seq == NULL)
3131 {
3132 record_error (0, 0, _("\
3133%s: character `%s' not defined while needed as default value"),
3134 "LC_CTYPE", "<tab>");
3135 }
3136 else if (seq->nbytes != 1)
3137 record_error (0, 0, _("\
3138%s: character `%s' in charmap not representable with one byte"),
3139 "LC_CTYPE", "<tab>");
3140 else
3141 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3142
3143 /* No need to search. */
3144 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3145 }
3146
3147 if ((ctype->class_done & BITw (tok_graph)) == 0)
3148 /* "If this keyword [graph] is not specified, characters specified for
3149 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3150 shall belong to this character class." [P1003.2, 2.5.2.1] */
3151 {
3152 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower)
3153 | BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit)
3154 | BIT (tok_punct);
3155 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower)
3156 | BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit)
3157 | BITw (tok_punct);
3158
3159 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3160 if ((ctype->class_collection[cnt] & maskw) != 0)
3161 ctype->class_collection[cnt] |= BITw (tok_graph);
3162
3163 for (size_t cnt = 0; cnt < 256; ++cnt)
3164 if ((ctype->class256_collection[cnt] & mask) != 0)
3165 ctype->class256_collection[cnt] |= BIT (tok_graph);
3166 }
3167
3168 if ((ctype->class_done & BITw (tok_print)) == 0)
3169 /* "If this keyword [print] is not provided, characters specified for
3170 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3171 and the <space> character shall belong to this character class."
3172 [P1003.2, 2.5.2.1] */
3173 {
3174 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower)
3175 | BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit)
3176 | BIT (tok_punct);
3177 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower)
3178 | BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit)
3179 | BITw (tok_punct);
3180 struct charseq *seq;
3181
3182 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3183 if ((ctype->class_collection[cnt] & maskw) != 0)
3184 ctype->class_collection[cnt] |= BITw (tok_print);
3185
3186 for (size_t cnt = 0; cnt < 256; ++cnt)
3187 if ((ctype->class256_collection[cnt] & mask) != 0)
3188 ctype->class256_collection[cnt] |= BIT (tok_print);
3189
3190
3191 seq = charmap_find_value (charmap, "space", 5);
3192 if (seq == NULL)
3193 seq = charmap_find_value (charmap, "SP", 2);
3194 if (seq == NULL)
3195 seq = charmap_find_value (charmap, "U00000020", 9);
3196 if (seq == NULL)
3197 {
3198 record_error (0, 0, _("\
3199%s: character `%s' not defined while needed as default value"),
3200 "LC_CTYPE", "<space>");
3201 }
3202 else if (seq->nbytes != 1)
3203 record_error (0, 0, _("\
3204%s: character `%s' in charmap not representable with one byte"),
3205 "LC_CTYPE", "<space>");
3206 else
3207 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3208
3209 /* No need to search. */
3210 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3211 }
3212
3213 if (ctype->tomap_done[0] == 0)
3214 /* "If this keyword [toupper] is not specified, the lowercase letters
3215 `a' through `z', and their corresponding uppercase letters `A' to
3216 `Z', ..., shall automatically be included, with implementation-
3217 defined character values." [P1003.2, 2.5.2.1] */
3218 {
3219 char tmp[4];
3220 int ch;
3221
3222 strcpy (tmp, "<?>");
3223
3224 for (ch = 'a'; ch <= 'z'; ++ch)
3225 {
3226 struct charseq *seq_from, *seq_to;
3227
3228 tmp[1] = (char) ch;
3229
3230 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3231 if (seq_from == NULL)
3232 {
3233 char buf[10];
3234 sprintf (buf, "U%08X", ch);
3235 seq_from = charmap_find_value (charmap, buf, 9);
3236 }
3237 if (seq_from == NULL)
3238 {
3239 record_error (0, 0, _("\
3240%s: character `%s' not defined while needed as default value"),
3241 "LC_CTYPE", tmp);
3242 }
3243 else if (seq_from->nbytes != 1)
3244 {
3245 record_error (0, 0, _("\
3246%s: character `%s' needed as default value not representable with one byte"),
3247 "LC_CTYPE", tmp);
3248 }
3249 else
3250 {
3251 /* This conversion is implementation defined. */
3252 tmp[1] = (char) (ch + ('A' - 'a'));
3253 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3254 if (seq_to == NULL)
3255 {
3256 char buf[10];
3257 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3258 seq_to = charmap_find_value (charmap, buf, 9);
3259 }
3260 if (seq_to == NULL)
3261 {
3262 record_error (0, 0, _("\
3263%s: character `%s' not defined while needed as default value"),
3264 "LC_CTYPE", tmp);
3265 }
3266 else if (seq_to->nbytes != 1)
3267 {
3268 record_error (0, 0, _("\
3269%s: character `%s' needed as default value not representable with one byte"),
3270 "LC_CTYPE", tmp);
3271 }
3272 else
3273 /* The index [0] is determined by the order of the
3274 `ctype_map_newP' calls in `ctype_startup'. */
3275 ctype->map256_collection[0][seq_from->bytes[0]]
3276 = seq_to->bytes[0];
3277 }
3278
3279 /* No need to search. */
3280 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3281 }
3282 }
3283
3284 if (ctype->tomap_done[1] == 0)
3285 /* "If this keyword [tolower] is not specified, the mapping shall be
3286 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3287 {
3288 for (size_t cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3289 if (ctype->map_collection[0][cnt] != 0)
3290 ELEM (ctype, map_collection, [1],
3291 ctype->map_collection[0][cnt])
3292 = ctype->charnames[cnt];
3293
3294 for (size_t cnt = 0; cnt < 256; ++cnt)
3295 if (ctype->map256_collection[0][cnt] != 0)
3296 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3297 }
3298
3299 if (ctype->outdigits_act != 10)
3300 {
3301 if (ctype->outdigits_act != 0)
3302 record_error (0, 0, _("\
3303%s: field `%s' does not contain exactly ten entries"),
3304 "LC_CTYPE", "outdigit");
3305
3306 for (size_t cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3307 {
3308 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3309 (char *) digits + cnt,
3310 1);
3311
3312 if (ctype->mboutdigits[cnt] == NULL)
3313 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3314 longnames[cnt],
3315 strlen (longnames[cnt]));
3316
3317 if (ctype->mboutdigits[cnt] == NULL)
3318 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3319 uninames[cnt], 9);
3320
3321 if (ctype->mboutdigits[cnt] == NULL)
3322 {
3323 /* Provide a replacement. */
3324 record_error (0, 0, _("\
3325no output digits defined and none of the standard names in the charmap"));
3326
3327 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3328 sizeof (struct charseq)
3329 + 1);
3330
3331 /* This is better than nothing. */
3332 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3333 ctype->mboutdigits[cnt]->nbytes = 1;
3334 }
3335
3336 ctype->wcoutdigits[cnt] = L'0' + cnt;
3337 }
3338
3339 ctype->outdigits_act = 10;
3340 }
3341
3342#undef set_default
3343}
3344
3345
3346/* Initialize. Assumes t->p and t->q have already been set. */
3347static inline void
3348wctype_table_init (struct wctype_table *t)
3349{
3350 t->level1 = NULL;
3351 t->level1_alloc = t->level1_size = 0;
3352 t->level2 = NULL;
3353 t->level2_alloc = t->level2_size = 0;
3354 t->level3 = NULL;
3355 t->level3_alloc = t->level3_size = 0;
3356}
3357
3358/* Retrieve an entry. */
3359static inline int
3360wctype_table_get (struct wctype_table *t, uint32_t wc)
3361{
3362 uint32_t index1 = wc >> (t->q + t->p + 5);
3363 if (index1 < t->level1_size)
3364 {
3365 uint32_t lookup1 = t->level1[index1];
3366 if (lookup1 != EMPTY)
3367 {
3368 uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1))
3369 + (lookup1 << t->q);
3370 uint32_t lookup2 = t->level2[index2];
3371 if (lookup2 != EMPTY)
3372 {
3373 uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1))
3374 + (lookup2 << t->p);
3375 uint32_t lookup3 = t->level3[index3];
3376 uint32_t index4 = wc & 0x1f;
3377
3378 return (lookup3 >> index4) & 1;
3379 }
3380 }
3381 }
3382 return 0;
3383}
3384
3385/* Add one entry. */
3386static void
3387wctype_table_add (struct wctype_table *t, uint32_t wc)
3388{
3389 uint32_t index1 = wc >> (t->q + t->p + 5);
3390 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3391 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3392 uint32_t index4 = wc & 0x1f;
3393 size_t i, i1, i2;
3394
3395 if (index1 >= t->level1_size)
3396 {
3397 if (index1 >= t->level1_alloc)
3398 {
3399 size_t alloc = 2 * t->level1_alloc;
3400 if (alloc <= index1)
3401 alloc = index1 + 1;
3402 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3403 alloc * sizeof (uint32_t));
3404 t->level1_alloc = alloc;
3405 }
3406 while (index1 >= t->level1_size)
3407 t->level1[t->level1_size++] = EMPTY;
3408 }
3409
3410 if (t->level1[index1] == EMPTY)
3411 {
3412 if (t->level2_size == t->level2_alloc)
3413 {
3414 size_t alloc = 2 * t->level2_alloc + 1;
3415 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3416 (alloc << t->q) * sizeof (uint32_t));
3417 t->level2_alloc = alloc;
3418 }
3419 i1 = t->level2_size << t->q;
3420 i2 = (t->level2_size + 1) << t->q;
3421 for (i = i1; i < i2; i++)
3422 t->level2[i] = EMPTY;
3423 t->level1[index1] = t->level2_size++;
3424 }
3425
3426 index2 += t->level1[index1] << t->q;
3427
3428 if (t->level2[index2] == EMPTY)
3429 {
3430 if (t->level3_size == t->level3_alloc)
3431 {
3432 size_t alloc = 2 * t->level3_alloc + 1;
3433 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3434 (alloc << t->p) * sizeof (uint32_t));
3435 t->level3_alloc = alloc;
3436 }
3437 i1 = t->level3_size << t->p;
3438 i2 = (t->level3_size + 1) << t->p;
3439 for (i = i1; i < i2; i++)
3440 t->level3[i] = 0;
3441 t->level2[index2] = t->level3_size++;
3442 }
3443
3444 index3 += t->level2[index2] << t->p;
3445
3446 t->level3[index3] |= (uint32_t)1 << index4;
3447}
3448
3449/* Finalize and shrink. */
3450static void
3451add_locale_wctype_table (struct locale_file *file, struct wctype_table *t)
3452{
3453 size_t i, j, k;
3454 uint32_t reorder3[t->level3_size];
3455 uint32_t reorder2[t->level2_size];
3456 uint32_t level2_offset, level3_offset;
3457
3458 /* Uniquify level3 blocks. */
3459 k = 0;
3460 for (j = 0; j < t->level3_size; j++)
3461 {
3462 for (i = 0; i < k; i++)
3463 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3464 (1 << t->p) * sizeof (uint32_t)) == 0)
3465 break;
3466 /* Relocate block j to block i. */
3467 reorder3[j] = i;
3468 if (i == k)
3469 {
3470 if (i != j)
3471 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3472 (1 << t->p) * sizeof (uint32_t));
3473 k++;
3474 }
3475 }
3476 t->level3_size = k;
3477
3478 for (i = 0; i < (t->level2_size << t->q); i++)
3479 if (t->level2[i] != EMPTY)
3480 t->level2[i] = reorder3[t->level2[i]];
3481
3482 /* Uniquify level2 blocks. */
3483 k = 0;
3484 for (j = 0; j < t->level2_size; j++)
3485 {
3486 for (i = 0; i < k; i++)
3487 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3488 (1 << t->q) * sizeof (uint32_t)) == 0)
3489 break;
3490 /* Relocate block j to block i. */
3491 reorder2[j] = i;
3492 if (i == k)
3493 {
3494 if (i != j)
3495 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3496 (1 << t->q) * sizeof (uint32_t));
3497 k++;
3498 }
3499 }
3500 t->level2_size = k;
3501
3502 for (i = 0; i < t->level1_size; i++)
3503 if (t->level1[i] != EMPTY)
3504 t->level1[i] = reorder2[t->level1[i]];
3505
3506 t->result_size =
3507 5 * sizeof (uint32_t)
3508 + t->level1_size * sizeof (uint32_t)
3509 + (t->level2_size << t->q) * sizeof (uint32_t)
3510 + (t->level3_size << t->p) * sizeof (uint32_t);
3511
3512 level2_offset =
3513 5 * sizeof (uint32_t)
3514 + t->level1_size * sizeof (uint32_t);
3515 level3_offset =
3516 5 * sizeof (uint32_t)
3517 + t->level1_size * sizeof (uint32_t)
3518 + (t->level2_size << t->q) * sizeof (uint32_t);
3519
3520 start_locale_structure (file);
3521 add_locale_uint32 (file, t->q + t->p + 5);
3522 add_locale_uint32 (file, t->level1_size);
3523 add_locale_uint32 (file, t->p + 5);
3524 add_locale_uint32 (file, (1 << t->q) - 1);
3525 add_locale_uint32 (file, (1 << t->p) - 1);
3526
3527 for (i = 0; i < t->level1_size; i++)
3528 add_locale_uint32
3529 (file,
3530 t->level1[i] == EMPTY
3531 ? 0
3532 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3533
3534 for (i = 0; i < (t->level2_size << t->q); i++)
3535 add_locale_uint32
3536 (file,
3537 t->level2[i] == EMPTY
3538 ? 0
3539 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3540
3541 add_locale_uint32_array (file, t->level3, t->level3_size << t->p);
3542 end_locale_structure (file);
3543
3544 if (t->level1_alloc > 0)
3545 free (t->level1);
3546 if (t->level2_alloc > 0)
3547 free (t->level2);
3548 if (t->level3_alloc > 0)
3549 free (t->level3);
3550}
3551
3552/* Flattens the included transliterations into a translit list.
3553 Inserts them in the list at `cursor', and returns the new cursor. */
3554static struct translit_t **
3555translit_flatten (struct locale_ctype_t *ctype,
3556 const struct charmap_t *charmap,
3557 struct translit_t **cursor)
3558{
3559 while (ctype->translit_include != NULL)
3560 {
3561 const char *copy_locale = ctype->translit_include->copy_locale;
3562 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3563 struct localedef_t *other;
3564
3565 /* Unchain the include statement. During the depth-first traversal
3566 we don't want to visit any locale more than once. */
3567 ctype->translit_include = ctype->translit_include->next;
3568
3569 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3570
3571 if (other == NULL || other->categories[LC_CTYPE].ctype == NULL)
3572 {
3573 record_error (0, 0, _("\
3574%s: transliteration data from locale `%s' not available"),
3575 "LC_CTYPE", copy_locale);
3576 }
3577 else
3578 {
3579 struct locale_ctype_t *other_ctype =
3580 other->categories[LC_CTYPE].ctype;
3581
3582 cursor = translit_flatten (other_ctype, charmap, cursor);
3583 assert (other_ctype->translit_include == NULL);
3584
3585 if (other_ctype->translit != NULL)
3586 {
3587 /* Insert the other_ctype->translit list at *cursor. */
3588 struct translit_t *endp = other_ctype->translit;
3589 while (endp->next != NULL)
3590 endp = endp->next;
3591
3592 endp->next = *cursor;
3593 *cursor = other_ctype->translit;
3594
3595 /* Avoid any risk of circular lists. */
3596 other_ctype->translit = NULL;
3597
3598 cursor = &endp->next;
3599 }
3600
3601 if (ctype->default_missing == NULL)
3602 ctype->default_missing = other_ctype->default_missing;
3603 }
3604 }
3605
3606 return cursor;
3607}
3608
3609static void
3610allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3611 struct repertoire_t *repertoire)
3612{
3613 size_t idx, nr;
3614 const void *key;
3615 size_t len;
3616 void *vdata;
3617 void *curs;
3618
3619 /* You wonder about this amount of memory? This is only because some
3620 users do not manage to address the array with unsigned values or
3621 data types with range >= 256. '\200' would result in the array
3622 index -128. To help these poor people we duplicate the entries for
3623 128 up to 255 below the entry for \0. */
3624 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3625 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3626 ctype->class_b = (uint32_t **)
3627 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3628 ctype->class_3level = (struct wctype_table *)
3629 xmalloc (ctype->nr_charclass * sizeof (struct wctype_table));
3630
3631 /* This is the array accessed using the multibyte string elements. */
3632 for (idx = 0; idx < 256; ++idx)
3633 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3634
3635 /* Mirror first 127 entries. We must take care that entry -1 is not
3636 mirrored because EOF == -1. */
3637 for (idx = 0; idx < 127; ++idx)
3638 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3639
3640 /* The 32 bit array contains all characters < 0x100. */
3641 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3642 if (ctype->charnames[idx] < 0x100)
3643 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3644
3645 for (nr = 0; nr < ctype->nr_charclass; nr++)
3646 {
3647 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3648
3649 /* We only set CLASS_B for the bits in the ISO C classes, not
3650 the user defined classes. The number should not change but
3651 who knows. */
3652#define LAST_ISO_C_BIT 11
3653 if (nr <= LAST_ISO_C_BIT)
3654 for (idx = 0; idx < 256; ++idx)
3655 if (ctype->class256_collection[idx] & _ISbit (nr))
3656 ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f);
3657 }
3658
3659 for (nr = 0; nr < ctype->nr_charclass; nr++)
3660 {
3661 struct wctype_table *t;
3662
3663 t = &ctype->class_3level[nr];
3664 t->p = 4; /* or: 5 */
3665 t->q = 7; /* or: 6 */
3666 wctype_table_init (t);
3667
3668 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3669 if (ctype->class_collection[idx] & _ISwbit (nr))
3670 wctype_table_add (t, ctype->charnames[idx]);
3671
3672 record_verbose (stderr, _("\
3673%s: table for class \"%s\": %lu bytes"),
3674 "LC_CTYPE", ctype->classnames[nr],
3675 (unsigned long int) t->result_size);
3676 }
3677
3678 /* Room for table of mappings. */
3679 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3680 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3681 * sizeof (uint32_t *));
3682 ctype->map_3level = (struct wctrans_table *)
3683 xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table));
3684
3685 /* Fill in all mappings. */
3686 for (idx = 0; idx < 2; ++idx)
3687 {
3688 unsigned int idx2;
3689
3690 /* Allocate table. */
3691 ctype->map_b[idx] = (uint32_t *)
3692 xmalloc ((256 + 128) * sizeof (uint32_t));
3693
3694 /* Copy values from collection. */
3695 for (idx2 = 0; idx2 < 256; ++idx2)
3696 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3697
3698 /* Mirror first 127 entries. We must take care not to map entry
3699 -1 because EOF == -1. */
3700 for (idx2 = 0; idx2 < 127; ++idx2)
3701 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3702
3703 /* EOF must map to EOF. */
3704 ctype->map_b[idx][127] = EOF;
3705 }
3706
3707 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3708 {
3709 unsigned int idx2;
3710
3711 /* Allocate table. */
3712 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3713
3714 /* Copy values from collection. Default is identity mapping. */
3715 for (idx2 = 0; idx2 < 256; ++idx2)
3716 ctype->map32_b[idx][idx2] =
3717 (ctype->map_collection[idx][idx2] != 0
3718 ? ctype->map_collection[idx][idx2]
3719 : idx2);
3720 }
3721
3722 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3723 {
3724 struct wctrans_table *t;
3725
3726 t = &ctype->map_3level[nr];
3727 t->p = 7;
3728 t->q = 9;
3729 wctrans_table_init (t);
3730
3731 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3732 if (ctype->map_collection[nr][idx] != 0)
3733 wctrans_table_add (t, ctype->charnames[idx],
3734 ctype->map_collection[nr][idx]);
3735
3736 record_verbose (stderr, _("\
3737%s: table for map \"%s\": %lu bytes"),
3738 "LC_CTYPE", ctype->mapnames[nr],
3739 (unsigned long int) t->result_size);
3740 }
3741
3742 /* Extra array for class and map names. */
3743 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3744 * sizeof (uint32_t));
3745 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3746 * sizeof (uint32_t));
3747
3748 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3749 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3750
3751 /* Array for width information. Because the expected widths are very
3752 small (never larger than 2) we use only one single byte. This
3753 saves space.
3754 We put only printable characters in the table. wcwidth is specified
3755 to return -1 for non-printable characters. Doing the check here
3756 saves a run-time check.
3757 But we put L'\0' in the table. This again saves a run-time check. */
3758 {
3759 struct wcwidth_table *t;
3760
3761 t = &ctype->width;
3762 t->p = 7;
3763 t->q = 9;
3764 wcwidth_table_init (t);
3765
3766 /* First set all the printable characters of the character set to
3767 the default width. */
3768 curs = NULL;
3769 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3770 {
3771 struct charseq *data = (struct charseq *) vdata;
3772
3773 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3774 data->ucs4 = repertoire_find_value (ctype->repertoire,
3775 data->name, len);
3776
3777 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3778 {
3779 uint32_t *class_bits =
3780 find_idx (ctype, &ctype->class_collection, NULL,
3781 &ctype->class_collection_act, data->ucs4);
3782
3783 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3784 wcwidth_table_add (t, data->ucs4, charmap->width_default);
3785 }
3786 }
3787
3788 /* Now add the explicitly specified widths. */
3789 if (charmap->width_rules != NULL)
3790 for (size_t cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3791 {
3792 unsigned char bytes[charmap->mb_cur_max];
3793 int nbytes = charmap->width_rules[cnt].from->nbytes;
3794
3795 /* We have the range of character for which the width is
3796 specified described using byte sequences of the multibyte
3797 charset. We have to convert this to UCS4 now. And we
3798 cannot simply convert the beginning and the end of the
3799 sequence, we have to iterate over the byte sequence and
3800 convert it for every single character. */
3801 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3802
3803 while (nbytes < charmap->width_rules[cnt].to->nbytes
3804 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3805 nbytes) <= 0)
3806 {
3807 /* Find the UCS value for `bytes'. */
3808 int inner;
3809 uint32_t wch;
3810 struct charseq *seq =
3811 charmap_find_symbol (charmap, (char *) bytes, nbytes);
3812
3813 if (seq == NULL)
3814 wch = ILLEGAL_CHAR_VALUE;
3815 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3816 wch = seq->ucs4;
3817 else
3818 wch = repertoire_find_value (ctype->repertoire, seq->name,
3819 strlen (seq->name));
3820
3821 if (wch != ILLEGAL_CHAR_VALUE)
3822 {
3823 /* Store the value. */
3824 uint32_t *class_bits =
3825 find_idx (ctype, &ctype->class_collection, NULL,
3826 &ctype->class_collection_act, wch);
3827
3828 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3829 wcwidth_table_add (t, wch,
3830 charmap->width_rules[cnt].width);
3831 }
3832
3833 /* "Increment" the bytes sequence. */
3834 inner = nbytes - 1;
3835 while (inner >= 0 && bytes[inner] == 0xff)
3836 --inner;
3837
3838 if (inner < 0)
3839 {
3840 /* We have to extend the byte sequence. */
3841 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3842 break;
3843
3844 bytes[0] = 1;
3845 memset (&bytes[1], 0, nbytes);
3846 ++nbytes;
3847 }
3848 else
3849 {
3850 ++bytes[inner];
3851 while (++inner < nbytes)
3852 bytes[inner] = 0;
3853 }
3854 }
3855 }
3856
3857 /* Set the width of L'\0' to 0. */
3858 wcwidth_table_add (t, 0, 0);
3859
3860 record_verbose (stderr, _("%s: table for width: %lu bytes"),
3861 "LC_CTYPE", (unsigned long int) t->result_size);
3862 }
3863
3864 /* Set MB_CUR_MAX. */
3865 ctype->mb_cur_max = charmap->mb_cur_max;
3866
3867 /* Now determine the table for the transliteration information.
3868
3869 XXX It is not yet clear to me whether it is worth implementing a
3870 complicated algorithm which uses a hash table to locate the entries.
3871 For now I'll use a simple array which can be searching using binary
3872 search. */
3873 if (ctype->translit_include != NULL)
3874 /* Traverse the locales mentioned in the `include' statements in a
3875 depth-first way and fold in their transliteration information. */
3876 translit_flatten (ctype, charmap, &ctype->translit);
3877
3878 if (ctype->translit != NULL)
3879 {
3880 /* First count how many entries we have. This is the upper limit
3881 since some entries from the included files might be overwritten. */
3882 size_t number = 0;
3883 struct translit_t *runp = ctype->translit;
3884 struct translit_t **sorted;
3885 size_t from_len, to_len;
3886
3887 while (runp != NULL)
3888 {
3889 ++number;
3890 runp = runp->next;
3891 }
3892
3893 /* Next we allocate an array large enough and fill in the values. */
3894 sorted = (struct translit_t **) alloca (number
3895 * sizeof (struct translit_t **));
3896 runp = ctype->translit;
3897 number = 0;
3898 do
3899 {
3900 /* Search for the place where to insert this string.
3901 XXX Better use a real sorting algorithm later. */
3902 size_t idx = 0;
3903 int replace = 0;
3904
3905 while (idx < number)
3906 {
3907 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3908 (const wchar_t *) runp->from);
3909 if (res == 0)
3910 {
3911 replace = 1;
3912 break;
3913 }
3914 if (res > 0)
3915 break;
3916 ++idx;
3917 }
3918
3919 if (replace)
3920 sorted[idx] = runp;
3921 else
3922 {
3923 memmove (&sorted[idx + 1], &sorted[idx],
3924 (number - idx) * sizeof (struct translit_t *));
3925 sorted[idx] = runp;
3926 ++number;
3927 }
3928
3929 runp = runp->next;
3930 }
3931 while (runp != NULL);
3932
3933 /* The next step is putting all the possible transliteration
3934 strings in one memory block so that we can write it out.
3935 We need several different blocks:
3936 - index to the from-string array
3937 - from-string array
3938 - index to the to-string array
3939 - to-string array.
3940 */
3941 from_len = to_len = 0;
3942 for (size_t cnt = 0; cnt < number; ++cnt)
3943 {
3944 struct translit_to_t *srunp;
3945 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3946 srunp = sorted[cnt]->to;
3947 while (srunp != NULL)
3948 {
3949 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3950 srunp = srunp->next;
3951 }
3952 /* Plus one for the extra NUL character marking the end of
3953 the list for the current entry. */
3954 ++to_len;
3955 }
3956
3957 /* We can allocate the arrays for the results. */
3958 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3959 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3960 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3961 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3962
3963 from_len = 0;
3964 to_len = 0;
3965 for (size_t cnt = 0; cnt < number; ++cnt)
3966 {
3967 size_t len;
3968 struct translit_to_t *srunp;
3969
3970 ctype->translit_from_idx[cnt] = from_len;
3971 ctype->translit_to_idx[cnt] = to_len;
3972
3973 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3974 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3975 (const wchar_t *) sorted[cnt]->from, len);
3976 from_len += len;
3977
3978 ctype->translit_to_idx[cnt] = to_len;
3979 srunp = sorted[cnt]->to;
3980 while (srunp != NULL)
3981 {
3982 len = wcslen ((const wchar_t *) srunp->str) + 1;
3983 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3984 (const wchar_t *) srunp->str, len);
3985 to_len += len;
3986 srunp = srunp->next;
3987 }
3988 ctype->translit_to_tbl[to_len++] = L'\0';
3989 }
3990
3991 /* Store the information about the length. */
3992 ctype->translit_idx_size = number;
3993 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3994 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3995 }
3996 else
3997 {
3998 ctype->translit_from_idx = no_str;
3999 ctype->translit_from_tbl = no_str;
4000 ctype->translit_to_tbl = no_str;
4001 ctype->translit_idx_size = 0;
4002 ctype->translit_from_tbl_size = 0;
4003 ctype->translit_to_tbl_size = 0;
4004 }
4005}
4006