1/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <ctype.h>
23#include <errno.h>
24#include <libintl.h>
25#include <limits.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29#include <stdint.h>
30
31#include "localedef.h"
32#include "linereader.h"
33#include "charmap.h"
34#include "charmap-dir.h"
35
36#include <assert.h>
37
38
39/* Define the lookup function. */
40#include "charmap-kw.h"
41
42
43/* Prototypes for local functions. */
44static struct charmap_t *parse_charmap (struct linereader *cmfile,
45 int verbose, int be_quiet);
46static void new_width (struct linereader *cmfile, struct charmap_t *result,
47 const char *from, const char *to,
48 unsigned long int width);
49static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
50 size_t nbytes, unsigned char *bytes,
51 const char *from, const char *to,
52 int decimal_ellipsis, int step);
53
54
55bool enc_not_ascii_compatible;
56
57
58#ifdef NEED_NULL_POINTER
59static const char *null_pointer;
60#endif
61
62static struct linereader *
63cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
64{
65 FILE *fp;
66
67 fp = charmap_open (directory, name);
68 if (fp == NULL)
69 return NULL;
70 else
71 {
72 size_t dlen = strlen (directory);
73 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
74 size_t nlen = strlen (name);
75 char *pathname;
76 char *p;
77
78 pathname = alloca (dlen + add_slash + nlen + 1);
79 p = stpcpy (pathname, directory);
80 if (add_slash)
81 *p++ = '/';
82 stpcpy (p, name);
83
84 return lr_create (fp, pathname, hf);
85 }
86}
87
88struct charmap_t *
89charmap_read (const char *filename, int verbose, int error_not_found,
90 int be_quiet, int use_default)
91{
92 struct charmap_t *result = NULL;
93
94 if (filename != NULL)
95 {
96 struct linereader *cmfile;
97
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
101 {
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
105 {
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
108 {
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
112 char *next;
113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
114
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
117 {
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
120
121 if (cmfile == NULL)
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
124 }
125 }
126
127 if (cmfile == NULL)
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
130 }
131 }
132
133 if (cmfile != NULL)
134 result = parse_charmap (cmfile, verbose, be_quiet);
135
136 if (result == NULL && error_not_found)
137 record_error (0, errno,
138 _("character map file `%s' not found"),
139 filename);
140 }
141
142 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
143 {
144 /* OK, one more try. We also accept the names given to the
145 character sets in the files. Sometimes they differ from the
146 file name. */
147 CHARMAP_DIR *dir;
148
149 dir = charmap_opendir (CHARMAP_PATH);
150 if (dir != NULL)
151 {
152 const char *dirent;
153
154 while ((dirent = charmap_readdir (dir)) != NULL)
155 {
156 char **aliases;
157 char **p;
158 int found;
159
160 aliases = charmap_aliases (CHARMAP_PATH, dirent);
161 found = 0;
162 for (p = aliases; *p; p++)
163 if (strcasecmp (*p, filename) == 0)
164 {
165 found = 1;
166 break;
167 }
168 charmap_free_aliases (aliases);
169
170 if (found)
171 {
172 struct linereader *cmfile;
173
174 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175 if (cmfile != NULL)
176 result = parse_charmap (cmfile, verbose, be_quiet);
177
178 break;
179 }
180 }
181
182 charmap_closedir (dir);
183 }
184 }
185
186 if (result == NULL && DEFAULT_CHARMAP != NULL)
187 {
188 struct linereader *cmfile;
189
190 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191 if (cmfile != NULL)
192 result = parse_charmap (cmfile, verbose, be_quiet);
193
194 if (result == NULL)
195 record_error (4, errno,
196 _("default character map file `%s' not found"),
197 DEFAULT_CHARMAP);
198 }
199
200 if (result != NULL && result->code_set_name == NULL)
201 /* The input file does not specify a code set name. This
202 shouldn't happen but we should cope with it. */
203 result->code_set_name = basename (filename);
204
205 /* Test of ASCII compatibility of locale encoding.
206
207 Verify that the encoding to be used in a locale is ASCII compatible,
208 at least for the graphic characters, excluding the control characters,
209 '$' and '@'. This constraint comes from an ISO C 99 restriction.
210
211 ISO C 99 section 7.17.(2) (about wchar_t):
212 the null character shall have the code value zero and each member of
213 the basic character set shall have a code value equal to its value
214 when used as the lone character in an integer character constant.
215 ISO C 99 section 5.2.1.(3):
216 Both the basic source and basic execution character sets shall have
217 the following members: the 26 uppercase letters of the Latin alphabet
218 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219 the 26 lowercase letters of the Latin alphabet
220 a b c d e f g h i j k l m n o p q r s t u v w x y z
221 the 10 decimal digits
222 0 1 2 3 4 5 6 7 8 9
223 the following 29 graphic characters
224 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
225 the space character, and control characters representing horizontal
226 tab, vertical tab, and form feed.
227
228 Therefore, for all members of the "basic character set", the 'char' code
229 must have the same value as the 'wchar_t' code, which in glibc is the
230 same as the Unicode code, which for all of the enumerated characters
231 is identical to the ASCII code. */
232 if (result != NULL && use_default)
233 {
234 static const char basic_charset[] =
235 {
236 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
237 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
238 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
239 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
240 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
241 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
242 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
243 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
244 };
245 int failed = 0;
246 const char *p = basic_charset;
247
248 do
249 {
250 struct charseq *seq = charmap_find_symbol (result, p, 1);
251
252 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
253 failed = 1;
254 }
255 while (*p++ != '\0');
256
257 if (failed)
258 {
259 /* A user may disable the ASCII compatibility warning check,
260 but we must remember that the encoding is not ASCII
261 compatible, since it may have other implications. Later
262 we will set _NL_CTYPE_MAP_TO_NONASCII from this value. */
263 if (warn_ascii)
264 record_warning (_(
265"character map `%s' is not ASCII compatible, locale not ISO C compliant "
266"[--no-warnings=ascii]"),
267 result->code_set_name);
268 enc_not_ascii_compatible = true;
269 }
270 }
271
272 return result;
273}
274
275
276static struct charmap_t *
277parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
278{
279 struct charmap_t *result;
280 int state;
281 enum token_t expected_tok = tok_error;
282 const char *expected_str = NULL;
283 char *from_name = NULL;
284 char *to_name = NULL;
285 enum token_t ellipsis = 0;
286 int step = 1;
287
288 /* We don't want symbolic names in string to be translated. */
289 cmfile->translate_strings = 0;
290
291 /* Allocate room for result. */
292 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
293 memset (result, '\0', sizeof (struct charmap_t));
294 /* The default DEFAULT_WIDTH is 1. */
295 result->width_default = 1;
296
297#define obstack_chunk_alloc malloc
298#define obstack_chunk_free free
299 obstack_init (&result->mem_pool);
300
301 if (init_hash (&result->char_table, 256)
302 || init_hash (&result->byte_table, 256))
303 {
304 free (result);
305 return NULL;
306 }
307
308 /* We use a state machine to describe the charmap description file
309 format. */
310 state = 1;
311 while (1)
312 {
313 /* What's on? */
314 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
315 enum token_t nowtok = now->tok;
316 struct token *arg;
317
318 if (nowtok == tok_eof)
319 break;
320
321 switch (state)
322 {
323 case 1:
324 /* The beginning. We expect the special declarations, EOL or
325 `CHARMAP'. */
326 if (nowtok == tok_eol)
327 /* Ignore empty lines. */
328 continue;
329
330 if (nowtok == tok_charmap)
331 {
332 from_name = NULL;
333 to_name = NULL;
334
335 /* We have to set up the real work. Fill in some
336 default values. */
337 if (result->mb_cur_max == 0)
338 result->mb_cur_max = 1;
339 if (result->mb_cur_min == 0)
340 result->mb_cur_min = result->mb_cur_max;
341 if (result->mb_cur_min > result->mb_cur_max)
342 {
343 record_error (0, 0, _("\
344%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
345 cmfile->fname);
346
347 result->mb_cur_min = result->mb_cur_max;
348 }
349
350 lr_ignore_rest (cmfile, 1);
351
352 state = 2;
353 continue;
354 }
355
356 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
357 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
358 && nowtok != tok_comment_char && nowtok != tok_g0esc
359 && nowtok != tok_g1esc && nowtok != tok_g2esc
360 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
361 && nowtok != tok_include)
362 {
363 lr_error (cmfile, _("syntax error in prolog: %s"),
364 _("invalid definition"));
365
366 lr_ignore_rest (cmfile, 0);
367 continue;
368 }
369
370 /* We know that we need an argument. */
371 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
372
373 switch (nowtok)
374 {
375 case tok_code_set_name:
376 case tok_repertoiremap:
377 if (arg->tok != tok_ident && arg->tok != tok_string)
378 {
379 badarg:
380 lr_error (cmfile, _("syntax error in prolog: %s"),
381 _("bad argument"));
382
383 lr_ignore_rest (cmfile, 0);
384 continue;
385 }
386
387 if (nowtok == tok_code_set_name)
388 result->code_set_name = obstack_copy0 (&result->mem_pool,
389 arg->val.str.startmb,
390 arg->val.str.lenmb);
391 else
392 result->repertoiremap = obstack_copy0 (&result->mem_pool,
393 arg->val.str.startmb,
394 arg->val.str.lenmb);
395
396 lr_ignore_rest (cmfile, 1);
397 continue;
398
399 case tok_mb_cur_max:
400 case tok_mb_cur_min:
401 if (arg->tok != tok_number)
402 goto badarg;
403
404 if ((nowtok == tok_mb_cur_max
405 && result->mb_cur_max != 0)
406 || (nowtok == tok_mb_cur_max
407 && result->mb_cur_max != 0))
408 lr_error (cmfile, _("duplicate definition of <%s>"),
409 nowtok == tok_mb_cur_min
410 ? "mb_cur_min" : "mb_cur_max");
411
412 if (arg->val.num < 1)
413 {
414 lr_error (cmfile,
415 _("value for <%s> must be 1 or greater"),
416 nowtok == tok_mb_cur_min
417 ? "mb_cur_min" : "mb_cur_max");
418
419 lr_ignore_rest (cmfile, 0);
420 continue;
421 }
422 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
423 && (int) arg->val.num < result->mb_cur_min)
424 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
425 && (int) arg->val.num > result->mb_cur_max))
426 {
427 lr_error (cmfile, _("\
428value of <%s> must be greater or equal than the value of <%s>"),
429 "mb_cur_max", "mb_cur_min");
430
431 lr_ignore_rest (cmfile, 0);
432 continue;
433 }
434
435 if (nowtok == tok_mb_cur_max)
436 result->mb_cur_max = arg->val.num;
437 else
438 result->mb_cur_min = arg->val.num;
439
440 lr_ignore_rest (cmfile, 1);
441 continue;
442
443 case tok_escape_char:
444 case tok_comment_char:
445 if (arg->tok != tok_ident)
446 goto badarg;
447
448 if (arg->val.str.lenmb != 1)
449 {
450 lr_error (cmfile, _("\
451argument to <%s> must be a single character"),
452 nowtok == tok_escape_char ? "escape_char"
453 : "comment_char");
454
455 lr_ignore_rest (cmfile, 0);
456 continue;
457 }
458
459 if (nowtok == tok_escape_char)
460 cmfile->escape_char = *arg->val.str.startmb;
461 else
462 cmfile->comment_char = *arg->val.str.startmb;
463
464 lr_ignore_rest (cmfile, 1);
465 continue;
466
467 case tok_g0esc:
468 case tok_g1esc:
469 case tok_g2esc:
470 case tok_g3esc:
471 case tok_escseq:
472 lr_ignore_rest (cmfile, 0); /* XXX */
473 continue;
474
475 case tok_include:
476 lr_error (cmfile, _("\
477character sets with locking states are not supported"));
478 exit (4);
479
480 default:
481 /* Cannot happen. */
482 assert (! "Should not happen");
483 }
484 break;
485
486 case 2:
487 /* We have seen `CHARMAP' and now are in the body. Each line
488 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
489 if (nowtok == tok_eol)
490 /* Ignore empty lines. */
491 continue;
492
493 if (nowtok == tok_end)
494 {
495 expected_tok = tok_charmap;
496 expected_str = "CHARMAP";
497 state = 90;
498 continue;
499 }
500
501 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
502 {
503 lr_error (cmfile, _("syntax error in %s definition: %s"),
504 "CHARMAP", _("no symbolic name given"));
505
506 lr_ignore_rest (cmfile, 0);
507 continue;
508 }
509
510 /* If the previous line was not completely correct free the
511 used memory. */
512 if (from_name != NULL)
513 obstack_free (&result->mem_pool, from_name);
514
515 if (nowtok == tok_bsymbol)
516 from_name = (char *) obstack_copy0 (&result->mem_pool,
517 now->val.str.startmb,
518 now->val.str.lenmb);
519 else
520 {
521 obstack_printf (&result->mem_pool, "U%08X",
522 cmfile->token.val.ucs4);
523 obstack_1grow (&result->mem_pool, '\0');
524 from_name = (char *) obstack_finish (&result->mem_pool);
525 }
526 to_name = NULL;
527
528 state = 3;
529 continue;
530
531 case 3:
532 /* We have two possibilities: We can see an ellipsis or an
533 encoding value. */
534 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
535 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
536 || nowtok == tok_ellipsis2_2)
537 {
538 ellipsis = nowtok;
539 if (nowtok == tok_ellipsis4_2)
540 {
541 step = 2;
542 nowtok = tok_ellipsis4;
543 }
544 else if (nowtok == tok_ellipsis2_2)
545 {
546 step = 2;
547 nowtok = tok_ellipsis2;
548 }
549 state = 4;
550 continue;
551 }
552 /* FALLTHROUGH */
553
554 case 5:
555 if (nowtok != tok_charcode)
556 {
557 lr_error (cmfile, _("syntax error in %s definition: %s"),
558 "CHARMAP", _("invalid encoding given"));
559
560 lr_ignore_rest (cmfile, 0);
561
562 state = 2;
563 continue;
564 }
565
566 if (now->val.charcode.nbytes < result->mb_cur_min)
567 lr_error (cmfile, _("too few bytes in character encoding"));
568 else if (now->val.charcode.nbytes > result->mb_cur_max)
569 lr_error (cmfile, _("too many bytes in character encoding"));
570 else
571 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
572 now->val.charcode.bytes, from_name, to_name,
573 ellipsis != tok_ellipsis2, step);
574
575 /* Ignore trailing comment silently. */
576 lr_ignore_rest (cmfile, 0);
577
578 from_name = NULL;
579 to_name = NULL;
580 ellipsis = tok_none;
581 step = 1;
582
583 state = 2;
584 continue;
585
586 case 4:
587 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
588 {
589 lr_error (cmfile, _("syntax error in %s definition: %s"),
590 "CHARMAP",
591 _("no symbolic name given for end of range"));
592
593 lr_ignore_rest (cmfile, 0);
594 continue;
595 }
596
597 /* Copy the to-name in a safe place. */
598 if (nowtok == tok_bsymbol)
599 to_name = (char *) obstack_copy0 (&result->mem_pool,
600 cmfile->token.val.str.startmb,
601 cmfile->token.val.str.lenmb);
602 else
603 {
604 obstack_printf (&result->mem_pool, "U%08X",
605 cmfile->token.val.ucs4);
606 obstack_1grow (&result->mem_pool, '\0');
607 to_name = (char *) obstack_finish (&result->mem_pool);
608 }
609
610 state = 5;
611 continue;
612
613 case 90:
614 if (nowtok != expected_tok)
615 lr_error (cmfile, _("\
616%1$s: definition does not end with `END %1$s'"), expected_str);
617
618 lr_ignore_rest (cmfile, nowtok == expected_tok);
619 state = 91;
620 continue;
621
622 case 91:
623 /* Waiting for WIDTH... */
624 if (nowtok == tok_eol)
625 /* Ignore empty lines. */
626 continue;
627
628 if (nowtok == tok_width_default)
629 {
630 state = 92;
631 continue;
632 }
633
634 if (nowtok == tok_width)
635 {
636 lr_ignore_rest (cmfile, 1);
637 state = 93;
638 continue;
639 }
640
641 if (nowtok == tok_width_variable)
642 {
643 lr_ignore_rest (cmfile, 1);
644 state = 98;
645 continue;
646 }
647
648 lr_error (cmfile, _("\
649only WIDTH definitions are allowed to follow the CHARMAP definition"));
650
651 lr_ignore_rest (cmfile, 0);
652 continue;
653
654 case 92:
655 if (nowtok != tok_number)
656 lr_error (cmfile, _("value for %s must be an integer"),
657 "WIDTH_DEFAULT");
658 else
659 result->width_default = now->val.num;
660
661 lr_ignore_rest (cmfile, nowtok == tok_number);
662
663 state = 91;
664 continue;
665
666 case 93:
667 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
668 "%s...%s %d\n". */
669 if (nowtok == tok_eol)
670 /* ignore empty lines. */
671 continue;
672
673 if (nowtok == tok_end)
674 {
675 expected_tok = tok_width;
676 expected_str = "WIDTH";
677 state = 90;
678 continue;
679 }
680
681 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
682 {
683 lr_error (cmfile, _("syntax error in %s definition: %s"),
684 "WIDTH", _("no symbolic name given"));
685
686 lr_ignore_rest (cmfile, 0);
687 continue;
688 }
689
690 if (from_name != NULL)
691 obstack_free (&result->mem_pool, from_name);
692
693 if (nowtok == tok_bsymbol)
694 from_name = (char *) obstack_copy0 (&result->mem_pool,
695 now->val.str.startmb,
696 now->val.str.lenmb);
697 else
698 {
699 obstack_printf (&result->mem_pool, "U%08X",
700 cmfile->token.val.ucs4);
701 obstack_1grow (&result->mem_pool, '\0');
702 from_name = (char *) obstack_finish (&result->mem_pool);
703 }
704
705 to_name = NULL;
706
707 state = 94;
708 continue;
709
710 case 94:
711 if (nowtok == tok_ellipsis3)
712 {
713 state = 95;
714 continue;
715 }
716
717 case 96:
718 if (nowtok != tok_number)
719 lr_error (cmfile, _("value for %s must be an integer"),
720 "WIDTH");
721 else
722 {
723 /* Store width for chars. */
724 new_width (cmfile, result, from_name, to_name, now->val.num);
725
726 from_name = NULL;
727 to_name = NULL;
728 }
729
730 lr_ignore_rest (cmfile, nowtok == tok_number);
731
732 state = 93;
733 continue;
734
735 case 95:
736 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
737 {
738 lr_error (cmfile, _("syntax error in %s definition: %s"),
739 "WIDTH", _("no symbolic name given for end of range"));
740
741 lr_ignore_rest (cmfile, 0);
742
743 state = 93;
744 continue;
745 }
746
747 if (nowtok == tok_bsymbol)
748 to_name = (char *) obstack_copy0 (&result->mem_pool,
749 now->val.str.startmb,
750 now->val.str.lenmb);
751 else
752 {
753 obstack_printf (&result->mem_pool, "U%08X",
754 cmfile->token.val.ucs4);
755 obstack_1grow (&result->mem_pool, '\0');
756 to_name = (char *) obstack_finish (&result->mem_pool);
757 }
758
759 state = 96;
760 continue;
761
762 case 98:
763 /* We now expect `END WIDTH_VARIABLE' or lines of the format
764 "%s\n" or "%s...%s\n". */
765 if (nowtok == tok_eol)
766 /* ignore empty lines. */
767 continue;
768
769 if (nowtok == tok_end)
770 {
771 expected_tok = tok_width_variable;
772 expected_str = "WIDTH_VARIABLE";
773 state = 90;
774 continue;
775 }
776
777 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
778 {
779 lr_error (cmfile, _("syntax error in %s definition: %s"),
780 "WIDTH_VARIABLE", _("no symbolic name given"));
781
782 lr_ignore_rest (cmfile, 0);
783
784 continue;
785 }
786
787 if (from_name != NULL)
788 obstack_free (&result->mem_pool, from_name);
789
790 if (nowtok == tok_bsymbol)
791 from_name = (char *) obstack_copy0 (&result->mem_pool,
792 now->val.str.startmb,
793 now->val.str.lenmb);
794 else
795 {
796 obstack_printf (&result->mem_pool, "U%08X",
797 cmfile->token.val.ucs4);
798 obstack_1grow (&result->mem_pool, '\0');
799 from_name = (char *) obstack_finish (&result->mem_pool);
800 }
801 to_name = NULL;
802
803 state = 99;
804 continue;
805
806 case 99:
807 if (nowtok == tok_ellipsis3)
808 state = 100;
809
810 /* Store info. */
811 from_name = NULL;
812
813 /* Warn */
814 state = 98;
815 continue;
816
817 case 100:
818 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
819 {
820 lr_error (cmfile, _("syntax error in %s definition: %s"),
821 "WIDTH_VARIABLE",
822 _("no symbolic name given for end of range"));
823 lr_ignore_rest (cmfile, 0);
824 continue;
825 }
826
827 if (nowtok == tok_bsymbol)
828 to_name = (char *) obstack_copy0 (&result->mem_pool,
829 now->val.str.startmb,
830 now->val.str.lenmb);
831 else
832 {
833 obstack_printf (&result->mem_pool, "U%08X",
834 cmfile->token.val.ucs4);
835 obstack_1grow (&result->mem_pool, '\0');
836 to_name = (char *) obstack_finish (&result->mem_pool);
837 }
838
839 /* XXX Enter value into table. */
840
841 lr_ignore_rest (cmfile, 1);
842
843 state = 98;
844 continue;
845
846 default:
847 record_error (5, 0, _("%s: error in state machine"),
848 __FILE__);
849 /* NOTREACHED */
850 }
851 break;
852 }
853
854 if (state != 91)
855 record_error (0, 0, _("%s: premature end of file"),
856 cmfile->fname);
857
858 lr_close (cmfile);
859
860 return result;
861}
862
863
864static void
865new_width (struct linereader *cmfile, struct charmap_t *result,
866 const char *from, const char *to, unsigned long int width)
867{
868 struct charseq *from_val;
869 struct charseq *to_val;
870
871 from_val = charmap_find_value (result, from, strlen (from));
872 if (from_val == NULL)
873 {
874 lr_error (cmfile, _("unknown character `%s'"), from);
875 return;
876 }
877
878 if (to == NULL)
879 to_val = from_val;
880 else
881 {
882 to_val = charmap_find_value (result, to, strlen (to));
883 if (to_val == NULL)
884 {
885 lr_error (cmfile, _("unknown character `%s'"), to);
886 return;
887 }
888
889 /* Make sure the number of bytes for the end points of the range
890 is correct. */
891 if (from_val->nbytes != to_val->nbytes)
892 {
893 lr_error (cmfile, _("\
894number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
895 from_val->nbytes, to_val->nbytes);
896 return;
897 }
898 }
899
900 if (result->nwidth_rules >= result->nwidth_rules_max)
901 {
902 size_t new_size = result->nwidth_rules + 32;
903 struct width_rule *new_rules =
904 (struct width_rule *) obstack_alloc (&result->mem_pool,
905 (new_size
906 * sizeof (struct width_rule)));
907
908 memcpy (new_rules, result->width_rules,
909 result->nwidth_rules_max * sizeof (struct width_rule));
910
911 result->width_rules = new_rules;
912 result->nwidth_rules_max = new_size;
913 }
914
915 result->width_rules[result->nwidth_rules].from = from_val;
916 result->width_rules[result->nwidth_rules].to = to_val;
917 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
918 ++result->nwidth_rules;
919}
920
921
922struct charseq *
923charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
924{
925 void *result;
926
927 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
928 < 0 ? NULL : (struct charseq *) result);
929}
930
931
932static void
933charmap_new_char (struct linereader *lr, struct charmap_t *cm,
934 size_t nbytes, unsigned char *bytes,
935 const char *from, const char *to,
936 int decimal_ellipsis, int step)
937{
938 hash_table *ht = &cm->char_table;
939 hash_table *bt = &cm->byte_table;
940 struct obstack *ob = &cm->mem_pool;
941 char *from_end;
942 char *to_end;
943 const char *cp;
944 int prefix_len, len1, len2;
945 unsigned int from_nr, to_nr, cnt;
946 struct charseq *newp;
947
948 len1 = strlen (from);
949
950 if (to == NULL)
951 {
952 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
953 newp->nbytes = nbytes;
954 memcpy (newp->bytes, bytes, nbytes);
955 newp->name = from;
956
957 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
958 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
959 {
960 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
961 xxxx and xxxxxxxx are hexadecimal numbers. In this case
962 we use the value of xxxx or xxxxxxxx as the UCS4 value of
963 this character and we don't have to consult the repertoire
964 map.
965
966 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
967 and xxxxxxxx also give the code point in UCS4 but this must
968 be in the private, i.e., unassigned, area. This should be
969 used for characters which do not (yet) have an equivalent
970 in ISO 10646 and Unicode. */
971 char *endp;
972
973 errno = 0;
974 newp->ucs4 = strtoul (from + 1, &endp, 16);
975 if (endp - from != len1
976 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
977 || newp->ucs4 >= 0x80000000)
978 /* This wasn't successful. Signal this name cannot be a
979 correct UCS value. */
980 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
981 }
982
983 insert_entry (ht, from, len1, newp);
984 insert_entry (bt, newp->bytes, nbytes, newp);
985 /* Please note that it isn't a bug if a symbol is defined more
986 than once. All later definitions are simply discarded. */
987 return;
988 }
989
990 /* We have a range: the names must have names with equal prefixes
991 and an equal number of digits, where the second number is greater
992 or equal than the first. */
993 len2 = strlen (to);
994
995 if (len1 != len2)
996 {
997 illegal_range:
998 lr_error (lr, _("invalid names for character range"));
999 return;
1000 }
1001
1002 cp = &from[len1 - 1];
1003 if (decimal_ellipsis)
1004 while (isdigit (*cp) && cp >= from)
1005 --cp;
1006 else
1007 while (isxdigit (*cp) && cp >= from)
1008 {
1009 if (!isdigit (*cp) && !isupper (*cp))
1010 lr_error (lr, _("\
1011hexadecimal range format should use only capital characters"));
1012 --cp;
1013 }
1014
1015 prefix_len = (cp - from) + 1;
1016
1017 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1018 goto illegal_range;
1019
1020 errno = 0;
1021 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1022 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1023 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1024 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1025 && errno == ERANGE)
1026 || *to_end != '\0')
1027 {
1028 lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029 return;
1030 }
1031
1032 if (from_nr > to_nr)
1033 {
1034 lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035 return;
1036 }
1037
1038 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039 {
1040 char *name_end;
1041 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1042 prefix_len, from, len1 - prefix_len, cnt);
1043 obstack_1grow (ob, '\0');
1044 name_end = obstack_finish (ob);
1045
1046 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1047 newp->nbytes = nbytes;
1048 memcpy (newp->bytes, bytes, nbytes);
1049 newp->name = name_end;
1050
1051 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052 if ((name_end[0] == 'U' || name_end[0] == 'P')
1053 && (len1 == 5 || len1 == 9))
1054 {
1055 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1056 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1057 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058 this character and we don't have to consult the repertoire
1059 map.
1060
1061 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062 and xxxxxxxx also give the code point in UCS4 but this must
1063 be in the private, i.e., unassigned, area. This should be
1064 used for characters which do not (yet) have an equivalent
1065 in ISO 10646 and Unicode. */
1066 char *endp;
1067
1068 errno = 0;
1069 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1070 if (endp - name_end != len1
1071 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1072 || newp->ucs4 >= 0x80000000)
1073 /* This wasn't successful. Signal this name cannot be a
1074 correct UCS value. */
1075 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076 }
1077
1078 insert_entry (ht, name_end, len1, newp);
1079 insert_entry (bt, newp->bytes, nbytes, newp);
1080 /* Please note we don't examine the return value since it is no error
1081 if we have two definitions for a symbol. */
1082
1083 /* Increment the value in the byte sequence. */
1084 if (++bytes[nbytes - 1] == '\0')
1085 {
1086 int b = nbytes - 2;
1087
1088 do
1089 if (b < 0)
1090 {
1091 lr_error (lr,
1092 _("resulting bytes for range not representable."));
1093 return;
1094 }
1095 while (++bytes[b--] == 0);
1096 }
1097 }
1098}
1099
1100
1101struct charseq *
1102charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1103 size_t nbytes)
1104{
1105 void *result;
1106
1107 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108 < 0 ? NULL : (struct charseq *) result);
1109}
1110