1 | /* Copyright (C) 1996-2016 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. |
4 | |
5 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published |
7 | by the Free Software Foundation; version 2 of the License, or |
8 | (at your option) any later version. |
9 | |
10 | This program is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License |
16 | along with this program; if not, see <http://www.gnu.org/licenses/>. */ |
17 | |
18 | #ifdef HAVE_CONFIG_H |
19 | # include <config.h> |
20 | #endif |
21 | |
22 | #include <ctype.h> |
23 | #include <errno.h> |
24 | #include <libintl.h> |
25 | #include <limits.h> |
26 | #include <stdio.h> |
27 | #include <stdlib.h> |
28 | #include <string.h> |
29 | #include <error.h> |
30 | #include <stdint.h> |
31 | |
32 | #include "localedef.h" |
33 | #include "linereader.h" |
34 | #include "charmap.h" |
35 | #include "charmap-dir.h" |
36 | |
37 | #include <assert.h> |
38 | |
39 | |
40 | /* Define the lookup function. */ |
41 | #include "charmap-kw.h" |
42 | |
43 | |
44 | /* Prototypes for local functions. */ |
45 | static struct charmap_t *parse_charmap (struct linereader *cmfile, |
46 | int verbose, int be_quiet); |
47 | static void new_width (struct linereader *cmfile, struct charmap_t *result, |
48 | const char *from, const char *to, |
49 | unsigned long int width); |
50 | static void charmap_new_char (struct linereader *lr, struct charmap_t *cm, |
51 | size_t nbytes, unsigned char *bytes, |
52 | const char *from, const char *to, |
53 | int decimal_ellipsis, int step); |
54 | |
55 | |
56 | bool enc_not_ascii_compatible; |
57 | |
58 | |
59 | #ifdef NEED_NULL_POINTER |
60 | static const char *null_pointer; |
61 | #endif |
62 | |
63 | static struct linereader * |
64 | cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf) |
65 | { |
66 | FILE *fp; |
67 | |
68 | fp = charmap_open (directory, name); |
69 | if (fp == NULL) |
70 | return NULL; |
71 | else |
72 | { |
73 | size_t dlen = strlen (directory); |
74 | int add_slash = (dlen == 0 || directory[dlen - 1] != '/'); |
75 | size_t nlen = strlen (name); |
76 | char *pathname; |
77 | char *p; |
78 | |
79 | pathname = alloca (dlen + add_slash + nlen + 1); |
80 | p = stpcpy (pathname, directory); |
81 | if (add_slash) |
82 | *p++ = '/'; |
83 | stpcpy (p, name); |
84 | |
85 | return lr_create (fp, pathname, hf); |
86 | } |
87 | } |
88 | |
89 | struct charmap_t * |
90 | charmap_read (const char *filename, int verbose, int error_not_found, |
91 | int be_quiet, int use_default) |
92 | { |
93 | struct charmap_t *result = NULL; |
94 | |
95 | if (filename != NULL) |
96 | { |
97 | struct linereader *cmfile; |
98 | |
99 | /* First try the name as found in the parameter. */ |
100 | cmfile = lr_open (filename, charmap_hash); |
101 | if (cmfile == NULL) |
102 | { |
103 | /* No successful. So start looking through the directories |
104 | in the I18NPATH if this is a simple name. */ |
105 | if (strchr (filename, '/') == NULL) |
106 | { |
107 | char *i18npath = getenv ("I18NPATH" ); |
108 | if (i18npath != NULL && *i18npath != '\0') |
109 | { |
110 | const size_t pathlen = strlen (i18npath); |
111 | char i18npathbuf[pathlen + 1]; |
112 | char path[pathlen + sizeof ("/charmaps" )]; |
113 | char *next; |
114 | i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1); |
115 | |
116 | while (cmfile == NULL |
117 | && (next = strsep (&i18npath, ":" )) != NULL) |
118 | { |
119 | stpcpy (stpcpy (path, next), "/charmaps" ); |
120 | cmfile = cmlr_open (path, filename, charmap_hash); |
121 | |
122 | if (cmfile == NULL) |
123 | /* Try without the "/charmaps" part. */ |
124 | cmfile = cmlr_open (next, filename, charmap_hash); |
125 | } |
126 | } |
127 | |
128 | if (cmfile == NULL) |
129 | /* Try the default directory. */ |
130 | cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash); |
131 | } |
132 | } |
133 | |
134 | if (cmfile != NULL) |
135 | result = parse_charmap (cmfile, verbose, be_quiet); |
136 | |
137 | if (result == NULL && error_not_found) |
138 | WITH_CUR_LOCALE (error (0, errno, _("\ |
139 | character map file `%s' not found" ), filename)); |
140 | } |
141 | |
142 | if (result == NULL && filename != NULL && strchr (filename, '/') == NULL) |
143 | { |
144 | /* OK, one more try. We also accept the names given to the |
145 | character sets in the files. Sometimes they differ from the |
146 | file name. */ |
147 | CHARMAP_DIR *dir; |
148 | |
149 | dir = charmap_opendir (CHARMAP_PATH); |
150 | if (dir != NULL) |
151 | { |
152 | const char *dirent; |
153 | |
154 | while ((dirent = charmap_readdir (dir)) != NULL) |
155 | { |
156 | char **aliases; |
157 | char **p; |
158 | int found; |
159 | |
160 | aliases = charmap_aliases (CHARMAP_PATH, dirent); |
161 | found = 0; |
162 | for (p = aliases; *p; p++) |
163 | if (strcasecmp (*p, filename) == 0) |
164 | { |
165 | found = 1; |
166 | break; |
167 | } |
168 | charmap_free_aliases (aliases); |
169 | |
170 | if (found) |
171 | { |
172 | struct linereader *cmfile; |
173 | |
174 | cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash); |
175 | if (cmfile != NULL) |
176 | result = parse_charmap (cmfile, verbose, be_quiet); |
177 | |
178 | break; |
179 | } |
180 | } |
181 | |
182 | charmap_closedir (dir); |
183 | } |
184 | } |
185 | |
186 | if (result == NULL && DEFAULT_CHARMAP != NULL) |
187 | { |
188 | struct linereader *cmfile; |
189 | |
190 | cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash); |
191 | if (cmfile != NULL) |
192 | result = parse_charmap (cmfile, verbose, be_quiet); |
193 | |
194 | if (result == NULL) |
195 | WITH_CUR_LOCALE (error (4, errno, _("\ |
196 | default character map file `%s' not found" ), DEFAULT_CHARMAP)); |
197 | } |
198 | |
199 | if (result != NULL && result->code_set_name == NULL) |
200 | /* The input file does not specify a code set name. This |
201 | shouldn't happen but we should cope with it. */ |
202 | result->code_set_name = basename (filename); |
203 | |
204 | /* Test of ASCII compatibility of locale encoding. |
205 | |
206 | Verify that the encoding to be used in a locale is ASCII compatible, |
207 | at least for the graphic characters, excluding the control characters, |
208 | '$' and '@'. This constraint comes from an ISO C 99 restriction. |
209 | |
210 | ISO C 99 section 7.17.(2) (about wchar_t): |
211 | the null character shall have the code value zero and each member of |
212 | the basic character set shall have a code value equal to its value |
213 | when used as the lone character in an integer character constant. |
214 | ISO C 99 section 5.2.1.(3): |
215 | Both the basic source and basic execution character sets shall have |
216 | the following members: the 26 uppercase letters of the Latin alphabet |
217 | A B C D E F G H I J K L M N O P Q R S T U V W X Y Z |
218 | the 26 lowercase letters of the Latin alphabet |
219 | a b c d e f g h i j k l m n o p q r s t u v w x y z |
220 | the 10 decimal digits |
221 | 0 1 2 3 4 5 6 7 8 9 |
222 | the following 29 graphic characters |
223 | ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~ |
224 | the space character, and control characters representing horizontal |
225 | tab, vertical tab, and form feed. |
226 | |
227 | Therefore, for all members of the "basic character set", the 'char' code |
228 | must have the same value as the 'wchar_t' code, which in glibc is the |
229 | same as the Unicode code, which for all of the enumerated characters |
230 | is identical to the ASCII code. */ |
231 | if (result != NULL && use_default) |
232 | { |
233 | static const char basic_charset[] = |
234 | { |
235 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', |
236 | 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', |
237 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', |
238 | 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', |
239 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', |
240 | '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-', |
241 | '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^', |
242 | '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0' |
243 | }; |
244 | int failed = 0; |
245 | const char *p = basic_charset; |
246 | |
247 | do |
248 | { |
249 | struct charseq *seq = charmap_find_symbol (result, p, 1); |
250 | |
251 | if (seq == NULL || seq->ucs4 != (uint32_t) *p) |
252 | failed = 1; |
253 | } |
254 | while (*p++ != '\0'); |
255 | |
256 | if (failed) |
257 | { |
258 | WITH_CUR_LOCALE (fprintf (stderr, _("\ |
259 | character map `%s' is not ASCII compatible, locale not ISO C compliant\n" ), |
260 | result->code_set_name)); |
261 | enc_not_ascii_compatible = true; |
262 | } |
263 | } |
264 | |
265 | return result; |
266 | } |
267 | |
268 | |
269 | static struct charmap_t * |
270 | parse_charmap (struct linereader *cmfile, int verbose, int be_quiet) |
271 | { |
272 | struct charmap_t *result; |
273 | int state; |
274 | enum token_t expected_tok = tok_error; |
275 | const char *expected_str = NULL; |
276 | char *from_name = NULL; |
277 | char *to_name = NULL; |
278 | enum token_t ellipsis = 0; |
279 | int step = 1; |
280 | |
281 | /* We don't want symbolic names in string to be translated. */ |
282 | cmfile->translate_strings = 0; |
283 | |
284 | /* Allocate room for result. */ |
285 | result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t)); |
286 | memset (result, '\0', sizeof (struct charmap_t)); |
287 | /* The default DEFAULT_WIDTH is 1. */ |
288 | result->width_default = 1; |
289 | |
290 | #define obstack_chunk_alloc malloc |
291 | #define obstack_chunk_free free |
292 | obstack_init (&result->mem_pool); |
293 | |
294 | if (init_hash (&result->char_table, 256) |
295 | || init_hash (&result->byte_table, 256)) |
296 | { |
297 | free (result); |
298 | return NULL; |
299 | } |
300 | |
301 | /* We use a state machine to describe the charmap description file |
302 | format. */ |
303 | state = 1; |
304 | while (1) |
305 | { |
306 | /* What's on? */ |
307 | struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose); |
308 | enum token_t nowtok = now->tok; |
309 | struct token *arg; |
310 | |
311 | if (nowtok == tok_eof) |
312 | break; |
313 | |
314 | switch (state) |
315 | { |
316 | case 1: |
317 | /* The beginning. We expect the special declarations, EOL or |
318 | `CHARMAP'. */ |
319 | if (nowtok == tok_eol) |
320 | /* Ignore empty lines. */ |
321 | continue; |
322 | |
323 | if (nowtok == tok_charmap) |
324 | { |
325 | from_name = NULL; |
326 | to_name = NULL; |
327 | |
328 | /* We have to set up the real work. Fill in some |
329 | default values. */ |
330 | if (result->mb_cur_max == 0) |
331 | result->mb_cur_max = 1; |
332 | if (result->mb_cur_min == 0) |
333 | result->mb_cur_min = result->mb_cur_max; |
334 | if (result->mb_cur_min > result->mb_cur_max) |
335 | { |
336 | if (!be_quiet) |
337 | WITH_CUR_LOCALE (error (0, 0, _("\ |
338 | %s: <mb_cur_max> must be greater than <mb_cur_min>\n" ), |
339 | cmfile->fname)); |
340 | |
341 | result->mb_cur_min = result->mb_cur_max; |
342 | } |
343 | |
344 | lr_ignore_rest (cmfile, 1); |
345 | |
346 | state = 2; |
347 | continue; |
348 | } |
349 | |
350 | if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max |
351 | && nowtok != tok_mb_cur_min && nowtok != tok_escape_char |
352 | && nowtok != tok_comment_char && nowtok != tok_g0esc |
353 | && nowtok != tok_g1esc && nowtok != tok_g2esc |
354 | && nowtok != tok_g3esc && nowtok != tok_repertoiremap |
355 | && nowtok != tok_include) |
356 | { |
357 | lr_error (cmfile, _("syntax error in prolog: %s" ), |
358 | _("invalid definition" )); |
359 | |
360 | lr_ignore_rest (cmfile, 0); |
361 | continue; |
362 | } |
363 | |
364 | /* We know that we need an argument. */ |
365 | arg = lr_token (cmfile, NULL, NULL, NULL, verbose); |
366 | |
367 | switch (nowtok) |
368 | { |
369 | case tok_code_set_name: |
370 | case tok_repertoiremap: |
371 | if (arg->tok != tok_ident && arg->tok != tok_string) |
372 | { |
373 | badarg: |
374 | lr_error (cmfile, _("syntax error in prolog: %s" ), |
375 | _("bad argument" )); |
376 | |
377 | lr_ignore_rest (cmfile, 0); |
378 | continue; |
379 | } |
380 | |
381 | if (nowtok == tok_code_set_name) |
382 | result->code_set_name = obstack_copy0 (&result->mem_pool, |
383 | arg->val.str.startmb, |
384 | arg->val.str.lenmb); |
385 | else |
386 | result->repertoiremap = obstack_copy0 (&result->mem_pool, |
387 | arg->val.str.startmb, |
388 | arg->val.str.lenmb); |
389 | |
390 | lr_ignore_rest (cmfile, 1); |
391 | continue; |
392 | |
393 | case tok_mb_cur_max: |
394 | case tok_mb_cur_min: |
395 | if (arg->tok != tok_number) |
396 | goto badarg; |
397 | |
398 | if (verbose |
399 | && ((nowtok == tok_mb_cur_max |
400 | && result->mb_cur_max != 0) |
401 | || (nowtok == tok_mb_cur_max |
402 | && result->mb_cur_max != 0))) |
403 | lr_error (cmfile, _("duplicate definition of <%s>" ), |
404 | nowtok == tok_mb_cur_min |
405 | ? "mb_cur_min" : "mb_cur_max" ); |
406 | |
407 | if (arg->val.num < 1) |
408 | { |
409 | lr_error (cmfile, |
410 | _("value for <%s> must be 1 or greater" ), |
411 | nowtok == tok_mb_cur_min |
412 | ? "mb_cur_min" : "mb_cur_max" ); |
413 | |
414 | lr_ignore_rest (cmfile, 0); |
415 | continue; |
416 | } |
417 | if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0 |
418 | && (int) arg->val.num < result->mb_cur_min) |
419 | || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0 |
420 | && (int) arg->val.num > result->mb_cur_max)) |
421 | { |
422 | lr_error (cmfile, _("\ |
423 | value of <%s> must be greater or equal than the value of <%s>" ), |
424 | "mb_cur_max" , "mb_cur_min" ); |
425 | |
426 | lr_ignore_rest (cmfile, 0); |
427 | continue; |
428 | } |
429 | |
430 | if (nowtok == tok_mb_cur_max) |
431 | result->mb_cur_max = arg->val.num; |
432 | else |
433 | result->mb_cur_min = arg->val.num; |
434 | |
435 | lr_ignore_rest (cmfile, 1); |
436 | continue; |
437 | |
438 | case tok_escape_char: |
439 | case tok_comment_char: |
440 | if (arg->tok != tok_ident) |
441 | goto badarg; |
442 | |
443 | if (arg->val.str.lenmb != 1) |
444 | { |
445 | lr_error (cmfile, _("\ |
446 | argument to <%s> must be a single character" ), |
447 | nowtok == tok_escape_char ? "escape_char" |
448 | : "comment_char" ); |
449 | |
450 | lr_ignore_rest (cmfile, 0); |
451 | continue; |
452 | } |
453 | |
454 | if (nowtok == tok_escape_char) |
455 | cmfile->escape_char = *arg->val.str.startmb; |
456 | else |
457 | cmfile->comment_char = *arg->val.str.startmb; |
458 | |
459 | lr_ignore_rest (cmfile, 1); |
460 | continue; |
461 | |
462 | case tok_g0esc: |
463 | case tok_g1esc: |
464 | case tok_g2esc: |
465 | case tok_g3esc: |
466 | case tok_escseq: |
467 | lr_ignore_rest (cmfile, 0); /* XXX */ |
468 | continue; |
469 | |
470 | case tok_include: |
471 | lr_error (cmfile, _("\ |
472 | character sets with locking states are not supported" )); |
473 | exit (4); |
474 | |
475 | default: |
476 | /* Cannot happen. */ |
477 | assert (! "Should not happen" ); |
478 | } |
479 | break; |
480 | |
481 | case 2: |
482 | /* We have seen `CHARMAP' and now are in the body. Each line |
483 | must have the format "%s %s %s\n" or "%s...%s %s %s\n". */ |
484 | if (nowtok == tok_eol) |
485 | /* Ignore empty lines. */ |
486 | continue; |
487 | |
488 | if (nowtok == tok_end) |
489 | { |
490 | expected_tok = tok_charmap; |
491 | expected_str = "CHARMAP" ; |
492 | state = 90; |
493 | continue; |
494 | } |
495 | |
496 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
497 | { |
498 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
499 | "CHARMAP" , _("no symbolic name given" )); |
500 | |
501 | lr_ignore_rest (cmfile, 0); |
502 | continue; |
503 | } |
504 | |
505 | /* If the previous line was not completely correct free the |
506 | used memory. */ |
507 | if (from_name != NULL) |
508 | obstack_free (&result->mem_pool, from_name); |
509 | |
510 | if (nowtok == tok_bsymbol) |
511 | from_name = (char *) obstack_copy0 (&result->mem_pool, |
512 | now->val.str.startmb, |
513 | now->val.str.lenmb); |
514 | else |
515 | { |
516 | obstack_printf (&result->mem_pool, "U%08X" , |
517 | cmfile->token.val.ucs4); |
518 | obstack_1grow (&result->mem_pool, '\0'); |
519 | from_name = (char *) obstack_finish (&result->mem_pool); |
520 | } |
521 | to_name = NULL; |
522 | |
523 | state = 3; |
524 | continue; |
525 | |
526 | case 3: |
527 | /* We have two possibilities: We can see an ellipsis or an |
528 | encoding value. */ |
529 | if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4 |
530 | || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2 |
531 | || nowtok == tok_ellipsis2_2) |
532 | { |
533 | ellipsis = nowtok; |
534 | if (nowtok == tok_ellipsis4_2) |
535 | { |
536 | step = 2; |
537 | nowtok = tok_ellipsis4; |
538 | } |
539 | else if (nowtok == tok_ellipsis2_2) |
540 | { |
541 | step = 2; |
542 | nowtok = tok_ellipsis2; |
543 | } |
544 | state = 4; |
545 | continue; |
546 | } |
547 | /* FALLTHROUGH */ |
548 | |
549 | case 5: |
550 | if (nowtok != tok_charcode) |
551 | { |
552 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
553 | "CHARMAP" , _("invalid encoding given" )); |
554 | |
555 | lr_ignore_rest (cmfile, 0); |
556 | |
557 | state = 2; |
558 | continue; |
559 | } |
560 | |
561 | if (now->val.charcode.nbytes < result->mb_cur_min) |
562 | lr_error (cmfile, _("too few bytes in character encoding" )); |
563 | else if (now->val.charcode.nbytes > result->mb_cur_max) |
564 | lr_error (cmfile, _("too many bytes in character encoding" )); |
565 | else |
566 | charmap_new_char (cmfile, result, now->val.charcode.nbytes, |
567 | now->val.charcode.bytes, from_name, to_name, |
568 | ellipsis != tok_ellipsis2, step); |
569 | |
570 | /* Ignore trailing comment silently. */ |
571 | lr_ignore_rest (cmfile, 0); |
572 | |
573 | from_name = NULL; |
574 | to_name = NULL; |
575 | ellipsis = tok_none; |
576 | step = 1; |
577 | |
578 | state = 2; |
579 | continue; |
580 | |
581 | case 4: |
582 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
583 | { |
584 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
585 | "CHARMAP" , |
586 | _("no symbolic name given for end of range" )); |
587 | |
588 | lr_ignore_rest (cmfile, 0); |
589 | continue; |
590 | } |
591 | |
592 | /* Copy the to-name in a safe place. */ |
593 | if (nowtok == tok_bsymbol) |
594 | to_name = (char *) obstack_copy0 (&result->mem_pool, |
595 | cmfile->token.val.str.startmb, |
596 | cmfile->token.val.str.lenmb); |
597 | else |
598 | { |
599 | obstack_printf (&result->mem_pool, "U%08X" , |
600 | cmfile->token.val.ucs4); |
601 | obstack_1grow (&result->mem_pool, '\0'); |
602 | to_name = (char *) obstack_finish (&result->mem_pool); |
603 | } |
604 | |
605 | state = 5; |
606 | continue; |
607 | |
608 | case 90: |
609 | if (nowtok != expected_tok) |
610 | lr_error (cmfile, _("\ |
611 | %1$s: definition does not end with `END %1$s'" ), expected_str); |
612 | |
613 | lr_ignore_rest (cmfile, nowtok == expected_tok); |
614 | state = 91; |
615 | continue; |
616 | |
617 | case 91: |
618 | /* Waiting for WIDTH... */ |
619 | if (nowtok == tok_eol) |
620 | /* Ignore empty lines. */ |
621 | continue; |
622 | |
623 | if (nowtok == tok_width_default) |
624 | { |
625 | state = 92; |
626 | continue; |
627 | } |
628 | |
629 | if (nowtok == tok_width) |
630 | { |
631 | lr_ignore_rest (cmfile, 1); |
632 | state = 93; |
633 | continue; |
634 | } |
635 | |
636 | if (nowtok == tok_width_variable) |
637 | { |
638 | lr_ignore_rest (cmfile, 1); |
639 | state = 98; |
640 | continue; |
641 | } |
642 | |
643 | lr_error (cmfile, _("\ |
644 | only WIDTH definitions are allowed to follow the CHARMAP definition" )); |
645 | |
646 | lr_ignore_rest (cmfile, 0); |
647 | continue; |
648 | |
649 | case 92: |
650 | if (nowtok != tok_number) |
651 | lr_error (cmfile, _("value for %s must be an integer" ), |
652 | "WIDTH_DEFAULT" ); |
653 | else |
654 | result->width_default = now->val.num; |
655 | |
656 | lr_ignore_rest (cmfile, nowtok == tok_number); |
657 | |
658 | state = 91; |
659 | continue; |
660 | |
661 | case 93: |
662 | /* We now expect `END WIDTH' or lines of the format "%s %d\n" or |
663 | "%s...%s %d\n". */ |
664 | if (nowtok == tok_eol) |
665 | /* ignore empty lines. */ |
666 | continue; |
667 | |
668 | if (nowtok == tok_end) |
669 | { |
670 | expected_tok = tok_width; |
671 | expected_str = "WIDTH" ; |
672 | state = 90; |
673 | continue; |
674 | } |
675 | |
676 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
677 | { |
678 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
679 | "WIDTH" , _("no symbolic name given" )); |
680 | |
681 | lr_ignore_rest (cmfile, 0); |
682 | continue; |
683 | } |
684 | |
685 | if (from_name != NULL) |
686 | obstack_free (&result->mem_pool, from_name); |
687 | |
688 | if (nowtok == tok_bsymbol) |
689 | from_name = (char *) obstack_copy0 (&result->mem_pool, |
690 | now->val.str.startmb, |
691 | now->val.str.lenmb); |
692 | else |
693 | { |
694 | obstack_printf (&result->mem_pool, "U%08X" , |
695 | cmfile->token.val.ucs4); |
696 | obstack_1grow (&result->mem_pool, '\0'); |
697 | from_name = (char *) obstack_finish (&result->mem_pool); |
698 | } |
699 | |
700 | to_name = NULL; |
701 | |
702 | state = 94; |
703 | continue; |
704 | |
705 | case 94: |
706 | if (nowtok == tok_ellipsis3) |
707 | { |
708 | state = 95; |
709 | continue; |
710 | } |
711 | |
712 | case 96: |
713 | if (nowtok != tok_number) |
714 | lr_error (cmfile, _("value for %s must be an integer" ), |
715 | "WIDTH" ); |
716 | else |
717 | { |
718 | /* Store width for chars. */ |
719 | new_width (cmfile, result, from_name, to_name, now->val.num); |
720 | |
721 | from_name = NULL; |
722 | to_name = NULL; |
723 | } |
724 | |
725 | lr_ignore_rest (cmfile, nowtok == tok_number); |
726 | |
727 | state = 93; |
728 | continue; |
729 | |
730 | case 95: |
731 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
732 | { |
733 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
734 | "WIDTH" , _("no symbolic name given for end of range" )); |
735 | |
736 | lr_ignore_rest (cmfile, 0); |
737 | |
738 | state = 93; |
739 | continue; |
740 | } |
741 | |
742 | if (nowtok == tok_bsymbol) |
743 | to_name = (char *) obstack_copy0 (&result->mem_pool, |
744 | now->val.str.startmb, |
745 | now->val.str.lenmb); |
746 | else |
747 | { |
748 | obstack_printf (&result->mem_pool, "U%08X" , |
749 | cmfile->token.val.ucs4); |
750 | obstack_1grow (&result->mem_pool, '\0'); |
751 | to_name = (char *) obstack_finish (&result->mem_pool); |
752 | } |
753 | |
754 | state = 96; |
755 | continue; |
756 | |
757 | case 98: |
758 | /* We now expect `END WIDTH_VARIABLE' or lines of the format |
759 | "%s\n" or "%s...%s\n". */ |
760 | if (nowtok == tok_eol) |
761 | /* ignore empty lines. */ |
762 | continue; |
763 | |
764 | if (nowtok == tok_end) |
765 | { |
766 | expected_tok = tok_width_variable; |
767 | expected_str = "WIDTH_VARIABLE" ; |
768 | state = 90; |
769 | continue; |
770 | } |
771 | |
772 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
773 | { |
774 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
775 | "WIDTH_VARIABLE" , _("no symbolic name given" )); |
776 | |
777 | lr_ignore_rest (cmfile, 0); |
778 | |
779 | continue; |
780 | } |
781 | |
782 | if (from_name != NULL) |
783 | obstack_free (&result->mem_pool, from_name); |
784 | |
785 | if (nowtok == tok_bsymbol) |
786 | from_name = (char *) obstack_copy0 (&result->mem_pool, |
787 | now->val.str.startmb, |
788 | now->val.str.lenmb); |
789 | else |
790 | { |
791 | obstack_printf (&result->mem_pool, "U%08X" , |
792 | cmfile->token.val.ucs4); |
793 | obstack_1grow (&result->mem_pool, '\0'); |
794 | from_name = (char *) obstack_finish (&result->mem_pool); |
795 | } |
796 | to_name = NULL; |
797 | |
798 | state = 99; |
799 | continue; |
800 | |
801 | case 99: |
802 | if (nowtok == tok_ellipsis3) |
803 | state = 100; |
804 | |
805 | /* Store info. */ |
806 | from_name = NULL; |
807 | |
808 | /* Warn */ |
809 | state = 98; |
810 | continue; |
811 | |
812 | case 100: |
813 | if (nowtok != tok_bsymbol && nowtok != tok_ucs4) |
814 | { |
815 | lr_error (cmfile, _("syntax error in %s definition: %s" ), |
816 | "WIDTH_VARIABLE" , |
817 | _("no symbolic name given for end of range" )); |
818 | lr_ignore_rest (cmfile, 0); |
819 | continue; |
820 | } |
821 | |
822 | if (nowtok == tok_bsymbol) |
823 | to_name = (char *) obstack_copy0 (&result->mem_pool, |
824 | now->val.str.startmb, |
825 | now->val.str.lenmb); |
826 | else |
827 | { |
828 | obstack_printf (&result->mem_pool, "U%08X" , |
829 | cmfile->token.val.ucs4); |
830 | obstack_1grow (&result->mem_pool, '\0'); |
831 | to_name = (char *) obstack_finish (&result->mem_pool); |
832 | } |
833 | |
834 | /* XXX Enter value into table. */ |
835 | |
836 | lr_ignore_rest (cmfile, 1); |
837 | |
838 | state = 98; |
839 | continue; |
840 | |
841 | default: |
842 | WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine" ), |
843 | __FILE__)); |
844 | /* NOTREACHED */ |
845 | } |
846 | break; |
847 | } |
848 | |
849 | if (state != 91 && !be_quiet) |
850 | WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file" ), |
851 | cmfile->fname)); |
852 | |
853 | lr_close (cmfile); |
854 | |
855 | return result; |
856 | } |
857 | |
858 | |
859 | static void |
860 | new_width (struct linereader *cmfile, struct charmap_t *result, |
861 | const char *from, const char *to, unsigned long int width) |
862 | { |
863 | struct charseq *from_val; |
864 | struct charseq *to_val; |
865 | |
866 | from_val = charmap_find_value (result, from, strlen (from)); |
867 | if (from_val == NULL) |
868 | { |
869 | lr_error (cmfile, _("unknown character `%s'" ), from); |
870 | return; |
871 | } |
872 | |
873 | if (to == NULL) |
874 | to_val = from_val; |
875 | else |
876 | { |
877 | to_val = charmap_find_value (result, to, strlen (to)); |
878 | if (to_val == NULL) |
879 | { |
880 | lr_error (cmfile, _("unknown character `%s'" ), to); |
881 | return; |
882 | } |
883 | |
884 | /* Make sure the number of bytes for the end points of the range |
885 | is correct. */ |
886 | if (from_val->nbytes != to_val->nbytes) |
887 | { |
888 | lr_error (cmfile, _("\ |
889 | number of bytes for byte sequence of beginning and end of range not the same: %d vs %d" ), |
890 | from_val->nbytes, to_val->nbytes); |
891 | return; |
892 | } |
893 | } |
894 | |
895 | if (result->nwidth_rules >= result->nwidth_rules_max) |
896 | { |
897 | size_t new_size = result->nwidth_rules + 32; |
898 | struct width_rule *new_rules = |
899 | (struct width_rule *) obstack_alloc (&result->mem_pool, |
900 | (new_size |
901 | * sizeof (struct width_rule))); |
902 | |
903 | memcpy (new_rules, result->width_rules, |
904 | result->nwidth_rules_max * sizeof (struct width_rule)); |
905 | |
906 | result->width_rules = new_rules; |
907 | result->nwidth_rules_max = new_size; |
908 | } |
909 | |
910 | result->width_rules[result->nwidth_rules].from = from_val; |
911 | result->width_rules[result->nwidth_rules].to = to_val; |
912 | result->width_rules[result->nwidth_rules].width = (unsigned int) width; |
913 | ++result->nwidth_rules; |
914 | } |
915 | |
916 | |
917 | struct charseq * |
918 | charmap_find_value (const struct charmap_t *cm, const char *name, size_t len) |
919 | { |
920 | void *result; |
921 | |
922 | return (find_entry ((hash_table *) &cm->char_table, name, len, &result) |
923 | < 0 ? NULL : (struct charseq *) result); |
924 | } |
925 | |
926 | |
927 | static void |
928 | charmap_new_char (struct linereader *lr, struct charmap_t *cm, |
929 | size_t nbytes, unsigned char *bytes, |
930 | const char *from, const char *to, |
931 | int decimal_ellipsis, int step) |
932 | { |
933 | hash_table *ht = &cm->char_table; |
934 | hash_table *bt = &cm->byte_table; |
935 | struct obstack *ob = &cm->mem_pool; |
936 | char *from_end; |
937 | char *to_end; |
938 | const char *cp; |
939 | int prefix_len, len1, len2; |
940 | unsigned int from_nr, to_nr, cnt; |
941 | struct charseq *newp; |
942 | |
943 | len1 = strlen (from); |
944 | |
945 | if (to == NULL) |
946 | { |
947 | newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes); |
948 | newp->nbytes = nbytes; |
949 | memcpy (newp->bytes, bytes, nbytes); |
950 | newp->name = from; |
951 | |
952 | newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
953 | if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9)) |
954 | { |
955 | /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where |
956 | xxxx and xxxxxxxx are hexadecimal numbers. In this case |
957 | we use the value of xxxx or xxxxxxxx as the UCS4 value of |
958 | this character and we don't have to consult the repertoire |
959 | map. |
960 | |
961 | If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx |
962 | and xxxxxxxx also give the code point in UCS4 but this must |
963 | be in the private, i.e., unassigned, area. This should be |
964 | used for characters which do not (yet) have an equivalent |
965 | in ISO 10646 and Unicode. */ |
966 | char *endp; |
967 | |
968 | errno = 0; |
969 | newp->ucs4 = strtoul (from + 1, &endp, 16); |
970 | if (endp - from != len1 |
971 | || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE) |
972 | || newp->ucs4 >= 0x80000000) |
973 | /* This wasn't successful. Signal this name cannot be a |
974 | correct UCS value. */ |
975 | newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
976 | } |
977 | |
978 | insert_entry (ht, from, len1, newp); |
979 | insert_entry (bt, newp->bytes, nbytes, newp); |
980 | /* Please note that it isn't a bug if a symbol is defined more |
981 | than once. All later definitions are simply discarded. */ |
982 | return; |
983 | } |
984 | |
985 | /* We have a range: the names must have names with equal prefixes |
986 | and an equal number of digits, where the second number is greater |
987 | or equal than the first. */ |
988 | len2 = strlen (to); |
989 | |
990 | if (len1 != len2) |
991 | { |
992 | illegal_range: |
993 | lr_error (lr, _("invalid names for character range" )); |
994 | return; |
995 | } |
996 | |
997 | cp = &from[len1 - 1]; |
998 | if (decimal_ellipsis) |
999 | while (isdigit (*cp) && cp >= from) |
1000 | --cp; |
1001 | else |
1002 | while (isxdigit (*cp) && cp >= from) |
1003 | { |
1004 | if (!isdigit (*cp) && !isupper (*cp)) |
1005 | lr_error (lr, _("\ |
1006 | hexadecimal range format should use only capital characters" )); |
1007 | --cp; |
1008 | } |
1009 | |
1010 | prefix_len = (cp - from) + 1; |
1011 | |
1012 | if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0) |
1013 | goto illegal_range; |
1014 | |
1015 | errno = 0; |
1016 | from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16); |
1017 | if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE) |
1018 | || ((to_nr = strtoul (&to[prefix_len], &to_end, |
1019 | decimal_ellipsis ? 10 : 16)) == UINT_MAX |
1020 | && errno == ERANGE) |
1021 | || *to_end != '\0') |
1022 | { |
1023 | lr_error (lr, _("<%s> and <%s> are invalid names for range" ), from, to); |
1024 | return; |
1025 | } |
1026 | |
1027 | if (from_nr > to_nr) |
1028 | { |
1029 | lr_error (lr, _("upper limit in range is smaller than lower limit" )); |
1030 | return; |
1031 | } |
1032 | |
1033 | for (cnt = from_nr; cnt <= to_nr; cnt += step) |
1034 | { |
1035 | char *name_end; |
1036 | obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X" , |
1037 | prefix_len, from, len1 - prefix_len, cnt); |
1038 | obstack_1grow (ob, '\0'); |
1039 | name_end = obstack_finish (ob); |
1040 | |
1041 | newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes); |
1042 | newp->nbytes = nbytes; |
1043 | memcpy (newp->bytes, bytes, nbytes); |
1044 | newp->name = name_end; |
1045 | |
1046 | newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
1047 | if ((name_end[0] == 'U' || name_end[0] == 'P') |
1048 | && (len1 == 5 || len1 == 9)) |
1049 | { |
1050 | /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where |
1051 | xxxx and xxxxxxxx are hexadecimal numbers. In this case |
1052 | we use the value of xxxx or xxxxxxxx as the UCS4 value of |
1053 | this character and we don't have to consult the repertoire |
1054 | map. |
1055 | |
1056 | If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx |
1057 | and xxxxxxxx also give the code point in UCS4 but this must |
1058 | be in the private, i.e., unassigned, area. This should be |
1059 | used for characters which do not (yet) have an equivalent |
1060 | in ISO 10646 and Unicode. */ |
1061 | char *endp; |
1062 | |
1063 | errno = 0; |
1064 | newp->ucs4 = strtoul (name_end + 1, &endp, 16); |
1065 | if (endp - name_end != len1 |
1066 | || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE) |
1067 | || newp->ucs4 >= 0x80000000) |
1068 | /* This wasn't successful. Signal this name cannot be a |
1069 | correct UCS value. */ |
1070 | newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
1071 | } |
1072 | |
1073 | insert_entry (ht, name_end, len1, newp); |
1074 | insert_entry (bt, newp->bytes, nbytes, newp); |
1075 | /* Please note we don't examine the return value since it is no error |
1076 | if we have two definitions for a symbol. */ |
1077 | |
1078 | /* Increment the value in the byte sequence. */ |
1079 | if (++bytes[nbytes - 1] == '\0') |
1080 | { |
1081 | int b = nbytes - 2; |
1082 | |
1083 | do |
1084 | if (b < 0) |
1085 | { |
1086 | lr_error (lr, |
1087 | _("resulting bytes for range not representable." )); |
1088 | return; |
1089 | } |
1090 | while (++bytes[b--] == 0); |
1091 | } |
1092 | } |
1093 | } |
1094 | |
1095 | |
1096 | struct charseq * |
1097 | charmap_find_symbol (const struct charmap_t *cm, const char *bytes, |
1098 | size_t nbytes) |
1099 | { |
1100 | void *result; |
1101 | |
1102 | return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result) |
1103 | < 0 ? NULL : (struct charseq *) result); |
1104 | } |
1105 | |