1/* Copyright (C) 1998-2019 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <errno.h>
23#include <limits.h>
24#include <obstack.h>
25#include <search.h>
26#include <stdlib.h>
27#include <string.h>
28#include <unistd.h>
29#include <stdint.h>
30
31#include "localedef.h"
32#include "linereader.h"
33#include "charmap.h"
34#include "repertoire.h"
35#include "simple-hash.h"
36
37
38/* Simple keyword hashing for the repertoiremap. */
39static const struct keyword_t *repertoiremap_hash (const char *str,
40 unsigned int len);
41static void repertoire_new_char (struct linereader *lr, hash_table *ht,
42 hash_table *rt, struct obstack *ob,
43 uint32_t value, const char *from,
44 const char *to, int decimal_ellipsis);
45static int repertoire_compare (const void *p1, const void *p2);
46
47/* Already known repertoire maps. */
48static void *known;
49
50/* List of repertoire maps which are not available and which have been
51 reported to not be. */
52static void *unavailable;
53
54
55struct repertoire_t *
56repertoire_read (const char *filename)
57{
58 struct linereader *repfile;
59 struct repertoire_t *result;
60 struct repertoire_t **resultp;
61 struct repertoire_t search;
62 int state;
63 char *from_name = NULL;
64 char *to_name = NULL;
65 enum token_t ellipsis = tok_none;
66
67 search.name = filename;
68 resultp = tfind (&search, &known, &repertoire_compare);
69 if (resultp != NULL)
70 return *resultp;
71
72 /* Determine path. */
73 repfile = lr_open (filename, repertoiremap_hash);
74 if (repfile == NULL)
75 {
76 if (strchr (filename, '/') == NULL)
77 {
78 char *i18npath = getenv ("I18NPATH");
79 if (i18npath != NULL && *i18npath != '\0')
80 {
81 const size_t pathlen = strlen (i18npath);
82 char i18npathbuf[pathlen + 1];
83 char path[strlen (filename) + 1 + pathlen
84 + sizeof ("/repertoiremaps/") - 1];
85 char *next;
86 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
87
88 while (repfile == NULL
89 && (next = strsep (&i18npath, ":")) != NULL)
90 {
91 stpcpy (stpcpy (stpcpy (path, next), "/repertoiremaps/"),
92 filename);
93
94 repfile = lr_open (path, repertoiremap_hash);
95
96 if (repfile == NULL)
97 {
98 stpcpy (stpcpy (stpcpy (path, next), "/"), filename);
99
100 repfile = lr_open (path, repertoiremap_hash);
101 }
102 }
103 }
104
105 if (repfile == NULL)
106 {
107 /* Look in the systems charmap directory. */
108 char *buf = xmalloc (strlen (filename) + 1
109 + sizeof (REPERTOIREMAP_PATH));
110
111 stpcpy (stpcpy (stpcpy (buf, REPERTOIREMAP_PATH), "/"),
112 filename);
113 repfile = lr_open (buf, repertoiremap_hash);
114
115 free (buf);
116 }
117 }
118
119 if (repfile == NULL)
120 return NULL;
121 }
122
123 /* We don't want symbolic names in string to be translated. */
124 repfile->translate_strings = 0;
125
126 /* Allocate room for result. */
127 result = (struct repertoire_t *) xmalloc (sizeof (struct repertoire_t));
128 memset (result, '\0', sizeof (struct repertoire_t));
129
130 result->name = xstrdup (filename);
131
132#define obstack_chunk_alloc malloc
133#define obstack_chunk_free free
134 obstack_init (&result->mem_pool);
135
136 if (init_hash (&result->char_table, 256)
137 || init_hash (&result->reverse_table, 256)
138 || init_hash (&result->seq_table, 256))
139 {
140 free (result);
141 return NULL;
142 }
143
144 /* We use a state machine to describe the charmap description file
145 format. */
146 state = 1;
147 while (1)
148 {
149 /* What's on? */
150 struct token *now = lr_token (repfile, NULL, NULL, NULL, verbose);
151 enum token_t nowtok = now->tok;
152 struct token *arg;
153
154 if (nowtok == tok_eof)
155 break;
156
157 switch (state)
158 {
159 case 1:
160 /* We haven't yet read any character definition. This is where
161 we accept escape_char and comment_char definitions. */
162 if (nowtok == tok_eol)
163 /* Ignore empty lines. */
164 continue;
165
166 if (nowtok == tok_escape_char || nowtok == tok_comment_char)
167 {
168 /* We know that we need an argument. */
169 arg = lr_token (repfile, NULL, NULL, NULL, verbose);
170
171 if (arg->tok != tok_ident)
172 {
173 lr_error (repfile, _("syntax error in prolog: %s"),
174 _("bad argument"));
175
176 lr_ignore_rest (repfile, 0);
177 continue;
178 }
179
180 if (arg->val.str.lenmb != 1)
181 {
182 lr_error (repfile, _("\
183argument to <%s> must be a single character"),
184 nowtok == tok_escape_char ? "escape_char"
185 : "comment_char");
186
187 lr_ignore_rest (repfile, 0);
188 continue;
189 }
190
191 if (nowtok == tok_escape_char)
192 repfile->escape_char = *arg->val.str.startmb;
193 else
194 repfile->comment_char = *arg->val.str.startmb;
195
196 lr_ignore_rest (repfile, 1);
197 continue;
198 }
199
200 if (nowtok == tok_charids)
201 {
202 lr_ignore_rest (repfile, 1);
203
204 state = 2;
205 continue;
206 }
207
208 /* Otherwise we start reading the character definitions. */
209 state = 2;
210 /* FALLTHROUGH */
211
212 case 2:
213 /* We are now are in the body. Each line
214 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
215 if (nowtok == tok_eol)
216 /* Ignore empty lines. */
217 continue;
218
219 if (nowtok == tok_end)
220 {
221 state = 90;
222 continue;
223 }
224
225 if (nowtok != tok_bsymbol)
226 {
227 lr_error (repfile,
228 _("syntax error in repertoire map definition: %s"),
229 _("no symbolic name given"));
230
231 lr_ignore_rest (repfile, 0);
232 continue;
233 }
234
235 /* If the previous line was not completely correct free the
236 used memory. */
237 if (from_name != NULL)
238 obstack_free (&result->mem_pool, from_name);
239
240 from_name = (char *) obstack_copy0 (&result->mem_pool,
241 now->val.str.startmb,
242 now->val.str.lenmb);
243 to_name = NULL;
244
245 state = 3;
246 continue;
247
248 case 3:
249 /* We have two possibilities: We can see an ellipsis or an
250 encoding value. */
251 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
252 || nowtok == tok_ellipsis2)
253 {
254 ellipsis = nowtok;
255 state = 4;
256 continue;
257 }
258 /* FALLTHROUGH */
259
260 case 5:
261 /* We expect a value of the form <Uxxxx> or <Uxxxxxxxx> where
262 the xxx mean a hexadecimal value. */
263 state = 2;
264
265 errno = 0;
266 if (nowtok != tok_ucs4)
267 {
268 lr_error (repfile,
269 _("syntax error in repertoire map definition: %s"),
270 _("no <Uxxxx> or <Uxxxxxxxx> value given"));
271
272 lr_ignore_rest (repfile, 0);
273 continue;
274 }
275
276 /* We've found a new valid definition. */
277 repertoire_new_char (repfile, &result->char_table,
278 &result->reverse_table, &result->mem_pool,
279 now->val.ucs4, from_name, to_name,
280 ellipsis != tok_ellipsis2);
281
282 /* Ignore the rest of the line. */
283 lr_ignore_rest (repfile, 0);
284
285 from_name = NULL;
286 to_name = NULL;
287
288 continue;
289
290 case 4:
291 if (nowtok != tok_bsymbol)
292 {
293 lr_error (repfile,
294 _("syntax error in repertoire map definition: %s"),
295 _("no symbolic name given for end of range"));
296
297 lr_ignore_rest (repfile, 0);
298 state = 2;
299 continue;
300 }
301
302 /* Copy the to-name in a safe place. */
303 to_name = (char *) obstack_copy0 (&result->mem_pool,
304 repfile->token.val.str.startmb,
305 repfile->token.val.str.lenmb);
306
307 state = 5;
308 continue;
309
310 case 90:
311 if (nowtok != tok_charids)
312 lr_error (repfile, _("\
313%1$s: definition does not end with `END %1$s'"), "CHARIDS");
314
315 lr_ignore_rest (repfile, nowtok == tok_charids);
316 break;
317 }
318
319 break;
320 }
321
322 if (state != 2 && state != 90 && !be_quiet)
323 record_error (0, 0, _("%s: premature end of file"),
324 repfile->fname);
325
326 lr_close (repfile);
327
328 if (tsearch (result, &known, &repertoire_compare) == NULL)
329 /* Something went wrong. */
330 record_error (0, errno, _("cannot save new repertoire map"));
331
332 return result;
333}
334
335
336void
337repertoire_complain (const char *name)
338{
339 if (tfind (name, &unavailable, (__compar_fn_t) strcmp) == NULL)
340 {
341 record_error (0, errno, _("\
342repertoire map file `%s' not found"), name);
343
344 /* Remember that we reported this map. */
345 tsearch (name, &unavailable, (__compar_fn_t) strcmp);
346 }
347}
348
349
350static int
351repertoire_compare (const void *p1, const void *p2)
352{
353 struct repertoire_t *r1 = (struct repertoire_t *) p1;
354 struct repertoire_t *r2 = (struct repertoire_t *) p2;
355
356 return strcmp (r1->name, r2->name);
357}
358
359
360static const struct keyword_t *
361repertoiremap_hash (const char *str, unsigned int len)
362{
363 static const struct keyword_t wordlist[] =
364 {
365 {"escape_char", tok_escape_char, 0},
366 {"comment_char", tok_comment_char, 0},
367 {"CHARIDS", tok_charids, 0},
368 {"END", tok_end, 0},
369 };
370
371 if (len == 11 && memcmp (wordlist[0].name, str, 11) == 0)
372 return &wordlist[0];
373 if (len == 12 && memcmp (wordlist[1].name, str, 12) == 0)
374 return &wordlist[1];
375 if (len == 7 && memcmp (wordlist[2].name, str, 7) == 0)
376 return &wordlist[2];
377 if (len == 3 && memcmp (wordlist[3].name, str, 3) == 0)
378 return &wordlist[3];
379
380 return NULL;
381}
382
383
384static void
385repertoire_new_char (struct linereader *lr, hash_table *ht, hash_table *rt,
386 struct obstack *ob, uint32_t value, const char *from,
387 const char *to, int decimal_ellipsis)
388{
389 char *from_end;
390 char *to_end;
391 const char *cp;
392 char *buf = NULL;
393 int prefix_len, len1, len2;
394 unsigned long int from_nr, to_nr, cnt;
395
396 if (to == NULL)
397 {
398 insert_entry (ht, from, strlen (from),
399 (void *) (unsigned long int) value);
400 /* Please note that it isn't a bug if a symbol is defined more
401 than once. All later definitions are simply discarded. */
402
403 insert_entry (rt, obstack_copy (ob, &value, sizeof (value)),
404 sizeof (value), (void *) from);
405
406 return;
407 }
408
409 /* We have a range: the names must have names with equal prefixes
410 and an equal number of digits, where the second number is greater
411 or equal than the first. */
412 len1 = strlen (from);
413 len2 = strlen (to);
414
415 if (len1 != len2)
416 {
417 invalid_range:
418 lr_error (lr, _("invalid names for character range"));
419 return;
420 }
421
422 cp = &from[len1 - 1];
423 if (decimal_ellipsis)
424 while (isdigit (*cp) && cp >= from)
425 --cp;
426 else
427 while (isxdigit (*cp) && cp >= from)
428 {
429 if (!isdigit (*cp) && !isupper (*cp))
430 lr_error (lr, _("\
431hexadecimal range format should use only capital characters"));
432 --cp;
433 }
434
435 prefix_len = (cp - from) + 1;
436
437 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
438 goto invalid_range;
439
440 errno = 0;
441 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
442 if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
443 || ((to_nr = strtoul (&to[prefix_len], &to_end,
444 decimal_ellipsis ? 10 : 16)) == ULONG_MAX
445 && errno == ERANGE)
446 || *to_end != '\0')
447 {
448 lr_error (lr, _("<%s> and <%s> are invalid names for range"),
449 from, to);
450 return;
451 }
452
453 if (from_nr > to_nr)
454 {
455 lr_error (lr, _("upper limit in range is smaller than lower limit"));
456 return;
457 }
458
459 for (cnt = from_nr; cnt <= to_nr; ++cnt)
460 {
461 uint32_t this_value = value + (cnt - from_nr);
462
463 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*ld" : "%.*s%0*lX",
464 prefix_len, from, len1 - prefix_len, cnt);
465 obstack_1grow (ob, '\0');
466
467 insert_entry (ht, buf, len1,
468 (void *) (unsigned long int) this_value);
469 /* Please note we don't examine the return value since it is no error
470 if we have two definitions for a symbol. */
471
472 insert_entry (rt, obstack_copy (ob, &this_value, sizeof (this_value)),
473 sizeof (this_value), (void *) from);
474 }
475}
476
477
478uint32_t
479repertoire_find_value (const struct repertoire_t *rep, const char *name,
480 size_t len)
481{
482 void *result;
483
484 if (rep == NULL)
485 return ILLEGAL_CHAR_VALUE;
486
487 if (find_entry ((hash_table *) &rep->char_table, name, len, &result) < 0)
488 return ILLEGAL_CHAR_VALUE;
489
490 return (uint32_t) ((unsigned long int) result);
491}
492
493
494const char *
495repertoire_find_symbol (const struct repertoire_t *rep, uint32_t ucs)
496{
497 void *result;
498
499 if (rep == NULL)
500 return NULL;
501
502 if (find_entry ((hash_table *) &rep->reverse_table, &ucs, sizeof (ucs),
503 &result) < 0)
504 return NULL;
505
506 return (const char *) result;
507}
508
509
510struct charseq *
511repertoire_find_seq (const struct repertoire_t *rep, uint32_t ucs)
512{
513 void *result;
514
515 if (rep == NULL)
516 return NULL;
517
518 if (find_entry ((hash_table *) &rep->seq_table, &ucs, sizeof (ucs),
519 &result) < 0)
520 return NULL;
521
522 return (struct charseq *) result;
523}
524