1/* Copyright (C) 1996-2019 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <assert.h>
23#include <ctype.h>
24#include <errno.h>
25#include <libintl.h>
26#include <stdarg.h>
27#include <stdlib.h>
28#include <string.h>
29#include <stdint.h>
30
31#include "localedef.h"
32#include "charmap.h"
33#include "error.h"
34#include "linereader.h"
35#include "locfile.h"
36
37/* Prototypes for local functions. */
38static struct token *get_toplvl_escape (struct linereader *lr);
39static struct token *get_symname (struct linereader *lr);
40static struct token *get_ident (struct linereader *lr);
41static struct token *get_string (struct linereader *lr,
42 const struct charmap_t *charmap,
43 struct localedef_t *locale,
44 const struct repertoire_t *repertoire,
45 int verbose);
46
47
48struct linereader *
49lr_open (const char *fname, kw_hash_fct_t hf)
50{
51 FILE *fp;
52
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
55 return lr_create (stdin, "<stdin>", hf);
56 else
57 {
58 fp = fopen (fname, "rm");
59 if (fp == NULL)
60 return NULL;
61 return lr_create (fp, fname, hf);
62 }
63}
64
65struct linereader *
66lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
67{
68 struct linereader *result;
69 int n;
70
71 result = (struct linereader *) xmalloc (sizeof (*result));
72
73 result->fp = fp;
74 result->fname = xstrdup (fname);
75 result->buf = NULL;
76 result->bufsize = 0;
77 result->lineno = 1;
78 result->idx = 0;
79 result->comment_char = '#';
80 result->escape_char = '\\';
81 result->translate_strings = 1;
82 result->return_widestr = 0;
83
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
86 {
87 int save = errno;
88 fclose (result->fp);
89 free ((char *) result->fname);
90 free (result);
91 errno = save;
92 return NULL;
93 }
94
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
97
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
101
102 return result;
103}
104
105
106int
107lr_eof (struct linereader *lr)
108{
109 return lr->bufact = 0;
110}
111
112
113void
114lr_ignore_rest (struct linereader *lr, int verbose)
115{
116 if (verbose)
117 {
118 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 && lr->buf[lr->idx] != lr->comment_char)
120 if (lr->buf[lr->idx] == '\0')
121 {
122 if (lr_next (lr) < 0)
123 return;
124 }
125 else
126 ++lr->idx;
127
128 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 && lr->buf[lr->idx] != lr->comment_char)
130 lr_error (lr, _("trailing garbage at end of line"));
131 }
132
133 /* Ignore continued line. */
134 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135 if (lr_next (lr) < 0)
136 break;
137
138 lr->idx = lr->bufact;
139}
140
141
142void
143lr_close (struct linereader *lr)
144{
145 fclose (lr->fp);
146 free (lr->buf);
147 free (lr);
148}
149
150
151int
152lr_next (struct linereader *lr)
153{
154 int n;
155
156 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157 if (n < 0)
158 return -1;
159
160 ++lr->lineno;
161
162 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
163 {
164#if 0
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
167 --n;
168 lr->buf[n - 1] = ' ';
169#else
170 n -= 2;
171#endif
172 }
173
174 lr->buf[n] = '\0';
175 lr->bufact = n;
176 lr->idx = 0;
177
178 return 0;
179}
180
181
182/* Defined in error.c. */
183/* This variable is incremented each time `error' is called. */
184extern unsigned int error_message_count;
185
186/* The calling program should define program_name and set it to the
187 name of the executing program. */
188extern char *program_name;
189
190
191struct token *
192lr_token (struct linereader *lr, const struct charmap_t *charmap,
193 struct localedef_t *locale, const struct repertoire_t *repertoire,
194 int verbose)
195{
196 int ch;
197
198 while (1)
199 {
200 do
201 {
202 ch = lr_getc (lr);
203
204 if (ch == EOF)
205 {
206 lr->token.tok = tok_eof;
207 return &lr->token;
208 };
209
210 if (ch == '\n')
211 {
212 lr->token.tok = tok_eol;
213 return &lr->token;
214 }
215 }
216 while (isspace (ch));
217
218 if (ch != lr->comment_char)
219 break;
220
221 /* Is there an newline at the end of the buffer? */
222 if (lr->buf[lr->bufact - 1] != '\n')
223 {
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
226 Let's try this. */
227 lr->idx = lr->bufact;
228 continue;
229 }
230
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr, 0);
233 lr->token.tok = tok_eol;
234 return &lr->token;
235 }
236
237 /* Match escape sequences. */
238 if (ch == lr->escape_char)
239 return get_toplvl_escape (lr);
240
241 /* Match ellipsis. */
242 if (ch == '.')
243 {
244 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
245 {
246 int cnt;
247 for (cnt = 0; cnt < 10; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis4_2;
250 return &lr->token;
251 }
252 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
253 {
254 lr_getc (lr);
255 lr_getc (lr);
256 lr_getc (lr);
257 lr->token.tok = tok_ellipsis4;
258 return &lr->token;
259 }
260 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
261 {
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis3;
265 return &lr->token;
266 }
267 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
268 {
269 int cnt;
270 for (cnt = 0; cnt < 6; ++cnt)
271 lr_getc (lr);
272 lr->token.tok = tok_ellipsis2_2;
273 return &lr->token;
274 }
275 if (lr->buf[lr->idx] == '.')
276 {
277 lr_getc (lr);
278 lr->token.tok = tok_ellipsis2;
279 return &lr->token;
280 }
281 }
282
283 switch (ch)
284 {
285 case '<':
286 return get_symname (lr);
287
288 case '0' ... '9':
289 lr->token.tok = tok_number;
290 lr->token.val.num = ch - '0';
291
292 while (isdigit (ch = lr_getc (lr)))
293 {
294 lr->token.val.num *= 10;
295 lr->token.val.num += ch - '0';
296 }
297 if (isalpha (ch))
298 lr_error (lr, _("garbage at end of number"));
299 lr_ungetn (lr, 1);
300
301 return &lr->token;
302
303 case ';':
304 lr->token.tok = tok_semicolon;
305 return &lr->token;
306
307 case ',':
308 lr->token.tok = tok_comma;
309 return &lr->token;
310
311 case '(':
312 lr->token.tok = tok_open_brace;
313 return &lr->token;
314
315 case ')':
316 lr->token.tok = tok_close_brace;
317 return &lr->token;
318
319 case '"':
320 return get_string (lr, charmap, locale, repertoire, verbose);
321
322 case '-':
323 ch = lr_getc (lr);
324 if (ch == '1')
325 {
326 lr->token.tok = tok_minus1;
327 return &lr->token;
328 }
329 lr_ungetn (lr, 2);
330 break;
331 }
332
333 return get_ident (lr);
334}
335
336
337static struct token *
338get_toplvl_escape (struct linereader *lr)
339{
340 /* This is supposed to be a numeric value. We return the
341 numerical value and the number of bytes. */
342 size_t start_idx = lr->idx - 1;
343 unsigned char *bytes = lr->token.val.charcode.bytes;
344 size_t nbytes = 0;
345 int ch;
346
347 do
348 {
349 unsigned int byte = 0;
350 unsigned int base = 8;
351
352 ch = lr_getc (lr);
353
354 if (ch == 'd')
355 {
356 base = 10;
357 ch = lr_getc (lr);
358 }
359 else if (ch == 'x')
360 {
361 base = 16;
362 ch = lr_getc (lr);
363 }
364
365 if ((base == 16 && !isxdigit (ch))
366 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
367 {
368 esc_error:
369 lr->token.val.str.startmb = &lr->buf[start_idx];
370
371 while (ch != EOF && !isspace (ch))
372 ch = lr_getc (lr);
373 lr->token.val.str.lenmb = lr->idx - start_idx;
374
375 lr->token.tok = tok_error;
376 return &lr->token;
377 }
378
379 if (isdigit (ch))
380 byte = ch - '0';
381 else
382 byte = tolower (ch) - 'a' + 10;
383
384 ch = lr_getc (lr);
385 if ((base == 16 && !isxdigit (ch))
386 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
387 goto esc_error;
388
389 byte *= base;
390 if (isdigit (ch))
391 byte += ch - '0';
392 else
393 byte += tolower (ch) - 'a' + 10;
394
395 ch = lr_getc (lr);
396 if (base != 16 && isdigit (ch))
397 {
398 byte *= base;
399 byte += ch - '0';
400
401 ch = lr_getc (lr);
402 }
403
404 bytes[nbytes++] = byte;
405 }
406 while (ch == lr->escape_char
407 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
408
409 if (!isspace (ch))
410 lr_error (lr, _("garbage at end of character code specification"));
411
412 lr_ungetn (lr, 1);
413
414 lr->token.tok = tok_charcode;
415 lr->token.val.charcode.nbytes = nbytes;
416
417 return &lr->token;
418}
419
420
421#define ADDC(ch) \
422 do \
423 { \
424 if (bufact == bufmax) \
425 { \
426 bufmax *= 2; \
427 buf = xrealloc (buf, bufmax); \
428 } \
429 buf[bufact++] = (ch); \
430 } \
431 while (0)
432
433
434#define ADDS(s, l) \
435 do \
436 { \
437 size_t _l = (l); \
438 if (bufact + _l > bufmax) \
439 { \
440 if (bufact < _l) \
441 bufact = _l; \
442 bufmax *= 2; \
443 buf = xrealloc (buf, bufmax); \
444 } \
445 memcpy (&buf[bufact], s, _l); \
446 bufact += _l; \
447 } \
448 while (0)
449
450
451#define ADDWC(ch) \
452 do \
453 { \
454 if (buf2act == buf2max) \
455 { \
456 buf2max *= 2; \
457 buf2 = xrealloc (buf2, buf2max * 4); \
458 } \
459 buf2[buf2act++] = (ch); \
460 } \
461 while (0)
462
463
464static struct token *
465get_symname (struct linereader *lr)
466{
467 /* Symbol in brackets. We must distinguish three kinds:
468 1. reserved words
469 2. ISO 10646 position values
470 3. all other. */
471 char *buf;
472 size_t bufact = 0;
473 size_t bufmax = 56;
474 const struct keyword_t *kw;
475 int ch;
476
477 buf = (char *) xmalloc (bufmax);
478
479 do
480 {
481 ch = lr_getc (lr);
482 if (ch == lr->escape_char)
483 {
484 int c2 = lr_getc (lr);
485 ADDC (c2);
486
487 if (c2 == '\n')
488 ch = '\n';
489 }
490 else
491 ADDC (ch);
492 }
493 while (ch != '>' && ch != '\n');
494
495 if (ch == '\n')
496 lr_error (lr, _("unterminated symbolic name"));
497
498 /* Test for ISO 10646 position value. */
499 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
500 {
501 char *cp = buf + 1;
502 while (cp < &buf[bufact - 1] && isxdigit (*cp))
503 ++cp;
504
505 if (cp == &buf[bufact - 1])
506 {
507 /* Yes, it is. */
508 lr->token.tok = tok_ucs4;
509 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
510
511 return &lr->token;
512 }
513 }
514
515 /* It is a symbolic name. Test for reserved words. */
516 kw = lr->hash_fct (buf, bufact - 1);
517
518 if (kw != NULL && kw->symname_or_ident == 1)
519 {
520 lr->token.tok = kw->token;
521 free (buf);
522 }
523 else
524 {
525 lr->token.tok = tok_bsymbol;
526
527 buf = xrealloc (buf, bufact + 1);
528 buf[bufact] = '\0';
529
530 lr->token.val.str.startmb = buf;
531 lr->token.val.str.lenmb = bufact - 1;
532 }
533
534 return &lr->token;
535}
536
537
538static struct token *
539get_ident (struct linereader *lr)
540{
541 char *buf;
542 size_t bufact;
543 size_t bufmax = 56;
544 const struct keyword_t *kw;
545 int ch;
546
547 buf = xmalloc (bufmax);
548 bufact = 0;
549
550 ADDC (lr->buf[lr->idx - 1]);
551
552 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
553 && ch != '<' && ch != ',' && ch != EOF)
554 {
555 if (ch == lr->escape_char)
556 {
557 ch = lr_getc (lr);
558 if (ch == '\n' || ch == EOF)
559 {
560 lr_error (lr, _("invalid escape sequence"));
561 break;
562 }
563 }
564 ADDC (ch);
565 }
566
567 lr_ungetc (lr, ch);
568
569 kw = lr->hash_fct (buf, bufact);
570
571 if (kw != NULL && kw->symname_or_ident == 0)
572 {
573 lr->token.tok = kw->token;
574 free (buf);
575 }
576 else
577 {
578 lr->token.tok = tok_ident;
579
580 buf = xrealloc (buf, bufact + 1);
581 buf[bufact] = '\0';
582
583 lr->token.val.str.startmb = buf;
584 lr->token.val.str.lenmb = bufact;
585 }
586
587 return &lr->token;
588}
589
590
591static struct token *
592get_string (struct linereader *lr, const struct charmap_t *charmap,
593 struct localedef_t *locale, const struct repertoire_t *repertoire,
594 int verbose)
595{
596 int return_widestr = lr->return_widestr;
597 char *buf;
598 wchar_t *buf2 = NULL;
599 size_t bufact;
600 size_t bufmax = 56;
601
602 /* We must return two different strings. */
603 buf = xmalloc (bufmax);
604 bufact = 0;
605
606 /* We know it'll be a string. */
607 lr->token.tok = tok_string;
608
609 /* If we need not translate the strings (i.e., expand <...> parts)
610 we can run a simple loop. */
611 if (!lr->translate_strings)
612 {
613 int ch;
614
615 buf2 = NULL;
616 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
617 ADDC (ch);
618
619 /* Catch errors with trailing escape character. */
620 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
621 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
622 {
623 lr_error (lr, _("illegal escape sequence at end of string"));
624 --bufact;
625 }
626 else if (ch == '\n' || ch == EOF)
627 lr_error (lr, _("unterminated string"));
628
629 ADDC ('\0');
630 }
631 else
632 {
633 int illegal_string = 0;
634 size_t buf2act = 0;
635 size_t buf2max = 56 * sizeof (uint32_t);
636 int ch;
637
638 /* We have to provide the wide character result as well. */
639 if (return_widestr)
640 buf2 = xmalloc (buf2max);
641
642 /* Read until the end of the string (or end of the line or file). */
643 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
644 {
645 size_t startidx;
646 uint32_t wch;
647 struct charseq *seq;
648
649 if (ch != '<')
650 {
651 /* The standards leave it up to the implementation to decide
652 what to do with character which stand for themself. We
653 could jump through hoops to find out the value relative to
654 the charmap and the repertoire map, but instead we leave
655 it up to the locale definition author to write a better
656 definition. We assume here that every character which
657 stands for itself is encoded using ISO 8859-1. Using the
658 escape character is allowed. */
659 if (ch == lr->escape_char)
660 {
661 ch = lr_getc (lr);
662 if (ch == '\n' || ch == EOF)
663 break;
664 }
665
666 ADDC (ch);
667 if (return_widestr)
668 ADDWC ((uint32_t) ch);
669
670 continue;
671 }
672
673 /* Now we have to search for the end of the symbolic name, i.e.,
674 the closing '>'. */
675 startidx = bufact;
676 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
677 {
678 if (ch == lr->escape_char)
679 {
680 ch = lr_getc (lr);
681 if (ch == '\n' || ch == EOF)
682 break;
683 }
684 ADDC (ch);
685 }
686 if (ch == '\n' || ch == EOF)
687 /* Not a correct string. */
688 break;
689 if (bufact == startidx)
690 {
691 /* <> is no correct name. Ignore it and also signal an
692 error. */
693 illegal_string = 1;
694 continue;
695 }
696
697 /* It might be a Uxxxx symbol. */
698 if (buf[startidx] == 'U'
699 && (bufact - startidx == 5 || bufact - startidx == 9))
700 {
701 char *cp = buf + startidx + 1;
702 while (cp < &buf[bufact] && isxdigit (*cp))
703 ++cp;
704
705 if (cp == &buf[bufact])
706 {
707 char utmp[10];
708
709 /* Yes, it is. */
710 ADDC ('\0');
711 wch = strtoul (buf + startidx + 1, NULL, 16);
712
713 /* Now forget about the name we just added. */
714 bufact = startidx;
715
716 if (return_widestr)
717 ADDWC (wch);
718
719 /* See whether the charmap contains the Uxxxxxxxx names. */
720 snprintf (utmp, sizeof (utmp), "U%08X", wch);
721 seq = charmap_find_value (charmap, utmp, 9);
722
723 if (seq == NULL)
724 {
725 /* No, this isn't the case. Now determine from
726 the repertoire the name of the character and
727 find it in the charmap. */
728 if (repertoire != NULL)
729 {
730 const char *symbol;
731
732 symbol = repertoire_find_symbol (repertoire, wch);
733
734 if (symbol != NULL)
735 seq = charmap_find_value (charmap, symbol,
736 strlen (symbol));
737 }
738
739 if (seq == NULL)
740 {
741#ifndef NO_TRANSLITERATION
742 /* Transliterate if possible. */
743 if (locale != NULL)
744 {
745 uint32_t *translit;
746
747 if ((locale->avail & CTYPE_LOCALE) == 0)
748 {
749 /* Load the CTYPE data now. */
750 int old_needed = locale->needed;
751
752 locale->needed = 0;
753 locale = load_locale (LC_CTYPE,
754 locale->name,
755 locale->repertoire_name,
756 charmap, locale);
757 locale->needed = old_needed;
758 }
759
760 if ((locale->avail & CTYPE_LOCALE) != 0
761 && ((translit = find_translit (locale,
762 charmap, wch))
763 != NULL))
764 /* The CTYPE data contains a matching
765 transliteration. */
766 {
767 int i;
768
769 for (i = 0; translit[i] != 0; ++i)
770 {
771 char utmp[10];
772
773 snprintf (utmp, sizeof (utmp), "U%08X",
774 translit[i]);
775 seq = charmap_find_value (charmap, utmp,
776 9);
777 assert (seq != NULL);
778 ADDS (seq->bytes, seq->nbytes);
779 }
780
781 continue;
782 }
783 }
784#endif /* NO_TRANSLITERATION */
785
786 /* Not a known name. */
787 illegal_string = 1;
788 }
789 }
790
791 if (seq != NULL)
792 ADDS (seq->bytes, seq->nbytes);
793
794 continue;
795 }
796 }
797
798 /* We now have the symbolic name in buf[startidx] to
799 buf[bufact-1]. Now find out the value for this character
800 in the charmap as well as in the repertoire map (in this
801 order). */
802 seq = charmap_find_value (charmap, &buf[startidx],
803 bufact - startidx);
804
805 if (seq == NULL)
806 {
807 /* This name is not in the charmap. */
808 lr_error (lr, _("symbol `%.*s' not in charmap"),
809 (int) (bufact - startidx), &buf[startidx]);
810 illegal_string = 1;
811 }
812
813 if (return_widestr)
814 {
815 /* Now the same for the multibyte representation. */
816 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
817 wch = seq->ucs4;
818 else
819 {
820 wch = repertoire_find_value (repertoire, &buf[startidx],
821 bufact - startidx);
822 if (seq != NULL)
823 seq->ucs4 = wch;
824 }
825
826 if (wch == ILLEGAL_CHAR_VALUE)
827 {
828 /* This name is not in the repertoire map. */
829 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
830 (int) (bufact - startidx), &buf[startidx]);
831 illegal_string = 1;
832 }
833 else
834 ADDWC (wch);
835 }
836
837 /* Now forget about the name we just added. */
838 bufact = startidx;
839
840 /* And copy the bytes. */
841 if (seq != NULL)
842 ADDS (seq->bytes, seq->nbytes);
843 }
844
845 if (ch == '\n' || ch == EOF)
846 {
847 lr_error (lr, _("unterminated string"));
848 illegal_string = 1;
849 }
850
851 if (illegal_string)
852 {
853 free (buf);
854 free (buf2);
855 lr->token.val.str.startmb = NULL;
856 lr->token.val.str.lenmb = 0;
857 lr->token.val.str.startwc = NULL;
858 lr->token.val.str.lenwc = 0;
859
860 return &lr->token;
861 }
862
863 ADDC ('\0');
864
865 if (return_widestr)
866 {
867 ADDWC (0);
868 lr->token.val.str.startwc = xrealloc (buf2,
869 buf2act * sizeof (uint32_t));
870 lr->token.val.str.lenwc = buf2act;
871 }
872 }
873
874 lr->token.val.str.startmb = xrealloc (buf, bufact);
875 lr->token.val.str.lenmb = bufact;
876
877 return &lr->token;
878}
879