charmap.c source code [glibc_src_2.23/locale/programs/charmap.c]

1	/ Copyright (C) 1996-2016 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3	Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published
7	by the Free Software Foundation; version 2 of the License, or
8	(at your option) any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, see <http://www.gnu.org/licenses/>. /*
17
18	#ifdef HAVE_CONFIG_H
19	# include <config.h>
20	#endif
21
22	#include <ctype.h>
23	#include <errno.h>
24	#include <libintl.h>
25	#include <limits.h>
26	#include <stdio.h>
27	#include <stdlib.h>
28	#include <string.h>
29	#include <error.h>
30	#include <stdint.h>
31
32	#include "localedef.h"
33	#include "linereader.h"
34	#include "charmap.h"
35	#include "charmap-dir.h"
36
37	#include <assert.h>
38
39
40	/ Define the lookup function. /
41	#include "charmap-kw.h"
42
43
44	/ Prototypes for local functions. /
45	static struct charmap_t parse_charmap (struct* linereader *cmfile,
46	int verbose, int be_quiet);
47	static void new_width (struct linereader cmfile, struct* charmap_t *result,
48	const char from, const* char *to,
49	unsigned long int width);
50	static void charmap_new_char (struct linereader lr, struct* charmap_t *cm,
51	size_t nbytes, unsigned char *bytes,
52	const char from, const* char *to,
53	int decimal_ellipsis, int step);
54
55
56	bool enc_not_ascii_compatible;
57
58
59	#ifdef NEED_NULL_POINTER
60	static const char *null_pointer;
61	#endif
62
63	static struct linereader *
64	cmlr_open (const char directory, const* char *name, kw_hash_fct_t hf)
65	{
66	FILE *fp;
67
68	fp = charmap_open (directory, name);
69	if (fp == NULL)
70	return NULL;
71	else
72	{
73	size_t dlen = strlen (directory);
74	int add_slash = (dlen == `0` \|\| directory[dlen - `1`] != `'/'`);
75	size_t nlen = strlen (name);
76	char *pathname;
77	char *p;
78
79	pathname = alloca (dlen + add_slash + nlen + `1`);
80	p = stpcpy (pathname, directory);
81	if (add_slash)
82	*p++ = `'/'`;
83	stpcpy (p, name);
84
85	return lr_create (fp, pathname, hf);
86	}
87	}
88
89	struct charmap_t *
90	charmap_read (const char filename, int* verbose, int error_not_found,
91	int be_quiet, int use_default)
92	{
93	struct charmap_t *result = NULL;
94
95	if (filename != NULL)
96	{
97	struct linereader *cmfile;
98
99	/ First try the name as found in the parameter. /
100	cmfile = lr_open (filename, charmap_hash);
101	if (cmfile == NULL)
102	{
103	/ No successful. So start looking through the directories*
104	in the I18NPATH if this is a simple name. /*
105	if (strchr (filename, `'/'`) == NULL)
106	{
107	char *i18npath = getenv ("I18NPATH");
108	if (i18npath != NULL && *i18npath != `'\0'`)
109	{
110	const size_t pathlen = strlen (i18npath);
111	char i18npathbuf[pathlen + `1`];
112	char path[pathlen + sizeof ("/charmaps")];
113	char *next;
114	i18npath = memcpy (i18npathbuf, i18npath, pathlen + `1`);
115
116	while (cmfile == NULL
117	&& (next = strsep (&i18npath, ":")) != NULL)
118	{
119	stpcpy (stpcpy (path, next), "/charmaps");
120	cmfile = cmlr_open (path, filename, charmap_hash);
121
122	if (cmfile == NULL)
123	/ Try without the "/charmaps" part. /
124	cmfile = cmlr_open (next, filename, charmap_hash);
125	}
126	}
127
128	if (cmfile == NULL)
129	/ Try the default directory. /
130	cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
131	}
132	}
133
134	if (cmfile != NULL)
135	result = parse_charmap (cmfile, verbose, be_quiet);
136
137	if (result == NULL && error_not_found)
138	WITH_CUR_LOCALE (error (`0`, errno, _("\
139	character map file `%s' not found"), filename));
140	}
141
142	if (result == NULL && filename != NULL && strchr (filename, `'/'`) == NULL)
143	{
144	/ OK, one more try. We also accept the names given to the*
145	character sets in the files. Sometimes they differ from the
146	file name. /*
147	CHARMAP_DIR *dir;
148
149	dir = charmap_opendir (CHARMAP_PATH);
150	if (dir != NULL)
151	{
152	const char *dirent;
153
154	while ((dirent = charmap_readdir (dir)) != NULL)
155	{
156	char **aliases;
157	char **p;
158	int found;
159
160	aliases = charmap_aliases (CHARMAP_PATH, dirent);
161	found = `0`;
162	for (p = aliases; *p; p++)
163	if (strcasecmp (*p, filename) == `0`)
164	{
165	found = `1`;
166	break;
167	}
168	charmap_free_aliases (aliases);
169
170	if (found)
171	{
172	struct linereader *cmfile;
173
174	cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175	if (cmfile != NULL)
176	result = parse_charmap (cmfile, verbose, be_quiet);
177
178	break;
179	}
180	}
181
182	charmap_closedir (dir);
183	}
184	}
185
186	if (result == NULL && DEFAULT_CHARMAP != NULL)
187	{
188	struct linereader *cmfile;
189
190	cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191	if (cmfile != NULL)
192	result = parse_charmap (cmfile, verbose, be_quiet);
193
194	if (result == NULL)
195	WITH_CUR_LOCALE (error (`4`, errno, _("\
196	default character map file `%s' not found"), DEFAULT_CHARMAP));
197	}
198
199	if (result != NULL && result->code_set_name == NULL)
200	/ The input file does not specify a code set name. This*
201	shouldn't happen but we should cope with it. /*
202	result->code_set_name = basename (filename);
203
204	/ Test of ASCII compatibility of locale encoding.*
205
206	Verify that the encoding to be used in a locale is ASCII compatible,
207	at least for the graphic characters, excluding the control characters,
208	'$' and '@'. This constraint comes from an ISO C 99 restriction.
209
210	ISO C 99 section 7.17.(2) (about wchar_t):
211	the null character shall have the code value zero and each member of
212	the basic character set shall have a code value equal to its value
213	when used as the lone character in an integer character constant.
214	ISO C 99 section 5.2.1.(3):
215	Both the basic source and basic execution character sets shall have
216	the following members: the 26 uppercase letters of the Latin alphabet
217	A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
218	the 26 lowercase letters of the Latin alphabet
219	a b c d e f g h i j k l m n o p q r s t u v w x y z
220	the 10 decimal digits
221	0 1 2 3 4 5 6 7 8 9
222	the following 29 graphic characters
223	! " # % & ' ( ) + , - . / : ; < = > ? [ \ ] ^ _ { \| } ~*
224	the space character, and control characters representing horizontal
225	tab, vertical tab, and form feed.
226
227	Therefore, for all members of the "basic character set", the 'char' code
228	must have the same value as the 'wchar_t' code, which in glibc is the
229	same as the Unicode code, which for all of the enumerated characters
230	is identical to the ASCII code. /*
231	if (result != NULL && use_default)
232	{
233	static const char basic_charset[] =
234	{
235	`'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`, `'G'`, `'H'`, `'I'`, `'J'`, `'K'`, `'L'`, `'M'`,
236	`'N'`, `'O'`, `'P'`, `'Q'`, `'R'`, `'S'`, `'T'`, `'U'`, `'V'`, `'W'`, `'X'`, `'Y'`, `'Z'`,
237	`'a'`, `'b'`, `'c'`, `'d'`, `'e'`, `'f'`, `'g'`, `'h'`, `'i'`, `'j'`, `'k'`, `'l'`, `'m'`,
238	`'n'`, `'o'`, `'p'`, `'q'`, `'r'`, `'s'`, `'t'`, `'u'`, `'v'`, `'w'`, `'x'`, `'y'`, `'z'`,
239	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`, `'8'`, `'9'`,
240	`'!'`, `'"'`, `'#'`, `'%'`, `'&'`, `'\''`, `'('`, `')'`, `'*'`, `'+'`, `','`, `'-'`,
241	`'.'`, `'/'`, `':'`, `';'`, `'<'`, `'='`, `'>'`, `'?'`, `'['`, `'\\'`, `']'`, `'^'`,
242	`'_'`, `'{'`, `'\|'`, `'}'`, `'~'`, `' '`, `'\t'`, `'\v'`, `'\f'`, `'\0'`
243	};
244	int failed = `0`;
245	const char *p = basic_charset;
246
247	do
248	{
249	struct charseq *seq = charmap_find_symbol (result, p, `1`);
250
251	if (seq == NULL \|\| seq->ucs4 != (uint32_t) *p)
252	failed = `1`;
253	}
254	while (*p++ != `'\0'`);
255
256	if (failed)
257	{
258	WITH_CUR_LOCALE (fprintf (stderr, _("\
259	character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
260	result->code_set_name));
261	enc_not_ascii_compatible = true;
262	}
263	}
264
265	return result;
266	}
267
268
269	static struct charmap_t *
270	parse_charmap (struct linereader cmfile, int* verbose, int be_quiet)
271	{
272	struct charmap_t *result;
273	int state;
274	enum token_t expected_tok = tok_error;
275	const char *expected_str = NULL;
276	char *from_name = NULL;
277	char *to_name = NULL;
278	enum token_t ellipsis = `0`;
279	int step = `1`;
280
281	/ We don't want symbolic names in string to be translated. /
282	cmfile->translate_strings = `0`;
283
284	/ Allocate room for result. /
285	result = (struct charmap_t ) xmalloc (sizeof* (struct charmap_t));
286	memset (result, `'\0'`, sizeof (struct charmap_t));
287	/ The default DEFAULT_WIDTH is 1. /
288	result->width_default = `1`;
289
290	#define obstack_chunk_alloc malloc
291	#define obstack_chunk_free free
292	obstack_init (&result->mem_pool);
293
294	if (init_hash (&result->char_table, `256`)
295	\|\| init_hash (&result->byte_table, `256`))
296	{
297	free (result);
298	return NULL;
299	}
300
301	/ We use a state machine to describe the charmap description file*
302	format. /*
303	state = `1`;
304	while (`1`)
305	{
306	/ What's on? /
307	struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
308	enum token_t nowtok = now->tok;
309	struct token *arg;
310
311	if (nowtok == tok_eof)
312	break;
313
314	switch (state)
315	{
316	case `1`:
317	/ The beginning. We expect the special declarations, EOL or*
318	`CHARMAP'. /*
319	if (nowtok == tok_eol)
320	/ Ignore empty lines. /
321	continue;
322
323	if (nowtok == tok_charmap)
324	{
325	from_name = NULL;
326	to_name = NULL;
327
328	/ We have to set up the real work. Fill in some*
329	default values. /*
330	if (result->mb_cur_max == `0`)
331	result->mb_cur_max = `1`;
332	if (result->mb_cur_min == `0`)
333	result->mb_cur_min = result->mb_cur_max;
334	if (result->mb_cur_min > result->mb_cur_max)
335	{
336	if (!be_quiet)
337	WITH_CUR_LOCALE (error (`0`, `0`, _("\
338	%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
339	cmfile->fname));
340
341	result->mb_cur_min = result->mb_cur_max;
342	}
343
344	lr_ignore_rest (cmfile, `1`);
345
346	state = `2`;
347	continue;
348	}
349
350	if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
351	&& nowtok != tok_mb_cur_min && nowtok != tok_escape_char
352	&& nowtok != tok_comment_char && nowtok != tok_g0esc
353	&& nowtok != tok_g1esc && nowtok != tok_g2esc
354	&& nowtok != tok_g3esc && nowtok != tok_repertoiremap
355	&& nowtok != tok_include)
356	{
357	lr_error (cmfile, _("syntax error in prolog: %s"),
358	_("invalid definition"));
359
360	lr_ignore_rest (cmfile, `0`);
361	continue;
362	}
363
364	/ We know that we need an argument. /
365	arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
366
367	switch (nowtok)
368	{
369	case tok_code_set_name:
370	case tok_repertoiremap:
371	if (arg->tok != tok_ident && arg->tok != tok_string)
372	{
373	badarg:
374	lr_error (cmfile, _("syntax error in prolog: %s"),
375	_("bad argument"));
376
377	lr_ignore_rest (cmfile, `0`);
378	continue;
379	}
380
381	if (nowtok == tok_code_set_name)
382	result->code_set_name = obstack_copy0 (&result->mem_pool,
383	arg->val.str.startmb,
384	arg->val.str.lenmb);
385	else
386	result->repertoiremap = obstack_copy0 (&result->mem_pool,
387	arg->val.str.startmb,
388	arg->val.str.lenmb);
389
390	lr_ignore_rest (cmfile, `1`);
391	continue;
392
393	case tok_mb_cur_max:
394	case tok_mb_cur_min:
395	if (arg->tok != tok_number)
396	goto badarg;
397
398	if (verbose
399	&& ((nowtok == tok_mb_cur_max
400	&& result->mb_cur_max != `0`)
401	\|\| (nowtok == tok_mb_cur_max
402	&& result->mb_cur_max != `0`)))
403	lr_error (cmfile, _("duplicate definition of <%s>"),
404	nowtok == tok_mb_cur_min
405	? "mb_cur_min" : "mb_cur_max");
406
407	if (arg->val.num < `1`)
408	{
409	lr_error (cmfile,
410	_("value for <%s> must be 1 or greater"),
411	nowtok == tok_mb_cur_min
412	? "mb_cur_min" : "mb_cur_max");
413
414	lr_ignore_rest (cmfile, `0`);
415	continue;
416	}
417	if ((nowtok == tok_mb_cur_max && result->mb_cur_min != `0`
418	&& (int) arg->val.num < result->mb_cur_min)
419	\|\| (nowtok == tok_mb_cur_min && result->mb_cur_max != `0`
420	&& (int) arg->val.num > result->mb_cur_max))
421	{
422	lr_error (cmfile, _("\
423	value of <%s> must be greater or equal than the value of <%s>"),
424	"mb_cur_max", "mb_cur_min");
425
426	lr_ignore_rest (cmfile, `0`);
427	continue;
428	}
429
430	if (nowtok == tok_mb_cur_max)
431	result->mb_cur_max = arg->val.num;
432	else
433	result->mb_cur_min = arg->val.num;
434
435	lr_ignore_rest (cmfile, `1`);
436	continue;
437
438	case tok_escape_char:
439	case tok_comment_char:
440	if (arg->tok != tok_ident)
441	goto badarg;
442
443	if (arg->val.str.lenmb != `1`)
444	{
445	lr_error (cmfile, _("\
446	argument to <%s> must be a single character"),
447	nowtok == tok_escape_char ? "escape_char"
448	: "comment_char");
449
450	lr_ignore_rest (cmfile, `0`);
451	continue;
452	}
453
454	if (nowtok == tok_escape_char)
455	cmfile->escape_char = *arg->val.str.startmb;
456	else
457	cmfile->comment_char = *arg->val.str.startmb;
458
459	lr_ignore_rest (cmfile, `1`);
460	continue;
461
462	case tok_g0esc:
463	case tok_g1esc:
464	case tok_g2esc:
465	case tok_g3esc:
466	case tok_escseq:
467	lr_ignore_rest (cmfile, `0`); / XXX /
468	continue;
469
470	case tok_include:
471	lr_error (cmfile, _("\
472	character sets with locking states are not supported"));
473	exit (`4`);
474
475	default:
476	/ Cannot happen. /
477	assert (! "Should not happen");
478	}
479	break;
480
481	case `2`:
482	/ We have seen `CHARMAP' and now are in the body. Each line*
483	must have the format "%s %s %s\n" or "%s...%s %s %s\n". /*
484	if (nowtok == tok_eol)
485	/ Ignore empty lines. /
486	continue;
487
488	if (nowtok == tok_end)
489	{
490	expected_tok = tok_charmap;
491	expected_str = "CHARMAP";
492	state = `90`;
493	continue;
494	}
495
496	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
497	{
498	lr_error (cmfile, _("syntax error in %s definition: %s"),
499	"CHARMAP", _("no symbolic name given"));
500
501	lr_ignore_rest (cmfile, `0`);
502	continue;
503	}
504
505	/ If the previous line was not completely correct free the*
506	used memory. /*
507	if (from_name != NULL)
508	obstack_free (&result->mem_pool, from_name);
509
510	if (nowtok == tok_bsymbol)
511	from_name = (char *) obstack_copy0 (&result->mem_pool,
512	now->val.str.startmb,
513	now->val.str.lenmb);
514	else
515	{
516	obstack_printf (&result->mem_pool, "U%08X",
517	cmfile->token.val.ucs4);
518	obstack_1grow (&result->mem_pool, `'\0'`);
519	from_name = (char *) obstack_finish (&result->mem_pool);
520	}
521	to_name = NULL;
522
523	state = `3`;
524	continue;
525
526	case `3`:
527	/ We have two possibilities: We can see an ellipsis or an*
528	encoding value. /*
529	if (nowtok == tok_ellipsis3 \|\| nowtok == tok_ellipsis4
530	\|\| nowtok == tok_ellipsis2 \|\| nowtok == tok_ellipsis4_2
531	\|\| nowtok == tok_ellipsis2_2)
532	{
533	ellipsis = nowtok;
534	if (nowtok == tok_ellipsis4_2)
535	{
536	step = `2`;
537	nowtok = tok_ellipsis4;
538	}
539	else if (nowtok == tok_ellipsis2_2)
540	{
541	step = `2`;
542	nowtok = tok_ellipsis2;
543	}
544	state = `4`;
545	continue;
546	}
547	/ FALLTHROUGH /
548
549	case `5`:
550	if (nowtok != tok_charcode)
551	{
552	lr_error (cmfile, _("syntax error in %s definition: %s"),
553	"CHARMAP", _("invalid encoding given"));
554
555	lr_ignore_rest (cmfile, `0`);
556
557	state = `2`;
558	continue;
559	}
560
561	if (now->val.charcode.nbytes < result->mb_cur_min)
562	lr_error (cmfile, _("too few bytes in character encoding"));
563	else if (now->val.charcode.nbytes > result->mb_cur_max)
564	lr_error (cmfile, _("too many bytes in character encoding"));
565	else
566	charmap_new_char (cmfile, result, now->val.charcode.nbytes,
567	now->val.charcode.bytes, from_name, to_name,
568	ellipsis != tok_ellipsis2, step);
569
570	/ Ignore trailing comment silently. /
571	lr_ignore_rest (cmfile, `0`);
572
573	from_name = NULL;
574	to_name = NULL;
575	ellipsis = tok_none;
576	step = `1`;
577
578	state = `2`;
579	continue;
580
581	case `4`:
582	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
583	{
584	lr_error (cmfile, _("syntax error in %s definition: %s"),
585	"CHARMAP",
586	_("no symbolic name given for end of range"));
587
588	lr_ignore_rest (cmfile, `0`);
589	continue;
590	}
591
592	/ Copy the to-name in a safe place. /
593	if (nowtok == tok_bsymbol)
594	to_name = (char *) obstack_copy0 (&result->mem_pool,
595	cmfile->token.val.str.startmb,
596	cmfile->token.val.str.lenmb);
597	else
598	{
599	obstack_printf (&result->mem_pool, "U%08X",
600	cmfile->token.val.ucs4);
601	obstack_1grow (&result->mem_pool, `'\0'`);
602	to_name = (char *) obstack_finish (&result->mem_pool);
603	}
604
605	state = `5`;
606	continue;
607
608	case `90`:
609	if (nowtok != expected_tok)
610	lr_error (cmfile, _("\
611	%1$s: definition does not end with `END %1$s'"), expected_str);
612
613	lr_ignore_rest (cmfile, nowtok == expected_tok);
614	state = `91`;
615	continue;
616
617	case `91`:
618	/ Waiting for WIDTH... /
619	if (nowtok == tok_eol)
620	/ Ignore empty lines. /
621	continue;
622
623	if (nowtok == tok_width_default)
624	{
625	state = `92`;
626	continue;
627	}
628
629	if (nowtok == tok_width)
630	{
631	lr_ignore_rest (cmfile, `1`);
632	state = `93`;
633	continue;
634	}
635
636	if (nowtok == tok_width_variable)
637	{
638	lr_ignore_rest (cmfile, `1`);
639	state = `98`;
640	continue;
641	}
642
643	lr_error (cmfile, _("\
644	only WIDTH definitions are allowed to follow the CHARMAP definition"));
645
646	lr_ignore_rest (cmfile, `0`);
647	continue;
648
649	case `92`:
650	if (nowtok != tok_number)
651	lr_error (cmfile, _("value for %s must be an integer"),
652	"WIDTH_DEFAULT");
653	else
654	result->width_default = now->val.num;
655
656	lr_ignore_rest (cmfile, nowtok == tok_number);
657
658	state = `91`;
659	continue;
660
661	case `93`:
662	/ We now expect `END WIDTH' or lines of the format "%s %d\n" or*
663	"%s...%s %d\n". /*
664	if (nowtok == tok_eol)
665	/ ignore empty lines. /
666	continue;
667
668	if (nowtok == tok_end)
669	{
670	expected_tok = tok_width;
671	expected_str = "WIDTH";
672	state = `90`;
673	continue;
674	}
675
676	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
677	{
678	lr_error (cmfile, _("syntax error in %s definition: %s"),
679	"WIDTH", _("no symbolic name given"));
680
681	lr_ignore_rest (cmfile, `0`);
682	continue;
683	}
684
685	if (from_name != NULL)
686	obstack_free (&result->mem_pool, from_name);
687
688	if (nowtok == tok_bsymbol)
689	from_name = (char *) obstack_copy0 (&result->mem_pool,
690	now->val.str.startmb,
691	now->val.str.lenmb);
692	else
693	{
694	obstack_printf (&result->mem_pool, "U%08X",
695	cmfile->token.val.ucs4);
696	obstack_1grow (&result->mem_pool, `'\0'`);
697	from_name = (char *) obstack_finish (&result->mem_pool);
698	}
699
700	to_name = NULL;
701
702	state = `94`;
703	continue;
704
705	case `94`:
706	if (nowtok == tok_ellipsis3)
707	{
708	state = `95`;
709	continue;
710	}
711
712	case `96`:
713	if (nowtok != tok_number)
714	lr_error (cmfile, _("value for %s must be an integer"),
715	"WIDTH");
716	else
717	{
718	/ Store width for chars. /
719	new_width (cmfile, result, from_name, to_name, now->val.num);
720
721	from_name = NULL;
722	to_name = NULL;
723	}
724
725	lr_ignore_rest (cmfile, nowtok == tok_number);
726
727	state = `93`;
728	continue;
729
730	case `95`:
731	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
732	{
733	lr_error (cmfile, _("syntax error in %s definition: %s"),
734	"WIDTH", _("no symbolic name given for end of range"));
735
736	lr_ignore_rest (cmfile, `0`);
737
738	state = `93`;
739	continue;
740	}
741
742	if (nowtok == tok_bsymbol)
743	to_name = (char *) obstack_copy0 (&result->mem_pool,
744	now->val.str.startmb,
745	now->val.str.lenmb);
746	else
747	{
748	obstack_printf (&result->mem_pool, "U%08X",
749	cmfile->token.val.ucs4);
750	obstack_1grow (&result->mem_pool, `'\0'`);
751	to_name = (char *) obstack_finish (&result->mem_pool);
752	}
753
754	state = `96`;
755	continue;
756
757	case `98`:
758	/ We now expect `END WIDTH_VARIABLE' or lines of the format*
759	"%s\n" or "%s...%s\n". /*
760	if (nowtok == tok_eol)
761	/ ignore empty lines. /
762	continue;
763
764	if (nowtok == tok_end)
765	{
766	expected_tok = tok_width_variable;
767	expected_str = "WIDTH_VARIABLE";
768	state = `90`;
769	continue;
770	}
771
772	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
773	{
774	lr_error (cmfile, _("syntax error in %s definition: %s"),
775	"WIDTH_VARIABLE", _("no symbolic name given"));
776
777	lr_ignore_rest (cmfile, `0`);
778
779	continue;
780	}
781
782	if (from_name != NULL)
783	obstack_free (&result->mem_pool, from_name);
784
785	if (nowtok == tok_bsymbol)
786	from_name = (char *) obstack_copy0 (&result->mem_pool,
787	now->val.str.startmb,
788	now->val.str.lenmb);
789	else
790	{
791	obstack_printf (&result->mem_pool, "U%08X",
792	cmfile->token.val.ucs4);
793	obstack_1grow (&result->mem_pool, `'\0'`);
794	from_name = (char *) obstack_finish (&result->mem_pool);
795	}
796	to_name = NULL;
797
798	state = `99`;
799	continue;
800
801	case `99`:
802	if (nowtok == tok_ellipsis3)
803	state = `100`;
804
805	/ Store info. /
806	from_name = NULL;
807
808	/ Warn /
809	state = `98`;
810	continue;
811
812	case `100`:
813	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
814	{
815	lr_error (cmfile, _("syntax error in %s definition: %s"),
816	"WIDTH_VARIABLE",
817	_("no symbolic name given for end of range"));
818	lr_ignore_rest (cmfile, `0`);
819	continue;
820	}
821
822	if (nowtok == tok_bsymbol)
823	to_name = (char *) obstack_copy0 (&result->mem_pool,
824	now->val.str.startmb,
825	now->val.str.lenmb);
826	else
827	{
828	obstack_printf (&result->mem_pool, "U%08X",
829	cmfile->token.val.ucs4);
830	obstack_1grow (&result->mem_pool, `'\0'`);
831	to_name = (char *) obstack_finish (&result->mem_pool);
832	}
833
834	/ XXX Enter value into table. /
835
836	lr_ignore_rest (cmfile, `1`);
837
838	state = `98`;
839	continue;
840
841	default:
842	WITH_CUR_LOCALE (error (`5`, `0`, _("%s: error in state machine"),
843	__FILE__));
844	/ NOTREACHED /
845	}
846	break;
847	}
848
849	if (state != `91` && !be_quiet)
850	WITH_CUR_LOCALE (error (`0`, `0`, _("%s: premature end of file"),
851	cmfile->fname));
852
853	lr_close (cmfile);
854
855	return result;
856	}
857
858
859	static void
860	new_width (struct linereader cmfile, struct* charmap_t *result,
861	const char from, const* char to, unsigned* long int width)
862	{
863	struct charseq *from_val;
864	struct charseq *to_val;
865
866	from_val = charmap_find_value (result, from, strlen (from));
867	if (from_val == NULL)
868	{
869	lr_error (cmfile, _("unknown character `%s'"), from);
870	return;
871	}
872
873	if (to == NULL)
874	to_val = from_val;
875	else
876	{
877	to_val = charmap_find_value (result, to, strlen (to));
878	if (to_val == NULL)
879	{
880	lr_error (cmfile, _("unknown character `%s'"), to);
881	return;
882	}
883
884	/ Make sure the number of bytes for the end points of the range*
885	is correct. /*
886	if (from_val->nbytes != to_val->nbytes)
887	{
888	lr_error (cmfile, _("\
889	number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
890	from_val->nbytes, to_val->nbytes);
891	return;
892	}
893	}
894
895	if (result->nwidth_rules >= result->nwidth_rules_max)
896	{
897	size_t new_size = result->nwidth_rules + `32`;
898	struct width_rule *new_rules =
899	(struct width_rule *) obstack_alloc (&result->mem_pool,
900	(new_size
901	* sizeof (struct width_rule)));
902
903	memcpy (new_rules, result->width_rules,
904	result->nwidth_rules_max * sizeof (struct width_rule));
905
906	result->width_rules = new_rules;
907	result->nwidth_rules_max = new_size;
908	}
909
910	result->width_rules[result->nwidth_rules].from = from_val;
911	result->width_rules[result->nwidth_rules].to = to_val;
912	result->width_rules[result->nwidth_rules].width = (unsigned int) width;
913	++result->nwidth_rules;
914	}
915
916
917	struct charseq *
918	charmap_find_value (const struct charmap_t cm, const* char *name, size_t len)
919	{
920	void *result;
921
922	return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
923	< `0` ? NULL : (struct charseq *) result);
924	}
925
926
927	static void
928	charmap_new_char (struct linereader lr, struct* charmap_t *cm,
929	size_t nbytes, unsigned char *bytes,
930	const char from, const* char *to,
931	int decimal_ellipsis, int step)
932	{
933	hash_table *ht = &cm->char_table;
934	hash_table *bt = &cm->byte_table;
935	struct obstack *ob = &cm->mem_pool;
936	char *from_end;
937	char *to_end;
938	const char *cp;
939	int prefix_len, len1, len2;
940	unsigned int from_nr, to_nr, cnt;
941	struct charseq *newp;
942
943	len1 = strlen (from);
944
945	if (to == NULL)
946	{
947	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
948	newp->nbytes = nbytes;
949	memcpy (newp->bytes, bytes, nbytes);
950	newp->name = from;
951
952	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
953	if ((from[`0`] == `'U'` \|\| from[`0`] == `'P'`) && (len1 == `5` \|\| len1 == `9`))
954	{
955	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
956	xxxx and xxxxxxxx are hexadecimal numbers. In this case
957	we use the value of xxxx or xxxxxxxx as the UCS4 value of
958	this character and we don't have to consult the repertoire
959	map.
960
961	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
962	and xxxxxxxx also give the code point in UCS4 but this must
963	be in the private, i.e., unassigned, area. This should be
964	used for characters which do not (yet) have an equivalent
965	in ISO 10646 and Unicode. /*
966	char *endp;
967
968	errno = `0`;
969	newp->ucs4 = strtoul (from + `1`, &endp, `16`);
970	if (endp - from != len1
971	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
972	\|\| newp->ucs4 >= `0x80000000`)
973	/ This wasn't successful. Signal this name cannot be a*
974	correct UCS value. /*
975	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
976	}
977
978	insert_entry (ht, from, len1, newp);
979	insert_entry (bt, newp->bytes, nbytes, newp);
980	/ Please note that it isn't a bug if a symbol is defined more*
981	than once. All later definitions are simply discarded. /*
982	return;
983	}
984
985	/ We have a range: the names must have names with equal prefixes*
986	and an equal number of digits, where the second number is greater
987	or equal than the first. /*
988	len2 = strlen (to);
989
990	if (len1 != len2)
991	{
992	illegal_range:
993	lr_error (lr, _("invalid names for character range"));
994	return;
995	}
996
997	cp = &from[len1 - `1`];
998	if (decimal_ellipsis)
999	while (isdigit (*cp) && cp >= from)
1000	--cp;
1001	else
1002	while (isxdigit (*cp) && cp >= from)
1003	{
1004	if (!isdigit (cp) && !isupper (cp))
1005	lr_error (lr, _("\
1006	hexadecimal range format should use only capital characters"));
1007	--cp;
1008	}
1009
1010	prefix_len = (cp - from) + `1`;
1011
1012	if (cp == &from[len1 - `1`] \|\| strncmp (from, to, prefix_len) != `0`)
1013	goto illegal_range;
1014
1015	errno = `0`;
1016	from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? `10` : `16`);
1017	if (*from_end != `'\0'` \|\| (from_nr == UINT_MAX && errno == ERANGE)
1018	\|\| ((to_nr = strtoul (&to[prefix_len], &to_end,
1019	decimal_ellipsis ? `10` : `16`)) == UINT_MAX
1020	&& errno == ERANGE)
1021	\|\| *to_end != `'\0'`)
1022	{
1023	lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1024	return;
1025	}
1026
1027	if (from_nr > to_nr)
1028	{
1029	lr_error (lr, _("upper limit in range is smaller than lower limit"));
1030	return;
1031	}
1032
1033	for (cnt = from_nr; cnt <= to_nr; cnt += step)
1034	{
1035	char *name_end;
1036	obstack_printf (ob, decimal_ellipsis ? "%.s%0d" : "%.s%0X",
1037	prefix_len, from, len1 - prefix_len, cnt);
1038	obstack_1grow (ob, `'\0'`);
1039	name_end = obstack_finish (ob);
1040
1041	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
1042	newp->nbytes = nbytes;
1043	memcpy (newp->bytes, bytes, nbytes);
1044	newp->name = name_end;
1045
1046	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1047	if ((name_end[`0`] == `'U'` \|\| name_end[`0`] == `'P'`)
1048	&& (len1 == `5` \|\| len1 == `9`))
1049	{
1050	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
1051	xxxx and xxxxxxxx are hexadecimal numbers. In this case
1052	we use the value of xxxx or xxxxxxxx as the UCS4 value of
1053	this character and we don't have to consult the repertoire
1054	map.
1055
1056	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1057	and xxxxxxxx also give the code point in UCS4 but this must
1058	be in the private, i.e., unassigned, area. This should be
1059	used for characters which do not (yet) have an equivalent
1060	in ISO 10646 and Unicode. /*
1061	char *endp;
1062
1063	errno = `0`;
1064	newp->ucs4 = strtoul (name_end + `1`, &endp, `16`);
1065	if (endp - name_end != len1
1066	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
1067	\|\| newp->ucs4 >= `0x80000000`)
1068	/ This wasn't successful. Signal this name cannot be a*
1069	correct UCS value. /*
1070	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1071	}
1072
1073	insert_entry (ht, name_end, len1, newp);
1074	insert_entry (bt, newp->bytes, nbytes, newp);
1075	/ Please note we don't examine the return value since it is no error*
1076	if we have two definitions for a symbol. /*
1077
1078	/ Increment the value in the byte sequence. /
1079	if (++bytes[nbytes - `1`] == `'\0'`)
1080	{
1081	int b = nbytes - `2`;
1082
1083	do
1084	if (b < `0`)
1085	{
1086	lr_error (lr,
1087	_("resulting bytes for range not representable."));
1088	return;
1089	}
1090	while (++bytes[b--] == `0`);
1091	}
1092	}
1093	}
1094
1095
1096	struct charseq *
1097	charmap_find_symbol (const struct charmap_t cm, const* char *bytes,
1098	size_t nbytes)
1099	{
1100	void *result;
1101
1102	return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1103	< `0` ? NULL : (struct charseq *) result);
1104	}
1105

Browse the source code of glibc_src_2.23/locale/programs/charmap.c