charmap.c source code [glibc_src_2.30/locale/programs/charmap.c]

1	/ Copyright (C) 1996-2019 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3	Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published
7	by the Free Software Foundation; version 2 of the License, or
8	(at your option) any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, see <http://www.gnu.org/licenses/>. /*
17
18	#ifdef HAVE_CONFIG_H
19	# include <config.h>
20	#endif
21
22	#include <ctype.h>
23	#include <errno.h>
24	#include <libintl.h>
25	#include <limits.h>
26	#include <stdio.h>
27	#include <stdlib.h>
28	#include <string.h>
29	#include <stdint.h>
30
31	#include "localedef.h"
32	#include "linereader.h"
33	#include "charmap.h"
34	#include "charmap-dir.h"
35
36	#include <assert.h>
37
38
39	/ Define the lookup function. /
40	#include "charmap-kw.h"
41
42
43	/ Prototypes for local functions. /
44	static struct charmap_t parse_charmap (struct* linereader *cmfile,
45	int verbose, int be_quiet);
46	static void new_width (struct linereader cmfile, struct* charmap_t *result,
47	const char from, const* char *to,
48	unsigned long int width);
49	static void charmap_new_char (struct linereader lr, struct* charmap_t *cm,
50	size_t nbytes, unsigned char *bytes,
51	const char from, const* char *to,
52	int decimal_ellipsis, int step);
53
54
55	bool enc_not_ascii_compatible;
56
57
58	#ifdef NEED_NULL_POINTER
59	static const char *null_pointer;
60	#endif
61
62	static struct linereader *
63	cmlr_open (const char directory, const* char *name, kw_hash_fct_t hf)
64	{
65	FILE *fp;
66
67	fp = charmap_open (directory, name);
68	if (fp == NULL)
69	return NULL;
70	else
71	{
72	size_t dlen = strlen (directory);
73	int add_slash = (dlen == `0` \|\| directory[dlen - `1`] != `'/'`);
74	size_t nlen = strlen (name);
75	char *pathname;
76	char *p;
77
78	pathname = alloca (dlen + add_slash + nlen + `1`);
79	p = stpcpy (pathname, directory);
80	if (add_slash)
81	*p++ = `'/'`;
82	stpcpy (p, name);
83
84	return lr_create (fp, pathname, hf);
85	}
86	}
87
88	struct charmap_t *
89	charmap_read (const char filename, int* verbose, int error_not_found,
90	int be_quiet, int use_default)
91	{
92	struct charmap_t *result = NULL;
93
94	if (filename != NULL)
95	{
96	struct linereader *cmfile;
97
98	/ First try the name as found in the parameter. /
99	cmfile = lr_open (filename, charmap_hash);
100	if (cmfile == NULL)
101	{
102	/ No successful. So start looking through the directories*
103	in the I18NPATH if this is a simple name. /*
104	if (strchr (filename, `'/'`) == NULL)
105	{
106	char *i18npath = getenv ("I18NPATH");
107	if (i18npath != NULL && *i18npath != `'\0'`)
108	{
109	const size_t pathlen = strlen (i18npath);
110	char i18npathbuf[pathlen + `1`];
111	char path[pathlen + sizeof ("/charmaps")];
112	char *next;
113	i18npath = memcpy (i18npathbuf, i18npath, pathlen + `1`);
114
115	while (cmfile == NULL
116	&& (next = strsep (&i18npath, ":")) != NULL)
117	{
118	stpcpy (stpcpy (path, next), "/charmaps");
119	cmfile = cmlr_open (path, filename, charmap_hash);
120
121	if (cmfile == NULL)
122	/ Try without the "/charmaps" part. /
123	cmfile = cmlr_open (next, filename, charmap_hash);
124	}
125	}
126
127	if (cmfile == NULL)
128	/ Try the default directory. /
129	cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
130	}
131	}
132
133	if (cmfile != NULL)
134	result = parse_charmap (cmfile, verbose, be_quiet);
135
136	if (result == NULL && error_not_found)
137	record_error (`0`, errno,
138	_("character map file `%s' not found"),
139	filename);
140	}
141
142	if (result == NULL && filename != NULL && strchr (filename, `'/'`) == NULL)
143	{
144	/ OK, one more try. We also accept the names given to the*
145	character sets in the files. Sometimes they differ from the
146	file name. /*
147	CHARMAP_DIR *dir;
148
149	dir = charmap_opendir (CHARMAP_PATH);
150	if (dir != NULL)
151	{
152	const char *dirent;
153
154	while ((dirent = charmap_readdir (dir)) != NULL)
155	{
156	char **aliases;
157	char **p;
158	int found;
159
160	aliases = charmap_aliases (CHARMAP_PATH, dirent);
161	found = `0`;
162	for (p = aliases; *p; p++)
163	if (strcasecmp (*p, filename) == `0`)
164	{
165	found = `1`;
166	break;
167	}
168	charmap_free_aliases (aliases);
169
170	if (found)
171	{
172	struct linereader *cmfile;
173
174	cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175	if (cmfile != NULL)
176	result = parse_charmap (cmfile, verbose, be_quiet);
177
178	break;
179	}
180	}
181
182	charmap_closedir (dir);
183	}
184	}
185
186	if (result == NULL && DEFAULT_CHARMAP != NULL)
187	{
188	struct linereader *cmfile;
189
190	cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191	if (cmfile != NULL)
192	result = parse_charmap (cmfile, verbose, be_quiet);
193
194	if (result == NULL)
195	record_error (`4`, errno,
196	_("default character map file `%s' not found"),
197	DEFAULT_CHARMAP);
198	}
199
200	if (result != NULL && result->code_set_name == NULL)
201	/ The input file does not specify a code set name. This*
202	shouldn't happen but we should cope with it. /*
203	result->code_set_name = basename (filename);
204
205	/ Test of ASCII compatibility of locale encoding.*
206
207	Verify that the encoding to be used in a locale is ASCII compatible,
208	at least for the graphic characters, excluding the control characters,
209	'$' and '@'. This constraint comes from an ISO C 99 restriction.
210
211	ISO C 99 section 7.17.(2) (about wchar_t):
212	the null character shall have the code value zero and each member of
213	the basic character set shall have a code value equal to its value
214	when used as the lone character in an integer character constant.
215	ISO C 99 section 5.2.1.(3):
216	Both the basic source and basic execution character sets shall have
217	the following members: the 26 uppercase letters of the Latin alphabet
218	A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219	the 26 lowercase letters of the Latin alphabet
220	a b c d e f g h i j k l m n o p q r s t u v w x y z
221	the 10 decimal digits
222	0 1 2 3 4 5 6 7 8 9
223	the following 29 graphic characters
224	! " # % & ' ( ) + , - . / : ; < = > ? [ \ ] ^ _ { \| } ~*
225	the space character, and control characters representing horizontal
226	tab, vertical tab, and form feed.
227
228	Therefore, for all members of the "basic character set", the 'char' code
229	must have the same value as the 'wchar_t' code, which in glibc is the
230	same as the Unicode code, which for all of the enumerated characters
231	is identical to the ASCII code. /*
232	if (result != NULL && use_default)
233	{
234	static const char basic_charset[] =
235	{
236	`'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`, `'G'`, `'H'`, `'I'`, `'J'`, `'K'`, `'L'`, `'M'`,
237	`'N'`, `'O'`, `'P'`, `'Q'`, `'R'`, `'S'`, `'T'`, `'U'`, `'V'`, `'W'`, `'X'`, `'Y'`, `'Z'`,
238	`'a'`, `'b'`, `'c'`, `'d'`, `'e'`, `'f'`, `'g'`, `'h'`, `'i'`, `'j'`, `'k'`, `'l'`, `'m'`,
239	`'n'`, `'o'`, `'p'`, `'q'`, `'r'`, `'s'`, `'t'`, `'u'`, `'v'`, `'w'`, `'x'`, `'y'`, `'z'`,
240	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`, `'8'`, `'9'`,
241	`'!'`, `'"'`, `'#'`, `'%'`, `'&'`, `'\''`, `'('`, `')'`, `'*'`, `'+'`, `','`, `'-'`,
242	`'.'`, `'/'`, `':'`, `';'`, `'<'`, `'='`, `'>'`, `'?'`, `'['`, `'\\'`, `']'`, `'^'`,
243	`'_'`, `'{'`, `'\|'`, `'}'`, `'~'`, `' '`, `'\t'`, `'\v'`, `'\f'`, `'\0'`
244	};
245	int failed = `0`;
246	const char *p = basic_charset;
247
248	do
249	{
250	struct charseq *seq = charmap_find_symbol (result, p, `1`);
251
252	if (seq == NULL \|\| seq->ucs4 != (uint32_t) *p)
253	failed = `1`;
254	}
255	while (*p++ != `'\0'`);
256
257	if (failed)
258	{
259	/ A user may disable the ASCII compatibility warning check,*
260	but we must remember that the encoding is not ASCII
261	compatible, since it may have other implications. Later
262	we will set _NL_CTYPE_MAP_TO_NONASCII from this value. /*
263	if (warn_ascii)
264	record_warning (_(
265	"character map `%s' is not ASCII compatible, locale not ISO C compliant "
266	"[--no-warnings=ascii]"),
267	result->code_set_name);
268	enc_not_ascii_compatible = true;
269	}
270	}
271
272	return result;
273	}
274
275
276	static struct charmap_t *
277	parse_charmap (struct linereader cmfile, int* verbose, int be_quiet)
278	{
279	struct charmap_t *result;
280	int state;
281	enum token_t expected_tok = tok_error;
282	const char *expected_str = NULL;
283	char *from_name = NULL;
284	char *to_name = NULL;
285	enum token_t ellipsis = `0`;
286	int step = `1`;
287
288	/ We don't want symbolic names in string to be translated. /
289	cmfile->translate_strings = `0`;
290
291	/ Allocate room for result. /
292	result = (struct charmap_t ) xmalloc (sizeof* (struct charmap_t));
293	memset (result, `'\0'`, sizeof (struct charmap_t));
294	/ The default DEFAULT_WIDTH is 1. /
295	result->width_default = `1`;
296
297	#define obstack_chunk_alloc malloc
298	#define obstack_chunk_free free
299	obstack_init (&result->mem_pool);
300
301	if (init_hash (&result->char_table, `256`)
302	\|\| init_hash (&result->byte_table, `256`))
303	{
304	free (result);
305	return NULL;
306	}
307
308	/ We use a state machine to describe the charmap description file*
309	format. /*
310	state = `1`;
311	while (`1`)
312	{
313	/ What's on? /
314	struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
315	enum token_t nowtok = now->tok;
316	struct token *arg;
317
318	if (nowtok == tok_eof)
319	break;
320
321	switch (state)
322	{
323	case `1`:
324	/ The beginning. We expect the special declarations, EOL or*
325	`CHARMAP'. /*
326	if (nowtok == tok_eol)
327	/ Ignore empty lines. /
328	continue;
329
330	if (nowtok == tok_charmap)
331	{
332	from_name = NULL;
333	to_name = NULL;
334
335	/ We have to set up the real work. Fill in some*
336	default values. /*
337	if (result->mb_cur_max == `0`)
338	result->mb_cur_max = `1`;
339	if (result->mb_cur_min == `0`)
340	result->mb_cur_min = result->mb_cur_max;
341	if (result->mb_cur_min > result->mb_cur_max)
342	{
343	record_error (`0`, `0`, _("\
344	%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
345	cmfile->fname);
346
347	result->mb_cur_min = result->mb_cur_max;
348	}
349
350	lr_ignore_rest (cmfile, `1`);
351
352	state = `2`;
353	continue;
354	}
355
356	if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
357	&& nowtok != tok_mb_cur_min && nowtok != tok_escape_char
358	&& nowtok != tok_comment_char && nowtok != tok_g0esc
359	&& nowtok != tok_g1esc && nowtok != tok_g2esc
360	&& nowtok != tok_g3esc && nowtok != tok_repertoiremap
361	&& nowtok != tok_include)
362	{
363	lr_error (cmfile, _("syntax error in prolog: %s"),
364	_("invalid definition"));
365
366	lr_ignore_rest (cmfile, `0`);
367	continue;
368	}
369
370	/ We know that we need an argument. /
371	arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
372
373	switch (nowtok)
374	{
375	case tok_code_set_name:
376	case tok_repertoiremap:
377	if (arg->tok != tok_ident && arg->tok != tok_string)
378	{
379	badarg:
380	lr_error (cmfile, _("syntax error in prolog: %s"),
381	_("bad argument"));
382
383	lr_ignore_rest (cmfile, `0`);
384	continue;
385	}
386
387	if (nowtok == tok_code_set_name)
388	result->code_set_name = obstack_copy0 (&result->mem_pool,
389	arg->val.str.startmb,
390	arg->val.str.lenmb);
391	else
392	result->repertoiremap = obstack_copy0 (&result->mem_pool,
393	arg->val.str.startmb,
394	arg->val.str.lenmb);
395
396	lr_ignore_rest (cmfile, `1`);
397	continue;
398
399	case tok_mb_cur_max:
400	case tok_mb_cur_min:
401	if (arg->tok != tok_number)
402	goto badarg;
403
404	if ((nowtok == tok_mb_cur_max
405	&& result->mb_cur_max != `0`)
406	\|\| (nowtok == tok_mb_cur_max
407	&& result->mb_cur_max != `0`))
408	lr_error (cmfile, _("duplicate definition of <%s>"),
409	nowtok == tok_mb_cur_min
410	? "mb_cur_min" : "mb_cur_max");
411
412	if (arg->val.num < `1`)
413	{
414	lr_error (cmfile,
415	_("value for <%s> must be 1 or greater"),
416	nowtok == tok_mb_cur_min
417	? "mb_cur_min" : "mb_cur_max");
418
419	lr_ignore_rest (cmfile, `0`);
420	continue;
421	}
422	if ((nowtok == tok_mb_cur_max && result->mb_cur_min != `0`
423	&& (int) arg->val.num < result->mb_cur_min)
424	\|\| (nowtok == tok_mb_cur_min && result->mb_cur_max != `0`
425	&& (int) arg->val.num > result->mb_cur_max))
426	{
427	lr_error (cmfile, _("\
428	value of <%s> must be greater or equal than the value of <%s>"),
429	"mb_cur_max", "mb_cur_min");
430
431	lr_ignore_rest (cmfile, `0`);
432	continue;
433	}
434
435	if (nowtok == tok_mb_cur_max)
436	result->mb_cur_max = arg->val.num;
437	else
438	result->mb_cur_min = arg->val.num;
439
440	lr_ignore_rest (cmfile, `1`);
441	continue;
442
443	case tok_escape_char:
444	case tok_comment_char:
445	if (arg->tok != tok_ident)
446	goto badarg;
447
448	if (arg->val.str.lenmb != `1`)
449	{
450	lr_error (cmfile, _("\
451	argument to <%s> must be a single character"),
452	nowtok == tok_escape_char ? "escape_char"
453	: "comment_char");
454
455	lr_ignore_rest (cmfile, `0`);
456	continue;
457	}
458
459	if (nowtok == tok_escape_char)
460	cmfile->escape_char = *arg->val.str.startmb;
461	else
462	cmfile->comment_char = *arg->val.str.startmb;
463
464	lr_ignore_rest (cmfile, `1`);
465	continue;
466
467	case tok_g0esc:
468	case tok_g1esc:
469	case tok_g2esc:
470	case tok_g3esc:
471	case tok_escseq:
472	lr_ignore_rest (cmfile, `0`); / XXX /
473	continue;
474
475	case tok_include:
476	lr_error (cmfile, _("\
477	character sets with locking states are not supported"));
478	exit (`4`);
479
480	default:
481	/ Cannot happen. /
482	assert (! "Should not happen");
483	}
484	break;
485
486	case `2`:
487	/ We have seen `CHARMAP' and now are in the body. Each line*
488	must have the format "%s %s %s\n" or "%s...%s %s %s\n". /*
489	if (nowtok == tok_eol)
490	/ Ignore empty lines. /
491	continue;
492
493	if (nowtok == tok_end)
494	{
495	expected_tok = tok_charmap;
496	expected_str = "CHARMAP";
497	state = `90`;
498	continue;
499	}
500
501	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
502	{
503	lr_error (cmfile, _("syntax error in %s definition: %s"),
504	"CHARMAP", _("no symbolic name given"));
505
506	lr_ignore_rest (cmfile, `0`);
507	continue;
508	}
509
510	/ If the previous line was not completely correct free the*
511	used memory. /*
512	if (from_name != NULL)
513	obstack_free (&result->mem_pool, from_name);
514
515	if (nowtok == tok_bsymbol)
516	from_name = (char *) obstack_copy0 (&result->mem_pool,
517	now->val.str.startmb,
518	now->val.str.lenmb);
519	else
520	{
521	obstack_printf (&result->mem_pool, "U%08X",
522	cmfile->token.val.ucs4);
523	obstack_1grow (&result->mem_pool, `'\0'`);
524	from_name = (char *) obstack_finish (&result->mem_pool);
525	}
526	to_name = NULL;
527
528	state = `3`;
529	continue;
530
531	case `3`:
532	/ We have two possibilities: We can see an ellipsis or an*
533	encoding value. /*
534	if (nowtok == tok_ellipsis3 \|\| nowtok == tok_ellipsis4
535	\|\| nowtok == tok_ellipsis2 \|\| nowtok == tok_ellipsis4_2
536	\|\| nowtok == tok_ellipsis2_2)
537	{
538	ellipsis = nowtok;
539	if (nowtok == tok_ellipsis4_2)
540	{
541	step = `2`;
542	nowtok = tok_ellipsis4;
543	}
544	else if (nowtok == tok_ellipsis2_2)
545	{
546	step = `2`;
547	nowtok = tok_ellipsis2;
548	}
549	state = `4`;
550	continue;
551	}
552	/ FALLTHROUGH /
553
554	case `5`:
555	if (nowtok != tok_charcode)
556	{
557	lr_error (cmfile, _("syntax error in %s definition: %s"),
558	"CHARMAP", _("invalid encoding given"));
559
560	lr_ignore_rest (cmfile, `0`);
561
562	state = `2`;
563	continue;
564	}
565
566	if (now->val.charcode.nbytes < result->mb_cur_min)
567	lr_error (cmfile, _("too few bytes in character encoding"));
568	else if (now->val.charcode.nbytes > result->mb_cur_max)
569	lr_error (cmfile, _("too many bytes in character encoding"));
570	else
571	charmap_new_char (cmfile, result, now->val.charcode.nbytes,
572	now->val.charcode.bytes, from_name, to_name,
573	ellipsis != tok_ellipsis2, step);
574
575	/ Ignore trailing comment silently. /
576	lr_ignore_rest (cmfile, `0`);
577
578	from_name = NULL;
579	to_name = NULL;
580	ellipsis = tok_none;
581	step = `1`;
582
583	state = `2`;
584	continue;
585
586	case `4`:
587	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
588	{
589	lr_error (cmfile, _("syntax error in %s definition: %s"),
590	"CHARMAP",
591	_("no symbolic name given for end of range"));
592
593	lr_ignore_rest (cmfile, `0`);
594	continue;
595	}
596
597	/ Copy the to-name in a safe place. /
598	if (nowtok == tok_bsymbol)
599	to_name = (char *) obstack_copy0 (&result->mem_pool,
600	cmfile->token.val.str.startmb,
601	cmfile->token.val.str.lenmb);
602	else
603	{
604	obstack_printf (&result->mem_pool, "U%08X",
605	cmfile->token.val.ucs4);
606	obstack_1grow (&result->mem_pool, `'\0'`);
607	to_name = (char *) obstack_finish (&result->mem_pool);
608	}
609
610	state = `5`;
611	continue;
612
613	case `90`:
614	if (nowtok != expected_tok)
615	lr_error (cmfile, _("\
616	%1$s: definition does not end with `END %1$s'"), expected_str);
617
618	lr_ignore_rest (cmfile, nowtok == expected_tok);
619	state = `91`;
620	continue;
621
622	case `91`:
623	/ Waiting for WIDTH... /
624	if (nowtok == tok_eol)
625	/ Ignore empty lines. /
626	continue;
627
628	if (nowtok == tok_width_default)
629	{
630	state = `92`;
631	continue;
632	}
633
634	if (nowtok == tok_width)
635	{
636	lr_ignore_rest (cmfile, `1`);
637	state = `93`;
638	continue;
639	}
640
641	if (nowtok == tok_width_variable)
642	{
643	lr_ignore_rest (cmfile, `1`);
644	state = `98`;
645	continue;
646	}
647
648	lr_error (cmfile, _("\
649	only WIDTH definitions are allowed to follow the CHARMAP definition"));
650
651	lr_ignore_rest (cmfile, `0`);
652	continue;
653
654	case `92`:
655	if (nowtok != tok_number)
656	lr_error (cmfile, _("value for %s must be an integer"),
657	"WIDTH_DEFAULT");
658	else
659	result->width_default = now->val.num;
660
661	lr_ignore_rest (cmfile, nowtok == tok_number);
662
663	state = `91`;
664	continue;
665
666	case `93`:
667	/ We now expect `END WIDTH' or lines of the format "%s %d\n" or*
668	"%s...%s %d\n". /*
669	if (nowtok == tok_eol)
670	/ ignore empty lines. /
671	continue;
672
673	if (nowtok == tok_end)
674	{
675	expected_tok = tok_width;
676	expected_str = "WIDTH";
677	state = `90`;
678	continue;
679	}
680
681	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
682	{
683	lr_error (cmfile, _("syntax error in %s definition: %s"),
684	"WIDTH", _("no symbolic name given"));
685
686	lr_ignore_rest (cmfile, `0`);
687	continue;
688	}
689
690	if (from_name != NULL)
691	obstack_free (&result->mem_pool, from_name);
692
693	if (nowtok == tok_bsymbol)
694	from_name = (char *) obstack_copy0 (&result->mem_pool,
695	now->val.str.startmb,
696	now->val.str.lenmb);
697	else
698	{
699	obstack_printf (&result->mem_pool, "U%08X",
700	cmfile->token.val.ucs4);
701	obstack_1grow (&result->mem_pool, `'\0'`);
702	from_name = (char *) obstack_finish (&result->mem_pool);
703	}
704
705	to_name = NULL;
706
707	state = `94`;
708	continue;
709
710	case `94`:
711	if (nowtok == tok_ellipsis3)
712	{
713	state = `95`;
714	continue;
715	}
716	/ Fall through. /
717
718	case `96`:
719	if (nowtok != tok_number)
720	lr_error (cmfile, _("value for %s must be an integer"),
721	"WIDTH");
722	else
723	{
724	/ Store width for chars. /
725	new_width (cmfile, result, from_name, to_name, now->val.num);
726
727	from_name = NULL;
728	to_name = NULL;
729	}
730
731	lr_ignore_rest (cmfile, nowtok == tok_number);
732
733	state = `93`;
734	continue;
735
736	case `95`:
737	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
738	{
739	lr_error (cmfile, _("syntax error in %s definition: %s"),
740	"WIDTH", _("no symbolic name given for end of range"));
741
742	lr_ignore_rest (cmfile, `0`);
743
744	state = `93`;
745	continue;
746	}
747
748	if (nowtok == tok_bsymbol)
749	to_name = (char *) obstack_copy0 (&result->mem_pool,
750	now->val.str.startmb,
751	now->val.str.lenmb);
752	else
753	{
754	obstack_printf (&result->mem_pool, "U%08X",
755	cmfile->token.val.ucs4);
756	obstack_1grow (&result->mem_pool, `'\0'`);
757	to_name = (char *) obstack_finish (&result->mem_pool);
758	}
759
760	state = `96`;
761	continue;
762
763	case `98`:
764	/ We now expect `END WIDTH_VARIABLE' or lines of the format*
765	"%s\n" or "%s...%s\n". /*
766	if (nowtok == tok_eol)
767	/ ignore empty lines. /
768	continue;
769
770	if (nowtok == tok_end)
771	{
772	expected_tok = tok_width_variable;
773	expected_str = "WIDTH_VARIABLE";
774	state = `90`;
775	continue;
776	}
777
778	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
779	{
780	lr_error (cmfile, _("syntax error in %s definition: %s"),
781	"WIDTH_VARIABLE", _("no symbolic name given"));
782
783	lr_ignore_rest (cmfile, `0`);
784
785	continue;
786	}
787
788	if (from_name != NULL)
789	obstack_free (&result->mem_pool, from_name);
790
791	if (nowtok == tok_bsymbol)
792	from_name = (char *) obstack_copy0 (&result->mem_pool,
793	now->val.str.startmb,
794	now->val.str.lenmb);
795	else
796	{
797	obstack_printf (&result->mem_pool, "U%08X",
798	cmfile->token.val.ucs4);
799	obstack_1grow (&result->mem_pool, `'\0'`);
800	from_name = (char *) obstack_finish (&result->mem_pool);
801	}
802	to_name = NULL;
803
804	state = `99`;
805	continue;
806
807	case `99`:
808	if (nowtok == tok_ellipsis3)
809	state = `100`;
810
811	/ Store info. /
812	from_name = NULL;
813
814	/ Warn /
815	state = `98`;
816	continue;
817
818	case `100`:
819	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
820	{
821	lr_error (cmfile, _("syntax error in %s definition: %s"),
822	"WIDTH_VARIABLE",
823	_("no symbolic name given for end of range"));
824	lr_ignore_rest (cmfile, `0`);
825	continue;
826	}
827
828	if (nowtok == tok_bsymbol)
829	to_name = (char *) obstack_copy0 (&result->mem_pool,
830	now->val.str.startmb,
831	now->val.str.lenmb);
832	else
833	{
834	obstack_printf (&result->mem_pool, "U%08X",
835	cmfile->token.val.ucs4);
836	obstack_1grow (&result->mem_pool, `'\0'`);
837	to_name = (char *) obstack_finish (&result->mem_pool);
838	}
839
840	/ XXX Enter value into table. /
841
842	lr_ignore_rest (cmfile, `1`);
843
844	state = `98`;
845	continue;
846
847	default:
848	record_error (`5`, `0`, _("%s: error in state machine"),
849	__FILE__);
850	/ NOTREACHED /
851	}
852	break;
853	}
854
855	if (state != `91`)
856	record_error (`0`, `0`, _("%s: premature end of file"),
857	cmfile->fname);
858
859	lr_close (cmfile);
860
861	return result;
862	}
863
864
865	static void
866	new_width (struct linereader cmfile, struct* charmap_t *result,
867	const char from, const* char to, unsigned* long int width)
868	{
869	struct charseq *from_val;
870	struct charseq *to_val;
871
872	from_val = charmap_find_value (result, from, strlen (from));
873	if (from_val == NULL)
874	{
875	lr_error (cmfile, _("unknown character `%s'"), from);
876	return;
877	}
878
879	if (to == NULL)
880	to_val = from_val;
881	else
882	{
883	to_val = charmap_find_value (result, to, strlen (to));
884	if (to_val == NULL)
885	{
886	lr_error (cmfile, _("unknown character `%s'"), to);
887	return;
888	}
889
890	/ Make sure the number of bytes for the end points of the range*
891	is correct. /*
892	if (from_val->nbytes != to_val->nbytes)
893	{
894	lr_error (cmfile, _("\
895	number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
896	from_val->nbytes, to_val->nbytes);
897	return;
898	}
899	}
900
901	if (result->nwidth_rules >= result->nwidth_rules_max)
902	{
903	size_t new_size = result->nwidth_rules + `32`;
904	struct width_rule *new_rules =
905	(struct width_rule *) obstack_alloc (&result->mem_pool,
906	(new_size
907	* sizeof (struct width_rule)));
908
909	memcpy (new_rules, result->width_rules,
910	result->nwidth_rules_max * sizeof (struct width_rule));
911
912	result->width_rules = new_rules;
913	result->nwidth_rules_max = new_size;
914	}
915
916	result->width_rules[result->nwidth_rules].from = from_val;
917	result->width_rules[result->nwidth_rules].to = to_val;
918	result->width_rules[result->nwidth_rules].width = (unsigned int) width;
919	++result->nwidth_rules;
920	}
921
922
923	struct charseq *
924	charmap_find_value (const struct charmap_t cm, const* char *name, size_t len)
925	{
926	void *result;
927
928	return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
929	< `0` ? NULL : (struct charseq *) result);
930	}
931
932
933	static void
934	charmap_new_char (struct linereader lr, struct* charmap_t *cm,
935	size_t nbytes, unsigned char *bytes,
936	const char from, const* char *to,
937	int decimal_ellipsis, int step)
938	{
939	hash_table *ht = &cm->char_table;
940	hash_table *bt = &cm->byte_table;
941	struct obstack *ob = &cm->mem_pool;
942	char *from_end;
943	char *to_end;
944	const char *cp;
945	int prefix_len, len1, len2;
946	unsigned int from_nr, to_nr, cnt;
947	struct charseq *newp;
948
949	len1 = strlen (from);
950
951	if (to == NULL)
952	{
953	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
954	newp->nbytes = nbytes;
955	memcpy (newp->bytes, bytes, nbytes);
956	newp->name = from;
957
958	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
959	if ((from[`0`] == `'U'` \|\| from[`0`] == `'P'`) && (len1 == `5` \|\| len1 == `9`))
960	{
961	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
962	xxxx and xxxxxxxx are hexadecimal numbers. In this case
963	we use the value of xxxx or xxxxxxxx as the UCS4 value of
964	this character and we don't have to consult the repertoire
965	map.
966
967	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
968	and xxxxxxxx also give the code point in UCS4 but this must
969	be in the private, i.e., unassigned, area. This should be
970	used for characters which do not (yet) have an equivalent
971	in ISO 10646 and Unicode. /*
972	char *endp;
973
974	errno = `0`;
975	newp->ucs4 = strtoul (from + `1`, &endp, `16`);
976	if (endp - from != len1
977	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
978	\|\| newp->ucs4 >= `0x80000000`)
979	/ This wasn't successful. Signal this name cannot be a*
980	correct UCS value. /*
981	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
982	}
983
984	insert_entry (ht, from, len1, newp);
985	insert_entry (bt, newp->bytes, nbytes, newp);
986	/ Please note that it isn't a bug if a symbol is defined more*
987	than once. All later definitions are simply discarded. /*
988	return;
989	}
990
991	/ We have a range: the names must have names with equal prefixes*
992	and an equal number of digits, where the second number is greater
993	or equal than the first. /*
994	len2 = strlen (to);
995
996	if (len1 != len2)
997	{
998	illegal_range:
999	lr_error (lr, _("invalid names for character range"));
1000	return;
1001	}
1002
1003	cp = &from[len1 - `1`];
1004	if (decimal_ellipsis)
1005	while (isdigit (*cp) && cp >= from)
1006	--cp;
1007	else
1008	while (isxdigit (*cp) && cp >= from)
1009	{
1010	if (!isdigit (cp) && !isupper (cp))
1011	lr_error (lr, _("\
1012	hexadecimal range format should use only capital characters"));
1013	--cp;
1014	}
1015
1016	prefix_len = (cp - from) + `1`;
1017
1018	if (cp == &from[len1 - `1`] \|\| strncmp (from, to, prefix_len) != `0`)
1019	goto illegal_range;
1020
1021	errno = `0`;
1022	from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? `10` : `16`);
1023	if (*from_end != `'\0'` \|\| (from_nr == UINT_MAX && errno == ERANGE)
1024	\|\| ((to_nr = strtoul (&to[prefix_len], &to_end,
1025	decimal_ellipsis ? `10` : `16`)) == UINT_MAX
1026	&& errno == ERANGE)
1027	\|\| *to_end != `'\0'`)
1028	{
1029	lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1030	return;
1031	}
1032
1033	if (from_nr > to_nr)
1034	{
1035	lr_error (lr, _("upper limit in range is smaller than lower limit"));
1036	return;
1037	}
1038
1039	for (cnt = from_nr; cnt <= to_nr; cnt += step)
1040	{
1041	char *name_end;
1042	obstack_printf (ob, decimal_ellipsis ? "%.s%0d" : "%.s%0X",
1043	prefix_len, from, len1 - prefix_len, cnt);
1044	obstack_1grow (ob, `'\0'`);
1045	name_end = obstack_finish (ob);
1046
1047	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
1048	newp->nbytes = nbytes;
1049	memcpy (newp->bytes, bytes, nbytes);
1050	newp->name = name_end;
1051
1052	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1053	if ((name_end[`0`] == `'U'` \|\| name_end[`0`] == `'P'`)
1054	&& (len1 == `5` \|\| len1 == `9`))
1055	{
1056	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
1057	xxxx and xxxxxxxx are hexadecimal numbers. In this case
1058	we use the value of xxxx or xxxxxxxx as the UCS4 value of
1059	this character and we don't have to consult the repertoire
1060	map.
1061
1062	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1063	and xxxxxxxx also give the code point in UCS4 but this must
1064	be in the private, i.e., unassigned, area. This should be
1065	used for characters which do not (yet) have an equivalent
1066	in ISO 10646 and Unicode. /*
1067	char *endp;
1068
1069	errno = `0`;
1070	newp->ucs4 = strtoul (name_end + `1`, &endp, `16`);
1071	if (endp - name_end != len1
1072	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
1073	\|\| newp->ucs4 >= `0x80000000`)
1074	/ This wasn't successful. Signal this name cannot be a*
1075	correct UCS value. /*
1076	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1077	}
1078
1079	insert_entry (ht, name_end, len1, newp);
1080	insert_entry (bt, newp->bytes, nbytes, newp);
1081	/ Please note we don't examine the return value since it is no error*
1082	if we have two definitions for a symbol. /*
1083
1084	/ Increment the value in the byte sequence. /
1085	if (++bytes[nbytes - `1`] == `'\0'`)
1086	{
1087	int b = nbytes - `2`;
1088
1089	do
1090	if (b < `0`)
1091	{
1092	lr_error (lr,
1093	_("resulting bytes for range not representable."));
1094	return;
1095	}
1096	while (++bytes[b--] == `0`);
1097	}
1098	}
1099	}
1100
1101
1102	struct charseq *
1103	charmap_find_symbol (const struct charmap_t cm, const* char *bytes,
1104	size_t nbytes)
1105	{
1106	void *result;
1107
1108	return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1109	< `0` ? NULL : (struct charseq *) result);
1110	}
1111

Browse the source code of glibc_src_2.30/locale/programs/charmap.c