gconv_simple.c source code [glibc_src_2.31/iconv/gconv_simple.c]

1	/ Simple transformations functions.*
2	Copyright (C) 1997-2020 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <byteswap.h>
21	#include <dlfcn.h>
22	#include <endian.h>
23	#include <errno.h>
24	#include <gconv.h>
25	#include <stdint.h>
26	#include <stdlib.h>
27	#include <string.h>
28	#include <wchar.h>
29	#include <sys/param.h>
30	#include <gconv_int.h>
31
32	#define BUILTIN_ALIAS(s1, s2) /* nothing */
33	#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
34	MinF, MaxF, MinT, MaxT) \
35	extern int Fct (struct __gconv_step , struct __gconv_step_data , \
36	const unsigned char *, const unsigned char , \
37	unsigned char *, size_t , int, int);
38	#include "gconv_builtin.h"
39
40
41	#ifndef EILSEQ
42	# define EILSEQ EINVAL
43	#endif
44
45
46	/ Specialized conversion function for a single byte to INTERNAL, recognizing*
47	only ASCII characters. /*
48	wint_t
49	__gconv_btwoc_ascii (struct __gconv_step step, unsigned* char c)
50	{
51	if (c < `0x80`)
52	return c;
53	else
54	return WEOF;
55	}
56
57
58	/ Transform from the internal, UCS4-like format, to UCS4. The*
59	difference between the internal ucs4 format and the real UCS4
60	format is, if any, the endianess. The Unicode/ISO 10646 says that
61	unless some higher protocol specifies it differently, the byte
62	order is big endian./*
63	#define DEFINE_INIT 0
64	#define DEFINE_FINI 0
65	#define MIN_NEEDED_FROM 4
66	#define MIN_NEEDED_TO 4
67	#define FROM_DIRECTION 1
68	#define FROM_LOOP internal_ucs4_loop
69	#define TO_LOOP internal_ucs4_loop /* This is not used. */
70	#define FUNCTION_NAME __gconv_transform_internal_ucs4
71	#define ONE_DIRECTION 0
72
73
74	static inline int
75	__attribute ((always_inline))
76	internal_ucs4_loop (struct __gconv_step *step,
77	struct __gconv_step_data *step_data,
78	const unsigned char *inptrp, const* unsigned char *inend,
79	unsigned char *outptrp, const* unsigned char *outend,
80	size_t *irreversible)
81	{
82	const unsigned char inptr = inptrp;
83	unsigned char outptr = outptrp;
84	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
85	int result;
86
87	#if __BYTE_ORDER == __LITTLE_ENDIAN
88	/ Sigh, we have to do some real work. /
89	size_t cnt;
90	uint32_t outptr32 = (uint32_t ) outptr;
91
92	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
93	outptr32++ = bswap_32 ((const uint32_t *) inptr);
94
95	*inptrp = inptr;
96	outptrp = (unsigned* char *) outptr32;
97	#elif __BYTE_ORDER == __BIG_ENDIAN
98	/ Simply copy the data. /
99	inptrp = inptr + n_convert `4`;
100	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
101	#else
102	# error "This endianess is not supported."
103	#endif
104
105	/ Determine the status. /
106	if (*inptrp == inend)
107	result = __GCONV_EMPTY_INPUT;
108	else if (*outptrp + `4` > outend)
109	result = __GCONV_FULL_OUTPUT;
110	else
111	result = __GCONV_INCOMPLETE_INPUT;
112
113	return result;
114	}
115
116	#if !_STRING_ARCH_unaligned
117	static inline int
118	__attribute ((always_inline))
119	internal_ucs4_loop_unaligned (struct __gconv_step *step,
120	struct __gconv_step_data *step_data,
121	const unsigned char **inptrp,
122	const unsigned char *inend,
123	unsigned char **outptrp,
124	const unsigned char *outend,
125	size_t *irreversible)
126	{
127	const unsigned char inptr = inptrp;
128	unsigned char outptr = outptrp;
129	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
130	int result;
131
132	# if __BYTE_ORDER == __LITTLE_ENDIAN
133	/ Sigh, we have to do some real work. /
134	size_t cnt;
135
136	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
137	{
138	outptr[`0`] = inptr[`3`];
139	outptr[`1`] = inptr[`2`];
140	outptr[`2`] = inptr[`1`];
141	outptr[`3`] = inptr[`0`];
142	}
143
144	*inptrp = inptr;
145	*outptrp = outptr;
146	# elif __BYTE_ORDER == __BIG_ENDIAN
147	/ Simply copy the data. /
148	inptrp = inptr + n_convert `4`;
149	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
150	# else
151	# error "This endianess is not supported."
152	# endif
153
154	/ Determine the status. /
155	if (*inptrp == inend)
156	result = __GCONV_EMPTY_INPUT;
157	else if (*outptrp + `4` > outend)
158	result = __GCONV_FULL_OUTPUT;
159	else
160	result = __GCONV_INCOMPLETE_INPUT;
161
162	return result;
163	}
164	#endif
165
166
167	static inline int
168	__attribute ((always_inline))
169	internal_ucs4_loop_single (struct __gconv_step *step,
170	struct __gconv_step_data *step_data,
171	const unsigned char **inptrp,
172	const unsigned char *inend,
173	unsigned char **outptrp,
174	const unsigned char *outend,
175	size_t *irreversible)
176	{
177	mbstate_t *state = step_data->__statep;
178	size_t cnt = state->__count & `7`;
179
180	while (*inptrp < inend && cnt < `4`)
181	state->__value.__wchb[cnt++] = (inptrp)++;
182
183	if (__glibc_unlikely (cnt < `4`))
184	{
185	/ Still not enough bytes. Store the ones in the input buffer. /
186	state->__count &= ~`7`;
187	state->__count \|= cnt;
188
189	return __GCONV_INCOMPLETE_INPUT;
190	}
191
192	#if __BYTE_ORDER == __LITTLE_ENDIAN
193	(*outptrp)[`0`] = state->__value.__wchb[`3`];
194	(*outptrp)[`1`] = state->__value.__wchb[`2`];
195	(*outptrp)[`2`] = state->__value.__wchb[`1`];
196	(*outptrp)[`3`] = state->__value.__wchb[`0`];
197
198	#elif __BYTE_ORDER == __BIG_ENDIAN
199	/ XXX unaligned /
200	(*outptrp)[`0`] = state->__value.__wchb[`0`];
201	(*outptrp)[`1`] = state->__value.__wchb[`1`];
202	(*outptrp)[`2`] = state->__value.__wchb[`2`];
203	(*outptrp)[`3`] = state->__value.__wchb[`3`];
204	#else
205	# error "This endianess is not supported."
206	#endif
207	*outptrp += `4`;
208
209	/ Clear the state buffer. /
210	state->__count &= ~`7`;
211
212	return __GCONV_OK;
213	}
214
215	#include <iconv/skeleton.c>
216
217
218	/ Transform from UCS4 to the internal, UCS4-like format. Unlike*
219	for the other direction we have to check for correct values here. /*
220	#define DEFINE_INIT 0
221	#define DEFINE_FINI 0
222	#define MIN_NEEDED_FROM 4
223	#define MIN_NEEDED_TO 4
224	#define FROM_DIRECTION 1
225	#define FROM_LOOP ucs4_internal_loop
226	#define TO_LOOP ucs4_internal_loop /* This is not used. */
227	#define FUNCTION_NAME __gconv_transform_ucs4_internal
228	#define ONE_DIRECTION 0
229
230
231	static inline int
232	__attribute ((always_inline))
233	ucs4_internal_loop (struct __gconv_step *step,
234	struct __gconv_step_data *step_data,
235	const unsigned char *inptrp, const* unsigned char *inend,
236	unsigned char *outptrp, const* unsigned char *outend,
237	size_t *irreversible)
238	{
239	int flags = step_data->__flags;
240	const unsigned char inptr = inptrp;
241	unsigned char outptr = outptrp;
242	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
243	int result;
244	size_t cnt;
245
246	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
247	{
248	uint32_t inval;
249
250	#if __BYTE_ORDER == __LITTLE_ENDIAN
251	inval = bswap_32 ((const* uint32_t *) inptr);
252	#else
253	inval = (const* uint32_t *) inptr;
254	#endif
255
256	if (__glibc_unlikely (inval > `0x7fffffff`))
257	{
258	/ The value is too large. We don't try transliteration here since*
259	this is not an error because of the lack of possibilities to
260	represent the result. This is a genuine bug in the input since
261	UCS4 does not allow such values. /*
262	if (irreversible == NULL)
263	/ We are transliterating, don't try to correct anything. /
264	return __GCONV_ILLEGAL_INPUT;
265
266	if (flags & __GCONV_IGNORE_ERRORS)
267	{
268	/ Just ignore this character. /
269	++*irreversible;
270	continue;
271	}
272
273	*inptrp = inptr;
274	*outptrp = outptr;
275	return __GCONV_ILLEGAL_INPUT;
276	}
277
278	((uint32_t ) outptr) = inval;
279	outptr += sizeof (uint32_t);
280	}
281
282	*inptrp = inptr;
283	*outptrp = outptr;
284
285	/ Determine the status. /
286	if (*inptrp == inend)
287	result = __GCONV_EMPTY_INPUT;
288	else if (*outptrp + `4` > outend)
289	result = __GCONV_FULL_OUTPUT;
290	else
291	result = __GCONV_INCOMPLETE_INPUT;
292
293	return result;
294	}
295
296	#if !_STRING_ARCH_unaligned
297	static inline int
298	__attribute ((always_inline))
299	ucs4_internal_loop_unaligned (struct __gconv_step *step,
300	struct __gconv_step_data *step_data,
301	const unsigned char **inptrp,
302	const unsigned char *inend,
303	unsigned char **outptrp,
304	const unsigned char *outend,
305	size_t *irreversible)
306	{
307	int flags = step_data->__flags;
308	const unsigned char inptr = inptrp;
309	unsigned char outptr = outptrp;
310	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
311	int result;
312	size_t cnt;
313
314	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
315	{
316	if (__glibc_unlikely (inptr[`0`] > `0x80`))
317	{
318	/ The value is too large. We don't try transliteration here since*
319	this is not an error because of the lack of possibilities to
320	represent the result. This is a genuine bug in the input since
321	UCS4 does not allow such values. /*
322	if (irreversible == NULL)
323	/ We are transliterating, don't try to correct anything. /
324	return __GCONV_ILLEGAL_INPUT;
325
326	if (flags & __GCONV_IGNORE_ERRORS)
327	{
328	/ Just ignore this character. /
329	++*irreversible;
330	continue;
331	}
332
333	*inptrp = inptr;
334	*outptrp = outptr;
335	return __GCONV_ILLEGAL_INPUT;
336	}
337
338	# if __BYTE_ORDER == __LITTLE_ENDIAN
339	outptr[`3`] = inptr[`0`];
340	outptr[`2`] = inptr[`1`];
341	outptr[`1`] = inptr[`2`];
342	outptr[`0`] = inptr[`3`];
343	# else
344	outptr[`0`] = inptr[`0`];
345	outptr[`1`] = inptr[`1`];
346	outptr[`2`] = inptr[`2`];
347	outptr[`3`] = inptr[`3`];
348	# endif
349	outptr += `4`;
350	}
351
352	*inptrp = inptr;
353	*outptrp = outptr;
354
355	/ Determine the status. /
356	if (*inptrp == inend)
357	result = __GCONV_EMPTY_INPUT;
358	else if (*outptrp + `4` > outend)
359	result = __GCONV_FULL_OUTPUT;
360	else
361	result = __GCONV_INCOMPLETE_INPUT;
362
363	return result;
364	}
365	#endif
366
367
368	static inline int
369	__attribute ((always_inline))
370	ucs4_internal_loop_single (struct __gconv_step *step,
371	struct __gconv_step_data *step_data,
372	const unsigned char **inptrp,
373	const unsigned char *inend,
374	unsigned char **outptrp,
375	const unsigned char *outend,
376	size_t *irreversible)
377	{
378	mbstate_t *state = step_data->__statep;
379	int flags = step_data->__flags;
380	size_t cnt = state->__count & `7`;
381
382	while (*inptrp < inend && cnt < `4`)
383	state->__value.__wchb[cnt++] = (inptrp)++;
384
385	if (__glibc_unlikely (cnt < `4`))
386	{
387	/ Still not enough bytes. Store the ones in the input buffer. /
388	state->__count &= ~`7`;
389	state->__count \|= cnt;
390
391	return __GCONV_INCOMPLETE_INPUT;
392	}
393
394	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`0`] > `0x80`,
395	`0`))
396	{
397	/ The value is too large. We don't try transliteration here since*
398	this is not an error because of the lack of possibilities to
399	represent the result. This is a genuine bug in the input since
400	UCS4 does not allow such values. /*
401	if (!(flags & __GCONV_IGNORE_ERRORS))
402	{
403	*inptrp -= cnt - (state->__count & `7`);
404	return __GCONV_ILLEGAL_INPUT;
405	}
406	}
407	else
408	{
409	#if __BYTE_ORDER == __LITTLE_ENDIAN
410	(*outptrp)[`0`] = state->__value.__wchb[`3`];
411	(*outptrp)[`1`] = state->__value.__wchb[`2`];
412	(*outptrp)[`2`] = state->__value.__wchb[`1`];
413	(*outptrp)[`3`] = state->__value.__wchb[`0`];
414	#elif __BYTE_ORDER == __BIG_ENDIAN
415	(*outptrp)[`0`] = state->__value.__wchb[`0`];
416	(*outptrp)[`1`] = state->__value.__wchb[`1`];
417	(*outptrp)[`2`] = state->__value.__wchb[`2`];
418	(*outptrp)[`3`] = state->__value.__wchb[`3`];
419	#endif
420
421	*outptrp += `4`;
422	}
423
424	/ Clear the state buffer. /
425	state->__count &= ~`7`;
426
427	return __GCONV_OK;
428	}
429
430	#include <iconv/skeleton.c>
431
432
433	/ Similarly for the little endian form. /
434	#define DEFINE_INIT 0
435	#define DEFINE_FINI 0
436	#define MIN_NEEDED_FROM 4
437	#define MIN_NEEDED_TO 4
438	#define FROM_DIRECTION 1
439	#define FROM_LOOP internal_ucs4le_loop
440	#define TO_LOOP internal_ucs4le_loop /* This is not used. */
441	#define FUNCTION_NAME __gconv_transform_internal_ucs4le
442	#define ONE_DIRECTION 0
443
444
445	static inline int
446	__attribute ((always_inline))
447	internal_ucs4le_loop (struct __gconv_step *step,
448	struct __gconv_step_data *step_data,
449	const unsigned char *inptrp, const* unsigned char *inend,
450	unsigned char *outptrp, const* unsigned char *outend,
451	size_t *irreversible)
452	{
453	const unsigned char inptr = inptrp;
454	unsigned char outptr = outptrp;
455	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
456	int result;
457
458	#if __BYTE_ORDER == __BIG_ENDIAN
459	/ Sigh, we have to do some real work. /
460	size_t cnt;
461	uint32_t outptr32 = (uint32_t ) outptr;
462
463	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
464	outptr32++ = bswap_32 ((const uint32_t *) inptr);
465	outptr = (unsigned char *) outptr32;
466
467	*inptrp = inptr;
468	*outptrp = outptr;
469	#elif __BYTE_ORDER == __LITTLE_ENDIAN
470	/ Simply copy the data. /
471	inptrp = inptr + n_convert `4`;
472	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
473	#else
474	# error "This endianess is not supported."
475	#endif
476
477	/ Determine the status. /
478	if (*inptrp == inend)
479	result = __GCONV_EMPTY_INPUT;
480	else if (*outptrp + `4` > outend)
481	result = __GCONV_FULL_OUTPUT;
482	else
483	result = __GCONV_INCOMPLETE_INPUT;
484
485	return result;
486	}
487
488	#if !_STRING_ARCH_unaligned
489	static inline int
490	__attribute ((always_inline))
491	internal_ucs4le_loop_unaligned (struct __gconv_step *step,
492	struct __gconv_step_data *step_data,
493	const unsigned char **inptrp,
494	const unsigned char *inend,
495	unsigned char **outptrp,
496	const unsigned char *outend,
497	size_t *irreversible)
498	{
499	const unsigned char inptr = inptrp;
500	unsigned char outptr = outptrp;
501	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
502	int result;
503
504	# if __BYTE_ORDER == __BIG_ENDIAN
505	/ Sigh, we have to do some real work. /
506	size_t cnt;
507
508	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
509	{
510	outptr[`0`] = inptr[`3`];
511	outptr[`1`] = inptr[`2`];
512	outptr[`2`] = inptr[`1`];
513	outptr[`3`] = inptr[`0`];
514	}
515
516	*inptrp = inptr;
517	*outptrp = outptr;
518	# elif __BYTE_ORDER == __LITTLE_ENDIAN
519	/ Simply copy the data. /
520	inptrp = inptr + n_convert `4`;
521	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
522	# else
523	# error "This endianess is not supported."
524	# endif
525
526	/ Determine the status. /
527	if (*inptrp == inend)
528	result = __GCONV_EMPTY_INPUT;
529	else if (*inptrp + `4` > inend)
530	result = __GCONV_INCOMPLETE_INPUT;
531	else
532	{
533	assert (*outptrp + `4` > outend);
534	result = __GCONV_FULL_OUTPUT;
535	}
536
537	return result;
538	}
539	#endif
540
541
542	static inline int
543	__attribute ((always_inline))
544	internal_ucs4le_loop_single (struct __gconv_step *step,
545	struct __gconv_step_data *step_data,
546	const unsigned char **inptrp,
547	const unsigned char *inend,
548	unsigned char **outptrp,
549	const unsigned char *outend,
550	size_t *irreversible)
551	{
552	mbstate_t *state = step_data->__statep;
553	size_t cnt = state->__count & `7`;
554
555	while (*inptrp < inend && cnt < `4`)
556	state->__value.__wchb[cnt++] = (inptrp)++;
557
558	if (__glibc_unlikely (cnt < `4`))
559	{
560	/ Still not enough bytes. Store the ones in the input buffer. /
561	state->__count &= ~`7`;
562	state->__count \|= cnt;
563
564	return __GCONV_INCOMPLETE_INPUT;
565	}
566
567	#if __BYTE_ORDER == __BIG_ENDIAN
568	(*outptrp)[`0`] = state->__value.__wchb[`3`];
569	(*outptrp)[`1`] = state->__value.__wchb[`2`];
570	(*outptrp)[`2`] = state->__value.__wchb[`1`];
571	(*outptrp)[`3`] = state->__value.__wchb[`0`];
572
573	#else
574	/ XXX unaligned /
575	(*outptrp)[`0`] = state->__value.__wchb[`0`];
576	(*outptrp)[`1`] = state->__value.__wchb[`1`];
577	(*outptrp)[`2`] = state->__value.__wchb[`2`];
578	(*outptrp)[`3`] = state->__value.__wchb[`3`];
579
580	#endif
581
582	*outptrp += `4`;
583
584	/ Clear the state buffer. /
585	state->__count &= ~`7`;
586
587	return __GCONV_OK;
588	}
589
590	#include <iconv/skeleton.c>
591
592
593	/ And finally from UCS4-LE to the internal encoding. /
594	#define DEFINE_INIT 0
595	#define DEFINE_FINI 0
596	#define MIN_NEEDED_FROM 4
597	#define MIN_NEEDED_TO 4
598	#define FROM_DIRECTION 1
599	#define FROM_LOOP ucs4le_internal_loop
600	#define TO_LOOP ucs4le_internal_loop /* This is not used. */
601	#define FUNCTION_NAME __gconv_transform_ucs4le_internal
602	#define ONE_DIRECTION 0
603
604
605	static inline int
606	__attribute ((always_inline))
607	ucs4le_internal_loop (struct __gconv_step *step,
608	struct __gconv_step_data *step_data,
609	const unsigned char *inptrp, const* unsigned char *inend,
610	unsigned char *outptrp, const* unsigned char *outend,
611	size_t *irreversible)
612	{
613	int flags = step_data->__flags;
614	const unsigned char inptr = inptrp;
615	unsigned char outptr = outptrp;
616	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
617	int result;
618	size_t cnt;
619
620	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
621	{
622	uint32_t inval;
623
624	#if __BYTE_ORDER == __BIG_ENDIAN
625	inval = bswap_32 ((const* uint32_t *) inptr);
626	#else
627	inval = (const* uint32_t *) inptr;
628	#endif
629
630	if (__glibc_unlikely (inval > `0x7fffffff`))
631	{
632	/ The value is too large. We don't try transliteration here since*
633	this is not an error because of the lack of possibilities to
634	represent the result. This is a genuine bug in the input since
635	UCS4 does not allow such values. /*
636	if (irreversible == NULL)
637	/ We are transliterating, don't try to correct anything. /
638	return __GCONV_ILLEGAL_INPUT;
639
640	if (flags & __GCONV_IGNORE_ERRORS)
641	{
642	/ Just ignore this character. /
643	++*irreversible;
644	continue;
645	}
646
647	*inptrp = inptr;
648	*outptrp = outptr;
649	return __GCONV_ILLEGAL_INPUT;
650	}
651
652	((uint32_t ) outptr) = inval;
653	outptr += sizeof (uint32_t);
654	}
655
656	*inptrp = inptr;
657	*outptrp = outptr;
658
659	/ Determine the status. /
660	if (*inptrp == inend)
661	result = __GCONV_EMPTY_INPUT;
662	else if (*inptrp + `4` > inend)
663	result = __GCONV_INCOMPLETE_INPUT;
664	else
665	{
666	assert (*outptrp + `4` > outend);
667	result = __GCONV_FULL_OUTPUT;
668	}
669
670	return result;
671	}
672
673	#if !_STRING_ARCH_unaligned
674	static inline int
675	__attribute ((always_inline))
676	ucs4le_internal_loop_unaligned (struct __gconv_step *step,
677	struct __gconv_step_data *step_data,
678	const unsigned char **inptrp,
679	const unsigned char *inend,
680	unsigned char **outptrp,
681	const unsigned char *outend,
682	size_t *irreversible)
683	{
684	int flags = step_data->__flags;
685	const unsigned char inptr = inptrp;
686	unsigned char outptr = outptrp;
687	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
688	int result;
689	size_t cnt;
690
691	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
692	{
693	if (__glibc_unlikely (inptr[`3`] > `0x80`))
694	{
695	/ The value is too large. We don't try transliteration here since*
696	this is not an error because of the lack of possibilities to
697	represent the result. This is a genuine bug in the input since
698	UCS4 does not allow such values. /*
699	if (irreversible == NULL)
700	/ We are transliterating, don't try to correct anything. /
701	return __GCONV_ILLEGAL_INPUT;
702
703	if (flags & __GCONV_IGNORE_ERRORS)
704	{
705	/ Just ignore this character. /
706	++*irreversible;
707	continue;
708	}
709
710	*inptrp = inptr;
711	*outptrp = outptr;
712	return __GCONV_ILLEGAL_INPUT;
713	}
714
715	# if __BYTE_ORDER == __BIG_ENDIAN
716	outptr[`3`] = inptr[`0`];
717	outptr[`2`] = inptr[`1`];
718	outptr[`1`] = inptr[`2`];
719	outptr[`0`] = inptr[`3`];
720	# else
721	outptr[`0`] = inptr[`0`];
722	outptr[`1`] = inptr[`1`];
723	outptr[`2`] = inptr[`2`];
724	outptr[`3`] = inptr[`3`];
725	# endif
726
727	outptr += `4`;
728	}
729
730	*inptrp = inptr;
731	*outptrp = outptr;
732
733	/ Determine the status. /
734	if (*inptrp == inend)
735	result = __GCONV_EMPTY_INPUT;
736	else if (*inptrp + `4` > inend)
737	result = __GCONV_INCOMPLETE_INPUT;
738	else
739	{
740	assert (*outptrp + `4` > outend);
741	result = __GCONV_FULL_OUTPUT;
742	}
743
744	return result;
745	}
746	#endif
747
748
749	static inline int
750	__attribute ((always_inline))
751	ucs4le_internal_loop_single (struct __gconv_step *step,
752	struct __gconv_step_data *step_data,
753	const unsigned char **inptrp,
754	const unsigned char *inend,
755	unsigned char **outptrp,
756	const unsigned char *outend,
757	size_t *irreversible)
758	{
759	mbstate_t *state = step_data->__statep;
760	int flags = step_data->__flags;
761	size_t cnt = state->__count & `7`;
762
763	while (*inptrp < inend && cnt < `4`)
764	state->__value.__wchb[cnt++] = (inptrp)++;
765
766	if (__glibc_unlikely (cnt < `4`))
767	{
768	/ Still not enough bytes. Store the ones in the input buffer. /
769	state->__count &= ~`7`;
770	state->__count \|= cnt;
771
772	return __GCONV_INCOMPLETE_INPUT;
773	}
774
775	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`3`] > `0x80`,
776	`0`))
777	{
778	/ The value is too large. We don't try transliteration here since*
779	this is not an error because of the lack of possibilities to
780	represent the result. This is a genuine bug in the input since
781	UCS4 does not allow such values. /*
782	if (!(flags & __GCONV_IGNORE_ERRORS))
783	return __GCONV_ILLEGAL_INPUT;
784	}
785	else
786	{
787	#if __BYTE_ORDER == __BIG_ENDIAN
788	(*outptrp)[`0`] = state->__value.__wchb[`3`];
789	(*outptrp)[`1`] = state->__value.__wchb[`2`];
790	(*outptrp)[`2`] = state->__value.__wchb[`1`];
791	(*outptrp)[`3`] = state->__value.__wchb[`0`];
792	#else
793	(*outptrp)[`0`] = state->__value.__wchb[`0`];
794	(*outptrp)[`1`] = state->__value.__wchb[`1`];
795	(*outptrp)[`2`] = state->__value.__wchb[`2`];
796	(*outptrp)[`3`] = state->__value.__wchb[`3`];
797	#endif
798
799	*outptrp += `4`;
800	}
801
802	/ Clear the state buffer. /
803	state->__count &= ~`7`;
804
805	return __GCONV_OK;
806	}
807
808	#include <iconv/skeleton.c>
809
810
811	/ Convert from ISO 646-IRV to the internal (UCS4-like) format. /
812	#define DEFINE_INIT 0
813	#define DEFINE_FINI 0
814	#define MIN_NEEDED_FROM 1
815	#define MIN_NEEDED_TO 4
816	#define FROM_DIRECTION 1
817	#define FROM_LOOP ascii_internal_loop
818	#define TO_LOOP ascii_internal_loop /* This is not used. */
819	#define FUNCTION_NAME __gconv_transform_ascii_internal
820	#define ONE_DIRECTION 1
821
822	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
823	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
824	#define LOOPFCT FROM_LOOP
825	#define BODY \
826	{ \
827	if (__glibc_unlikely (*inptr > '\x7f')) \
828	{ \
829	/* The value is too large. We don't try transliteration here since \
830	this is not an error because of the lack of possibilities to \
831	represent the result. This is a genuine bug in the input since \
832	ASCII does not allow such values. */ \
833	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
834	} \
835	else \
836	{ \
837	/* It's an one byte sequence. */ \
838	((uint32_t ) outptr) = *inptr++; \
839	outptr += sizeof (uint32_t); \
840	} \
841	}
842	#define LOOP_NEED_FLAGS
843	#include <iconv/loop.c>
844	#include <iconv/skeleton.c>
845
846
847	/ Convert from the internal (UCS4-like) format to ISO 646-IRV. /
848	#define DEFINE_INIT 0
849	#define DEFINE_FINI 0
850	#define MIN_NEEDED_FROM 4
851	#define MIN_NEEDED_TO 1
852	#define FROM_DIRECTION 1
853	#define FROM_LOOP internal_ascii_loop
854	#define TO_LOOP internal_ascii_loop /* This is not used. */
855	#define FUNCTION_NAME __gconv_transform_internal_ascii
856	#define ONE_DIRECTION 1
857
858	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
859	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
860	#define LOOPFCT FROM_LOOP
861	#define BODY \
862	{ \
863	if (__glibc_unlikely (((const uint32_t ) inptr) > 0x7f)) \
864	{ \
865	UNICODE_TAG_HANDLER (((const uint32_t ) inptr), 4); \
866	STANDARD_TO_LOOP_ERR_HANDLER (4); \
867	} \
868	else \
869	{ \
870	/* It's an one byte sequence. */ \
871	outptr++ = ((const uint32_t *) inptr); \
872	inptr += sizeof (uint32_t); \
873	} \
874	}
875	#define LOOP_NEED_FLAGS
876	#include <iconv/loop.c>
877	#include <iconv/skeleton.c>
878
879
880	/ Convert from the internal (UCS4-like) format to UTF-8. /
881	#define DEFINE_INIT 0
882	#define DEFINE_FINI 0
883	#define MIN_NEEDED_FROM 4
884	#define MIN_NEEDED_TO 1
885	#define MAX_NEEDED_TO 6
886	#define FROM_DIRECTION 1
887	#define FROM_LOOP internal_utf8_loop
888	#define TO_LOOP internal_utf8_loop /* This is not used. */
889	#define FUNCTION_NAME __gconv_transform_internal_utf8
890	#define ONE_DIRECTION 1
891
892	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
893	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
894	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
895	#define LOOPFCT FROM_LOOP
896	#define BODY \
897	{ \
898	uint32_t wc = ((const uint32_t ) inptr); \
899	\
900	if (__glibc_likely (wc < 0x80)) \
901	/* It's an one byte sequence. */ \
902	*outptr++ = (unsigned char) wc; \
903	else if (__glibc_likely (wc <= 0x7fffffff \
904	&& (wc < 0xd800 \|\| wc > 0xdfff))) \
905	{ \
906	size_t step; \
907	unsigned char *start; \
908	\
909	for (step = 2; step < 6; ++step) \
910	if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
911	break; \
912	\
913	if (__glibc_unlikely (outptr + step > outend)) \
914	{ \
915	/* Too long. */ \
916	result = __GCONV_FULL_OUTPUT; \
917	break; \
918	} \
919	\
920	start = outptr; \
921	*outptr = (unsigned char) (~0xff >> step); \
922	outptr += step; \
923	do \
924	{ \
925	start[--step] = 0x80 \| (wc & 0x3f); \
926	wc >>= 6; \
927	} \
928	while (step > 1); \
929	start[0] \|= wc; \
930	} \
931	else \
932	{ \
933	STANDARD_TO_LOOP_ERR_HANDLER (4); \
934	} \
935	\
936	inptr += 4; \
937	}
938	#define LOOP_NEED_FLAGS
939	#include <iconv/loop.c>
940	#include <iconv/skeleton.c>
941
942
943	/ Convert from UTF-8 to the internal (UCS4-like) format. /
944	#define DEFINE_INIT 0
945	#define DEFINE_FINI 0
946	#define MIN_NEEDED_FROM 1
947	#define MAX_NEEDED_FROM 6
948	#define MIN_NEEDED_TO 4
949	#define FROM_DIRECTION 1
950	#define FROM_LOOP utf8_internal_loop
951	#define TO_LOOP utf8_internal_loop /* This is not used. */
952	#define FUNCTION_NAME __gconv_transform_utf8_internal
953	#define ONE_DIRECTION 1
954
955	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
956	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
957	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
958	#define LOOPFCT FROM_LOOP
959	#define BODY \
960	{ \
961	/* Next input byte. */ \
962	uint32_t ch = *inptr; \
963	\
964	if (__glibc_likely (ch < 0x80)) \
965	{ \
966	/* One byte sequence. */ \
967	++inptr; \
968	} \
969	else \
970	{ \
971	uint_fast32_t cnt; \
972	uint_fast32_t i; \
973	\
974	if (ch >= 0xc2 && ch < 0xe0) \
975	{ \
976	/* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
977	otherwise the wide character could have been represented \
978	using a single byte. */ \
979	cnt = 2; \
980	ch &= 0x1f; \
981	} \
982	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
983	{ \
984	/* We expect three bytes. */ \
985	cnt = 3; \
986	ch &= 0x0f; \
987	} \
988	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
989	{ \
990	/* We expect four bytes. */ \
991	cnt = 4; \
992	ch &= 0x07; \
993	} \
994	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
995	{ \
996	/* We expect five bytes. */ \
997	cnt = 5; \
998	ch &= 0x03; \
999	} \
1000	else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
1001	{ \
1002	/* We expect six bytes. */ \
1003	cnt = 6; \
1004	ch &= 0x01; \
1005	} \
1006	else \
1007	{ \
1008	/* Search the end of this ill-formed UTF-8 character. This \
1009	is the next byte with (x & 0xc0) != 0x80. */ \
1010	i = 0; \
1011	do \
1012	++i; \
1013	while (inptr + i < inend \
1014	&& (*(inptr + i) & 0xc0) == 0x80 \
1015	&& i < 5); \
1016	\
1017	errout: \
1018	STANDARD_FROM_LOOP_ERR_HANDLER (i); \
1019	} \
1020	\
1021	if (__glibc_unlikely (inptr + cnt > inend)) \
1022	{ \
1023	/* We don't have enough input. But before we report that check \
1024	that all the bytes are correct. */ \
1025	for (i = 1; inptr + i < inend; ++i) \
1026	if ((inptr[i] & 0xc0) != 0x80) \
1027	break; \
1028	\
1029	if (__glibc_likely (inptr + i == inend)) \
1030	{ \
1031	result = __GCONV_INCOMPLETE_INPUT; \
1032	break; \
1033	} \
1034	\
1035	goto errout; \
1036	} \
1037	\
1038	/* Read the possible remaining bytes. */ \
1039	for (i = 1; i < cnt; ++i) \
1040	{ \
1041	uint32_t byte = inptr[i]; \
1042	\
1043	if ((byte & 0xc0) != 0x80) \
1044	/* This is an illegal encoding. */ \
1045	break; \
1046	\
1047	ch <<= 6; \
1048	ch \|= byte & 0x3f; \
1049	} \
1050	\
1051	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
1052	If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
1053	have been represented with fewer than cnt bytes. */ \
1054	if (i < cnt \|\| (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
1055	/* Do not accept UTF-16 surrogates. */ \
1056	\|\| (ch >= 0xd800 && ch <= 0xdfff)) \
1057	{ \
1058	/* This is an illegal encoding. */ \
1059	goto errout; \
1060	} \
1061	\
1062	inptr += cnt; \
1063	} \
1064	\
1065	/* Now adjust the pointers and store the result. */ \
1066	((uint32_t ) outptr) = ch; \
1067	outptr += sizeof (uint32_t); \
1068	}
1069	#define LOOP_NEED_FLAGS
1070
1071	#define STORE_REST \
1072	{ \
1073	/* We store the remaining bytes while converting them into the UCS4 \
1074	format. We can assume that the first byte in the buffer is \
1075	correct and that it requires a larger number of bytes than there \
1076	are in the input buffer. */ \
1077	wint_t ch = **inptrp; \
1078	size_t cnt, r; \
1079	\
1080	state->__count = inend - *inptrp; \
1081	\
1082	assert (ch != 0xc0 && ch != 0xc1); \
1083	if (ch >= 0xc2 && ch < 0xe0) \
1084	{ \
1085	/* We expect two bytes. The first byte cannot be 0xc0 or \
1086	0xc1, otherwise the wide character could have been \
1087	represented using a single byte. */ \
1088	cnt = 2; \
1089	ch &= 0x1f; \
1090	} \
1091	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
1092	{ \
1093	/* We expect three bytes. */ \
1094	cnt = 3; \
1095	ch &= 0x0f; \
1096	} \
1097	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
1098	{ \
1099	/* We expect four bytes. */ \
1100	cnt = 4; \
1101	ch &= 0x07; \
1102	} \
1103	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
1104	{ \
1105	/* We expect five bytes. */ \
1106	cnt = 5; \
1107	ch &= 0x03; \
1108	} \
1109	else \
1110	{ \
1111	/* We expect six bytes. */ \
1112	cnt = 6; \
1113	ch &= 0x01; \
1114	} \
1115	\
1116	/* The first byte is already consumed. */ \
1117	r = cnt - 1; \
1118	while (++(*inptrp) < inend) \
1119	{ \
1120	ch <<= 6; \
1121	ch \|= **inptrp & 0x3f; \
1122	--r; \
1123	} \
1124	\
1125	/* Shift for the so far missing bytes. */ \
1126	ch <<= r * 6; \
1127	\
1128	/* Store the number of bytes expected for the entire sequence. */ \
1129	state->__count \|= cnt << 8; \
1130	\
1131	/* Store the value. */ \
1132	state->__value.__wch = ch; \
1133	}
1134
1135	#define UNPACK_BYTES \
1136	{ \
1137	static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
1138	wint_t wch = state->__value.__wch; \
1139	size_t ntotal = state->__count >> 8; \
1140	\
1141	inlen = state->__count & 255; \
1142	\
1143	bytebuf[0] = inmask[ntotal - 2]; \
1144	\
1145	do \
1146	{ \
1147	if (--ntotal < inlen) \
1148	bytebuf[ntotal] = 0x80 \| (wch & 0x3f); \
1149	wch >>= 6; \
1150	} \
1151	while (ntotal > 1); \
1152	\
1153	bytebuf[0] \|= wch; \
1154	}
1155
1156	#define CLEAR_STATE \
1157	state->__count = 0
1158
1159
1160	#include <iconv/loop.c>
1161	#include <iconv/skeleton.c>
1162
1163
1164	/ Convert from UCS2 to the internal (UCS4-like) format. /
1165	#define DEFINE_INIT 0
1166	#define DEFINE_FINI 0
1167	#define MIN_NEEDED_FROM 2
1168	#define MIN_NEEDED_TO 4
1169	#define FROM_DIRECTION 1
1170	#define FROM_LOOP ucs2_internal_loop
1171	#define TO_LOOP ucs2_internal_loop /* This is not used. */
1172	#define FUNCTION_NAME __gconv_transform_ucs2_internal
1173	#define ONE_DIRECTION 1
1174
1175	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1176	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1177	#define LOOPFCT FROM_LOOP
1178	#define BODY \
1179	{ \
1180	uint16_t u1 = get16 (inptr); \
1181	\
1182	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1183	{ \
1184	/* Surrogate characters in UCS-2 input are not valid. Reject \
1185	them. (Catching this here is not security relevant.) */ \
1186	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
1187	} \
1188	\
1189	((uint32_t ) outptr) = u1; \
1190	outptr += sizeof (uint32_t); \
1191	inptr += 2; \
1192	}
1193	#define LOOP_NEED_FLAGS
1194	#include <iconv/loop.c>
1195	#include <iconv/skeleton.c>
1196
1197
1198	/ Convert from the internal (UCS4-like) format to UCS2. /
1199	#define DEFINE_INIT 0
1200	#define DEFINE_FINI 0
1201	#define MIN_NEEDED_FROM 4
1202	#define MIN_NEEDED_TO 2
1203	#define FROM_DIRECTION 1
1204	#define FROM_LOOP internal_ucs2_loop
1205	#define TO_LOOP internal_ucs2_loop /* This is not used. */
1206	#define FUNCTION_NAME __gconv_transform_internal_ucs2
1207	#define ONE_DIRECTION 1
1208
1209	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1210	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1211	#define LOOPFCT FROM_LOOP
1212	#define BODY \
1213	{ \
1214	uint32_t val = ((const uint32_t ) inptr); \
1215	\
1216	if (__glibc_unlikely (val >= 0x10000)) \
1217	{ \
1218	UNICODE_TAG_HANDLER (val, 4); \
1219	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1220	} \
1221	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1222	{ \
1223	/* Surrogate characters in UCS-4 input are not valid. \
1224	We must catch this, because the UCS-2 output might be \
1225	interpreted as UTF-16 by other programs. If we let \
1226	surrogates pass through, attackers could make a security \
1227	hole exploit by synthesizing any desired plane 1-16 \
1228	character. */ \
1229	result = __GCONV_ILLEGAL_INPUT; \
1230	if (! ignore_errors_p ()) \
1231	break; \
1232	inptr += 4; \
1233	++*irreversible; \
1234	continue; \
1235	} \
1236	else \
1237	{ \
1238	put16 (outptr, val); \
1239	outptr += sizeof (uint16_t); \
1240	inptr += 4; \
1241	} \
1242	}
1243	#define LOOP_NEED_FLAGS
1244	#include <iconv/loop.c>
1245	#include <iconv/skeleton.c>
1246
1247
1248	/ Convert from UCS2 in other endianness to the internal (UCS4-like) format. /
1249	#define DEFINE_INIT 0
1250	#define DEFINE_FINI 0
1251	#define MIN_NEEDED_FROM 2
1252	#define MIN_NEEDED_TO 4
1253	#define FROM_DIRECTION 1
1254	#define FROM_LOOP ucs2reverse_internal_loop
1255	#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
1256	#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1257	#define ONE_DIRECTION 1
1258
1259	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1260	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1261	#define LOOPFCT FROM_LOOP
1262	#define BODY \
1263	{ \
1264	uint16_t u1 = bswap_16 (get16 (inptr)); \
1265	\
1266	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1267	{ \
1268	/* Surrogate characters in UCS-2 input are not valid. Reject \
1269	them. (Catching this here is not security relevant.) */ \
1270	if (! ignore_errors_p ()) \
1271	{ \
1272	result = __GCONV_ILLEGAL_INPUT; \
1273	break; \
1274	} \
1275	inptr += 2; \
1276	++*irreversible; \
1277	continue; \
1278	} \
1279	\
1280	((uint32_t ) outptr) = u1; \
1281	outptr += sizeof (uint32_t); \
1282	inptr += 2; \
1283	}
1284	#define LOOP_NEED_FLAGS
1285	#include <iconv/loop.c>
1286	#include <iconv/skeleton.c>
1287
1288
1289	/ Convert from the internal (UCS4-like) format to UCS2 in other endianness. /
1290	#define DEFINE_INIT 0
1291	#define DEFINE_FINI 0
1292	#define MIN_NEEDED_FROM 4
1293	#define MIN_NEEDED_TO 2
1294	#define FROM_DIRECTION 1
1295	#define FROM_LOOP internal_ucs2reverse_loop
1296	#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1297	#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1298	#define ONE_DIRECTION 1
1299
1300	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1301	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1302	#define LOOPFCT FROM_LOOP
1303	#define BODY \
1304	{ \
1305	uint32_t val = ((const uint32_t ) inptr); \
1306	if (__glibc_unlikely (val >= 0x10000)) \
1307	{ \
1308	UNICODE_TAG_HANDLER (val, 4); \
1309	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1310	} \
1311	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1312	{ \
1313	/* Surrogate characters in UCS-4 input are not valid. \
1314	We must catch this, because the UCS-2 output might be \
1315	interpreted as UTF-16 by other programs. If we let \
1316	surrogates pass through, attackers could make a security \
1317	hole exploit by synthesizing any desired plane 1-16 \
1318	character. */ \
1319	if (! ignore_errors_p ()) \
1320	{ \
1321	result = __GCONV_ILLEGAL_INPUT; \
1322	break; \
1323	} \
1324	inptr += 4; \
1325	++*irreversible; \
1326	continue; \
1327	} \
1328	else \
1329	{ \
1330	put16 (outptr, bswap_16 (val)); \
1331	outptr += sizeof (uint16_t); \
1332	inptr += 4; \
1333	} \
1334	}
1335	#define LOOP_NEED_FLAGS
1336	#include <iconv/loop.c>
1337	#include <iconv/skeleton.c>
1338

Browse the source code of glibc_src_2.31/iconv/gconv_simple.c