1 | /* Conversion loop frame work. |
2 | Copyright (C) 1998-2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | /* This file provides a frame for the reader loop in all conversion modules. |
21 | The actual code must (of course) be provided in the actual module source |
22 | code but certain actions can be written down generically, with some |
23 | customization options which are these: |
24 | |
25 | MIN_NEEDED_INPUT minimal number of input bytes needed for the next |
26 | conversion. |
27 | MIN_NEEDED_OUTPUT minimal number of bytes produced by the next round |
28 | of conversion. |
29 | |
30 | MAX_NEEDED_INPUT you guess it, this is the maximal number of input |
31 | bytes needed. It defaults to MIN_NEEDED_INPUT |
32 | MAX_NEEDED_OUTPUT likewise for output bytes. |
33 | |
34 | LOOPFCT name of the function created. If not specified |
35 | the name is `loop' but this prevents the use |
36 | of multiple functions in the same file. |
37 | |
38 | BODY this is supposed to expand to the body of the loop. |
39 | The user must provide this. |
40 | |
41 | EXTRA_LOOP_DECLS extra arguments passed from conversion loop call. |
42 | |
43 | INIT_PARAMS code to define and initialize variables from params. |
44 | UPDATE_PARAMS code to store result in params. |
45 | |
46 | ONEBYTE_BODY body of the specialized conversion function for a |
47 | single byte from the current character set to INTERNAL. |
48 | */ |
49 | |
50 | #include <assert.h> |
51 | #include <endian.h> |
52 | #include <gconv.h> |
53 | #include <stdint.h> |
54 | #include <string.h> |
55 | #include <wchar.h> |
56 | #include <sys/param.h> /* For MIN. */ |
57 | #define __need_size_t |
58 | #include <stddef.h> |
59 | #include <libc-internal.h> |
60 | |
61 | /* We have to provide support for machines which are not able to handled |
62 | unaligned memory accesses. Some of the character encodings have |
63 | representations with a fixed width of 2 or 4 bytes. But if we cannot |
64 | access unaligned memory we still have to read byte-wise. */ |
65 | #undef FCTNAME2 |
66 | #if _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED |
67 | /* We can handle unaligned memory access. */ |
68 | # define get16(addr) *((const uint16_t *) (addr)) |
69 | # define get32(addr) *((const uint32_t *) (addr)) |
70 | |
71 | /* We need no special support for writing values either. */ |
72 | # define put16(addr, val) *((uint16_t *) (addr)) = (val) |
73 | # define put32(addr, val) *((uint32_t *) (addr)) = (val) |
74 | |
75 | # define FCTNAME2(name) name |
76 | #else |
77 | /* Distinguish between big endian and little endian. */ |
78 | # if __BYTE_ORDER == __LITTLE_ENDIAN |
79 | # define get16(addr) \ |
80 | (((const unsigned char *) (addr))[1] << 8 \ |
81 | | ((const unsigned char *) (addr))[0]) |
82 | # define get32(addr) \ |
83 | (((((const unsigned char *) (addr))[3] << 8 \ |
84 | | ((const unsigned char *) (addr))[2]) << 8 \ |
85 | | ((const unsigned char *) (addr))[1]) << 8 \ |
86 | | ((const unsigned char *) (addr))[0]) |
87 | |
88 | # define put16(addr, val) \ |
89 | ({ uint16_t __val = (val); \ |
90 | ((unsigned char *) (addr))[0] = __val; \ |
91 | ((unsigned char *) (addr))[1] = __val >> 8; \ |
92 | (void) 0; }) |
93 | # define put32(addr, val) \ |
94 | ({ uint32_t __val = (val); \ |
95 | ((unsigned char *) (addr))[0] = __val; \ |
96 | __val >>= 8; \ |
97 | ((unsigned char *) (addr))[1] = __val; \ |
98 | __val >>= 8; \ |
99 | ((unsigned char *) (addr))[2] = __val; \ |
100 | __val >>= 8; \ |
101 | ((unsigned char *) (addr))[3] = __val; \ |
102 | (void) 0; }) |
103 | # else |
104 | # define get16(addr) \ |
105 | (((const unsigned char *) (addr))[0] << 8 \ |
106 | | ((const unsigned char *) (addr))[1]) |
107 | # define get32(addr) \ |
108 | (((((const unsigned char *) (addr))[0] << 8 \ |
109 | | ((const unsigned char *) (addr))[1]) << 8 \ |
110 | | ((const unsigned char *) (addr))[2]) << 8 \ |
111 | | ((const unsigned char *) (addr))[3]) |
112 | |
113 | # define put16(addr, val) \ |
114 | ({ uint16_t __val = (val); \ |
115 | ((unsigned char *) (addr))[1] = __val; \ |
116 | ((unsigned char *) (addr))[0] = __val >> 8; \ |
117 | (void) 0; }) |
118 | # define put32(addr, val) \ |
119 | ({ uint32_t __val = (val); \ |
120 | ((unsigned char *) (addr))[3] = __val; \ |
121 | __val >>= 8; \ |
122 | ((unsigned char *) (addr))[2] = __val; \ |
123 | __val >>= 8; \ |
124 | ((unsigned char *) (addr))[1] = __val; \ |
125 | __val >>= 8; \ |
126 | ((unsigned char *) (addr))[0] = __val; \ |
127 | (void) 0; }) |
128 | # endif |
129 | |
130 | # define FCTNAME2(name) name##_unaligned |
131 | #endif |
132 | #define FCTNAME(name) FCTNAME2(name) |
133 | |
134 | |
135 | /* We need at least one byte for the next round. */ |
136 | #ifndef MIN_NEEDED_INPUT |
137 | # error "MIN_NEEDED_INPUT definition missing" |
138 | #elif MIN_NEEDED_INPUT < 1 |
139 | # error "MIN_NEEDED_INPUT must be >= 1" |
140 | #endif |
141 | |
142 | /* Let's see how many bytes we produce. */ |
143 | #ifndef MAX_NEEDED_INPUT |
144 | # define MAX_NEEDED_INPUT MIN_NEEDED_INPUT |
145 | #endif |
146 | |
147 | /* We produce at least one byte in the next round. */ |
148 | #ifndef MIN_NEEDED_OUTPUT |
149 | # error "MIN_NEEDED_OUTPUT definition missing" |
150 | #elif MIN_NEEDED_OUTPUT < 1 |
151 | # error "MIN_NEEDED_OUTPUT must be >= 1" |
152 | #endif |
153 | |
154 | /* Let's see how many bytes we produce. */ |
155 | #ifndef MAX_NEEDED_OUTPUT |
156 | # define MAX_NEEDED_OUTPUT MIN_NEEDED_OUTPUT |
157 | #endif |
158 | |
159 | /* Default name for the function. */ |
160 | #ifndef LOOPFCT |
161 | # define LOOPFCT loop |
162 | #endif |
163 | |
164 | /* Make sure we have a loop body. */ |
165 | #ifndef BODY |
166 | # error "Definition of BODY missing for function" LOOPFCT |
167 | #endif |
168 | |
169 | |
170 | /* If no arguments have to passed to the loop function define the macro |
171 | as empty. */ |
172 | #ifndef EXTRA_LOOP_DECLS |
173 | # define |
174 | #endif |
175 | |
176 | /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test |
177 | isn't possible. */ |
178 | #ifndef UPDATE_PARAMS |
179 | # define UPDATE_PARAMS do { } while (0) |
180 | #endif |
181 | #ifndef REINIT_PARAMS |
182 | # define REINIT_PARAMS do { } while (0) |
183 | #endif |
184 | |
185 | |
186 | /* To make it easier for the writers of the modules, we define a macro |
187 | to test whether we have to ignore errors. */ |
188 | #define ignore_errors_p() \ |
189 | (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS)) |
190 | |
191 | |
192 | /* Error handling for the FROM_LOOP direction, with ignoring of errors. |
193 | Note that we cannot use the do while (0) trick since `break' and |
194 | `continue' must reach certain points. */ |
195 | #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \ |
196 | { \ |
197 | result = __GCONV_ILLEGAL_INPUT; \ |
198 | \ |
199 | if (! ignore_errors_p ()) \ |
200 | break; \ |
201 | \ |
202 | /* We ignore the invalid input byte sequence. */ \ |
203 | inptr += (Incr); \ |
204 | ++*irreversible; \ |
205 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
206 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
207 | continue; \ |
208 | } |
209 | |
210 | /* Error handling for the TO_LOOP direction, with use of transliteration/ |
211 | transcription functions and ignoring of errors. Note that we cannot use |
212 | the do while (0) trick since `break' and `continue' must reach certain |
213 | points. */ |
214 | #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \ |
215 | { \ |
216 | result = __GCONV_ILLEGAL_INPUT; \ |
217 | \ |
218 | if (irreversible == NULL) \ |
219 | /* This means we are in call from __gconv_transliterate. In this \ |
220 | case we are not doing any error recovery outself. */ \ |
221 | break; \ |
222 | \ |
223 | /* If needed, flush any conversion state, so that __gconv_transliterate \ |
224 | starts with current shift state. */ \ |
225 | UPDATE_PARAMS; \ |
226 | \ |
227 | /* First try the transliteration methods. */ \ |
228 | if ((step_data->__flags & __GCONV_TRANSLIT) != 0) \ |
229 | result = __gconv_transliterate \ |
230 | (step, step_data, *inptrp, \ |
231 | &inptr, inend, &outptr, irreversible); \ |
232 | \ |
233 | REINIT_PARAMS; \ |
234 | \ |
235 | /* If any of them recognized the input continue with the loop. */ \ |
236 | if (result != __GCONV_ILLEGAL_INPUT) \ |
237 | { \ |
238 | if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \ |
239 | break; \ |
240 | \ |
241 | continue; \ |
242 | } \ |
243 | \ |
244 | /* Next see whether we have to ignore the error. If not, stop. */ \ |
245 | if (! ignore_errors_p ()) \ |
246 | break; \ |
247 | \ |
248 | /* When we come here it means we ignore the character. */ \ |
249 | ++*irreversible; \ |
250 | inptr += Incr; \ |
251 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
252 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
253 | continue; \ |
254 | } |
255 | |
256 | |
257 | /* Handling of Unicode 3.1 TAG characters. Unicode recommends |
258 | "If language codes are not relevant to the particular processing |
259 | operation, then they should be ignored." This macro is usually |
260 | called right before STANDARD_TO_LOOP_ERR_HANDLER (Incr). */ |
261 | #define UNICODE_TAG_HANDLER(Character, Incr) \ |
262 | { \ |
263 | /* TAG characters are those in the range U+E0000..U+E007F. */ \ |
264 | if (((Character) >> 7) == (0xe0000 >> 7)) \ |
265 | { \ |
266 | inptr += Incr; \ |
267 | continue; \ |
268 | } \ |
269 | } |
270 | |
271 | |
272 | /* The function returns the status, as defined in gconv.h. */ |
273 | static inline int |
274 | __attribute ((always_inline)) |
275 | FCTNAME (LOOPFCT) (struct __gconv_step *step, |
276 | struct __gconv_step_data *step_data, |
277 | const unsigned char **inptrp, const unsigned char *inend, |
278 | unsigned char **outptrp, const unsigned char *outend, |
279 | size_t *irreversible EXTRA_LOOP_DECLS) |
280 | { |
281 | #ifdef LOOP_NEED_STATE |
282 | mbstate_t *state = step_data->__statep; |
283 | #endif |
284 | #ifdef LOOP_NEED_FLAGS |
285 | int flags = step_data->__flags; |
286 | #endif |
287 | #ifdef LOOP_NEED_DATA |
288 | void *data = step->__data; |
289 | #endif |
290 | int result = __GCONV_EMPTY_INPUT; |
291 | const unsigned char *inptr = *inptrp; |
292 | unsigned char *outptr = *outptrp; |
293 | |
294 | #ifdef INIT_PARAMS |
295 | INIT_PARAMS; |
296 | #endif |
297 | |
298 | while (inptr != inend) |
299 | { |
300 | /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the |
301 | compiler generating better code. They will be optimized away |
302 | since MIN_NEEDED_OUTPUT is always a constant. */ |
303 | if (MIN_NEEDED_INPUT > 1 |
304 | && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0)) |
305 | { |
306 | /* We don't have enough input for another complete input |
307 | character. */ |
308 | result = __GCONV_INCOMPLETE_INPUT; |
309 | break; |
310 | } |
311 | if ((MIN_NEEDED_OUTPUT != 1 |
312 | && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0)) |
313 | || (MIN_NEEDED_OUTPUT == 1 |
314 | && __builtin_expect (outptr >= outend, 0))) |
315 | { |
316 | /* Overflow in the output buffer. */ |
317 | result = __GCONV_FULL_OUTPUT; |
318 | break; |
319 | } |
320 | |
321 | /* Here comes the body the user provides. It can stop with |
322 | RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the |
323 | input characters vary in size), GCONV_ILLEGAL_INPUT, or |
324 | GCONV_FULL_OUTPUT (if the output characters vary in size). */ |
325 | BODY |
326 | } |
327 | |
328 | /* Update the pointers pointed to by the parameters. */ |
329 | *inptrp = inptr; |
330 | *outptrp = outptr; |
331 | UPDATE_PARAMS; |
332 | |
333 | return result; |
334 | } |
335 | |
336 | |
337 | /* Include the file a second time to define the function to handle |
338 | unaligned access. */ |
339 | #if !defined DEFINE_UNALIGNED && !_STRING_ARCH_unaligned \ |
340 | && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \ |
341 | && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0 |
342 | # undef get16 |
343 | # undef get32 |
344 | # undef put16 |
345 | # undef put32 |
346 | # undef unaligned |
347 | |
348 | # define DEFINE_UNALIGNED |
349 | # include "loop.c" |
350 | # undef DEFINE_UNALIGNED |
351 | #else |
352 | # if MAX_NEEDED_INPUT > 1 |
353 | # define SINGLE(fct) SINGLE2 (fct) |
354 | # define SINGLE2(fct) fct##_single |
355 | static inline int |
356 | __attribute ((always_inline)) |
357 | SINGLE(LOOPFCT) (struct __gconv_step *step, |
358 | struct __gconv_step_data *step_data, |
359 | const unsigned char **inptrp, const unsigned char *inend, |
360 | unsigned char **outptrp, unsigned char *outend, |
361 | size_t *irreversible EXTRA_LOOP_DECLS) |
362 | { |
363 | mbstate_t *state = step_data->__statep; |
364 | # ifdef LOOP_NEED_FLAGS |
365 | int flags = step_data->__flags; |
366 | # endif |
367 | # ifdef LOOP_NEED_DATA |
368 | void *data = step->__data; |
369 | # endif |
370 | int result = __GCONV_OK; |
371 | unsigned char bytebuf[MAX_NEEDED_INPUT]; |
372 | const unsigned char *inptr = *inptrp; |
373 | unsigned char *outptr = *outptrp; |
374 | size_t inlen; |
375 | |
376 | # ifdef INIT_PARAMS |
377 | INIT_PARAMS; |
378 | # endif |
379 | |
380 | # ifdef UNPACK_BYTES |
381 | UNPACK_BYTES |
382 | # else |
383 | /* Add the bytes from the state to the input buffer. */ |
384 | assert ((state->__count & 7) <= sizeof (state->__value)); |
385 | for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen) |
386 | bytebuf[inlen] = state->__value.__wchb[inlen]; |
387 | # endif |
388 | |
389 | /* Are there enough bytes in the input buffer? */ |
390 | if (MIN_NEEDED_INPUT > 1 |
391 | && __builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0)) |
392 | { |
393 | *inptrp = inend; |
394 | # ifdef STORE_REST |
395 | |
396 | /* Building with -O3 GCC emits a `array subscript is above array |
397 | bounds' warning. GCC BZ #64739 has been opened for this. */ |
398 | DIAG_PUSH_NEEDS_COMMENT; |
399 | DIAG_IGNORE_NEEDS_COMMENT (4.9, "-Warray-bounds" ); |
400 | while (inptr < inend) |
401 | bytebuf[inlen++] = *inptr++; |
402 | DIAG_POP_NEEDS_COMMENT; |
403 | |
404 | inptr = bytebuf; |
405 | inptrp = &inptr; |
406 | inend = &bytebuf[inlen]; |
407 | |
408 | STORE_REST |
409 | # else |
410 | /* We don't have enough input for another complete input |
411 | character. */ |
412 | while (inptr < inend) |
413 | state->__value.__wchb[inlen++] = *inptr++; |
414 | # endif |
415 | |
416 | return __GCONV_INCOMPLETE_INPUT; |
417 | } |
418 | |
419 | /* Enough space in output buffer. */ |
420 | if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend) |
421 | || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend)) |
422 | /* Overflow in the output buffer. */ |
423 | return __GCONV_FULL_OUTPUT; |
424 | |
425 | /* Now add characters from the normal input buffer. */ |
426 | do |
427 | bytebuf[inlen++] = *inptr++; |
428 | while (inlen < MAX_NEEDED_INPUT && inptr < inend); |
429 | |
430 | inptr = bytebuf; |
431 | inend = &bytebuf[inlen]; |
432 | |
433 | do |
434 | { |
435 | BODY |
436 | } |
437 | while (0); |
438 | |
439 | /* Now we either have produced an output character and consumed all the |
440 | bytes from the state and at least one more, or the character is still |
441 | incomplete, or we have some other error (like illegal input character, |
442 | no space in output buffer). */ |
443 | if (__glibc_likely (inptr != bytebuf)) |
444 | { |
445 | /* We found a new character. */ |
446 | assert (inptr - bytebuf > (state->__count & 7)); |
447 | |
448 | *inptrp += inptr - bytebuf - (state->__count & 7); |
449 | *outptrp = outptr; |
450 | |
451 | result = __GCONV_OK; |
452 | |
453 | /* Clear the state buffer. */ |
454 | # ifdef CLEAR_STATE |
455 | CLEAR_STATE; |
456 | # else |
457 | state->__count &= ~7; |
458 | # endif |
459 | } |
460 | else if (result == __GCONV_INCOMPLETE_INPUT) |
461 | { |
462 | /* This can only happen if we have less than MAX_NEEDED_INPUT bytes |
463 | available. */ |
464 | assert (inend != &bytebuf[MAX_NEEDED_INPUT]); |
465 | |
466 | *inptrp += inend - bytebuf - (state->__count & 7); |
467 | # ifdef STORE_REST |
468 | inptrp = &inptr; |
469 | |
470 | STORE_REST |
471 | # else |
472 | /* We don't have enough input for another complete input |
473 | character. */ |
474 | assert (inend - inptr > (state->__count & ~7)); |
475 | assert (inend - inptr <= sizeof (state->__value)); |
476 | state->__count = (state->__count & ~7) | (inend - inptr); |
477 | inlen = 0; |
478 | while (inptr < inend) |
479 | state->__value.__wchb[inlen++] = *inptr++; |
480 | # endif |
481 | } |
482 | |
483 | return result; |
484 | } |
485 | # undef SINGLE |
486 | # undef SINGLE2 |
487 | # endif |
488 | |
489 | |
490 | # ifdef ONEBYTE_BODY |
491 | /* Define the shortcut function for btowc. */ |
492 | static wint_t |
493 | gconv_btowc (struct __gconv_step *step, unsigned char c) |
494 | ONEBYTE_BODY |
495 | # define FROM_ONEBYTE gconv_btowc |
496 | # endif |
497 | |
498 | #endif |
499 | |
500 | /* We remove the macro definitions so that we can include this file again |
501 | for the definition of another function. */ |
502 | #undef MIN_NEEDED_INPUT |
503 | #undef MAX_NEEDED_INPUT |
504 | #undef MIN_NEEDED_OUTPUT |
505 | #undef MAX_NEEDED_OUTPUT |
506 | #undef LOOPFCT |
507 | #undef BODY |
508 | #undef LOOPFCT |
509 | #undef EXTRA_LOOP_DECLS |
510 | #undef INIT_PARAMS |
511 | #undef UPDATE_PARAMS |
512 | #undef REINIT_PARAMS |
513 | #undef ONEBYTE_BODY |
514 | #undef UNPACK_BYTES |
515 | #undef CLEAR_STATE |
516 | #undef LOOP_NEED_STATE |
517 | #undef LOOP_NEED_FLAGS |
518 | #undef LOOP_NEED_DATA |
519 | #undef get16 |
520 | #undef get32 |
521 | #undef put16 |
522 | #undef put32 |
523 | #undef unaligned |
524 | |