1 | /* nfkc.c Unicode normalization utilities. |
2 | * Copyright (C) 2002, 2003 Simon Josefsson |
3 | * |
4 | * This file is part of GNU Libidn. |
5 | * |
6 | * GNU Libidn is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * GNU Libidn is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #if HAVE_CONFIG_H |
21 | # include "config.h" |
22 | #endif |
23 | |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <stdint.h> |
27 | |
28 | #include "stringprep.h" |
29 | |
30 | /* This file contains functions from GLIB, including gutf8.c and |
31 | * gunidecomp.c, all licensed under LGPL and copyright hold by: |
32 | * |
33 | * Copyright (C) 1999, 2000 Tom Tromey |
34 | * Copyright 2000 Red Hat, Inc. |
35 | */ |
36 | |
37 | /* Hacks to make syncing with GLIB code easier. */ |
38 | #define gboolean int |
39 | #define gchar char |
40 | #define guchar unsigned char |
41 | #define glong long |
42 | #define gint int |
43 | #define guint unsigned int |
44 | #define gushort unsigned short |
45 | #define gint16 int16_t |
46 | #define guint16 uint16_t |
47 | #define gunichar uint32_t |
48 | #define gsize size_t |
49 | #define gssize ssize_t |
50 | #define g_malloc malloc |
51 | #define g_free free |
52 | #define GError void |
53 | #define g_set_error(a,b,c,d) ((void) 0) |
54 | #define g_new(struct_type, n_structs) \ |
55 | ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs)))) |
56 | # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus) |
57 | # define G_STMT_START (void)( |
58 | # define G_STMT_END ) |
59 | # else |
60 | # if (defined (sun) || defined (__sun__)) |
61 | # define G_STMT_START if (1) |
62 | # define G_STMT_END else (void)0 |
63 | # else |
64 | # define G_STMT_START do |
65 | # define G_STMT_END while (0) |
66 | # endif |
67 | # endif |
68 | #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END |
69 | #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) |
70 | #define TRUE 1 |
71 | #define FALSE 0 |
72 | |
73 | /* Code from GLIB gunicode.h starts here. */ |
74 | |
75 | typedef enum |
76 | { |
77 | G_NORMALIZE_DEFAULT, |
78 | G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, |
79 | G_NORMALIZE_DEFAULT_COMPOSE, |
80 | G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, |
81 | G_NORMALIZE_ALL, |
82 | G_NORMALIZE_NFKD = G_NORMALIZE_ALL, |
83 | G_NORMALIZE_ALL_COMPOSE, |
84 | G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE |
85 | } |
86 | GNormalizeMode; |
87 | |
88 | /* Code from GLIB gutf8.c starts here. */ |
89 | |
90 | #define UTF8_COMPUTE(Char, Mask, Len) \ |
91 | if (Char < 128) \ |
92 | { \ |
93 | Len = 1; \ |
94 | Mask = 0x7f; \ |
95 | } \ |
96 | else if ((Char & 0xe0) == 0xc0) \ |
97 | { \ |
98 | Len = 2; \ |
99 | Mask = 0x1f; \ |
100 | } \ |
101 | else if ((Char & 0xf0) == 0xe0) \ |
102 | { \ |
103 | Len = 3; \ |
104 | Mask = 0x0f; \ |
105 | } \ |
106 | else if ((Char & 0xf8) == 0xf0) \ |
107 | { \ |
108 | Len = 4; \ |
109 | Mask = 0x07; \ |
110 | } \ |
111 | else if ((Char & 0xfc) == 0xf8) \ |
112 | { \ |
113 | Len = 5; \ |
114 | Mask = 0x03; \ |
115 | } \ |
116 | else if ((Char & 0xfe) == 0xfc) \ |
117 | { \ |
118 | Len = 6; \ |
119 | Mask = 0x01; \ |
120 | } \ |
121 | else \ |
122 | Len = -1; |
123 | |
124 | #define UTF8_LENGTH(Char) \ |
125 | ((Char) < 0x80 ? 1 : \ |
126 | ((Char) < 0x800 ? 2 : \ |
127 | ((Char) < 0x10000 ? 3 : \ |
128 | ((Char) < 0x200000 ? 4 : \ |
129 | ((Char) < 0x4000000 ? 5 : 6))))) |
130 | |
131 | |
132 | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
133 | (Result) = (Chars)[0] & (Mask); \ |
134 | for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
135 | { \ |
136 | if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
137 | { \ |
138 | (Result) = -1; \ |
139 | break; \ |
140 | } \ |
141 | (Result) <<= 6; \ |
142 | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
143 | } |
144 | |
145 | #define UNICODE_VALID(Char) \ |
146 | ((Char) < 0x110000 && \ |
147 | (((Char) & 0xFFFFF800) != 0xD800) && \ |
148 | ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ |
149 | ((Char) & 0xFFFE) != 0xFFFE) |
150 | |
151 | |
152 | static const gchar utf8_skip_data[256] = { |
153 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
154 | 1, 1, 1, 1, 1, 1, 1, |
155 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
156 | 1, 1, 1, 1, 1, 1, 1, |
157 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
158 | 1, 1, 1, 1, 1, 1, 1, |
159 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
160 | 1, 1, 1, 1, 1, 1, 1, |
161 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
162 | 1, 1, 1, 1, 1, 1, 1, |
163 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
164 | 1, 1, 1, 1, 1, 1, 1, |
165 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
166 | 2, 2, 2, 2, 2, 2, 2, |
167 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, |
168 | 5, 5, 5, 6, 6, 1, 1 |
169 | }; |
170 | |
171 | const gchar *const g_utf8_skip = utf8_skip_data; |
172 | |
173 | #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) |
174 | |
175 | /* |
176 | * g_utf8_strlen: |
177 | * @p: pointer to the start of a UTF-8 encoded string. |
178 | * @max: the maximum number of bytes to examine. If @max |
179 | * is less than 0, then the string is assumed to be |
180 | * nul-terminated. If @max is 0, @p will not be examined and |
181 | * may be %NULL. |
182 | * |
183 | * Returns the length of the string in characters. |
184 | * |
185 | * Return value: the length of the string in characters |
186 | **/ |
187 | static glong |
188 | g_utf8_strlen (const gchar * p, gssize max) |
189 | { |
190 | glong len = 0; |
191 | const gchar *start = p; |
192 | g_return_val_if_fail (p != NULL || max == 0, 0); |
193 | |
194 | if (max < 0) |
195 | { |
196 | while (*p) |
197 | { |
198 | p = g_utf8_next_char (p); |
199 | ++len; |
200 | } |
201 | } |
202 | else |
203 | { |
204 | if (max == 0 || !*p) |
205 | return 0; |
206 | |
207 | p = g_utf8_next_char (p); |
208 | |
209 | while (p - start < max && *p) |
210 | { |
211 | ++len; |
212 | p = g_utf8_next_char (p); |
213 | } |
214 | |
215 | /* only do the last len increment if we got a complete |
216 | * char (don't count partial chars) |
217 | */ |
218 | if (p - start == max) |
219 | ++len; |
220 | } |
221 | |
222 | return len; |
223 | } |
224 | |
225 | /* |
226 | * g_utf8_get_char: |
227 | * @p: a pointer to Unicode character encoded as UTF-8 |
228 | * |
229 | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
230 | * If @p does not point to a valid UTF-8 encoded character, results are |
231 | * undefined. If you are not sure that the bytes are complete |
232 | * valid Unicode characters, you should use g_utf8_get_char_validated() |
233 | * instead. |
234 | * |
235 | * Return value: the resulting character |
236 | **/ |
237 | static gunichar |
238 | g_utf8_get_char (const gchar * p) |
239 | { |
240 | int i, mask = 0, len; |
241 | gunichar result; |
242 | unsigned char c = (unsigned char) *p; |
243 | |
244 | UTF8_COMPUTE (c, mask, len); |
245 | if (len == -1) |
246 | return (gunichar) - 1; |
247 | UTF8_GET (result, p, i, mask, len); |
248 | |
249 | return result; |
250 | } |
251 | |
252 | /* |
253 | * g_unichar_to_utf8: |
254 | * @c: a ISO10646 character code |
255 | * @outbuf: output buffer, must have at least 6 bytes of space. |
256 | * If %NULL, the length will be computed and returned |
257 | * and nothing will be written to @outbuf. |
258 | * |
259 | * Converts a single character to UTF-8. |
260 | * |
261 | * Return value: number of bytes written |
262 | **/ |
263 | static int |
264 | g_unichar_to_utf8 (gunichar c, gchar * outbuf) |
265 | { |
266 | guint len = 0; |
267 | int first; |
268 | int i; |
269 | |
270 | if (c < 0x80) |
271 | { |
272 | first = 0; |
273 | len = 1; |
274 | } |
275 | else if (c < 0x800) |
276 | { |
277 | first = 0xc0; |
278 | len = 2; |
279 | } |
280 | else if (c < 0x10000) |
281 | { |
282 | first = 0xe0; |
283 | len = 3; |
284 | } |
285 | else if (c < 0x200000) |
286 | { |
287 | first = 0xf0; |
288 | len = 4; |
289 | } |
290 | else if (c < 0x4000000) |
291 | { |
292 | first = 0xf8; |
293 | len = 5; |
294 | } |
295 | else |
296 | { |
297 | first = 0xfc; |
298 | len = 6; |
299 | } |
300 | |
301 | if (outbuf) |
302 | { |
303 | for (i = len - 1; i > 0; --i) |
304 | { |
305 | outbuf[i] = (c & 0x3f) | 0x80; |
306 | c >>= 6; |
307 | } |
308 | outbuf[0] = c | first; |
309 | } |
310 | |
311 | return len; |
312 | } |
313 | |
314 | /* |
315 | * g_utf8_to_ucs4_fast: |
316 | * @str: a UTF-8 encoded string |
317 | * @len: the maximum length of @str to use. If @len < 0, then |
318 | * the string is nul-terminated. |
319 | * @items_written: location to store the number of characters in the |
320 | * result, or %NULL. |
321 | * |
322 | * Convert a string from UTF-8 to a 32-bit fixed width |
323 | * representation as UCS-4, assuming valid UTF-8 input. |
324 | * This function is roughly twice as fast as g_utf8_to_ucs4() |
325 | * but does no error checking on the input. |
326 | * |
327 | * Return value: a pointer to a newly allocated UCS-4 string. |
328 | * This value must be freed with g_free(). |
329 | **/ |
330 | static gunichar * |
331 | g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written) |
332 | { |
333 | gint j, charlen; |
334 | gunichar *result; |
335 | gint n_chars, i; |
336 | const gchar *p; |
337 | |
338 | g_return_val_if_fail (str != NULL, NULL); |
339 | |
340 | p = str; |
341 | n_chars = 0; |
342 | if (len < 0) |
343 | { |
344 | while (*p) |
345 | { |
346 | p = g_utf8_next_char (p); |
347 | ++n_chars; |
348 | } |
349 | } |
350 | else |
351 | { |
352 | while (p < str + len && *p) |
353 | { |
354 | p = g_utf8_next_char (p); |
355 | ++n_chars; |
356 | } |
357 | } |
358 | |
359 | result = g_new (gunichar, n_chars + 1); |
360 | if (!result) |
361 | return NULL; |
362 | |
363 | p = str; |
364 | for (i = 0; i < n_chars; i++) |
365 | { |
366 | gunichar wc = ((unsigned char *) p)[0]; |
367 | |
368 | if (wc < 0x80) |
369 | { |
370 | result[i] = wc; |
371 | p++; |
372 | } |
373 | else |
374 | { |
375 | if (wc < 0xe0) |
376 | { |
377 | charlen = 2; |
378 | wc &= 0x1f; |
379 | } |
380 | else if (wc < 0xf0) |
381 | { |
382 | charlen = 3; |
383 | wc &= 0x0f; |
384 | } |
385 | else if (wc < 0xf8) |
386 | { |
387 | charlen = 4; |
388 | wc &= 0x07; |
389 | } |
390 | else if (wc < 0xfc) |
391 | { |
392 | charlen = 5; |
393 | wc &= 0x03; |
394 | } |
395 | else |
396 | { |
397 | charlen = 6; |
398 | wc &= 0x01; |
399 | } |
400 | |
401 | for (j = 1; j < charlen; j++) |
402 | { |
403 | wc <<= 6; |
404 | wc |= ((unsigned char *) p)[j] & 0x3f; |
405 | } |
406 | |
407 | result[i] = wc; |
408 | p += charlen; |
409 | } |
410 | } |
411 | result[i] = 0; |
412 | |
413 | if (items_written) |
414 | *items_written = i; |
415 | |
416 | return result; |
417 | } |
418 | |
419 | /* |
420 | * g_ucs4_to_utf8: |
421 | * @str: a UCS-4 encoded string |
422 | * @len: the maximum length of @str to use. If @len < 0, then |
423 | * the string is terminated with a 0 character. |
424 | * @items_read: location to store number of characters read read, or %NULL. |
425 | * @items_written: location to store number of bytes written or %NULL. |
426 | * The value here stored does not include the trailing 0 |
427 | * byte. |
428 | * @error: location to store the error occuring, or %NULL to ignore |
429 | * errors. Any of the errors in #GConvertError other than |
430 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
431 | * |
432 | * Convert a string from a 32-bit fixed width representation as UCS-4. |
433 | * to UTF-8. The result will be terminated with a 0 byte. |
434 | * |
435 | * Return value: a pointer to a newly allocated UTF-8 string. |
436 | * This value must be freed with g_free(). If an |
437 | * error occurs, %NULL will be returned and |
438 | * @error set. |
439 | **/ |
440 | static gchar * |
441 | g_ucs4_to_utf8 (const gunichar * str, |
442 | glong len, |
443 | glong * items_read, glong * items_written, GError ** error) |
444 | { |
445 | gint result_length; |
446 | gchar *result = NULL; |
447 | gchar *p; |
448 | gint i; |
449 | |
450 | result_length = 0; |
451 | for (i = 0; len < 0 || i < len; i++) |
452 | { |
453 | if (!str[i]) |
454 | break; |
455 | |
456 | if (str[i] >= 0x80000000) |
457 | { |
458 | if (items_read) |
459 | *items_read = i; |
460 | |
461 | g_set_error (error, G_CONVERT_ERROR, |
462 | G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
463 | _("Character out of range for UTF-8" )); |
464 | goto err_out; |
465 | } |
466 | |
467 | result_length += UTF8_LENGTH (str[i]); |
468 | } |
469 | |
470 | result = g_malloc (result_length + 1); |
471 | if (!result) |
472 | return NULL; |
473 | p = result; |
474 | |
475 | i = 0; |
476 | while (p < result + result_length) |
477 | p += g_unichar_to_utf8 (str[i++], p); |
478 | |
479 | *p = '\0'; |
480 | |
481 | if (items_written) |
482 | *items_written = p - result; |
483 | |
484 | err_out: |
485 | if (items_read) |
486 | *items_read = i; |
487 | |
488 | return result; |
489 | } |
490 | |
491 | /* Code from GLIB gunidecomp.c starts here. */ |
492 | |
493 | #include "gunidecomp.h" |
494 | #include "gunicomp.h" |
495 | |
496 | #define CC_PART1(Page, Char) \ |
497 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
498 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
499 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
500 | |
501 | #define CC_PART2(Page, Char) \ |
502 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
503 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
504 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
505 | |
506 | #define COMBINING_CLASS(Char) \ |
507 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
508 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
509 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
510 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
511 | : 0)) |
512 | |
513 | /* constants for hangul syllable [de]composition */ |
514 | #define SBase 0xAC00 |
515 | #define LBase 0x1100 |
516 | #define VBase 0x1161 |
517 | #define TBase 0x11A7 |
518 | #define LCount 19 |
519 | #define VCount 21 |
520 | #define TCount 28 |
521 | #define NCount (VCount * TCount) |
522 | #define SCount (LCount * NCount) |
523 | |
524 | /* |
525 | * g_unicode_canonical_ordering: |
526 | * @string: a UCS-4 encoded string. |
527 | * @len: the maximum length of @string to use. |
528 | * |
529 | * Computes the canonical ordering of a string in-place. |
530 | * This rearranges decomposed characters in the string |
531 | * according to their combining classes. See the Unicode |
532 | * manual for more information. |
533 | **/ |
534 | static void |
535 | g_unicode_canonical_ordering (gunichar * string, gsize len) |
536 | { |
537 | gsize i; |
538 | int swap = 1; |
539 | |
540 | while (swap) |
541 | { |
542 | int last; |
543 | swap = 0; |
544 | last = COMBINING_CLASS (string[0]); |
545 | for (i = 0; i < len - 1; ++i) |
546 | { |
547 | int next = COMBINING_CLASS (string[i + 1]); |
548 | if (next != 0 && last > next) |
549 | { |
550 | gsize j; |
551 | /* Percolate item leftward through string. */ |
552 | for (j = i + 1; j > 0; --j) |
553 | { |
554 | gunichar t; |
555 | if (COMBINING_CLASS (string[j - 1]) <= next) |
556 | break; |
557 | t = string[j]; |
558 | string[j] = string[j - 1]; |
559 | string[j - 1] = t; |
560 | swap = 1; |
561 | } |
562 | /* We're re-entering the loop looking at the old |
563 | character again. */ |
564 | next = last; |
565 | } |
566 | last = next; |
567 | } |
568 | } |
569 | } |
570 | |
571 | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
572 | * r should be null or have sufficient space. Calling with r == NULL will |
573 | * only calculate the result_len; however, a buffer with space for three |
574 | * characters will always be big enough. */ |
575 | static void |
576 | decompose_hangul (gunichar s, gunichar * r, gsize * result_len) |
577 | { |
578 | gint SIndex = s - SBase; |
579 | |
580 | /* not a hangul syllable */ |
581 | if (SIndex < 0 || SIndex >= SCount) |
582 | { |
583 | if (r) |
584 | r[0] = s; |
585 | *result_len = 1; |
586 | } |
587 | else |
588 | { |
589 | gunichar L = LBase + SIndex / NCount; |
590 | gunichar V = VBase + (SIndex % NCount) / TCount; |
591 | gunichar T = TBase + SIndex % TCount; |
592 | |
593 | if (r) |
594 | { |
595 | r[0] = L; |
596 | r[1] = V; |
597 | } |
598 | |
599 | if (T != TBase) |
600 | { |
601 | if (r) |
602 | r[2] = T; |
603 | *result_len = 3; |
604 | } |
605 | else |
606 | *result_len = 2; |
607 | } |
608 | } |
609 | |
610 | /* returns a pointer to a null-terminated UTF-8 string */ |
611 | static const gchar * |
612 | find_decomposition (gunichar ch, gboolean compat) |
613 | { |
614 | int start = 0; |
615 | int end = G_N_ELEMENTS (decomp_table); |
616 | |
617 | if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch) |
618 | { |
619 | while (TRUE) |
620 | { |
621 | int half = (start + end) / 2; |
622 | if (ch == decomp_table[half].ch) |
623 | { |
624 | int offset; |
625 | |
626 | if (compat) |
627 | { |
628 | offset = decomp_table[half].compat_offset; |
629 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
630 | offset = decomp_table[half].canon_offset; |
631 | } |
632 | else |
633 | { |
634 | offset = decomp_table[half].canon_offset; |
635 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
636 | return NULL; |
637 | } |
638 | |
639 | return &(decomp_expansion_string[offset]); |
640 | } |
641 | else if (half == start) |
642 | break; |
643 | else if (ch > decomp_table[half].ch) |
644 | start = half; |
645 | else |
646 | end = half; |
647 | } |
648 | } |
649 | |
650 | return NULL; |
651 | } |
652 | |
653 | /* L,V => LV and LV,T => LVT */ |
654 | static gboolean |
655 | combine_hangul (gunichar a, gunichar b, gunichar * result) |
656 | { |
657 | gint LIndex = a - LBase; |
658 | gint SIndex = a - SBase; |
659 | |
660 | gint VIndex = b - VBase; |
661 | gint TIndex = b - TBase; |
662 | |
663 | if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) |
664 | { |
665 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
666 | return TRUE; |
667 | } |
668 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
669 | && 0 <= TIndex && TIndex <= TCount) |
670 | { |
671 | *result = a + TIndex; |
672 | return TRUE; |
673 | } |
674 | |
675 | return FALSE; |
676 | } |
677 | |
678 | #define CI(Page, Char) \ |
679 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
680 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
681 | : (compose_data[compose_table[Page]][Char])) |
682 | |
683 | #define COMPOSE_INDEX(Char) \ |
684 | ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
685 | |
686 | static gboolean |
687 | combine (gunichar a, gunichar b, gunichar * result) |
688 | { |
689 | gushort index_a, index_b; |
690 | |
691 | if (combine_hangul (a, b, result)) |
692 | return TRUE; |
693 | |
694 | index_a = COMPOSE_INDEX (a); |
695 | |
696 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
697 | { |
698 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
699 | { |
700 | *result = |
701 | compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
702 | return TRUE; |
703 | } |
704 | else |
705 | return FALSE; |
706 | } |
707 | |
708 | index_b = COMPOSE_INDEX (b); |
709 | |
710 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
711 | { |
712 | if (a == |
713 | compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
714 | { |
715 | *result = |
716 | compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
717 | return TRUE; |
718 | } |
719 | else |
720 | return FALSE; |
721 | } |
722 | |
723 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START |
724 | && index_b >= COMPOSE_SECOND_START |
725 | && index_b < COMPOSE_SECOND_SINGLE_START) |
726 | { |
727 | gunichar res = |
728 | compose_array[index_a - COMPOSE_FIRST_START][index_b - |
729 | COMPOSE_SECOND_START]; |
730 | |
731 | if (res) |
732 | { |
733 | *result = res; |
734 | return TRUE; |
735 | } |
736 | } |
737 | |
738 | return FALSE; |
739 | } |
740 | |
741 | static gunichar * |
742 | _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) |
743 | { |
744 | gsize n_wc; |
745 | gunichar *wc_buffer; |
746 | const char *p; |
747 | gsize last_start; |
748 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); |
749 | gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); |
750 | |
751 | n_wc = 0; |
752 | p = str; |
753 | while ((max_len < 0 || p < str + max_len) && *p) |
754 | { |
755 | const gchar *decomp; |
756 | gunichar wc = g_utf8_get_char (p); |
757 | |
758 | if (wc >= 0xac00 && wc <= 0xd7af) |
759 | { |
760 | gsize result_len; |
761 | decompose_hangul (wc, NULL, &result_len); |
762 | n_wc += result_len; |
763 | } |
764 | else |
765 | { |
766 | decomp = find_decomposition (wc, do_compat); |
767 | |
768 | if (decomp) |
769 | n_wc += g_utf8_strlen (decomp, -1); |
770 | else |
771 | n_wc++; |
772 | } |
773 | |
774 | p = g_utf8_next_char (p); |
775 | } |
776 | |
777 | wc_buffer = g_new (gunichar, n_wc + 1); |
778 | if (!wc_buffer) |
779 | return NULL; |
780 | |
781 | last_start = 0; |
782 | n_wc = 0; |
783 | p = str; |
784 | while ((max_len < 0 || p < str + max_len) && *p) |
785 | { |
786 | gunichar wc = g_utf8_get_char (p); |
787 | const gchar *decomp; |
788 | int cc; |
789 | gsize old_n_wc = n_wc; |
790 | |
791 | if (wc >= 0xac00 && wc <= 0xd7af) |
792 | { |
793 | gsize result_len; |
794 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
795 | n_wc += result_len; |
796 | } |
797 | else |
798 | { |
799 | decomp = find_decomposition (wc, do_compat); |
800 | |
801 | if (decomp) |
802 | { |
803 | const char *pd; |
804 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
805 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
806 | } |
807 | else |
808 | wc_buffer[n_wc++] = wc; |
809 | } |
810 | |
811 | if (n_wc > 0) |
812 | { |
813 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
814 | |
815 | if (cc == 0) |
816 | { |
817 | g_unicode_canonical_ordering (wc_buffer + last_start, |
818 | n_wc - last_start); |
819 | last_start = old_n_wc; |
820 | } |
821 | } |
822 | |
823 | p = g_utf8_next_char (p); |
824 | } |
825 | |
826 | if (n_wc > 0) |
827 | { |
828 | g_unicode_canonical_ordering (wc_buffer + last_start, |
829 | n_wc - last_start); |
830 | last_start = n_wc; |
831 | } |
832 | |
833 | wc_buffer[n_wc] = 0; |
834 | |
835 | /* All decomposed and reordered */ |
836 | |
837 | if (do_compose && n_wc > 0) |
838 | { |
839 | gsize i, j; |
840 | int last_cc = 0; |
841 | last_start = 0; |
842 | |
843 | for (i = 0; i < n_wc; i++) |
844 | { |
845 | int cc = COMBINING_CLASS (wc_buffer[i]); |
846 | |
847 | if (i > 0 && |
848 | (last_cc == 0 || last_cc != cc) && |
849 | combine (wc_buffer[last_start], wc_buffer[i], |
850 | &wc_buffer[last_start])) |
851 | { |
852 | for (j = i + 1; j < n_wc; j++) |
853 | wc_buffer[j - 1] = wc_buffer[j]; |
854 | n_wc--; |
855 | i--; |
856 | |
857 | if (i == last_start) |
858 | last_cc = 0; |
859 | else |
860 | last_cc = COMBINING_CLASS (wc_buffer[i - 1]); |
861 | |
862 | continue; |
863 | } |
864 | |
865 | if (cc == 0) |
866 | last_start = i; |
867 | |
868 | last_cc = cc; |
869 | } |
870 | } |
871 | |
872 | wc_buffer[n_wc] = 0; |
873 | |
874 | return wc_buffer; |
875 | } |
876 | |
877 | /* |
878 | * g_utf8_normalize: |
879 | * @str: a UTF-8 encoded string. |
880 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
881 | * @mode: the type of normalization to perform. |
882 | * |
883 | * Converts a string into canonical form, standardizing |
884 | * such issues as whether a character with an accent |
885 | * is represented as a base character and combining |
886 | * accent or as a single precomposed character. You |
887 | * should generally call g_utf8_normalize() before |
888 | * comparing two Unicode strings. |
889 | * |
890 | * The normalization mode %G_NORMALIZE_DEFAULT only |
891 | * standardizes differences that do not affect the |
892 | * text content, such as the above-mentioned accent |
893 | * representation. %G_NORMALIZE_ALL also standardizes |
894 | * the "compatibility" characters in Unicode, such |
895 | * as SUPERSCRIPT THREE to the standard forms |
896 | * (in this case DIGIT THREE). Formatting information |
897 | * may be lost but for most text operations such |
898 | * characters should be considered the same. |
899 | * For example, g_utf8_collate() normalizes |
900 | * with %G_NORMALIZE_ALL as its first step. |
901 | * |
902 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
903 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
904 | * but returned a result with composed forms rather |
905 | * than a maximally decomposed form. This is often |
906 | * useful if you intend to convert the string to |
907 | * a legacy encoding or pass it to a system with |
908 | * less capable Unicode handling. |
909 | * |
910 | * Return value: a newly allocated string, that is the |
911 | * normalized form of @str. |
912 | **/ |
913 | static gchar * |
914 | g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode) |
915 | { |
916 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
917 | gchar *result; |
918 | |
919 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
920 | g_free (result_wc); |
921 | |
922 | return result; |
923 | } |
924 | |
925 | /* Public Libidn API starts here. */ |
926 | |
927 | /** |
928 | * stringprep_utf8_to_unichar: |
929 | * @p: a pointer to Unicode character encoded as UTF-8 |
930 | * |
931 | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
932 | * If @p does not point to a valid UTF-8 encoded character, results are |
933 | * undefined. |
934 | * |
935 | * Return value: the resulting character. |
936 | **/ |
937 | uint32_t |
938 | stringprep_utf8_to_unichar (const char *p) |
939 | { |
940 | return g_utf8_get_char (p); |
941 | } |
942 | |
943 | /** |
944 | * stringprep_unichar_to_utf8: |
945 | * @c: a ISO10646 character code |
946 | * @outbuf: output buffer, must have at least 6 bytes of space. |
947 | * If %NULL, the length will be computed and returned |
948 | * and nothing will be written to @outbuf. |
949 | * |
950 | * Converts a single character to UTF-8. |
951 | * |
952 | * Return value: number of bytes written. |
953 | **/ |
954 | int |
955 | stringprep_unichar_to_utf8 (uint32_t c, char *outbuf) |
956 | { |
957 | return g_unichar_to_utf8 (c, outbuf); |
958 | } |
959 | |
960 | /** |
961 | * stringprep_utf8_to_ucs4: |
962 | * @str: a UTF-8 encoded string |
963 | * @len: the maximum length of @str to use. If @len < 0, then |
964 | * the string is nul-terminated. |
965 | * @items_written: location to store the number of characters in the |
966 | * result, or %NULL. |
967 | * |
968 | * Convert a string from UTF-8 to a 32-bit fixed width |
969 | * representation as UCS-4, assuming valid UTF-8 input. |
970 | * This function does no error checking on the input. |
971 | * |
972 | * Return value: a pointer to a newly allocated UCS-4 string. |
973 | * This value must be freed with free(). |
974 | **/ |
975 | uint32_t * |
976 | stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written) |
977 | { |
978 | return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written); |
979 | } |
980 | |
981 | /** |
982 | * stringprep_ucs4_to_utf8: |
983 | * @str: a UCS-4 encoded string |
984 | * @len: the maximum length of @str to use. If @len < 0, then |
985 | * the string is terminated with a 0 character. |
986 | * @items_read: location to store number of characters read read, or %NULL. |
987 | * @items_written: location to store number of bytes written or %NULL. |
988 | * The value here stored does not include the trailing 0 |
989 | * byte. |
990 | * |
991 | * Convert a string from a 32-bit fixed width representation as UCS-4. |
992 | * to UTF-8. The result will be terminated with a 0 byte. |
993 | * |
994 | * Return value: a pointer to a newly allocated UTF-8 string. |
995 | * This value must be freed with free(). If an |
996 | * error occurs, %NULL will be returned and |
997 | * @error set. |
998 | **/ |
999 | char * |
1000 | stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len, |
1001 | size_t * items_read, size_t * items_written) |
1002 | { |
1003 | return g_ucs4_to_utf8 (str, len, (glong *) items_read, |
1004 | (glong *) items_written, NULL); |
1005 | } |
1006 | |
1007 | /** |
1008 | * stringprep_utf8_nfkc_normalize: |
1009 | * @str: a UTF-8 encoded string. |
1010 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1011 | * |
1012 | * Converts a string into canonical form, standardizing |
1013 | * such issues as whether a character with an accent |
1014 | * is represented as a base character and combining |
1015 | * accent or as a single precomposed character. |
1016 | * |
1017 | * The normalization mode is NFKC (ALL COMPOSE). It standardizes |
1018 | * differences that do not affect the text content, such as the |
1019 | * above-mentioned accent representation. It standardizes the |
1020 | * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to |
1021 | * the standard forms (in this case DIGIT THREE). Formatting |
1022 | * information may be lost but for most text operations such |
1023 | * characters should be considered the same. It returns a result with |
1024 | * composed forms rather than a maximally decomposed form. |
1025 | * |
1026 | * Return value: a newly allocated string, that is the |
1027 | * NFKC normalized form of @str. |
1028 | **/ |
1029 | char * |
1030 | stringprep_utf8_nfkc_normalize (const char *str, ssize_t len) |
1031 | { |
1032 | return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); |
1033 | } |
1034 | |
1035 | /** |
1036 | * stringprep_ucs4_nfkc_normalize: |
1037 | * @str: a Unicode string. |
1038 | * @len: length of @str array, or -1 if @str is nul-terminated. |
1039 | * |
1040 | * Converts UCS4 string into UTF-8 and runs |
1041 | * stringprep_utf8_nfkc_normalize(). |
1042 | * |
1043 | * Return value: a newly allocated Unicode string, that is the NFKC |
1044 | * normalized form of @str. |
1045 | **/ |
1046 | uint32_t * |
1047 | stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len) |
1048 | { |
1049 | char *p; |
1050 | uint32_t *result_wc; |
1051 | |
1052 | p = stringprep_ucs4_to_utf8 (str, len, 0, 0); |
1053 | result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC); |
1054 | free (p); |
1055 | |
1056 | return result_wc; |
1057 | } |
1058 | |