1/* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#if HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <stdlib.h>
25#include <string.h>
26#include <stdint.h>
27
28#include "stringprep.h"
29
30/* This file contains functions from GLIB, including gutf8.c and
31 * gunidecomp.c, all licensed under LGPL and copyright hold by:
32 *
33 * Copyright (C) 1999, 2000 Tom Tromey
34 * Copyright 2000 Red Hat, Inc.
35 */
36
37/* Hacks to make syncing with GLIB code easier. */
38#define gboolean int
39#define gchar char
40#define guchar unsigned char
41#define glong long
42#define gint int
43#define guint unsigned int
44#define gushort unsigned short
45#define gint16 int16_t
46#define guint16 uint16_t
47#define gunichar uint32_t
48#define gsize size_t
49#define gssize ssize_t
50#define g_malloc malloc
51#define g_free free
52#define GError void
53#define g_set_error(a,b,c,d) ((void) 0)
54#define g_new(struct_type, n_structs) \
55 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
56# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
57# define G_STMT_START (void)(
58# define G_STMT_END )
59# else
60# if (defined (sun) || defined (__sun__))
61# define G_STMT_START if (1)
62# define G_STMT_END else (void)0
63# else
64# define G_STMT_START do
65# define G_STMT_END while (0)
66# endif
67# endif
68#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
69#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
70#define TRUE 1
71#define FALSE 0
72
73/* Code from GLIB gunicode.h starts here. */
74
75typedef enum
76{
77 G_NORMALIZE_DEFAULT,
78 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
79 G_NORMALIZE_DEFAULT_COMPOSE,
80 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
81 G_NORMALIZE_ALL,
82 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
83 G_NORMALIZE_ALL_COMPOSE,
84 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
85}
86GNormalizeMode;
87
88/* Code from GLIB gutf8.c starts here. */
89
90#define UTF8_COMPUTE(Char, Mask, Len) \
91 if (Char < 128) \
92 { \
93 Len = 1; \
94 Mask = 0x7f; \
95 } \
96 else if ((Char & 0xe0) == 0xc0) \
97 { \
98 Len = 2; \
99 Mask = 0x1f; \
100 } \
101 else if ((Char & 0xf0) == 0xe0) \
102 { \
103 Len = 3; \
104 Mask = 0x0f; \
105 } \
106 else if ((Char & 0xf8) == 0xf0) \
107 { \
108 Len = 4; \
109 Mask = 0x07; \
110 } \
111 else if ((Char & 0xfc) == 0xf8) \
112 { \
113 Len = 5; \
114 Mask = 0x03; \
115 } \
116 else if ((Char & 0xfe) == 0xfc) \
117 { \
118 Len = 6; \
119 Mask = 0x01; \
120 } \
121 else \
122 Len = -1;
123
124#define UTF8_LENGTH(Char) \
125 ((Char) < 0x80 ? 1 : \
126 ((Char) < 0x800 ? 2 : \
127 ((Char) < 0x10000 ? 3 : \
128 ((Char) < 0x200000 ? 4 : \
129 ((Char) < 0x4000000 ? 5 : 6)))))
130
131
132#define UTF8_GET(Result, Chars, Count, Mask, Len) \
133 (Result) = (Chars)[0] & (Mask); \
134 for ((Count) = 1; (Count) < (Len); ++(Count)) \
135 { \
136 if (((Chars)[(Count)] & 0xc0) != 0x80) \
137 { \
138 (Result) = -1; \
139 break; \
140 } \
141 (Result) <<= 6; \
142 (Result) |= ((Chars)[(Count)] & 0x3f); \
143 }
144
145#define UNICODE_VALID(Char) \
146 ((Char) < 0x110000 && \
147 (((Char) & 0xFFFFF800) != 0xD800) && \
148 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
149 ((Char) & 0xFFFE) != 0xFFFE)
150
151
152static const gchar utf8_skip_data[256] = {
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1,
165 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 2, 2, 2, 2, 2, 2, 2,
167 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
168 5, 5, 5, 6, 6, 1, 1
169};
170
171const gchar *const g_utf8_skip = utf8_skip_data;
172
173#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
174
175/*
176 * g_utf8_strlen:
177 * @p: pointer to the start of a UTF-8 encoded string.
178 * @max: the maximum number of bytes to examine. If @max
179 * is less than 0, then the string is assumed to be
180 * nul-terminated. If @max is 0, @p will not be examined and
181 * may be %NULL.
182 *
183 * Returns the length of the string in characters.
184 *
185 * Return value: the length of the string in characters
186 **/
187static glong
188g_utf8_strlen (const gchar * p, gssize max)
189{
190 glong len = 0;
191 const gchar *start = p;
192 g_return_val_if_fail (p != NULL || max == 0, 0);
193
194 if (max < 0)
195 {
196 while (*p)
197 {
198 p = g_utf8_next_char (p);
199 ++len;
200 }
201 }
202 else
203 {
204 if (max == 0 || !*p)
205 return 0;
206
207 p = g_utf8_next_char (p);
208
209 while (p - start < max && *p)
210 {
211 ++len;
212 p = g_utf8_next_char (p);
213 }
214
215 /* only do the last len increment if we got a complete
216 * char (don't count partial chars)
217 */
218 if (p - start == max)
219 ++len;
220 }
221
222 return len;
223}
224
225/*
226 * g_utf8_get_char:
227 * @p: a pointer to Unicode character encoded as UTF-8
228 *
229 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
230 * If @p does not point to a valid UTF-8 encoded character, results are
231 * undefined. If you are not sure that the bytes are complete
232 * valid Unicode characters, you should use g_utf8_get_char_validated()
233 * instead.
234 *
235 * Return value: the resulting character
236 **/
237static gunichar
238g_utf8_get_char (const gchar * p)
239{
240 int i, mask = 0, len;
241 gunichar result;
242 unsigned char c = (unsigned char) *p;
243
244 UTF8_COMPUTE (c, mask, len);
245 if (len == -1)
246 return (gunichar) - 1;
247 UTF8_GET (result, p, i, mask, len);
248
249 return result;
250}
251
252/*
253 * g_unichar_to_utf8:
254 * @c: a ISO10646 character code
255 * @outbuf: output buffer, must have at least 6 bytes of space.
256 * If %NULL, the length will be computed and returned
257 * and nothing will be written to @outbuf.
258 *
259 * Converts a single character to UTF-8.
260 *
261 * Return value: number of bytes written
262 **/
263static int
264g_unichar_to_utf8 (gunichar c, gchar * outbuf)
265{
266 guint len = 0;
267 int first;
268 int i;
269
270 if (c < 0x80)
271 {
272 first = 0;
273 len = 1;
274 }
275 else if (c < 0x800)
276 {
277 first = 0xc0;
278 len = 2;
279 }
280 else if (c < 0x10000)
281 {
282 first = 0xe0;
283 len = 3;
284 }
285 else if (c < 0x200000)
286 {
287 first = 0xf0;
288 len = 4;
289 }
290 else if (c < 0x4000000)
291 {
292 first = 0xf8;
293 len = 5;
294 }
295 else
296 {
297 first = 0xfc;
298 len = 6;
299 }
300
301 if (outbuf)
302 {
303 for (i = len - 1; i > 0; --i)
304 {
305 outbuf[i] = (c & 0x3f) | 0x80;
306 c >>= 6;
307 }
308 outbuf[0] = c | first;
309 }
310
311 return len;
312}
313
314/*
315 * g_utf8_to_ucs4_fast:
316 * @str: a UTF-8 encoded string
317 * @len: the maximum length of @str to use. If @len < 0, then
318 * the string is nul-terminated.
319 * @items_written: location to store the number of characters in the
320 * result, or %NULL.
321 *
322 * Convert a string from UTF-8 to a 32-bit fixed width
323 * representation as UCS-4, assuming valid UTF-8 input.
324 * This function is roughly twice as fast as g_utf8_to_ucs4()
325 * but does no error checking on the input.
326 *
327 * Return value: a pointer to a newly allocated UCS-4 string.
328 * This value must be freed with g_free().
329 **/
330static gunichar *
331g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
332{
333 gint j, charlen;
334 gunichar *result;
335 gint n_chars, i;
336 const gchar *p;
337
338 g_return_val_if_fail (str != NULL, NULL);
339
340 p = str;
341 n_chars = 0;
342 if (len < 0)
343 {
344 while (*p)
345 {
346 p = g_utf8_next_char (p);
347 ++n_chars;
348 }
349 }
350 else
351 {
352 while (p < str + len && *p)
353 {
354 p = g_utf8_next_char (p);
355 ++n_chars;
356 }
357 }
358
359 result = g_new (gunichar, n_chars + 1);
360 if (!result)
361 return NULL;
362
363 p = str;
364 for (i = 0; i < n_chars; i++)
365 {
366 gunichar wc = ((unsigned char *) p)[0];
367
368 if (wc < 0x80)
369 {
370 result[i] = wc;
371 p++;
372 }
373 else
374 {
375 if (wc < 0xe0)
376 {
377 charlen = 2;
378 wc &= 0x1f;
379 }
380 else if (wc < 0xf0)
381 {
382 charlen = 3;
383 wc &= 0x0f;
384 }
385 else if (wc < 0xf8)
386 {
387 charlen = 4;
388 wc &= 0x07;
389 }
390 else if (wc < 0xfc)
391 {
392 charlen = 5;
393 wc &= 0x03;
394 }
395 else
396 {
397 charlen = 6;
398 wc &= 0x01;
399 }
400
401 for (j = 1; j < charlen; j++)
402 {
403 wc <<= 6;
404 wc |= ((unsigned char *) p)[j] & 0x3f;
405 }
406
407 result[i] = wc;
408 p += charlen;
409 }
410 }
411 result[i] = 0;
412
413 if (items_written)
414 *items_written = i;
415
416 return result;
417}
418
419/*
420 * g_ucs4_to_utf8:
421 * @str: a UCS-4 encoded string
422 * @len: the maximum length of @str to use. If @len < 0, then
423 * the string is terminated with a 0 character.
424 * @items_read: location to store number of characters read read, or %NULL.
425 * @items_written: location to store number of bytes written or %NULL.
426 * The value here stored does not include the trailing 0
427 * byte.
428 * @error: location to store the error occuring, or %NULL to ignore
429 * errors. Any of the errors in #GConvertError other than
430 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
431 *
432 * Convert a string from a 32-bit fixed width representation as UCS-4.
433 * to UTF-8. The result will be terminated with a 0 byte.
434 *
435 * Return value: a pointer to a newly allocated UTF-8 string.
436 * This value must be freed with g_free(). If an
437 * error occurs, %NULL will be returned and
438 * @error set.
439 **/
440static gchar *
441g_ucs4_to_utf8 (const gunichar * str,
442 glong len,
443 glong * items_read, glong * items_written, GError ** error)
444{
445 gint result_length;
446 gchar *result = NULL;
447 gchar *p;
448 gint i;
449
450 result_length = 0;
451 for (i = 0; len < 0 || i < len; i++)
452 {
453 if (!str[i])
454 break;
455
456 if (str[i] >= 0x80000000)
457 {
458 if (items_read)
459 *items_read = i;
460
461 g_set_error (error, G_CONVERT_ERROR,
462 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
463 _("Character out of range for UTF-8"));
464 goto err_out;
465 }
466
467 result_length += UTF8_LENGTH (str[i]);
468 }
469
470 result = g_malloc (result_length + 1);
471 if (!result)
472 return NULL;
473 p = result;
474
475 i = 0;
476 while (p < result + result_length)
477 p += g_unichar_to_utf8 (str[i++], p);
478
479 *p = '\0';
480
481 if (items_written)
482 *items_written = p - result;
483
484err_out:
485 if (items_read)
486 *items_read = i;
487
488 return result;
489}
490
491/* Code from GLIB gunidecomp.c starts here. */
492
493#include "gunidecomp.h"
494#include "gunicomp.h"
495
496#define CC_PART1(Page, Char) \
497 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
498 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
499 : (cclass_data[combining_class_table_part1[Page]][Char]))
500
501#define CC_PART2(Page, Char) \
502 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
503 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
504 : (cclass_data[combining_class_table_part2[Page]][Char]))
505
506#define COMBINING_CLASS(Char) \
507 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
508 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
509 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
510 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
511 : 0))
512
513/* constants for hangul syllable [de]composition */
514#define SBase 0xAC00
515#define LBase 0x1100
516#define VBase 0x1161
517#define TBase 0x11A7
518#define LCount 19
519#define VCount 21
520#define TCount 28
521#define NCount (VCount * TCount)
522#define SCount (LCount * NCount)
523
524/*
525 * g_unicode_canonical_ordering:
526 * @string: a UCS-4 encoded string.
527 * @len: the maximum length of @string to use.
528 *
529 * Computes the canonical ordering of a string in-place.
530 * This rearranges decomposed characters in the string
531 * according to their combining classes. See the Unicode
532 * manual for more information.
533 **/
534static void
535g_unicode_canonical_ordering (gunichar * string, gsize len)
536{
537 gsize i;
538 int swap = 1;
539
540 while (swap)
541 {
542 int last;
543 swap = 0;
544 last = COMBINING_CLASS (string[0]);
545 for (i = 0; i < len - 1; ++i)
546 {
547 int next = COMBINING_CLASS (string[i + 1]);
548 if (next != 0 && last > next)
549 {
550 gsize j;
551 /* Percolate item leftward through string. */
552 for (j = i + 1; j > 0; --j)
553 {
554 gunichar t;
555 if (COMBINING_CLASS (string[j - 1]) <= next)
556 break;
557 t = string[j];
558 string[j] = string[j - 1];
559 string[j - 1] = t;
560 swap = 1;
561 }
562 /* We're re-entering the loop looking at the old
563 character again. */
564 next = last;
565 }
566 last = next;
567 }
568 }
569}
570
571/* http://www.unicode.org/unicode/reports/tr15/#Hangul
572 * r should be null or have sufficient space. Calling with r == NULL will
573 * only calculate the result_len; however, a buffer with space for three
574 * characters will always be big enough. */
575static void
576decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
577{
578 gint SIndex = s - SBase;
579
580 /* not a hangul syllable */
581 if (SIndex < 0 || SIndex >= SCount)
582 {
583 if (r)
584 r[0] = s;
585 *result_len = 1;
586 }
587 else
588 {
589 gunichar L = LBase + SIndex / NCount;
590 gunichar V = VBase + (SIndex % NCount) / TCount;
591 gunichar T = TBase + SIndex % TCount;
592
593 if (r)
594 {
595 r[0] = L;
596 r[1] = V;
597 }
598
599 if (T != TBase)
600 {
601 if (r)
602 r[2] = T;
603 *result_len = 3;
604 }
605 else
606 *result_len = 2;
607 }
608}
609
610/* returns a pointer to a null-terminated UTF-8 string */
611static const gchar *
612find_decomposition (gunichar ch, gboolean compat)
613{
614 int start = 0;
615 int end = G_N_ELEMENTS (decomp_table);
616
617 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
618 {
619 while (TRUE)
620 {
621 int half = (start + end) / 2;
622 if (ch == decomp_table[half].ch)
623 {
624 int offset;
625
626 if (compat)
627 {
628 offset = decomp_table[half].compat_offset;
629 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
630 offset = decomp_table[half].canon_offset;
631 }
632 else
633 {
634 offset = decomp_table[half].canon_offset;
635 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
636 return NULL;
637 }
638
639 return &(decomp_expansion_string[offset]);
640 }
641 else if (half == start)
642 break;
643 else if (ch > decomp_table[half].ch)
644 start = half;
645 else
646 end = half;
647 }
648 }
649
650 return NULL;
651}
652
653/* L,V => LV and LV,T => LVT */
654static gboolean
655combine_hangul (gunichar a, gunichar b, gunichar * result)
656{
657 gint LIndex = a - LBase;
658 gint SIndex = a - SBase;
659
660 gint VIndex = b - VBase;
661 gint TIndex = b - TBase;
662
663 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
664 {
665 *result = SBase + (LIndex * VCount + VIndex) * TCount;
666 return TRUE;
667 }
668 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
669 && 0 <= TIndex && TIndex <= TCount)
670 {
671 *result = a + TIndex;
672 return TRUE;
673 }
674
675 return FALSE;
676}
677
678#define CI(Page, Char) \
679 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
680 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
681 : (compose_data[compose_table[Page]][Char]))
682
683#define COMPOSE_INDEX(Char) \
684 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
685
686static gboolean
687combine (gunichar a, gunichar b, gunichar * result)
688{
689 gushort index_a, index_b;
690
691 if (combine_hangul (a, b, result))
692 return TRUE;
693
694 index_a = COMPOSE_INDEX (a);
695
696 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
697 {
698 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
699 {
700 *result =
701 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
702 return TRUE;
703 }
704 else
705 return FALSE;
706 }
707
708 index_b = COMPOSE_INDEX (b);
709
710 if (index_b >= COMPOSE_SECOND_SINGLE_START)
711 {
712 if (a ==
713 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
714 {
715 *result =
716 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
717 return TRUE;
718 }
719 else
720 return FALSE;
721 }
722
723 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
724 && index_b >= COMPOSE_SECOND_START
725 && index_b < COMPOSE_SECOND_SINGLE_START)
726 {
727 gunichar res =
728 compose_array[index_a - COMPOSE_FIRST_START][index_b -
729 COMPOSE_SECOND_START];
730
731 if (res)
732 {
733 *result = res;
734 return TRUE;
735 }
736 }
737
738 return FALSE;
739}
740
741static gunichar *
742_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
743{
744 gsize n_wc;
745 gunichar *wc_buffer;
746 const char *p;
747 gsize last_start;
748 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
749 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
750
751 n_wc = 0;
752 p = str;
753 while ((max_len < 0 || p < str + max_len) && *p)
754 {
755 const gchar *decomp;
756 gunichar wc = g_utf8_get_char (p);
757
758 if (wc >= 0xac00 && wc <= 0xd7af)
759 {
760 gsize result_len;
761 decompose_hangul (wc, NULL, &result_len);
762 n_wc += result_len;
763 }
764 else
765 {
766 decomp = find_decomposition (wc, do_compat);
767
768 if (decomp)
769 n_wc += g_utf8_strlen (decomp, -1);
770 else
771 n_wc++;
772 }
773
774 p = g_utf8_next_char (p);
775 }
776
777 wc_buffer = g_new (gunichar, n_wc + 1);
778 if (!wc_buffer)
779 return NULL;
780
781 last_start = 0;
782 n_wc = 0;
783 p = str;
784 while ((max_len < 0 || p < str + max_len) && *p)
785 {
786 gunichar wc = g_utf8_get_char (p);
787 const gchar *decomp;
788 int cc;
789 gsize old_n_wc = n_wc;
790
791 if (wc >= 0xac00 && wc <= 0xd7af)
792 {
793 gsize result_len;
794 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
795 n_wc += result_len;
796 }
797 else
798 {
799 decomp = find_decomposition (wc, do_compat);
800
801 if (decomp)
802 {
803 const char *pd;
804 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
805 wc_buffer[n_wc++] = g_utf8_get_char (pd);
806 }
807 else
808 wc_buffer[n_wc++] = wc;
809 }
810
811 if (n_wc > 0)
812 {
813 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
814
815 if (cc == 0)
816 {
817 g_unicode_canonical_ordering (wc_buffer + last_start,
818 n_wc - last_start);
819 last_start = old_n_wc;
820 }
821 }
822
823 p = g_utf8_next_char (p);
824 }
825
826 if (n_wc > 0)
827 {
828 g_unicode_canonical_ordering (wc_buffer + last_start,
829 n_wc - last_start);
830 last_start = n_wc;
831 }
832
833 wc_buffer[n_wc] = 0;
834
835 /* All decomposed and reordered */
836
837 if (do_compose && n_wc > 0)
838 {
839 gsize i, j;
840 int last_cc = 0;
841 last_start = 0;
842
843 for (i = 0; i < n_wc; i++)
844 {
845 int cc = COMBINING_CLASS (wc_buffer[i]);
846
847 if (i > 0 &&
848 (last_cc == 0 || last_cc != cc) &&
849 combine (wc_buffer[last_start], wc_buffer[i],
850 &wc_buffer[last_start]))
851 {
852 for (j = i + 1; j < n_wc; j++)
853 wc_buffer[j - 1] = wc_buffer[j];
854 n_wc--;
855 i--;
856
857 if (i == last_start)
858 last_cc = 0;
859 else
860 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
861
862 continue;
863 }
864
865 if (cc == 0)
866 last_start = i;
867
868 last_cc = cc;
869 }
870 }
871
872 wc_buffer[n_wc] = 0;
873
874 return wc_buffer;
875}
876
877/*
878 * g_utf8_normalize:
879 * @str: a UTF-8 encoded string.
880 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881 * @mode: the type of normalization to perform.
882 *
883 * Converts a string into canonical form, standardizing
884 * such issues as whether a character with an accent
885 * is represented as a base character and combining
886 * accent or as a single precomposed character. You
887 * should generally call g_utf8_normalize() before
888 * comparing two Unicode strings.
889 *
890 * The normalization mode %G_NORMALIZE_DEFAULT only
891 * standardizes differences that do not affect the
892 * text content, such as the above-mentioned accent
893 * representation. %G_NORMALIZE_ALL also standardizes
894 * the "compatibility" characters in Unicode, such
895 * as SUPERSCRIPT THREE to the standard forms
896 * (in this case DIGIT THREE). Formatting information
897 * may be lost but for most text operations such
898 * characters should be considered the same.
899 * For example, g_utf8_collate() normalizes
900 * with %G_NORMALIZE_ALL as its first step.
901 *
902 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904 * but returned a result with composed forms rather
905 * than a maximally decomposed form. This is often
906 * useful if you intend to convert the string to
907 * a legacy encoding or pass it to a system with
908 * less capable Unicode handling.
909 *
910 * Return value: a newly allocated string, that is the
911 * normalized form of @str.
912 **/
913static gchar *
914g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
915{
916 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
917 gchar *result;
918
919 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
920 g_free (result_wc);
921
922 return result;
923}
924
925/* Public Libidn API starts here. */
926
927/**
928 * stringprep_utf8_to_unichar:
929 * @p: a pointer to Unicode character encoded as UTF-8
930 *
931 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932 * If @p does not point to a valid UTF-8 encoded character, results are
933 * undefined.
934 *
935 * Return value: the resulting character.
936 **/
937uint32_t
938stringprep_utf8_to_unichar (const char *p)
939{
940 return g_utf8_get_char (p);
941}
942
943/**
944 * stringprep_unichar_to_utf8:
945 * @c: a ISO10646 character code
946 * @outbuf: output buffer, must have at least 6 bytes of space.
947 * If %NULL, the length will be computed and returned
948 * and nothing will be written to @outbuf.
949 *
950 * Converts a single character to UTF-8.
951 *
952 * Return value: number of bytes written.
953 **/
954int
955stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
956{
957 return g_unichar_to_utf8 (c, outbuf);
958}
959
960/**
961 * stringprep_utf8_to_ucs4:
962 * @str: a UTF-8 encoded string
963 * @len: the maximum length of @str to use. If @len < 0, then
964 * the string is nul-terminated.
965 * @items_written: location to store the number of characters in the
966 * result, or %NULL.
967 *
968 * Convert a string from UTF-8 to a 32-bit fixed width
969 * representation as UCS-4, assuming valid UTF-8 input.
970 * This function does no error checking on the input.
971 *
972 * Return value: a pointer to a newly allocated UCS-4 string.
973 * This value must be freed with free().
974 **/
975uint32_t *
976stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
977{
978 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
979}
980
981/**
982 * stringprep_ucs4_to_utf8:
983 * @str: a UCS-4 encoded string
984 * @len: the maximum length of @str to use. If @len < 0, then
985 * the string is terminated with a 0 character.
986 * @items_read: location to store number of characters read read, or %NULL.
987 * @items_written: location to store number of bytes written or %NULL.
988 * The value here stored does not include the trailing 0
989 * byte.
990 *
991 * Convert a string from a 32-bit fixed width representation as UCS-4.
992 * to UTF-8. The result will be terminated with a 0 byte.
993 *
994 * Return value: a pointer to a newly allocated UTF-8 string.
995 * This value must be freed with free(). If an
996 * error occurs, %NULL will be returned and
997 * @error set.
998 **/
999char *
1000stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001 size_t * items_read, size_t * items_written)
1002{
1003 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004 (glong *) items_written, NULL);
1005}
1006
1007/**
1008 * stringprep_utf8_nfkc_normalize:
1009 * @str: a UTF-8 encoded string.
1010 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011 *
1012 * Converts a string into canonical form, standardizing
1013 * such issues as whether a character with an accent
1014 * is represented as a base character and combining
1015 * accent or as a single precomposed character.
1016 *
1017 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018 * differences that do not affect the text content, such as the
1019 * above-mentioned accent representation. It standardizes the
1020 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021 * the standard forms (in this case DIGIT THREE). Formatting
1022 * information may be lost but for most text operations such
1023 * characters should be considered the same. It returns a result with
1024 * composed forms rather than a maximally decomposed form.
1025 *
1026 * Return value: a newly allocated string, that is the
1027 * NFKC normalized form of @str.
1028 **/
1029char *
1030stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031{
1032 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033}
1034
1035/**
1036 * stringprep_ucs4_nfkc_normalize:
1037 * @str: a Unicode string.
1038 * @len: length of @str array, or -1 if @str is nul-terminated.
1039 *
1040 * Converts UCS4 string into UTF-8 and runs
1041 * stringprep_utf8_nfkc_normalize().
1042 *
1043 * Return value: a newly allocated Unicode string, that is the NFKC
1044 * normalized form of @str.
1045 **/
1046uint32_t *
1047stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048{
1049 char *p;
1050 uint32_t *result_wc;
1051
1052 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054 free (p);
1055
1056 return result_wc;
1057}
1058