nfkc.c source code [glibc_src_2.26/libidn/nfkc.c]

1	/ nfkc.c Unicode normalization utilities.*
2	* Copyright (C) 2002, 2003 Simon Josefsson
3	*
4	* This file is part of GNU Libidn.
5	*
6	* GNU Libidn is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* GNU Libidn is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
18	*/
19
20	#if HAVE_CONFIG_H
21	# include "config.h"
22	#endif
23
24	#include <stdlib.h>
25	#include <string.h>
26	#include <stdint.h>
27
28	#include "stringprep.h"
29
30	/ This file contains functions from GLIB, including gutf8.c and*
31	* gunidecomp.c, all licensed under LGPL and copyright hold by:
32	*
33	* Copyright (C) 1999, 2000 Tom Tromey
34	* Copyright 2000 Red Hat, Inc.
35	*/
36
37	/ Hacks to make syncing with GLIB code easier. /
38	#define gboolean int
39	#define gchar char
40	#define guchar unsigned char
41	#define glong long
42	#define gint int
43	#define guint unsigned int
44	#define gushort unsigned short
45	#define gint16 int16_t
46	#define guint16 uint16_t
47	#define gunichar uint32_t
48	#define gsize size_t
49	#define gssize ssize_t
50	#define g_malloc malloc
51	#define g_free free
52	#define GError void
53	#define g_set_error(a,b,c,d) ((void) 0)
54	#define g_new(struct_type, n_structs) \
55	((struct_type ) g_malloc (((gsize) sizeof (struct_type)) ((gsize) (n_structs))))
56	# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
57	# define G_STMT_START (void)(
58	# define G_STMT_END )
59	# else
60	# if (defined (sun) \|\| defined (__sun__))
61	# define G_STMT_START if (1)
62	# define G_STMT_END else (void)0
63	# else
64	# define G_STMT_START do
65	# define G_STMT_END while (0)
66	# endif
67	# endif
68	#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
69	#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
70	#define TRUE 1
71	#define FALSE 0
72
73	/ Code from GLIB gunicode.h starts here. /
74
75	typedef enum
76	{
77	G_NORMALIZE_DEFAULT,
78	G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
79	G_NORMALIZE_DEFAULT_COMPOSE,
80	G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
81	G_NORMALIZE_ALL,
82	G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
83	G_NORMALIZE_ALL_COMPOSE,
84	G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
85	}
86	GNormalizeMode;
87
88	/ Code from GLIB gutf8.c starts here. /
89
90	#define UTF8_COMPUTE(Char, Mask, Len) \
91	if (Char < 128) \
92	{ \
93	Len = 1; \
94	Mask = 0x7f; \
95	} \
96	else if ((Char & 0xe0) == 0xc0) \
97	{ \
98	Len = 2; \
99	Mask = 0x1f; \
100	} \
101	else if ((Char & 0xf0) == 0xe0) \
102	{ \
103	Len = 3; \
104	Mask = 0x0f; \
105	} \
106	else if ((Char & 0xf8) == 0xf0) \
107	{ \
108	Len = 4; \
109	Mask = 0x07; \
110	} \
111	else if ((Char & 0xfc) == 0xf8) \
112	{ \
113	Len = 5; \
114	Mask = 0x03; \
115	} \
116	else if ((Char & 0xfe) == 0xfc) \
117	{ \
118	Len = 6; \
119	Mask = 0x01; \
120	} \
121	else \
122	Len = -1;
123
124	#define UTF8_LENGTH(Char) \
125	((Char) < 0x80 ? 1 : \
126	((Char) < 0x800 ? 2 : \
127	((Char) < 0x10000 ? 3 : \
128	((Char) < 0x200000 ? 4 : \
129	((Char) < 0x4000000 ? 5 : 6)))))
130
131
132	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
133	(Result) = (Chars)[0] & (Mask); \
134	for ((Count) = 1; (Count) < (Len); ++(Count)) \
135	{ \
136	if (((Chars)[(Count)] & 0xc0) != 0x80) \
137	{ \
138	(Result) = -1; \
139	break; \
140	} \
141	(Result) <<= 6; \
142	(Result) \|= ((Chars)[(Count)] & 0x3f); \
143	}
144
145	#define UNICODE_VALID(Char) \
146	((Char) < 0x110000 && \
147	(((Char) & 0xFFFFF800) != 0xD800) && \
148	((Char) < 0xFDD0 \|\| (Char) > 0xFDEF) && \
149	((Char) & 0xFFFE) != 0xFFFE)
150
151
152	static const gchar utf8_skip_data[`256`] = {
153	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
154	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
155	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
156	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
157	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
158	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
159	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
160	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
161	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
162	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
163	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
164	`1`, `1`, `1`, `1`, `1`, `1`, `1`,
165	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
166	`2`, `2`, `2`, `2`, `2`, `2`, `2`,
167	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `5`,
168	`5`, `5`, `5`, `6`, `6`, `1`, `1`
169	};
170
171	const gchar *const g_utf8_skip = utf8_skip_data;
172
173	#define g_utf8_next_char(p) (char )((p) + g_utf8_skip[(guchar *)(p)])
174
175	/*
176	* g_utf8_strlen:
177	* @p: pointer to the start of a UTF-8 encoded string.
178	* @max: the maximum number of bytes to examine. If @max
179	* is less than 0, then the string is assumed to be
180	* nul-terminated. If @max is 0, @p will not be examined and
181	* may be %NULL.
182	*
183	* Returns the length of the string in characters.
184	*
185	* Return value: the length of the string in characters
186	**/
187	static glong
188	g_utf8_strlen (const gchar * p, gssize max)
189	{
190	glong len = `0`;
191	const gchar *start = p;
192	g_return_val_if_fail (p != NULL \|\| max == `0`, `0`);
193
194	if (max < `0`)
195	{
196	while (*p)
197	{
198	p = g_utf8_next_char (p);
199	++len;
200	}
201	}
202	else
203	{
204	if (max == `0` \|\| !*p)
205	return `0`;
206
207	p = g_utf8_next_char (p);
208
209	while (p - start < max && *p)
210	{
211	++len;
212	p = g_utf8_next_char (p);
213	}
214
215	/ only do the last len increment if we got a complete*
216	* char (don't count partial chars)
217	*/
218	if (p - start == max)
219	++len;
220	}
221
222	return len;
223	}
224
225	/*
226	* g_utf8_get_char:
227	* @p: a pointer to Unicode character encoded as UTF-8
228	*
229	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
230	* If @p does not point to a valid UTF-8 encoded character, results are
231	* undefined. If you are not sure that the bytes are complete
232	* valid Unicode characters, you should use g_utf8_get_char_validated()
233	* instead.
234	*
235	* Return value: the resulting character
236	**/
237	static gunichar
238	g_utf8_get_char (const gchar * p)
239	{
240	int i, mask = `0`, len;
241	gunichar result;
242	unsigned char c = (unsigned char) *p;
243
244	UTF8_COMPUTE (c, mask, len);
245	if (len == -`1`)
246	return (gunichar) - `1`;
247	UTF8_GET (result, p, i, mask, len);
248
249	return result;
250	}
251
252	/*
253	* g_unichar_to_utf8:
254	* @c: a ISO10646 character code
255	* @outbuf: output buffer, must have at least 6 bytes of space.
256	* If %NULL, the length will be computed and returned
257	* and nothing will be written to @outbuf.
258	*
259	* Converts a single character to UTF-8.
260	*
261	* Return value: number of bytes written
262	**/
263	static int
264	g_unichar_to_utf8 (gunichar c, gchar * outbuf)
265	{
266	guint len = `0`;
267	int first;
268	int i;
269
270	if (c < `0x80`)
271	{
272	first = `0`;
273	len = `1`;
274	}
275	else if (c < `0x800`)
276	{
277	first = `0xc0`;
278	len = `2`;
279	}
280	else if (c < `0x10000`)
281	{
282	first = `0xe0`;
283	len = `3`;
284	}
285	else if (c < `0x200000`)
286	{
287	first = `0xf0`;
288	len = `4`;
289	}
290	else if (c < `0x4000000`)
291	{
292	first = `0xf8`;
293	len = `5`;
294	}
295	else
296	{
297	first = `0xfc`;
298	len = `6`;
299	}
300
301	if (outbuf)
302	{
303	for (i = len - `1`; i > `0`; --i)
304	{
305	outbuf[i] = (c & `0x3f`) \| `0x80`;
306	c >>= `6`;
307	}
308	outbuf[`0`] = c \| first;
309	}
310
311	return len;
312	}
313
314	/*
315	* g_utf8_to_ucs4_fast:
316	* @str: a UTF-8 encoded string
317	* @len: the maximum length of @str to use. If @len < 0, then
318	* the string is nul-terminated.
319	* @items_written: location to store the number of characters in the
320	* result, or %NULL.
321	*
322	* Convert a string from UTF-8 to a 32-bit fixed width
323	* representation as UCS-4, assuming valid UTF-8 input.
324	* This function is roughly twice as fast as g_utf8_to_ucs4()
325	* but does no error checking on the input.
326	*
327	* Return value: a pointer to a newly allocated UCS-4 string.
328	* This value must be freed with g_free().
329	**/
330	static gunichar *
331	g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
332	{
333	gint j, charlen;
334	gunichar *result;
335	gint n_chars, i;
336	const gchar *p;
337
338	g_return_val_if_fail (str != NULL, NULL);
339
340	p = str;
341	n_chars = `0`;
342	if (len < `0`)
343	{
344	while (*p)
345	{
346	p = g_utf8_next_char (p);
347	++n_chars;
348	}
349	}
350	else
351	{
352	while (p < str + len && *p)
353	{
354	p = g_utf8_next_char (p);
355	++n_chars;
356	}
357	}
358
359	result = g_new (gunichar, n_chars + `1`);
360	if (!result)
361	return NULL;
362
363	p = str;
364	for (i = `0`; i < n_chars; i++)
365	{
366	gunichar wc = ((unsigned char *) p)[`0`];
367
368	if (wc < `0x80`)
369	{
370	result[i] = wc;
371	p++;
372	}
373	else
374	{
375	if (wc < `0xe0`)
376	{
377	charlen = `2`;
378	wc &= `0x1f`;
379	}
380	else if (wc < `0xf0`)
381	{
382	charlen = `3`;
383	wc &= `0x0f`;
384	}
385	else if (wc < `0xf8`)
386	{
387	charlen = `4`;
388	wc &= `0x07`;
389	}
390	else if (wc < `0xfc`)
391	{
392	charlen = `5`;
393	wc &= `0x03`;
394	}
395	else
396	{
397	charlen = `6`;
398	wc &= `0x01`;
399	}
400
401	for (j = `1`; j < charlen; j++)
402	{
403	wc <<= `6`;
404	wc \|= ((unsigned char *) p)[j] & `0x3f`;
405	}
406
407	result[i] = wc;
408	p += charlen;
409	}
410	}
411	result[i] = `0`;
412
413	if (items_written)
414	*items_written = i;
415
416	return result;
417	}
418
419	/*
420	* g_ucs4_to_utf8:
421	* @str: a UCS-4 encoded string
422	* @len: the maximum length of @str to use. If @len < 0, then
423	* the string is terminated with a 0 character.
424	* @items_read: location to store number of characters read read, or %NULL.
425	* @items_written: location to store number of bytes written or %NULL.
426	* The value here stored does not include the trailing 0
427	* byte.
428	* @error: location to store the error occuring, or %NULL to ignore
429	* errors. Any of the errors in #GConvertError other than
430	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
431	*
432	* Convert a string from a 32-bit fixed width representation as UCS-4.
433	* to UTF-8. The result will be terminated with a 0 byte.
434	*
435	* Return value: a pointer to a newly allocated UTF-8 string.
436	* This value must be freed with g_free(). If an
437	* error occurs, %NULL will be returned and
438	* @error set.
439	**/
440	static gchar *
441	g_ucs4_to_utf8 (const gunichar * str,
442	glong len,
443	glong * items_read, glong * items_written, GError ** error)
444	{
445	gint result_length;
446	gchar *result = NULL;
447	gchar *p;
448	gint i;
449
450	result_length = `0`;
451	for (i = `0`; len < `0` \|\| i < len; i++)
452	{
453	if (!str[i])
454	break;
455
456	if (str[i] >= `0x80000000`)
457	{
458	if (items_read)
459	*items_read = i;
460
461	g_set_error (error, G_CONVERT_ERROR,
462	G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
463	_("Character out of range for UTF-8"));
464	goto err_out;
465	}
466
467	result_length += UTF8_LENGTH (str[i]);
468	}
469
470	result = g_malloc (result_length + `1`);
471	if (!result)
472	return NULL;
473	p = result;
474
475	i = `0`;
476	while (p < result + result_length)
477	p += g_unichar_to_utf8 (str[i++], p);
478
479	*p = `'\0'`;
480
481	if (items_written)
482	*items_written = p - result;
483
484	err_out:
485	if (items_read)
486	*items_read = i;
487
488	return result;
489	}
490
491	/ Code from GLIB gunidecomp.c starts here. /
492
493	#include "gunidecomp.h"
494	#include "gunicomp.h"
495
496	#define CC_PART1(Page, Char) \
497	((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
498	? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
499	: (cclass_data[combining_class_table_part1[Page]][Char]))
500
501	#define CC_PART2(Page, Char) \
502	((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
503	? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
504	: (cclass_data[combining_class_table_part2[Page]][Char]))
505
506	#define COMBINING_CLASS(Char) \
507	(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
508	? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
509	: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
510	? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
511	: 0))
512
513	/ constants for hangul syllable [de]composition /
514	#define SBase 0xAC00
515	#define LBase 0x1100
516	#define VBase 0x1161
517	#define TBase 0x11A7
518	#define LCount 19
519	#define VCount 21
520	#define TCount 28
521	#define NCount (VCount * TCount)
522	#define SCount (LCount * NCount)
523
524	/*
525	* g_unicode_canonical_ordering:
526	* @string: a UCS-4 encoded string.
527	* @len: the maximum length of @string to use.
528	*
529	* Computes the canonical ordering of a string in-place.
530	* This rearranges decomposed characters in the string
531	* according to their combining classes. See the Unicode
532	* manual for more information.
533	**/
534	static void
535	g_unicode_canonical_ordering (gunichar * string, gsize len)
536	{
537	gsize i;
538	int swap = `1`;
539
540	while (swap)
541	{
542	int last;
543	swap = `0`;
544	last = COMBINING_CLASS (string[`0`]);
545	for (i = `0`; i < len - `1`; ++i)
546	{
547	int next = COMBINING_CLASS (string[i + `1`]);
548	if (next != `0` && last > next)
549	{
550	gsize j;
551	/ Percolate item leftward through string. /
552	for (j = i + `1`; j > `0`; --j)
553	{
554	gunichar t;
555	if (COMBINING_CLASS (string[j - `1`]) <= next)
556	break;
557	t = string[j];
558	string[j] = string[j - `1`];
559	string[j - `1`] = t;
560	swap = `1`;
561	}
562	/ We're re-entering the loop looking at the old*
563	character again. /*
564	next = last;
565	}
566	last = next;
567	}
568	}
569	}
570
571	/ http://www.unicode.org/unicode/reports/tr15/#Hangul*
572	* r should be null or have sufficient space. Calling with r == NULL will
573	* only calculate the result_len; however, a buffer with space for three
574	* characters will always be big enough. */
575	static void
576	decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
577	{
578	gint SIndex = s - SBase;
579
580	/ not a hangul syllable /
581	if (SIndex < `0` \|\| SIndex >= SCount)
582	{
583	if (r)
584	r[`0`] = s;
585	*result_len = `1`;
586	}
587	else
588	{
589	gunichar L = LBase + SIndex / NCount;
590	gunichar V = VBase + (SIndex % NCount) / TCount;
591	gunichar T = TBase + SIndex % TCount;
592
593	if (r)
594	{
595	r[`0`] = L;
596	r[`1`] = V;
597	}
598
599	if (T != TBase)
600	{
601	if (r)
602	r[`2`] = T;
603	*result_len = `3`;
604	}
605	else
606	*result_len = `2`;
607	}
608	}
609
610	/ returns a pointer to a null-terminated UTF-8 string /
611	static const gchar *
612	find_decomposition (gunichar ch, gboolean compat)
613	{
614	int start = `0`;
615	int end = G_N_ELEMENTS (decomp_table);
616
617	if (ch >= decomp_table[start].ch && ch <= decomp_table[end - `1`].ch)
618	{
619	while (TRUE)
620	{
621	int half = (start + end) / `2`;
622	if (ch == decomp_table[half].ch)
623	{
624	int offset;
625
626	if (compat)
627	{
628	offset = decomp_table[half].compat_offset;
629	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
630	offset = decomp_table[half].canon_offset;
631	}
632	else
633	{
634	offset = decomp_table[half].canon_offset;
635	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
636	return NULL;
637	}
638
639	return &(decomp_expansion_string[offset]);
640	}
641	else if (half == start)
642	break;
643	else if (ch > decomp_table[half].ch)
644	start = half;
645	else
646	end = half;
647	}
648	}
649
650	return NULL;
651	}
652
653	/ L,V => LV and LV,T => LVT /
654	static gboolean
655	combine_hangul (gunichar a, gunichar b, gunichar * result)
656	{
657	gint LIndex = a - LBase;
658	gint SIndex = a - SBase;
659
660	gint VIndex = b - VBase;
661	gint TIndex = b - TBase;
662
663	if (`0` <= LIndex && LIndex < LCount && `0` <= VIndex && VIndex < VCount)
664	{
665	result = SBase + (LIndex VCount + VIndex) * TCount;
666	return TRUE;
667	}
668	else if (`0` <= SIndex && SIndex < SCount && (SIndex % TCount) == `0`
669	&& `0` <= TIndex && TIndex <= TCount)
670	{
671	*result = a + TIndex;
672	return TRUE;
673	}
674
675	return FALSE;
676	}
677
678	#define CI(Page, Char) \
679	((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
680	? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
681	: (compose_data[compose_table[Page]][Char]))
682
683	#define COMPOSE_INDEX(Char) \
684	((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
685
686	static gboolean
687	combine (gunichar a, gunichar b, gunichar * result)
688	{
689	gushort index_a, index_b;
690
691	if (combine_hangul (a, b, result))
692	return TRUE;
693
694	index_a = COMPOSE_INDEX (a);
695
696	if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
697	{
698	if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][`0`])
699	{
700	*result =
701	compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][`1`];
702	return TRUE;
703	}
704	else
705	return FALSE;
706	}
707
708	index_b = COMPOSE_INDEX (b);
709
710	if (index_b >= COMPOSE_SECOND_SINGLE_START)
711	{
712	if (a ==
713	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][`0`])
714	{
715	*result =
716	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][`1`];
717	return TRUE;
718	}
719	else
720	return FALSE;
721	}
722
723	if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
724	&& index_b >= COMPOSE_SECOND_START
725	&& index_b < COMPOSE_SECOND_SINGLE_START)
726	{
727	gunichar res =
728	compose_array[index_a - COMPOSE_FIRST_START][index_b -
729	COMPOSE_SECOND_START];
730
731	if (res)
732	{
733	*result = res;
734	return TRUE;
735	}
736	}
737
738	return FALSE;
739	}
740
741	static gunichar *
742	_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
743	{
744	gsize n_wc;
745	gunichar *wc_buffer;
746	const char *p;
747	gsize last_start;
748	gboolean do_compat = (mode == G_NORMALIZE_NFKC \|\| mode == G_NORMALIZE_NFKD);
749	gboolean do_compose = (mode == G_NORMALIZE_NFC \|\| mode == G_NORMALIZE_NFKC);
750
751	n_wc = `0`;
752	p = str;
753	while ((max_len < `0` \|\| p < str + max_len) && *p)
754	{
755	const gchar *decomp;
756	gunichar wc = g_utf8_get_char (p);
757
758	if (wc >= `0xac00` && wc <= `0xd7af`)
759	{
760	gsize result_len;
761	decompose_hangul (wc, NULL, &result_len);
762	n_wc += result_len;
763	}
764	else
765	{
766	decomp = find_decomposition (wc, do_compat);
767
768	if (decomp)
769	n_wc += g_utf8_strlen (decomp, -`1`);
770	else
771	n_wc++;
772	}
773
774	p = g_utf8_next_char (p);
775	}
776
777	wc_buffer = g_new (gunichar, n_wc + `1`);
778	if (!wc_buffer)
779	return NULL;
780
781	last_start = `0`;
782	n_wc = `0`;
783	p = str;
784	while ((max_len < `0` \|\| p < str + max_len) && *p)
785	{
786	gunichar wc = g_utf8_get_char (p);
787	const gchar *decomp;
788	int cc;
789	gsize old_n_wc = n_wc;
790
791	if (wc >= `0xac00` && wc <= `0xd7af`)
792	{
793	gsize result_len;
794	decompose_hangul (wc, wc_buffer + n_wc, &result_len);
795	n_wc += result_len;
796	}
797	else
798	{
799	decomp = find_decomposition (wc, do_compat);
800
801	if (decomp)
802	{
803	const char *pd;
804	for (pd = decomp; *pd != `'\0'`; pd = g_utf8_next_char (pd))
805	wc_buffer[n_wc++] = g_utf8_get_char (pd);
806	}
807	else
808	wc_buffer[n_wc++] = wc;
809	}
810
811	if (n_wc > `0`)
812	{
813	cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
814
815	if (cc == `0`)
816	{
817	g_unicode_canonical_ordering (wc_buffer + last_start,
818	n_wc - last_start);
819	last_start = old_n_wc;
820	}
821	}
822
823	p = g_utf8_next_char (p);
824	}
825
826	if (n_wc > `0`)
827	{
828	g_unicode_canonical_ordering (wc_buffer + last_start,
829	n_wc - last_start);
830	last_start = n_wc;
831	}
832
833	wc_buffer[n_wc] = `0`;
834
835	/ All decomposed and reordered /
836
837	if (do_compose && n_wc > `0`)
838	{
839	gsize i, j;
840	int last_cc = `0`;
841	last_start = `0`;
842
843	for (i = `0`; i < n_wc; i++)
844	{
845	int cc = COMBINING_CLASS (wc_buffer[i]);
846
847	if (i > `0` &&
848	(last_cc == `0` \|\| last_cc != cc) &&
849	combine (wc_buffer[last_start], wc_buffer[i],
850	&wc_buffer[last_start]))
851	{
852	for (j = i + `1`; j < n_wc; j++)
853	wc_buffer[j - `1`] = wc_buffer[j];
854	n_wc--;
855	i--;
856
857	if (i == last_start)
858	last_cc = `0`;
859	else
860	last_cc = COMBINING_CLASS (wc_buffer[i - `1`]);
861
862	continue;
863	}
864
865	if (cc == `0`)
866	last_start = i;
867
868	last_cc = cc;
869	}
870	}
871
872	wc_buffer[n_wc] = `0`;
873
874	return wc_buffer;
875	}
876
877	/*
878	* g_utf8_normalize:
879	* @str: a UTF-8 encoded string.
880	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881	* @mode: the type of normalization to perform.
882	*
883	* Converts a string into canonical form, standardizing
884	* such issues as whether a character with an accent
885	* is represented as a base character and combining
886	* accent or as a single precomposed character. You
887	* should generally call g_utf8_normalize() before
888	* comparing two Unicode strings.
889	*
890	* The normalization mode %G_NORMALIZE_DEFAULT only
891	* standardizes differences that do not affect the
892	* text content, such as the above-mentioned accent
893	* representation. %G_NORMALIZE_ALL also standardizes
894	* the "compatibility" characters in Unicode, such
895	* as SUPERSCRIPT THREE to the standard forms
896	* (in this case DIGIT THREE). Formatting information
897	* may be lost but for most text operations such
898	* characters should be considered the same.
899	* For example, g_utf8_collate() normalizes
900	* with %G_NORMALIZE_ALL as its first step.
901	*
902	* %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903	* are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904	* but returned a result with composed forms rather
905	* than a maximally decomposed form. This is often
906	* useful if you intend to convert the string to
907	* a legacy encoding or pass it to a system with
908	* less capable Unicode handling.
909	*
910	* Return value: a newly allocated string, that is the
911	* normalized form of @str.
912	**/
913	static gchar *
914	g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
915	{
916	gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
917	gchar *result;
918
919	result = g_ucs4_to_utf8 (result_wc, -`1`, NULL, NULL, NULL);
920	g_free (result_wc);
921
922	return result;
923	}
924
925	/ Public Libidn API starts here. /
926
927	/**
928	* stringprep_utf8_to_unichar:
929	* @p: a pointer to Unicode character encoded as UTF-8
930	*
931	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932	* If @p does not point to a valid UTF-8 encoded character, results are
933	* undefined.
934	*
935	* Return value: the resulting character.
936	**/
937	uint32_t
938	stringprep_utf8_to_unichar (const char *p)
939	{
940	return g_utf8_get_char (p);
941	}
942
943	/**
944	* stringprep_unichar_to_utf8:
945	* @c: a ISO10646 character code
946	* @outbuf: output buffer, must have at least 6 bytes of space.
947	* If %NULL, the length will be computed and returned
948	* and nothing will be written to @outbuf.
949	*
950	* Converts a single character to UTF-8.
951	*
952	* Return value: number of bytes written.
953	**/
954	int
955	stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
956	{
957	return g_unichar_to_utf8 (c, outbuf);
958	}
959
960	/**
961	* stringprep_utf8_to_ucs4:
962	* @str: a UTF-8 encoded string
963	* @len: the maximum length of @str to use. If @len < 0, then
964	* the string is nul-terminated.
965	* @items_written: location to store the number of characters in the
966	* result, or %NULL.
967	*
968	* Convert a string from UTF-8 to a 32-bit fixed width
969	* representation as UCS-4, assuming valid UTF-8 input.
970	* This function does no error checking on the input.
971	*
972	* Return value: a pointer to a newly allocated UCS-4 string.
973	* This value must be freed with free().
974	**/
975	uint32_t *
976	stringprep_utf8_to_ucs4 (const char str, ssize_t len, size_t items_written)
977	{
978	return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
979	}
980
981	/**
982	* stringprep_ucs4_to_utf8:
983	* @str: a UCS-4 encoded string
984	* @len: the maximum length of @str to use. If @len < 0, then
985	* the string is terminated with a 0 character.
986	* @items_read: location to store number of characters read read, or %NULL.
987	* @items_written: location to store number of bytes written or %NULL.
988	* The value here stored does not include the trailing 0
989	* byte.
990	*
991	* Convert a string from a 32-bit fixed width representation as UCS-4.
992	* to UTF-8. The result will be terminated with a 0 byte.
993	*
994	* Return value: a pointer to a newly allocated UTF-8 string.
995	* This value must be freed with free(). If an
996	* error occurs, %NULL will be returned and
997	* @error set.
998	**/
999	char *
1000	stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001	size_t * items_read, size_t * items_written)
1002	{
1003	return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004	(glong *) items_written, NULL);
1005	}
1006
1007	/**
1008	* stringprep_utf8_nfkc_normalize:
1009	* @str: a UTF-8 encoded string.
1010	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011	*
1012	* Converts a string into canonical form, standardizing
1013	* such issues as whether a character with an accent
1014	* is represented as a base character and combining
1015	* accent or as a single precomposed character.
1016	*
1017	* The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018	* differences that do not affect the text content, such as the
1019	* above-mentioned accent representation. It standardizes the
1020	* "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021	* the standard forms (in this case DIGIT THREE). Formatting
1022	* information may be lost but for most text operations such
1023	* characters should be considered the same. It returns a result with
1024	* composed forms rather than a maximally decomposed form.
1025	*
1026	* Return value: a newly allocated string, that is the
1027	* NFKC normalized form of @str.
1028	**/
1029	char *
1030	stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031	{
1032	return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033	}
1034
1035	/**
1036	* stringprep_ucs4_nfkc_normalize:
1037	* @str: a Unicode string.
1038	* @len: length of @str array, or -1 if @str is nul-terminated.
1039	*
1040	* Converts UCS4 string into UTF-8 and runs
1041	* stringprep_utf8_nfkc_normalize().
1042	*
1043	* Return value: a newly allocated Unicode string, that is the NFKC
1044	* normalized form of @str.
1045	**/
1046	uint32_t *
1047	stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048	{
1049	char *p;
1050	uint32_t *result_wc;
1051
1052	p = stringprep_ucs4_to_utf8 (str, len, `0`, `0`);
1053	result_wc = _g_utf8_normalize_wc (p, -`1`, G_NORMALIZE_NFKC);
1054	free (p);
1055
1056	return result_wc;
1057	}
1058

Browse the source code of glibc_src_2.26/libidn/nfkc.c