punycode.c source code [glibc_src_2.24/libidn/punycode.c]

1	/ punycode.c Implementation of punycode used to ASCII encode IDN's.*
2	* Copyright (C) 2002, 2003 Simon Josefsson
3	*
4	* This file is part of GNU Libidn.
5	*
6	* GNU Libidn is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* GNU Libidn is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
18	*/
19
20	/*
21	* This file is derived from RFC 3492bis written by Adam M. Costello.
22	*
23	* Disclaimer and license: Regarding this entire document or any
24	* portion of it (including the pseudocode and C code), the author
25	* makes no guarantees and is not responsible for any damage resulting
26	* from its use. The author grants irrevocable permission to anyone
27	* to use, modify, and distribute it in any way that does not diminish
28	* the rights of anyone else to use, modify, and distribute it,
29	* provided that redistributed derivative works do not contain
30	* misleading author or version information. Derivative works need
31	* not be licensed under similar terms.
32	*
33	* Copyright (C) The Internet Society (2003). All Rights Reserved.
34	*
35	* This document and translations of it may be copied and furnished to
36	* others, and derivative works that comment on or otherwise explain it
37	* or assist in its implementation may be prepared, copied, published
38	* and distributed, in whole or in part, without restriction of any
39	* kind, provided that the above copyright notice and this paragraph are
40	* included on all such copies and derivative works. However, this
41	* document itself may not be modified in any way, such as by removing
42	* the copyright notice or references to the Internet Society or other
43	* Internet organizations, except as needed for the purpose of
44	* developing Internet standards in which case the procedures for
45	* copyrights defined in the Internet Standards process must be
46	* followed, or as required to translate it into languages other than
47	* English.
48	*
49	* The limited permissions granted above are perpetual and will not be
50	* revoked by the Internet Society or its successors or assigns.
51	*
52	* This document and the information contained herein is provided on an
53	* "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
54	* TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
55	* BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
56	* HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
57	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
58	*/
59
60	#include <string.h>
61
62	#include "punycode.h"
63
64	/ Bootstring parameters for Punycode /
65
66	enum
67	{ base = `36`, tmin = `1`, tmax = `26`, skew = `38`, damp = `700`,
68	initial_bias = `72`, initial_n = `0x80`, delimiter = `0x2D`
69	};
70
71	/ basic(cp) tests whether cp is a basic code point: /
72	#define basic(cp) ((punycode_uint)(cp) < 0x80)
73
74	/ delim(cp) tests whether cp is a delimiter: /
75	#define delim(cp) ((cp) == delimiter)
76
77	/ decode_digit(cp) returns the numeric value of a basic code /
78	/ point (for use in representing integers) in the range 0 to /
79	/ base-1, or base if cp does not represent a value. /
80
81	static punycode_uint
82	decode_digit (punycode_uint cp)
83	{
84	return cp - `48` < `10` ? cp - `22` : cp - `65` < `26` ? cp - `65` :
85	cp - `97` < `26` ? cp - `97` : base;
86	}
87
88	/ encode_digit(d,flag) returns the basic code point whose value /
89	/ (when used for representing integers) is d, which needs to be in /
90	/ the range 0 to base-1. The lowercase form is used unless flag is /
91	/ nonzero, in which case the uppercase form is used. The behavior /
92	/ is undefined if flag is nonzero and digit d has no uppercase form. /
93
94	static char
95	encode_digit (punycode_uint d, int flag)
96	{
97	return d + `22` + `75` * (d < `26`) - ((flag != `0`) << `5`);
98	/ 0..25 map to ASCII a..z or A..Z /
99	/ 26..35 map to ASCII 0..9 /
100	}
101
102	/ flagged(bcp) tests whether a basic code point is flagged /
103	/ (uppercase). The behavior is undefined if bcp is not a /
104	/ basic code point. /
105
106	#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
107
108	/ encode_basic(bcp,flag) forces a basic code point to lowercase /
109	/ if flag is zero, uppercase if flag is nonzero, and returns /
110	/ the resulting code point. The code point is unchanged if it /
111	/ is caseless. The behavior is undefined if bcp is not a basic /
112	/ code point. /
113
114	static char
115	encode_basic (punycode_uint bcp, int flag)
116	{
117	bcp -= (bcp - `97` < `26`) << `5`;
118	return bcp + ((!flag && (bcp - `65` < `26`)) << `5`);
119	}
120
121	/ Platform-specific constants /
122
123	/ maxint is the maximum value of a punycode_uint variable: /
124	static const punycode_uint maxint = -`1`;
125	/ Because maxint is unsigned, -1 becomes the maximum value. /
126
127	/ Bias adaptation function /
128
129	static punycode_uint
130	adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
131	{
132	punycode_uint k;
133
134	delta = firsttime ? delta / damp : delta >> `1`;
135	/ delta >> 1 is a faster way of doing delta / 2 /
136	delta += delta / numpoints;
137
138	for (k = `0`; delta > ((base - tmin) * tmax) / `2`; k += base)
139	{
140	delta /= base - tmin;
141	}
142
143	return k + (base - tmin + `1`) * delta / (delta + skew);
144	}
145
146	/ Main encode function /
147
148	/**
149	* punycode_encode:
150	* @input_length: The number of code points in the @input array and
151	* the number of flags in the @case_flags array.
152	* @input: An array of code points. They are presumed to be Unicode
153	* code points, but that is not strictly REQUIRED. The array
154	* contains code points, not code units. UTF-16 uses code units
155	* D800 through DFFF to refer to code points 10000..10FFFF. The
156	* code points D800..DFFF do not occur in any valid Unicode string.
157	* The code points that can occur in Unicode strings (0..D7FF and
158	* E000..10FFFF) are also called Unicode scalar values.
159	* @case_flags: A %NULL pointer or an array of boolean values parallel
160	* to the @input array. Nonzero (true, flagged) suggests that the
161	* corresponding Unicode character be forced to uppercase after
162	* being decoded (if possible), and zero (false, unflagged) suggests
163	* that it be forced to lowercase (if possible). ASCII code points
164	* (0..7F) are encoded literally, except that ASCII letters are
165	* forced to uppercase or lowercase according to the corresponding
166	* case flags. If @case_flags is a %NULL pointer then ASCII letters
167	* are left as they are, and other code points are treated as
168	* unflagged.
169	* @output_length: The caller passes in the maximum number of ASCII
170	* code points that it can receive. On successful return it will
171	* contain the number of ASCII code points actually output.
172	* @output: An array of ASCII code points. It is not
173	* null-terminated; it will contain zeros if and only if the @input
174	* contains zeros. (Of course the caller can leave room for a
175	* terminator and add one if needed.)
176	*
177	* Converts a sequence of code points (presumed to be Unicode code
178	* points) to Punycode.
179	*
180	* Return value: The return value can be any of the punycode_status
181	* values defined above except %punycode_bad_input. If not
182	* %punycode_success, then @output_size and @output might contain
183	* garbage.
184	**/
185	int
186	punycode_encode (size_t input_length,
187	const punycode_uint input[],
188	const unsigned char case_flags[],
189	size_t * output_length, char output[])
190	{
191	punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
192	size_t out, max_out;
193
194	/ The Punycode spec assumes that the input length is the same type /
195	/ of integer as a code point, so we need to convert the size_t to /
196	/ a punycode_uint, which could overflow. /
197
198	if (input_length > maxint)
199	return punycode_overflow;
200	input_len = (punycode_uint) input_length;
201
202	/ Initialize the state: /
203
204	n = initial_n;
205	delta = `0`;
206	out = `0`;
207	max_out = *output_length;
208	bias = initial_bias;
209
210	/ Handle the basic code points: /
211
212	for (j = `0`; j < input_len; ++j)
213	{
214	if (basic (input[j]))
215	{
216	if (max_out - out < `2`)
217	return punycode_big_output;
218	output[out++] = case_flags ?
219	encode_basic (input[j], case_flags[j]) : (char) input[j];
220	}
221	/ else if (input[j] < n) return punycode_bad_input; /
222	/ (not needed for Punycode with unsigned code points) /
223	}
224
225	h = b = (punycode_uint) out;
226	/ cannot overflow because out <= input_len <= maxint /
227
228	/ h is the number of code points that have been handled, b is the /
229	/ number of basic code points, and out is the number of ASCII code /
230	/ points that have been output. /
231
232	if (b > `0`)
233	output[out++] = delimiter;
234
235	/ Main encoding loop: /
236
237	while (h < input_len)
238	{
239	/ All non-basic code points < n have been /
240	/ handled already. Find the next larger one: /
241
242	for (m = maxint, j = `0`; j < input_len; ++j)
243	{
244	/ if (basic(input[j])) continue; /
245	/ (not needed for Punycode) /
246	if (input[j] >= n && input[j] < m)
247	m = input[j];
248	}
249
250	/ Increase delta enough to advance the decoder's /
251	/ <n,i> state to <m,0>, but guard against overflow: /
252
253	if (m - n > (maxint - delta) / (h + `1`))
254	return punycode_overflow;
255	delta += (m - n) * (h + `1`);
256	n = m;
257
258	for (j = `0`; j < input_len; ++j)
259	{
260	/ Punycode does not need to check whether input[j] is basic: /
261	if (input[j] < n / \|\| basic(input[j]) / )
262	{
263	if (++delta == `0`)
264	return punycode_overflow;
265	}
266
267	if (input[j] == n)
268	{
269	/ Represent delta as a generalized variable-length integer: /
270
271	for (q = delta, k = base;; k += base)
272	{
273	if (out >= max_out)
274	return punycode_big_output;
275	t = k <= bias / + tmin / ? tmin : / +tmin not needed /
276	k >= bias + tmax ? tmax : k - bias;
277	if (q < t)
278	break;
279	output[out++] = encode_digit (t + (q - t) % (base - t), `0`);
280	q = (q - t) / (base - t);
281	}
282
283	output[out++] = encode_digit (q, case_flags && case_flags[j]);
284	bias = adapt (delta, h + `1`, h == b);
285	delta = `0`;
286	++h;
287	}
288	}
289
290	++delta, ++n;
291	}
292
293	*output_length = out;
294	return punycode_success;
295	}
296
297	/ Main decode function /
298
299	/**
300	* punycode_decode:
301	* @input_length: The number of ASCII code points in the @input array.
302	* @input: An array of ASCII code points (0..7F).
303	* @output_length: The caller passes in the maximum number of code
304	* points that it can receive into the @output array (which is also
305	* the maximum number of flags that it can receive into the
306	* @case_flags array, if @case_flags is not a %NULL pointer). On
307	* successful return it will contain the number of code points
308	* actually output (which is also the number of flags actually
309	* output, if case_flags is not a null pointer). The decoder will
310	* never need to output more code points than the number of ASCII
311	* code points in the input, because of the way the encoding is
312	* defined. The number of code points output cannot exceed the
313	* maximum possible value of a punycode_uint, even if the supplied
314	* @output_length is greater than that.
315	* @output: An array of code points like the input argument of
316	* punycode_encode() (see above).
317	* @case_flags: A %NULL pointer (if the flags are not needed by the
318	* caller) or an array of boolean values parallel to the @output
319	* array. Nonzero (true, flagged) suggests that the corresponding
320	* Unicode character be forced to uppercase by the caller (if
321	* possible), and zero (false, unflagged) suggests that it be forced
322	* to lowercase (if possible). ASCII code points (0..7F) are output
323	* already in the proper case, but their flags will be set
324	* appropriately so that applying the flags would be harmless.
325	*
326	* Converts Punycode to a sequence of code points (presumed to be
327	* Unicode code points).
328	*
329	* Return value: The return value can be any of the punycode_status
330	* values defined above. If not %punycode_success, then
331	* @output_length, @output, and @case_flags might contain garbage.
332	*
333	**/
334	int
335	punycode_decode (size_t input_length,
336	const char input[],
337	size_t * output_length,
338	punycode_uint output[], unsigned char case_flags[])
339	{
340	punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
341	size_t b, j, in;
342
343	/ Initialize the state: /
344
345	n = initial_n;
346	out = i = `0`;
347	max_out = *output_length > maxint ? maxint
348	: (punycode_uint) * output_length;
349	bias = initial_bias;
350
351	/ Handle the basic code points: Let b be the number of input code /
352	/ points before the last delimiter, or 0 if there is none, then /
353	/ copy the first b code points to the output. /
354
355	for (b = j = `0`; j < input_length; ++j)
356	if (delim (input[j]))
357	b = j;
358	if (b > max_out)
359	return punycode_big_output;
360
361	for (j = `0`; j < b; ++j)
362	{
363	if (case_flags)
364	case_flags[out] = flagged (input[j]);
365	if (!basic (input[j]))
366	return punycode_bad_input;
367	output[out++] = input[j];
368	}
369
370	/ Main decoding loop: Start just after the last delimiter if any /
371	/ basic code points were copied; start at the beginning otherwise. /
372
373	for (in = b > `0` ? b + `1` : `0`; in < input_length; ++out)
374	{
375
376	/ in is the index of the next ASCII code point to be consumed, /
377	/ and out is the number of code points in the output array. /
378
379	/ Decode a generalized variable-length integer into delta, /
380	/ which gets added to i. The overflow checking is easier /
381	/ if we increase i as we go, then subtract off its starting /
382	/ value at the end to obtain delta. /
383
384	for (oldi = i, w = `1`, k = base;; k += base)
385	{
386	if (in >= input_length)
387	return punycode_bad_input;
388	digit = decode_digit (input[in++]);
389	if (digit >= base)
390	return punycode_bad_input;
391	if (digit > (maxint - i) / w)
392	return punycode_overflow;
393	i += digit * w;
394	t = k <= bias / + tmin / ? tmin : / +tmin not needed /
395	k >= bias + tmax ? tmax : k - bias;
396	if (digit < t)
397	break;
398	if (w > maxint / (base - t))
399	return punycode_overflow;
400	w *= (base - t);
401	}
402
403	bias = adapt (i - oldi, out + `1`, oldi == `0`);
404
405	/ i was supposed to wrap around from out+1 to 0, /
406	/ incrementing n each time, so we'll fix that now: /
407
408	if (i / (out + `1`) > maxint - n)
409	return punycode_overflow;
410	n += i / (out + `1`);
411	i %= (out + `1`);
412
413	/ Insert n at position i of the output: /
414
415	/ not needed for Punycode: /
416	/ if (basic(n)) return punycode_invalid_input; /
417	if (out >= max_out)
418	return punycode_big_output;
419
420	if (case_flags)
421	{
422	memmove (case_flags + i + `1`, case_flags + i, out - i);
423	/ Case of last ASCII code point determines case flag: /
424	case_flags[i] = flagged (input[in - `1`]);
425	}
426
427	memmove (output + i + `1`, output + i, (out - i) * sizeof *output);
428	output[i++] = n;
429	}
430
431	*output_length = (size_t) out;
432	/ cannot overflow because out <= old value of output_length /*
433	return punycode_success;
434	}
435
436	/**
437	* punycode_uint
438	*
439	* Unicode code point data type, this is always a 32 bit unsigned
440	* integer.
441	*/
442
443	/**
444	* Punycode_status
445	* @PUNYCODE_SUCCESS: Successful operation. This value is guaranteed
446	* to always be zero, the remaining ones are only guaranteed to hold
447	* non-zero values, for logical comparison purposes.
448	* @PUNYCODE_BAD_INPUT: Input is invalid.
449	* @PUNYCODE_BIG_OUTPUT: Output would exceed the space provided.
450	* @PUNYCODE_OVERFLOW: Input needs wider integers to process.
451	*
452	* Enumerated return codes of punycode_encode() and punycode_decode().
453	* The value 0 is guaranteed to always correspond to success.
454	*/
455

Browse the source code of glibc_src_2.24/libidn/punycode.c