00001 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's. 00002 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 00003 * Simon Josefsson 00004 * 00005 * This file is part of GNU Libidn. 00006 * 00007 * GNU Libidn is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * GNU Libidn is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with GNU Libidn; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 00020 * 00021 */ 00022 00023 /* 00024 * This file is derived from RFC 3492bis written by Adam M. Costello. 00025 * 00026 * Disclaimer and license: Regarding this entire document or any 00027 * portion of it (including the pseudocode and C code), the author 00028 * makes no guarantees and is not responsible for any damage resulting 00029 * from its use. The author grants irrevocable permission to anyone 00030 * to use, modify, and distribute it in any way that does not diminish 00031 * the rights of anyone else to use, modify, and distribute it, 00032 * provided that redistributed derivative works do not contain 00033 * misleading author or version information. Derivative works need 00034 * not be licensed under similar terms. 00035 * 00036 * Copyright (C) The Internet Society (2003). All Rights Reserved. 00037 * 00038 * This document and translations of it may be copied and furnished to 00039 * others, and derivative works that comment on or otherwise explain it 00040 * or assist in its implementation may be prepared, copied, published 00041 * and distributed, in whole or in part, without restriction of any 00042 * kind, provided that the above copyright notice and this paragraph are 00043 * included on all such copies and derivative works. However, this 00044 * document itself may not be modified in any way, such as by removing 00045 * the copyright notice or references to the Internet Society or other 00046 * Internet organizations, except as needed for the purpose of 00047 * developing Internet standards in which case the procedures for 00048 * copyrights defined in the Internet Standards process must be 00049 * followed, or as required to translate it into languages other than 00050 * English. 00051 * 00052 * The limited permissions granted above are perpetual and will not be 00053 * revoked by the Internet Society or its successors or assigns. 00054 * 00055 * This document and the information contained herein is provided on an 00056 * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING 00057 * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING 00058 * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION 00059 * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF 00060 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 00061 */ 00062 00063 #include <config.h> 00064 #include <string.h> 00065 00066 #include "punycode.h" 00067 00068 /*** Bootstring parameters for Punycode ***/ 00069 00070 enum 00071 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, 00072 initial_bias = 72, initial_n = 0x80, delimiter = 0x2D 00073 }; 00074 00075 /* basic(cp) tests whether cp is a basic code point: */ 00076 #define basic(cp) ((punycode_uint)(cp) < 0x80) 00077 00078 /* delim(cp) tests whether cp is a delimiter: */ 00079 #define delim(cp) ((cp) == delimiter) 00080 00081 /* decode_digit(cp) returns the numeric value of a basic code */ 00082 /* point (for use in representing integers) in the range 0 to */ 00083 /* base-1, or base if cp does not represent a value. */ 00084 00085 static punycode_uint 00086 decode_digit (punycode_uint cp) 00087 { 00088 return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : 00089 cp - 97 < 26 ? cp - 97 : base; 00090 } 00091 00092 /* encode_digit(d,flag) returns the basic code point whose value */ 00093 /* (when used for representing integers) is d, which needs to be in */ 00094 /* the range 0 to base-1. The lowercase form is used unless flag is */ 00095 /* nonzero, in which case the uppercase form is used. The behavior */ 00096 /* is undefined if flag is nonzero and digit d has no uppercase form. */ 00097 00098 static char 00099 encode_digit (punycode_uint d, int flag) 00100 { 00101 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); 00102 /* 0..25 map to ASCII a..z or A..Z */ 00103 /* 26..35 map to ASCII 0..9 */ 00104 } 00105 00106 /* flagged(bcp) tests whether a basic code point is flagged */ 00107 /* (uppercase). The behavior is undefined if bcp is not a */ 00108 /* basic code point. */ 00109 00110 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26) 00111 00112 /* encode_basic(bcp,flag) forces a basic code point to lowercase */ 00113 /* if flag is zero, uppercase if flag is nonzero, and returns */ 00114 /* the resulting code point. The code point is unchanged if it */ 00115 /* is caseless. The behavior is undefined if bcp is not a basic */ 00116 /* code point. */ 00117 00118 static char 00119 encode_basic (punycode_uint bcp, int flag) 00120 { 00121 bcp -= (bcp - 97 < 26) << 5; 00122 return bcp + ((!flag && (bcp - 65 < 26)) << 5); 00123 } 00124 00125 /*** Platform-specific constants ***/ 00126 00127 /* maxint is the maximum value of a punycode_uint variable: */ 00128 static const punycode_uint maxint = -1; 00129 /* Because maxint is unsigned, -1 becomes the maximum value. */ 00130 00131 /*** Bias adaptation function ***/ 00132 00133 static punycode_uint 00134 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime) 00135 { 00136 punycode_uint k; 00137 00138 delta = firsttime ? delta / damp : delta >> 1; 00139 /* delta >> 1 is a faster way of doing delta / 2 */ 00140 delta += delta / numpoints; 00141 00142 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) 00143 { 00144 delta /= base - tmin; 00145 } 00146 00147 return k + (base - tmin + 1) * delta / (delta + skew); 00148 } 00149 00150 /*** Main encode function ***/ 00151 00189 int 00190 punycode_encode (size_t input_length, 00191 const punycode_uint input[], 00192 const unsigned char case_flags[], 00193 size_t * output_length, char output[]) 00194 { 00195 punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t; 00196 size_t out, max_out; 00197 00198 /* The Punycode spec assumes that the input length is the same type */ 00199 /* of integer as a code point, so we need to convert the size_t to */ 00200 /* a punycode_uint, which could overflow. */ 00201 00202 if (input_length > maxint) 00203 return punycode_overflow; 00204 input_len = (punycode_uint) input_length; 00205 00206 /* Initialize the state: */ 00207 00208 n = initial_n; 00209 delta = 0; 00210 out = 0; 00211 max_out = *output_length; 00212 bias = initial_bias; 00213 00214 /* Handle the basic code points: */ 00215 00216 for (j = 0; j < input_len; ++j) 00217 { 00218 if (basic (input[j])) 00219 { 00220 if (max_out - out < 2) 00221 return punycode_big_output; 00222 output[out++] = case_flags ? 00223 encode_basic (input[j], case_flags[j]) : (char) input[j]; 00224 } 00225 /* else if (input[j] < n) return punycode_bad_input; */ 00226 /* (not needed for Punycode with unsigned code points) */ 00227 } 00228 00229 h = b = (punycode_uint) out; 00230 /* cannot overflow because out <= input_len <= maxint */ 00231 00232 /* h is the number of code points that have been handled, b is the */ 00233 /* number of basic code points, and out is the number of ASCII code */ 00234 /* points that have been output. */ 00235 00236 if (b > 0) 00237 output[out++] = delimiter; 00238 00239 /* Main encoding loop: */ 00240 00241 while (h < input_len) 00242 { 00243 /* All non-basic code points < n have been */ 00244 /* handled already. Find the next larger one: */ 00245 00246 for (m = maxint, j = 0; j < input_len; ++j) 00247 { 00248 /* if (basic(input[j])) continue; */ 00249 /* (not needed for Punycode) */ 00250 if (input[j] >= n && input[j] < m) 00251 m = input[j]; 00252 } 00253 00254 /* Increase delta enough to advance the decoder's */ 00255 /* <n,i> state to <m,0>, but guard against overflow: */ 00256 00257 if (m - n > (maxint - delta) / (h + 1)) 00258 return punycode_overflow; 00259 delta += (m - n) * (h + 1); 00260 n = m; 00261 00262 for (j = 0; j < input_len; ++j) 00263 { 00264 /* Punycode does not need to check whether input[j] is basic: */ 00265 if (input[j] < n /* || basic(input[j]) */ ) 00266 { 00267 if (++delta == 0) 00268 return punycode_overflow; 00269 } 00270 00271 if (input[j] == n) 00272 { 00273 /* Represent delta as a generalized variable-length integer: */ 00274 00275 for (q = delta, k = base;; k += base) 00276 { 00277 if (out >= max_out) 00278 return punycode_big_output; 00279 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00280 k >= bias + tmax ? tmax : k - bias; 00281 if (q < t) 00282 break; 00283 output[out++] = encode_digit (t + (q - t) % (base - t), 0); 00284 q = (q - t) / (base - t); 00285 } 00286 00287 output[out++] = encode_digit (q, case_flags && case_flags[j]); 00288 bias = adapt (delta, h + 1, h == b); 00289 delta = 0; 00290 ++h; 00291 } 00292 } 00293 00294 ++delta, ++n; 00295 } 00296 00297 *output_length = out; 00298 return punycode_success; 00299 } 00300 00301 /*** Main decode function ***/ 00302 00338 int 00339 punycode_decode (size_t input_length, 00340 const char input[], 00341 size_t * output_length, 00342 punycode_uint output[], unsigned char case_flags[]) 00343 { 00344 punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t; 00345 size_t b, j, in; 00346 00347 /* Initialize the state: */ 00348 00349 n = initial_n; 00350 out = i = 0; 00351 max_out = *output_length > maxint ? maxint 00352 : (punycode_uint) * output_length; 00353 bias = initial_bias; 00354 00355 /* Handle the basic code points: Let b be the number of input code */ 00356 /* points before the last delimiter, or 0 if there is none, then */ 00357 /* copy the first b code points to the output. */ 00358 00359 for (b = j = 0; j < input_length; ++j) 00360 if (delim (input[j])) 00361 b = j; 00362 if (b > max_out) 00363 return punycode_big_output; 00364 00365 for (j = 0; j < b; ++j) 00366 { 00367 if (case_flags) 00368 case_flags[out] = flagged (input[j]); 00369 if (!basic (input[j])) 00370 return punycode_bad_input; 00371 output[out++] = input[j]; 00372 } 00373 00374 /* Main decoding loop: Start just after the last delimiter if any */ 00375 /* basic code points were copied; start at the beginning otherwise. */ 00376 00377 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) 00378 { 00379 00380 /* in is the index of the next ASCII code point to be consumed, */ 00381 /* and out is the number of code points in the output array. */ 00382 00383 /* Decode a generalized variable-length integer into delta, */ 00384 /* which gets added to i. The overflow checking is easier */ 00385 /* if we increase i as we go, then subtract off its starting */ 00386 /* value at the end to obtain delta. */ 00387 00388 for (oldi = i, w = 1, k = base;; k += base) 00389 { 00390 if (in >= input_length) 00391 return punycode_bad_input; 00392 digit = decode_digit (input[in++]); 00393 if (digit >= base) 00394 return punycode_bad_input; 00395 if (digit > (maxint - i) / w) 00396 return punycode_overflow; 00397 i += digit * w; 00398 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00399 k >= bias + tmax ? tmax : k - bias; 00400 if (digit < t) 00401 break; 00402 if (w > maxint / (base - t)) 00403 return punycode_overflow; 00404 w *= (base - t); 00405 } 00406 00407 bias = adapt (i - oldi, out + 1, oldi == 0); 00408 00409 /* i was supposed to wrap around from out+1 to 0, */ 00410 /* incrementing n each time, so we'll fix that now: */ 00411 00412 if (i / (out + 1) > maxint - n) 00413 return punycode_overflow; 00414 n += i / (out + 1); 00415 i %= (out + 1); 00416 00417 /* Insert n at position i of the output: */ 00418 00419 /* not needed for Punycode: */ 00420 /* if (basic(n)) return punycode_invalid_input; */ 00421 if (out >= max_out) 00422 return punycode_big_output; 00423 00424 if (case_flags) 00425 { 00426 memmove (case_flags + i + 1, case_flags + i, out - i); 00427 /* Case of last ASCII code point determines case flag: */ 00428 case_flags[i] = flagged (input[in - 1]); 00429 } 00430 00431 memmove (output + i + 1, output + i, (out - i) * sizeof *output); 00432 output[i++] = n; 00433 } 00434 00435 *output_length = (size_t) out; 00436 /* cannot overflow because out <= old value of *output_length */ 00437 return punycode_success; 00438 } 00439