libuv-svace-build
560 строк · 12.5 Кб
1/* Copyright libuv contributors. All rights reserved.
2*
3* Permission to use, copy, modify, and/or distribute this software for any
4* purpose with or without fee is hereby granted, provided that the above
5* copyright notice and this permission notice appear in all copies.
6*
7* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14*/
15
16/* Derived from https://github.com/bnoordhuis/punycode
17* but updated to support IDNA 2008.
18*/
19
20#include "uv.h"21#include "uv-common.h"22#include "idna.h"23#include <assert.h>24#include <string.h>25#include <limits.h> /* UINT_MAX */26
27
28static int32_t uv__wtf8_decode1(const char** input) {29uint32_t code_point;30uint8_t b1;31uint8_t b2;32uint8_t b3;33uint8_t b4;34
35b1 = **input;36if (b1 <= 0x7F)37return b1; /* ASCII code point */38if (b1 < 0xC2)39return -1; /* invalid: continuation byte */40code_point = b1;41
42b2 = *++*input;43if ((b2 & 0xC0) != 0x80)44return -1; /* invalid: not a continuation byte */45code_point = (code_point << 6) | (b2 & 0x3F);46if (b1 <= 0xDF)47return 0x7FF & code_point; /* two-byte character */48
49b3 = *++*input;50if ((b3 & 0xC0) != 0x80)51return -1; /* invalid: not a continuation byte */52code_point = (code_point << 6) | (b3 & 0x3F);53if (b1 <= 0xEF)54return 0xFFFF & code_point; /* three-byte character */55
56b4 = *++*input;57if ((b4 & 0xC0) != 0x80)58return -1; /* invalid: not a continuation byte */59code_point = (code_point << 6) | (b4 & 0x3F);60if (b1 <= 0xF4) {61code_point &= 0x1FFFFF;62if (code_point <= 0x10FFFF)63return code_point; /* four-byte character */64}65
66/* code point too large */67return -1;68}
69
70
71static unsigned uv__utf8_decode1_slow(const char** p,72const char* pe,73unsigned a) {74unsigned b;75unsigned c;76unsigned d;77unsigned min;78
79if (a > 0xF7)80return -1;81
82switch (pe - *p) {83default:84if (a > 0xEF) {85min = 0x10000;86a = a & 7;87b = (unsigned char) *(*p)++;88c = (unsigned char) *(*p)++;89d = (unsigned char) *(*p)++;90break;91}92/* Fall through. */93case 2:94if (a > 0xDF) {95min = 0x800;96b = 0x80 | (a & 15);97c = (unsigned char) *(*p)++;98d = (unsigned char) *(*p)++;99a = 0;100break;101}102/* Fall through. */103case 1:104if (a > 0xBF) {105min = 0x80;106b = 0x80;107c = 0x80 | (a & 31);108d = (unsigned char) *(*p)++;109a = 0;110break;111}112/* Fall through. */113case 0:114return -1; /* Invalid continuation byte. */115}116
117if (0x80 != (0xC0 & (b ^ c ^ d)))118return -1; /* Invalid sequence. */119
120b &= 63;121c &= 63;122d &= 63;123a = (a << 18) | (b << 12) | (c << 6) | d;124
125if (a < min)126return -1; /* Overlong sequence. */127
128if (a > 0x10FFFF)129return -1; /* Four-byte sequence > U+10FFFF. */130
131if (a >= 0xD800 && a <= 0xDFFF)132return -1; /* Surrogate pair. */133
134return a;135}
136
137
138unsigned uv__utf8_decode1(const char** p, const char* pe) {139unsigned a;140
141assert(*p < pe);142
143a = (unsigned char) *(*p)++;144
145if (a < 128)146return a; /* ASCII, common case. */147
148return uv__utf8_decode1_slow(p, pe, a);149}
150
151
152static int uv__idna_toascii_label(const char* s, const char* se,153char** d, char* de) {154static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";155const char* ss;156unsigned c;157unsigned h;158unsigned k;159unsigned n;160unsigned m;161unsigned q;162unsigned t;163unsigned x;164unsigned y;165unsigned bias;166unsigned delta;167unsigned todo;168int first;169
170h = 0;171ss = s;172todo = 0;173
174/* Note: after this loop we've visited all UTF-8 characters and know175* they're legal so we no longer need to check for decode errors.
176*/
177while (s < se) {178c = uv__utf8_decode1(&s, se);179
180if (c == UINT_MAX)181return UV_EINVAL;182
183if (c < 128)184h++;185else186todo++;187}188
189/* Only write "xn--" when there are non-ASCII characters. */190if (todo > 0) {191if (*d < de) *(*d)++ = 'x';192if (*d < de) *(*d)++ = 'n';193if (*d < de) *(*d)++ = '-';194if (*d < de) *(*d)++ = '-';195}196
197/* Write ASCII characters. */198x = 0;199s = ss;200while (s < se) {201c = uv__utf8_decode1(&s, se);202assert(c != UINT_MAX);203
204if (c > 127)205continue;206
207if (*d < de)208*(*d)++ = c;209
210if (++x == h)211break; /* Visited all ASCII characters. */212}213
214if (todo == 0)215return h;216
217/* Only write separator when we've written ASCII characters first. */218if (h > 0)219if (*d < de)220*(*d)++ = '-';221
222n = 128;223bias = 72;224delta = 0;225first = 1;226
227while (todo > 0) {228m = -1;229s = ss;230
231while (s < se) {232c = uv__utf8_decode1(&s, se);233assert(c != UINT_MAX);234
235if (c >= n)236if (c < m)237m = c;238}239
240x = m - n;241y = h + 1;242
243if (x > ~delta / y)244return UV_E2BIG; /* Overflow. */245
246delta += x * y;247n = m;248
249s = ss;250while (s < se) {251c = uv__utf8_decode1(&s, se);252assert(c != UINT_MAX);253
254if (c < n)255if (++delta == 0)256return UV_E2BIG; /* Overflow. */257
258if (c != n)259continue;260
261for (k = 36, q = delta; /* empty */; k += 36) {262t = 1;263
264if (k > bias)265t = k - bias;266
267if (t > 26)268t = 26;269
270if (q < t)271break;272
273/* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore274* 10 <= y <= 35, we can optimize the long division
275* into a table-based reciprocal multiplication.
276*/
277x = q - t;278y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */279q = x / y;280t = t + x % y; /* 1 <= t <= 35 because of y. */281
282if (*d < de)283*(*d)++ = alphabet[t];284}285
286if (*d < de)287*(*d)++ = alphabet[q];288
289delta /= 2;290
291if (first) {292delta /= 350;293first = 0;294}295
296/* No overflow check is needed because |delta| was just297* divided by 2 and |delta+delta >= delta + delta/h|.
298*/
299h++;300delta += delta / h;301
302for (bias = 0; delta > 35 * 26 / 2; bias += 36)303delta /= 35;304
305bias += 36 * delta / (delta + 38);306delta = 0;307todo--;308}309
310delta++;311n++;312}313
314return 0;315}
316
317
318ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {319const char* si;320const char* st;321unsigned c;322char* ds;323int rc;324
325if (s == se)326return UV_EINVAL;327
328ds = d;329
330si = s;331while (si < se) {332st = si;333c = uv__utf8_decode1(&si, se);334
335if (c == UINT_MAX)336return UV_EINVAL;337
338if (c != '.')339if (c != 0x3002) /* 。 */340if (c != 0xFF0E) /* . */341if (c != 0xFF61) /* 。 */342continue;343
344rc = uv__idna_toascii_label(s, st, &d, de);345
346if (rc < 0)347return rc;348
349if (d < de)350*d++ = '.';351
352s = si;353}354
355if (s < se) {356rc = uv__idna_toascii_label(s, se, &d, de);357
358if (rc < 0)359return rc;360}361
362if (d >= de)363return UV_EINVAL;364
365*d++ = '\0';366return d - ds; /* Number of bytes written. */367}
368
369
370ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {371size_t w_target_len = 0;372int32_t code_point;373
374do {375code_point = uv__wtf8_decode1(&source_ptr);376if (code_point < 0)377return -1;378if (code_point > 0xFFFF)379w_target_len++;380w_target_len++;381} while (*source_ptr++);382
383return w_target_len;384}
385
386
387void uv_wtf8_to_utf16(const char* source_ptr,388uint16_t* w_target,389size_t w_target_len) {390int32_t code_point;391
392do {393code_point = uv__wtf8_decode1(&source_ptr);394/* uv_wtf8_length_as_utf16 should have been called and checked first. */395assert(code_point >= 0);396if (code_point > 0x10000) {397assert(code_point < 0x10FFFF);398*w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);399*w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;400w_target_len -= 2;401} else {402*w_target++ = code_point;403w_target_len -= 1;404}405} while (*source_ptr++);406
407(void)w_target_len;408assert(w_target_len == 0);409}
410
411
412static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,413ssize_t w_source_len) {414uint16_t u;415uint16_t next;416
417u = w_source_ptr[0];418if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {419next = w_source_ptr[1];420if (next >= 0xDC00 && next <= 0xDFFF)421return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);422}423return u;424}
425
426
427size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,428ssize_t w_source_len) {429size_t target_len;430int32_t code_point;431
432target_len = 0;433while (w_source_len) {434code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);435/* Can be invalid UTF-8 but must be valid WTF-8. */436assert(code_point >= 0);437if (w_source_len < 0 && code_point == 0)438break;439if (code_point < 0x80)440target_len += 1;441else if (code_point < 0x800)442target_len += 2;443else if (code_point < 0x10000)444target_len += 3;445else {446target_len += 4;447w_source_ptr++;448if (w_source_len > 0)449w_source_len--;450}451w_source_ptr++;452if (w_source_len > 0)453w_source_len--;454}455
456return target_len;457}
458
459
460int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,461ssize_t w_source_len,462char** target_ptr,463size_t* target_len_ptr) {464size_t target_len;465char* target;466char* target_end;467int32_t code_point;468
469/* If *target_ptr is provided, then *target_len_ptr must be its length470* (excluding space for NUL), otherwise we will compute the target_len_ptr
471* length and may return a new allocation in *target_ptr if target_ptr is
472* provided. */
473if (target_ptr == NULL || *target_ptr == NULL) {474target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);475if (target_len_ptr != NULL)476*target_len_ptr = target_len;477} else {478target_len = *target_len_ptr;479}480
481if (target_ptr == NULL)482return 0;483
484if (*target_ptr == NULL) {485target = uv__malloc(target_len + 1);486if (target == NULL) {487return UV_ENOMEM;488}489*target_ptr = target;490} else {491target = *target_ptr;492}493
494target_end = target + target_len;495
496while (target != target_end && w_source_len) {497code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);498/* Can be invalid UTF-8 but must be valid WTF-8. */499assert(code_point >= 0);500if (w_source_len < 0 && code_point == 0) {501w_source_len = 0;502break;503}504if (code_point < 0x80) {505*target++ = code_point;506} else if (code_point < 0x800) {507*target++ = 0xC0 | (code_point >> 6);508if (target == target_end)509break;510*target++ = 0x80 | (code_point & 0x3F);511} else if (code_point < 0x10000) {512*target++ = 0xE0 | (code_point >> 12);513if (target == target_end)514break;515*target++ = 0x80 | ((code_point >> 6) & 0x3F);516if (target == target_end)517break;518*target++ = 0x80 | (code_point & 0x3F);519} else {520*target++ = 0xF0 | (code_point >> 18);521if (target == target_end)522break;523*target++ = 0x80 | ((code_point >> 12) & 0x3F);524if (target == target_end)525break;526*target++ = 0x80 | ((code_point >> 6) & 0x3F);527if (target == target_end)528break;529*target++ = 0x80 | (code_point & 0x3F);530/* uv__get_surrogate_value consumed 2 input characters */531w_source_ptr++;532if (w_source_len > 0)533w_source_len--;534}535target_len = target - *target_ptr;536w_source_ptr++;537if (w_source_len > 0)538w_source_len--;539}540
541if (target != target_end && target_len_ptr != NULL)542/* Did not fill all of the provided buffer, so update the target_len_ptr543* output with the space used. */
544*target_len_ptr = target - *target_ptr;545
546/* Check if input fit into target exactly. */547if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)548w_source_len = 0;549
550*target++ = '\0';551
552/* Characters remained after filling the buffer, compute the remaining length now. */553if (w_source_len) {554if (target_len_ptr != NULL)555*target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);556return UV_ENOBUFS;557}558
559return 0;560}
561