libuv-svace-build

idna.c
560 строк · 12.5 Кб
Перенос по словам
1
/* Copyright libuv contributors. All rights reserved.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
 */
15

16
/* Derived from https://github.com/bnoordhuis/punycode
17
 * but updated to support IDNA 2008.
18
 */
19

20
#include "uv.h"
21
#include "uv-common.h"
22
#include "idna.h"
23
#include <assert.h>
24
#include <string.h>
25
#include <limits.h> /* UINT_MAX */
26

27

28
static int32_t uv__wtf8_decode1(const char** input) {
29
  uint32_t code_point;
30
  uint8_t b1;
31
  uint8_t b2;
32
  uint8_t b3;
33
  uint8_t b4;
34

35
  b1 = **input;
36
  if (b1 <= 0x7F)
37
    return b1; /* ASCII code point */
38
  if (b1 < 0xC2)
39
    return -1; /* invalid: continuation byte */
40
  code_point = b1;
41

42
  b2 = *++*input;
43
  if ((b2 & 0xC0) != 0x80)
44
    return -1; /* invalid: not a continuation byte */
45
  code_point = (code_point << 6) | (b2 & 0x3F);
46
  if (b1 <= 0xDF)
47
    return 0x7FF & code_point; /* two-byte character */
48

49
  b3 = *++*input;
50
  if ((b3 & 0xC0) != 0x80)
51
    return -1; /* invalid: not a continuation byte */
52
  code_point = (code_point << 6) | (b3 & 0x3F);
53
  if (b1 <= 0xEF)
54
    return 0xFFFF & code_point; /* three-byte character */
55

56
  b4 = *++*input;
57
  if ((b4 & 0xC0) != 0x80)
58
    return -1; /* invalid: not a continuation byte */
59
  code_point = (code_point << 6) | (b4 & 0x3F);
60
  if (b1 <= 0xF4) {
61
    code_point &= 0x1FFFFF;
62
    if (code_point <= 0x10FFFF)
63
      return code_point; /* four-byte character */
64
  }
65

66
  /* code point too large */
67
  return -1;
68
}
69

70

71
static unsigned uv__utf8_decode1_slow(const char** p,
72
                                      const char* pe,
73
                                      unsigned a) {
74
  unsigned b;
75
  unsigned c;
76
  unsigned d;
77
  unsigned min;
78

79
  if (a > 0xF7)
80
    return -1;
81

82
  switch (pe - *p) {
83
  default:
84
    if (a > 0xEF) {
85
      min = 0x10000;
86
      a = a & 7;
87
      b = (unsigned char) *(*p)++;
88
      c = (unsigned char) *(*p)++;
89
      d = (unsigned char) *(*p)++;
90
      break;
91
    }
92
    /* Fall through. */
93
  case 2:
94
    if (a > 0xDF) {
95
      min = 0x800;
96
      b = 0x80 | (a & 15);
97
      c = (unsigned char) *(*p)++;
98
      d = (unsigned char) *(*p)++;
99
      a = 0;
100
      break;
101
    }
102
    /* Fall through. */
103
  case 1:
104
    if (a > 0xBF) {
105
      min = 0x80;
106
      b = 0x80;
107
      c = 0x80 | (a & 31);
108
      d = (unsigned char) *(*p)++;
109
      a = 0;
110
      break;
111
    }
112
    /* Fall through. */
113
  case 0:
114
    return -1;  /* Invalid continuation byte. */
115
  }
116

117
  if (0x80 != (0xC0 & (b ^ c ^ d)))
118
    return -1;  /* Invalid sequence. */
119

120
  b &= 63;
121
  c &= 63;
122
  d &= 63;
123
  a = (a << 18) | (b << 12) | (c << 6) | d;
124

125
  if (a < min)
126
    return -1;  /* Overlong sequence. */
127

128
  if (a > 0x10FFFF)
129
    return -1;  /* Four-byte sequence > U+10FFFF. */
130

131
  if (a >= 0xD800 && a <= 0xDFFF)
132
    return -1;  /* Surrogate pair. */
133

134
  return a;
135
}
136

137

138
unsigned uv__utf8_decode1(const char** p, const char* pe) {
139
  unsigned a;
140

141
  assert(*p < pe);
142

143
  a = (unsigned char) *(*p)++;
144

145
  if (a < 128)
146
    return a;  /* ASCII, common case. */
147

148
  return uv__utf8_decode1_slow(p, pe, a);
149
}
150

151

152
static int uv__idna_toascii_label(const char* s, const char* se,
153
                                  char** d, char* de) {
154
  static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
155
  const char* ss;
156
  unsigned c;
157
  unsigned h;
158
  unsigned k;
159
  unsigned n;
160
  unsigned m;
161
  unsigned q;
162
  unsigned t;
163
  unsigned x;
164
  unsigned y;
165
  unsigned bias;
166
  unsigned delta;
167
  unsigned todo;
168
  int first;
169

170
  h = 0;
171
  ss = s;
172
  todo = 0;
173

174
  /* Note: after this loop we've visited all UTF-8 characters and know
175
   * they're legal so we no longer need to check for decode errors.
176
   */
177
  while (s < se) {
178
    c = uv__utf8_decode1(&s, se);
179

180
    if (c == UINT_MAX)
181
      return UV_EINVAL;
182

183
    if (c < 128)
184
      h++;
185
    else
186
      todo++;
187
  }
188

189
  /* Only write "xn--" when there are non-ASCII characters. */
190
  if (todo > 0) {
191
    if (*d < de) *(*d)++ = 'x';
192
    if (*d < de) *(*d)++ = 'n';
193
    if (*d < de) *(*d)++ = '-';
194
    if (*d < de) *(*d)++ = '-';
195
  }
196

197
  /* Write ASCII characters. */
198
  x = 0;
199
  s = ss;
200
  while (s < se) {
201
    c = uv__utf8_decode1(&s, se);
202
    assert(c != UINT_MAX);
203

204
    if (c > 127)
205
      continue;
206

207
    if (*d < de)
208
      *(*d)++ = c;
209

210
    if (++x == h)
211
      break;  /* Visited all ASCII characters. */
212
  }
213

214
  if (todo == 0)
215
    return h;
216

217
  /* Only write separator when we've written ASCII characters first. */
218
  if (h > 0)
219
    if (*d < de)
220
      *(*d)++ = '-';
221

222
  n = 128;
223
  bias = 72;
224
  delta = 0;
225
  first = 1;
226

227
  while (todo > 0) {
228
    m = -1;
229
    s = ss;
230

231
    while (s < se) {
232
      c = uv__utf8_decode1(&s, se);
233
      assert(c != UINT_MAX);
234

235
      if (c >= n)
236
        if (c < m)
237
          m = c;
238
    }
239

240
    x = m - n;
241
    y = h + 1;
242

243
    if (x > ~delta / y)
244
      return UV_E2BIG;  /* Overflow. */
245

246
    delta += x * y;
247
    n = m;
248

249
    s = ss;
250
    while (s < se) {
251
      c = uv__utf8_decode1(&s, se);
252
      assert(c != UINT_MAX);
253

254
      if (c < n)
255
        if (++delta == 0)
256
          return UV_E2BIG;  /* Overflow. */
257

258
      if (c != n)
259
        continue;
260

261
      for (k = 36, q = delta; /* empty */; k += 36) {
262
        t = 1;
263

264
        if (k > bias)
265
          t = k - bias;
266

267
        if (t > 26)
268
          t = 26;
269

270
        if (q < t)
271
          break;
272

273
        /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
274
         * 10 <= y <= 35, we can optimize the long division
275
         * into a table-based reciprocal multiplication.
276
         */
277
        x = q - t;
278
        y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
279
        q = x / y;
280
        t = t + x % y;  /* 1 <= t <= 35 because of y. */
281

282
        if (*d < de)
283
          *(*d)++ = alphabet[t];
284
      }
285

286
      if (*d < de)
287
        *(*d)++ = alphabet[q];
288

289
      delta /= 2;
290

291
      if (first) {
292
        delta /= 350;
293
        first = 0;
294
      }
295

296
      /* No overflow check is needed because |delta| was just
297
       * divided by 2 and |delta+delta >= delta + delta/h|.
298
       */
299
      h++;
300
      delta += delta / h;
301

302
      for (bias = 0; delta > 35 * 26 / 2; bias += 36)
303
        delta /= 35;
304

305
      bias += 36 * delta / (delta + 38);
306
      delta = 0;
307
      todo--;
308
    }
309

310
    delta++;
311
    n++;
312
  }
313

314
  return 0;
315
}
316

317

318
ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
319
  const char* si;
320
  const char* st;
321
  unsigned c;
322
  char* ds;
323
  int rc;
324

325
  if (s == se)
326
    return UV_EINVAL;
327

328
  ds = d;
329

330
  si = s;
331
  while (si < se) {
332
    st = si;
333
    c = uv__utf8_decode1(&si, se);
334

335
    if (c == UINT_MAX)
336
      return UV_EINVAL;
337

338
    if (c != '.')
339
      if (c != 0x3002)  /* 。 */
340
        if (c != 0xFF0E)  /* ． */
341
          if (c != 0xFF61)  /* ｡ */
342
            continue;
343

344
    rc = uv__idna_toascii_label(s, st, &d, de);
345

346
    if (rc < 0)
347
      return rc;
348

349
    if (d < de)
350
      *d++ = '.';
351

352
    s = si;
353
  }
354

355
  if (s < se) {
356
    rc = uv__idna_toascii_label(s, se, &d, de);
357

358
    if (rc < 0)
359
      return rc;
360
  }
361

362
  if (d >= de)
363
    return UV_EINVAL;
364

365
  *d++ = '\0';
366
  return d - ds;  /* Number of bytes written. */
367
}
368

369

370
ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
371
  size_t w_target_len = 0;
372
  int32_t code_point;
373

374
  do {
375
    code_point = uv__wtf8_decode1(&source_ptr);
376
    if (code_point < 0)
377
      return -1;
378
    if (code_point > 0xFFFF)
379
      w_target_len++;
380
    w_target_len++;
381
  } while (*source_ptr++);
382

383
  return w_target_len;
384
}
385

386

387
void uv_wtf8_to_utf16(const char* source_ptr,
388
                      uint16_t* w_target,
389
                      size_t w_target_len) {
390
  int32_t code_point;
391

392
  do {
393
    code_point = uv__wtf8_decode1(&source_ptr);
394
    /* uv_wtf8_length_as_utf16 should have been called and checked first. */
395
    assert(code_point >= 0);
396
    if (code_point > 0x10000) {
397
      assert(code_point < 0x10FFFF);
398
      *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
399
      *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
400
      w_target_len -= 2;
401
    } else {
402
      *w_target++ = code_point;
403
      w_target_len -= 1;
404
    }
405
  } while (*source_ptr++);
406

407
  (void)w_target_len;
408
  assert(w_target_len == 0);
409
}
410

411

412
static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
413
                                       ssize_t w_source_len) {
414
  uint16_t u;
415
  uint16_t next;
416

417
  u = w_source_ptr[0];
418
  if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
419
    next = w_source_ptr[1];
420
    if (next >= 0xDC00 && next <= 0xDFFF)
421
      return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
422
  }
423
  return u;
424
}
425

426

427
size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
428
                               ssize_t w_source_len) {
429
  size_t target_len;
430
  int32_t code_point;
431

432
  target_len = 0;
433
  while (w_source_len) {
434
    code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
435
    /* Can be invalid UTF-8 but must be valid WTF-8. */
436
    assert(code_point >= 0);
437
    if (w_source_len < 0 && code_point == 0)
438
      break;
439
    if (code_point < 0x80)
440
      target_len += 1;
441
    else if (code_point < 0x800)
442
      target_len += 2;
443
    else if (code_point < 0x10000)
444
      target_len += 3;
445
    else {
446
      target_len += 4;
447
      w_source_ptr++;
448
      if (w_source_len > 0)
449
        w_source_len--;
450
    }
451
    w_source_ptr++;
452
    if (w_source_len > 0)
453
      w_source_len--;
454
  }
455

456
  return target_len;
457
}
458

459

460
int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
461
                     ssize_t w_source_len,
462
                     char** target_ptr,
463
                     size_t* target_len_ptr) {
464
  size_t target_len;
465
  char* target;
466
  char* target_end;
467
  int32_t code_point;
468

469
  /* If *target_ptr is provided, then *target_len_ptr must be its length
470
   * (excluding space for NUL), otherwise we will compute the target_len_ptr
471
   * length and may return a new allocation in *target_ptr if target_ptr is
472
   * provided. */
473
  if (target_ptr == NULL || *target_ptr == NULL) {
474
    target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
475
    if (target_len_ptr != NULL)
476
      *target_len_ptr = target_len;
477
  } else {
478
    target_len = *target_len_ptr;
479
  }
480

481
  if (target_ptr == NULL)
482
    return 0;
483

484
  if (*target_ptr == NULL) {
485
    target = uv__malloc(target_len + 1);
486
    if (target == NULL) {
487
      return UV_ENOMEM;
488
    }
489
    *target_ptr = target;
490
  } else {
491
    target = *target_ptr;
492
  }
493

494
  target_end = target + target_len;
495

496
  while (target != target_end && w_source_len) {
497
    code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
498
    /* Can be invalid UTF-8 but must be valid WTF-8. */
499
    assert(code_point >= 0);
500
    if (w_source_len < 0 && code_point == 0) {
501
      w_source_len = 0;
502
      break;
503
    }
504
    if (code_point < 0x80) {
505
      *target++ = code_point;
506
    } else if (code_point < 0x800) {
507
      *target++ = 0xC0 | (code_point >> 6);
508
      if (target == target_end)
509
        break;
510
      *target++ = 0x80 | (code_point & 0x3F);
511
    } else if (code_point < 0x10000) {
512
      *target++ = 0xE0 | (code_point >> 12);
513
      if (target == target_end)
514
        break;
515
      *target++ = 0x80 | ((code_point >> 6) & 0x3F);
516
      if (target == target_end)
517
        break;
518
      *target++ = 0x80 | (code_point & 0x3F);
519
    } else {
520
      *target++ = 0xF0 | (code_point >> 18);
521
      if (target == target_end)
522
        break;
523
      *target++ = 0x80 | ((code_point >> 12) & 0x3F);
524
      if (target == target_end)
525
        break;
526
      *target++ = 0x80 | ((code_point >> 6) & 0x3F);
527
      if (target == target_end)
528
        break;
529
      *target++ = 0x80 | (code_point & 0x3F);
530
      /* uv__get_surrogate_value consumed 2 input characters */
531
      w_source_ptr++;
532
      if (w_source_len > 0)
533
        w_source_len--;
534
    }
535
    target_len = target - *target_ptr;
536
    w_source_ptr++;
537
    if (w_source_len > 0)
538
      w_source_len--;
539
  }
540

541
  if (target != target_end && target_len_ptr != NULL)
542
    /* Did not fill all of the provided buffer, so update the target_len_ptr
543
     * output with the space used. */
544
    *target_len_ptr = target - *target_ptr;
545

546
  /* Check if input fit into target exactly. */
547
  if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
548
    w_source_len = 0;
549

550
  *target++ = '\0';
551

552
  /* Characters remained after filling the buffer, compute the remaining length now. */
553
  if (w_source_len) {
554
    if (target_len_ptr != NULL)
555
      *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
556
    return UV_ENOBUFS;
557
  }
558

559
  return 0;
560
}
561
libuv-svace-build

Использование cookies