llvm-project

LiteralSupport.cpp
2433 строки · 82.5 Кб
Перенос по словам
1
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the NumericLiteralParser, CharLiteralParser, and
10
// StringLiteralParser interfaces.
11
//
12
//===----------------------------------------------------------------------===//
13

14
#include "clang/Lex/LiteralSupport.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TargetInfo.h"
19
#include "clang/Lex/LexDiagnostic.h"
20
#include "clang/Lex/Lexer.h"
21
#include "clang/Lex/Preprocessor.h"
22
#include "clang/Lex/Token.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/StringExtras.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include "llvm/Support/ConvertUTF.h"
28
#include "llvm/Support/Error.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include "llvm/Support/Unicode.h"
31
#include <algorithm>
32
#include <cassert>
33
#include <cstddef>
34
#include <cstdint>
35
#include <cstring>
36
#include <string>
37

38
using namespace clang;
39

40
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41
  switch (kind) {
42
  default: llvm_unreachable("Unknown token type!");
43
  case tok::char_constant:
44
  case tok::string_literal:
45
  case tok::utf8_char_constant:
46
  case tok::utf8_string_literal:
47
    return Target.getCharWidth();
48
  case tok::wide_char_constant:
49
  case tok::wide_string_literal:
50
    return Target.getWCharWidth();
51
  case tok::utf16_char_constant:
52
  case tok::utf16_string_literal:
53
    return Target.getChar16Width();
54
  case tok::utf32_char_constant:
55
  case tok::utf32_string_literal:
56
    return Target.getChar32Width();
57
  }
58
}
59

60
static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61
  switch (kind) {
62
  default:
63
    llvm_unreachable("Unknown token type!");
64
  case tok::char_constant:
65
  case tok::string_literal:
66
    return 0;
67
  case tok::utf8_char_constant:
68
  case tok::utf8_string_literal:
69
    return 2;
70
  case tok::wide_char_constant:
71
  case tok::wide_string_literal:
72
  case tok::utf16_char_constant:
73
  case tok::utf16_string_literal:
74
  case tok::utf32_char_constant:
75
  case tok::utf32_string_literal:
76
    return 1;
77
  }
78
}
79

80
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81
                                           FullSourceLoc TokLoc,
82
                                           const char *TokBegin,
83
                                           const char *TokRangeBegin,
84
                                           const char *TokRangeEnd) {
85
  SourceLocation Begin =
86
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
87
                                   TokLoc.getManager(), Features);
88
  SourceLocation End =
89
    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
90
                                   TokLoc.getManager(), Features);
91
  return CharSourceRange::getCharRange(Begin, End);
92
}
93

94
/// Produce a diagnostic highlighting some portion of a literal.
95
///
96
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
97
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
99
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100
                              const LangOptions &Features, FullSourceLoc TokLoc,
101
                              const char *TokBegin, const char *TokRangeBegin,
102
                              const char *TokRangeEnd, unsigned DiagID) {
103
  SourceLocation Begin =
104
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
105
                                   TokLoc.getManager(), Features);
106
  return Diags->Report(Begin, DiagID) <<
107
    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108
}
109

110
static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111
  switch (Escape) {
112
  case '\'':
113
  case '"':
114
  case '?':
115
  case '\\':
116
  case 'a':
117
  case 'b':
118
  case 'f':
119
  case 'n':
120
  case 'r':
121
  case 't':
122
  case 'v':
123
    return true;
124
  }
125
  return false;
126
}
127

128
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129
/// either a character or a string literal.
130
static unsigned ProcessCharEscape(const char *ThisTokBegin,
131
                                  const char *&ThisTokBuf,
132
                                  const char *ThisTokEnd, bool &HadError,
133
                                  FullSourceLoc Loc, unsigned CharWidth,
134
                                  DiagnosticsEngine *Diags,
135
                                  const LangOptions &Features,
136
                                  StringLiteralEvalMethod EvalMethod) {
137
  const char *EscapeBegin = ThisTokBuf;
138
  bool Delimited = false;
139
  bool EndDelimiterFound = false;
140

141
  // Skip the '\' char.
142
  ++ThisTokBuf;
143

144
  // We know that this character can't be off the end of the buffer, because
145
  // that would have been \", which would not have been the end of string.
146
  unsigned ResultChar = *ThisTokBuf++;
147
  char Escape = ResultChar;
148
  switch (ResultChar) {
149
  // These map to themselves.
150
  case '\\': case '\'': case '"': case '?': break;
151

152
    // These have fixed mappings.
153
  case 'a':
154
    // TODO: K&R: the meaning of '\\a' is different in traditional C
155
    ResultChar = 7;
156
    break;
157
  case 'b':
158
    ResultChar = 8;
159
    break;
160
  case 'e':
161
    if (Diags)
162
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
163
           diag::ext_nonstandard_escape) << "e";
164
    ResultChar = 27;
165
    break;
166
  case 'E':
167
    if (Diags)
168
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
169
           diag::ext_nonstandard_escape) << "E";
170
    ResultChar = 27;
171
    break;
172
  case 'f':
173
    ResultChar = 12;
174
    break;
175
  case 'n':
176
    ResultChar = 10;
177
    break;
178
  case 'r':
179
    ResultChar = 13;
180
    break;
181
  case 't':
182
    ResultChar = 9;
183
    break;
184
  case 'v':
185
    ResultChar = 11;
186
    break;
187
  case 'x': { // Hex escape.
188
    ResultChar = 0;
189
    if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190
      Delimited = true;
191
      ThisTokBuf++;
192
      if (*ThisTokBuf == '}') {
193
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194
             diag::err_delimited_escape_empty);
195
        return ResultChar;
196
      }
197
    } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
198
      if (Diags)
199
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200
             diag::err_hex_escape_no_digits) << "x";
201
      return ResultChar;
202
    }
203

204
    // Hex escapes are a maximal series of hex digits.
205
    bool Overflow = false;
206
    for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207
      if (Delimited && *ThisTokBuf == '}') {
208
        ThisTokBuf++;
209
        EndDelimiterFound = true;
210
        break;
211
      }
212
      int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213
      if (CharVal == -1) {
214
        // Non delimited hex escape sequences stop at the first non-hex digit.
215
        if (!Delimited)
216
          break;
217
        HadError = true;
218
        if (Diags)
219
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220
               diag::err_delimited_escape_invalid)
221
              << StringRef(ThisTokBuf, 1);
222
        continue;
223
      }
224
      // About to shift out a digit?
225
      if (ResultChar & 0xF0000000)
226
        Overflow = true;
227
      ResultChar <<= 4;
228
      ResultChar |= CharVal;
229
    }
230
    // See if any bits will be truncated when evaluated as a character.
231
    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
232
      Overflow = true;
233
      ResultChar &= ~0U >> (32-CharWidth);
234
    }
235

236
    // Check for overflow.
237
    if (!HadError && Overflow) { // Too many digits to fit in
238
      HadError = true;
239
      if (Diags)
240
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241
             diag::err_escape_too_large)
242
            << 0;
243
    }
244
    break;
245
  }
246
  case '0': case '1': case '2': case '3':
247
  case '4': case '5': case '6': case '7': {
248
    // Octal escapes.
249
    --ThisTokBuf;
250
    ResultChar = 0;
251

252
    // Octal escapes are a series of octal digits with maximum length 3.
253
    // "\0123" is a two digit sequence equal to "\012" "3".
254
    unsigned NumDigits = 0;
255
    do {
256
      ResultChar <<= 3;
257
      ResultChar |= *ThisTokBuf++ - '0';
258
      ++NumDigits;
259
    } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
260
             ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
261

262
    // Check for overflow.  Reject '\777', but not L'\777'.
263
    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
264
      if (Diags)
265
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
266
             diag::err_escape_too_large) << 1;
267
      ResultChar &= ~0U >> (32-CharWidth);
268
    }
269
    break;
270
  }
271
  case 'o': {
272
    bool Overflow = false;
273
    if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274
      HadError = true;
275
      if (Diags)
276
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
277
             diag::err_delimited_escape_missing_brace)
278
            << "o";
279

280
      break;
281
    }
282
    ResultChar = 0;
283
    Delimited = true;
284
    ++ThisTokBuf;
285
    if (*ThisTokBuf == '}') {
286
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287
           diag::err_delimited_escape_empty);
288
      return ResultChar;
289
    }
290

291
    while (ThisTokBuf != ThisTokEnd) {
292
      if (*ThisTokBuf == '}') {
293
        EndDelimiterFound = true;
294
        ThisTokBuf++;
295
        break;
296
      }
297
      if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298
        HadError = true;
299
        if (Diags)
300
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301
               diag::err_delimited_escape_invalid)
302
              << StringRef(ThisTokBuf, 1);
303
        ThisTokBuf++;
304
        continue;
305
      }
306
      // Check if one of the top three bits is set before shifting them out.
307
      if (ResultChar & 0xE0000000)
308
        Overflow = true;
309

310
      ResultChar <<= 3;
311
      ResultChar |= *ThisTokBuf++ - '0';
312
    }
313
    // Check for overflow.  Reject '\777', but not L'\777'.
314
    if (!HadError &&
315
        (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316
      HadError = true;
317
      if (Diags)
318
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319
             diag::err_escape_too_large)
320
            << 1;
321
      ResultChar &= ~0U >> (32 - CharWidth);
322
    }
323
    break;
324
  }
325
    // Otherwise, these are not valid escapes.
326
  case '(': case '{': case '[': case '%':
327
    // GCC accepts these as extensions.  We warn about them as such though.
328
    if (Diags)
329
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
330
           diag::ext_nonstandard_escape)
331
        << std::string(1, ResultChar);
332
    break;
333
  default:
334
    if (!Diags)
335
      break;
336

337
    if (isPrintable(ResultChar))
338
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
339
           diag::ext_unknown_escape)
340
        << std::string(1, ResultChar);
341
    else
342
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
343
           diag::ext_unknown_escape)
344
        << "x" + llvm::utohexstr(ResultChar);
345
    break;
346
  }
347

348
  if (Delimited && Diags) {
349
    if (!EndDelimiterFound)
350
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351
           diag::err_expected)
352
          << tok::r_brace;
353
    else if (!HadError) {
354
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
355
           Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356
                                : diag::ext_delimited_escape_sequence)
357
          << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358
    }
359
  }
360

361
  if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362
      !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
363
    Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
364
         diag::err_unevaluated_string_invalid_escape_sequence)
365
        << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
366
    HadError = true;
367
  }
368

369
  return ResultChar;
370
}
371

372
static void appendCodePoint(unsigned Codepoint,
373
                            llvm::SmallVectorImpl<char> &Str) {
374
  char ResultBuf[4];
375
  char *ResultPtr = ResultBuf;
376
  if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
377
    Str.append(ResultBuf, ResultPtr);
378
}
379

380
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
382
    if (*I != '\\') {
383
      Buf.push_back(*I);
384
      continue;
385
    }
386

387
    ++I;
388
    char Kind = *I;
389
    ++I;
390

391
    assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392
    uint32_t CodePoint = 0;
393

394
    if (Kind == 'u' && *I == '{') {
395
      for (++I; *I != '}'; ++I) {
396
        unsigned Value = llvm::hexDigitValue(*I);
397
        assert(Value != -1U);
398
        CodePoint <<= 4;
399
        CodePoint += Value;
400
      }
401
      appendCodePoint(CodePoint, Buf);
402
      continue;
403
    }
404

405
    if (Kind == 'N') {
406
      assert(*I == '{');
407
      ++I;
408
      auto Delim = std::find(I, Input.end(), '}');
409
      assert(Delim != Input.end());
410
      StringRef Name(I, std::distance(I, Delim));
411
      std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
412
          llvm::sys::unicode::nameToCodepointLooseMatching(Name);
413
      assert(Res && "could not find a codepoint that was previously found");
414
      CodePoint = Res->CodePoint;
415
      assert(CodePoint != 0xFFFFFFFF);
416
      appendCodePoint(CodePoint, Buf);
417
      I = Delim;
418
      continue;
419
    }
420

421
    unsigned NumHexDigits;
422
    if (Kind == 'u')
423
      NumHexDigits = 4;
424
    else
425
      NumHexDigits = 8;
426

427
    assert(I + NumHexDigits <= E);
428

429
    for (; NumHexDigits != 0; ++I, --NumHexDigits) {
430
      unsigned Value = llvm::hexDigitValue(*I);
431
      assert(Value != -1U);
432

433
      CodePoint <<= 4;
434
      CodePoint += Value;
435
    }
436

437
    appendCodePoint(CodePoint, Buf);
438
    --I;
439
  }
440
}
441

442
bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
443
                                              const LangOptions &LO) {
444
  return LO.MicrosoftExt &&
445
         (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
446
          K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
447
          K == tok::kw___FUNCDNAME__);
448
}
449

450
bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
451
  return tok::isStringLiteral(Tok.getKind()) ||
452
         isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
453
}
454

455
static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
456
                                    const char *&ThisTokBuf,
457
                                    const char *ThisTokEnd, uint32_t &UcnVal,
458
                                    unsigned short &UcnLen, bool &Delimited,
459
                                    FullSourceLoc Loc, DiagnosticsEngine *Diags,
460
                                    const LangOptions &Features,
461
                                    bool in_char_string_literal = false) {
462
  const char *UcnBegin = ThisTokBuf;
463
  bool HasError = false;
464
  bool EndDelimiterFound = false;
465

466
  // Skip the '\u' char's.
467
  ThisTokBuf += 2;
468
  Delimited = false;
469
  if (UcnBegin[1] == 'u' && in_char_string_literal &&
470
      ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
471
    Delimited = true;
472
    ThisTokBuf++;
473
  } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
474
    if (Diags)
475
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
476
           diag::err_hex_escape_no_digits)
477
          << StringRef(&ThisTokBuf[-1], 1);
478
    return false;
479
  }
480
  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
481

482
  bool Overflow = false;
483
  unsigned short Count = 0;
484
  for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
485
       ++ThisTokBuf) {
486
    if (Delimited && *ThisTokBuf == '}') {
487
      ++ThisTokBuf;
488
      EndDelimiterFound = true;
489
      break;
490
    }
491
    int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492
    if (CharVal == -1) {
493
      HasError = true;
494
      if (!Delimited)
495
        break;
496
      if (Diags) {
497
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498
             diag::err_delimited_escape_invalid)
499
            << StringRef(ThisTokBuf, 1);
500
      }
501
      Count++;
502
      continue;
503
    }
504
    if (UcnVal & 0xF0000000) {
505
      Overflow = true;
506
      continue;
507
    }
508
    UcnVal <<= 4;
509
    UcnVal |= CharVal;
510
    Count++;
511
  }
512

513
  if (Overflow) {
514
    if (Diags)
515
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516
           diag::err_escape_too_large)
517
          << 0;
518
    return false;
519
  }
520

521
  if (Delimited && !EndDelimiterFound) {
522
    if (Diags) {
523
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524
           diag::err_expected)
525
          << tok::r_brace;
526
    }
527
    return false;
528
  }
529

530
  // If we didn't consume the proper number of digits, there is a problem.
531
  if (Count == 0 || (!Delimited && Count != UcnLen)) {
532
    if (Diags)
533
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534
           Delimited ? diag::err_delimited_escape_empty
535
                     : diag::err_ucn_escape_incomplete);
536
    return false;
537
  }
538
  return !HasError;
539
}
540

541
static void DiagnoseInvalidUnicodeCharacterName(
542
    DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
543
    const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
544
    llvm::StringRef Name) {
545

546
  Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
547
       diag::err_invalid_ucn_name)
548
      << Name;
549

550
  namespace u = llvm::sys::unicode;
551

552
  std::optional<u::LooseMatchingResult> Res =
553
      u::nameToCodepointLooseMatching(Name);
554
  if (Res) {
555
    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
556
         diag::note_invalid_ucn_name_loose_matching)
557
        << FixItHint::CreateReplacement(
558
               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
559
                                   TokRangeEnd),
560
               Res->Name);
561
    return;
562
  }
563

564
  unsigned Distance = 0;
565
  SmallVector<u::MatchForCodepointName> Matches =
566
      u::nearestMatchesForCodepointName(Name, 5);
567
  assert(!Matches.empty() && "No unicode characters found");
568

569
  for (const auto &Match : Matches) {
570
    if (Distance == 0)
571
      Distance = Match.Distance;
572
    if (std::max(Distance, Match.Distance) -
573
            std::min(Distance, Match.Distance) >
574
        3)
575
      break;
576
    Distance = Match.Distance;
577

578
    std::string Str;
579
    llvm::UTF32 V = Match.Value;
580
    bool Converted =
581
        llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582
    (void)Converted;
583
    assert(Converted && "Found a match wich is not a unicode character");
584

585
    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
586
         diag::note_invalid_ucn_name_candidate)
587
        << Match.Name << llvm::utohexstr(Match.Value)
588
        << Str // FIXME: Fix the rendering of non printable characters
589
        << FixItHint::CreateReplacement(
590
               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
591
                                   TokRangeEnd),
592
               Match.Name);
593
  }
594
}
595

596
static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
597
                                  const char *&ThisTokBuf,
598
                                  const char *ThisTokEnd, uint32_t &UcnVal,
599
                                  unsigned short &UcnLen, FullSourceLoc Loc,
600
                                  DiagnosticsEngine *Diags,
601
                                  const LangOptions &Features) {
602
  const char *UcnBegin = ThisTokBuf;
603
  assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
604
  ThisTokBuf += 2;
605
  if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
606
    if (Diags) {
607
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
608
           diag::err_delimited_escape_missing_brace)
609
          << StringRef(&ThisTokBuf[-1], 1);
610
    }
611
    return false;
612
  }
613
  ThisTokBuf++;
614
  const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615
    return C == '}' || isVerticalWhitespace(C);
616
  });
617
  bool Incomplete = ClosingBrace == ThisTokEnd;
618
  bool Empty = ClosingBrace == ThisTokBuf;
619
  if (Incomplete || Empty) {
620
    if (Diags) {
621
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
622
           Incomplete ? diag::err_ucn_escape_incomplete
623
                      : diag::err_delimited_escape_empty)
624
          << StringRef(&UcnBegin[1], 1);
625
    }
626
    ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
627
    return false;
628
  }
629
  StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
630
  ThisTokBuf = ClosingBrace + 1;
631
  std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
632
  if (!Res) {
633
    if (Diags)
634
      DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
635
                                          &UcnBegin[3], ClosingBrace, Name);
636
    return false;
637
  }
638
  UcnVal = *Res;
639
  UcnLen = UcnVal > 0xFFFF ? 8 : 4;
640
  return true;
641
}
642

643
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
644
/// return the UTF32.
645
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
646
                             const char *ThisTokEnd, uint32_t &UcnVal,
647
                             unsigned short &UcnLen, FullSourceLoc Loc,
648
                             DiagnosticsEngine *Diags,
649
                             const LangOptions &Features,
650
                             bool in_char_string_literal = false) {
651

652
  bool HasError;
653
  const char *UcnBegin = ThisTokBuf;
654
  bool IsDelimitedEscapeSequence = false;
655
  bool IsNamedEscapeSequence = false;
656
  if (ThisTokBuf[1] == 'N') {
657
    IsNamedEscapeSequence = true;
658
    HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
659
                                      UcnVal, UcnLen, Loc, Diags, Features);
660
  } else {
661
    HasError =
662
        !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
663
                                 UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
664
                                 Features, in_char_string_literal);
665
  }
666
  if (HasError)
667
    return false;
668

669
  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
670
  if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
671
      UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
672
    if (Diags)
673
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
674
           diag::err_ucn_escape_invalid);
675
    return false;
676
  }
677

678
  // C23 and C++11 allow UCNs that refer to control characters
679
  // and basic source characters inside character and string literals
680
  if (UcnVal < 0xa0 &&
681
      // $, @, ` are allowed in all language modes
682
      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
683
    bool IsError =
684
        (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
685
    if (Diags) {
686
      char BasicSCSChar = UcnVal;
687
      if (UcnVal >= 0x20 && UcnVal < 0x7f)
688
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
689
             IsError ? diag::err_ucn_escape_basic_scs
690
             : Features.CPlusPlus
691
                 ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
692
                 : diag::warn_c23_compat_literal_ucn_escape_basic_scs)
693
            << StringRef(&BasicSCSChar, 1);
694
      else
695
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
696
             IsError ? diag::err_ucn_control_character
697
             : Features.CPlusPlus
698
                 ? diag::warn_cxx98_compat_literal_ucn_control_character
699
                 : diag::warn_c23_compat_literal_ucn_control_character);
700
    }
701
    if (IsError)
702
      return false;
703
  }
704

705
  if (!Features.CPlusPlus && !Features.C99 && Diags)
706
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
707
         diag::warn_ucn_not_valid_in_c89_literal);
708

709
  if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
710
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
711
         Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712
                              : diag::ext_delimited_escape_sequence)
713
        << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
714

715
  return true;
716
}
717

718
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
719
/// which this UCN will occupy.
720
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
721
                            const char *ThisTokEnd, unsigned CharByteWidth,
722
                            const LangOptions &Features, bool &HadError) {
723
  // UTF-32: 4 bytes per escape.
724
  if (CharByteWidth == 4)
725
    return 4;
726

727
  uint32_t UcnVal = 0;
728
  unsigned short UcnLen = 0;
729
  FullSourceLoc Loc;
730

731
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
732
                        UcnLen, Loc, nullptr, Features, true)) {
733
    HadError = true;
734
    return 0;
735
  }
736

737
  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
738
  if (CharByteWidth == 2)
739
    return UcnVal <= 0xFFFF ? 2 : 4;
740

741
  // UTF-8.
742
  if (UcnVal < 0x80)
743
    return 1;
744
  if (UcnVal < 0x800)
745
    return 2;
746
  if (UcnVal < 0x10000)
747
    return 3;
748
  return 4;
749
}
750

751
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
752
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
753
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
754
/// we will likely rework our support for UCN's.
755
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
756
                            const char *ThisTokEnd,
757
                            char *&ResultBuf, bool &HadError,
758
                            FullSourceLoc Loc, unsigned CharByteWidth,
759
                            DiagnosticsEngine *Diags,
760
                            const LangOptions &Features) {
761
  typedef uint32_t UTF32;
762
  UTF32 UcnVal = 0;
763
  unsigned short UcnLen = 0;
764
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
765
                        Loc, Diags, Features, true)) {
766
    HadError = true;
767
    return;
768
  }
769

770
  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
771
         "only character widths of 1, 2, or 4 bytes supported");
772

773
  (void)UcnLen;
774
  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
775

776
  if (CharByteWidth == 4) {
777
    // FIXME: Make the type of the result buffer correct instead of
778
    // using reinterpret_cast.
779
    llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
780
    *ResultPtr = UcnVal;
781
    ResultBuf += 4;
782
    return;
783
  }
784

785
  if (CharByteWidth == 2) {
786
    // FIXME: Make the type of the result buffer correct instead of
787
    // using reinterpret_cast.
788
    llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
789

790
    if (UcnVal <= (UTF32)0xFFFF) {
791
      *ResultPtr = UcnVal;
792
      ResultBuf += 2;
793
      return;
794
    }
795

796
    // Convert to UTF16.
797
    UcnVal -= 0x10000;
798
    *ResultPtr     = 0xD800 + (UcnVal >> 10);
799
    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
800
    ResultBuf += 4;
801
    return;
802
  }
803

804
  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
805

806
  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
807
  // The conversion below was inspired by:
808
  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
809
  // First, we determine how many bytes the result will require.
810
  typedef uint8_t UTF8;
811

812
  unsigned short bytesToWrite = 0;
813
  if (UcnVal < (UTF32)0x80)
814
    bytesToWrite = 1;
815
  else if (UcnVal < (UTF32)0x800)
816
    bytesToWrite = 2;
817
  else if (UcnVal < (UTF32)0x10000)
818
    bytesToWrite = 3;
819
  else
820
    bytesToWrite = 4;
821

822
  const unsigned byteMask = 0xBF;
823
  const unsigned byteMark = 0x80;
824

825
  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
826
  // into the first byte, depending on how many bytes follow.
827
  static const UTF8 firstByteMark[5] = {
828
    0x00, 0x00, 0xC0, 0xE0, 0xF0
829
  };
830
  // Finally, we write the bytes into ResultBuf.
831
  ResultBuf += bytesToWrite;
832
  switch (bytesToWrite) { // note: everything falls through.
833
  case 4:
834
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835
    [[fallthrough]];
836
  case 3:
837
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838
    [[fallthrough]];
839
  case 2:
840
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841
    [[fallthrough]];
842
  case 1:
843
    *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
844
  }
845
  // Update the buffer.
846
  ResultBuf += bytesToWrite;
847
}
848

849
///       integer-constant: [C99 6.4.4.1]
850
///         decimal-constant integer-suffix
851
///         octal-constant integer-suffix
852
///         hexadecimal-constant integer-suffix
853
///         binary-literal integer-suffix [GNU, C++1y]
854
///       user-defined-integer-literal: [C++11 lex.ext]
855
///         decimal-literal ud-suffix
856
///         octal-literal ud-suffix
857
///         hexadecimal-literal ud-suffix
858
///         binary-literal ud-suffix [GNU, C++1y]
859
///       decimal-constant:
860
///         nonzero-digit
861
///         decimal-constant digit
862
///       octal-constant:
863
///         0
864
///         octal-constant octal-digit
865
///       hexadecimal-constant:
866
///         hexadecimal-prefix hexadecimal-digit
867
///         hexadecimal-constant hexadecimal-digit
868
///       hexadecimal-prefix: one of
869
///         0x 0X
870
///       binary-literal:
871
///         0b binary-digit
872
///         0B binary-digit
873
///         binary-literal binary-digit
874
///       integer-suffix:
875
///         unsigned-suffix [long-suffix]
876
///         unsigned-suffix [long-long-suffix]
877
///         long-suffix [unsigned-suffix]
878
///         long-long-suffix [unsigned-sufix]
879
///       nonzero-digit:
880
///         1 2 3 4 5 6 7 8 9
881
///       octal-digit:
882
///         0 1 2 3 4 5 6 7
883
///       hexadecimal-digit:
884
///         0 1 2 3 4 5 6 7 8 9
885
///         a b c d e f
886
///         A B C D E F
887
///       binary-digit:
888
///         0
889
///         1
890
///       unsigned-suffix: one of
891
///         u U
892
///       long-suffix: one of
893
///         l L
894
///       long-long-suffix: one of
895
///         ll LL
896
///
897
///       floating-constant: [C99 6.4.4.2]
898
///         TODO: add rules...
899
///
900
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
901
                                           SourceLocation TokLoc,
902
                                           const SourceManager &SM,
903
                                           const LangOptions &LangOpts,
904
                                           const TargetInfo &Target,
905
                                           DiagnosticsEngine &Diags)
906
    : SM(SM), LangOpts(LangOpts), Diags(Diags),
907
      ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
908

909
  s = DigitsBegin = ThisTokBegin;
910
  saw_exponent = false;
911
  saw_period = false;
912
  saw_ud_suffix = false;
913
  saw_fixed_point_suffix = false;
914
  isLong = false;
915
  isUnsigned = false;
916
  isLongLong = false;
917
  isSizeT = false;
918
  isHalf = false;
919
  isFloat = false;
920
  isImaginary = false;
921
  isFloat16 = false;
922
  isFloat128 = false;
923
  MicrosoftInteger = 0;
924
  isFract = false;
925
  isAccum = false;
926
  hadError = false;
927
  isBitInt = false;
928

929
  // This routine assumes that the range begin/end matches the regex for integer
930
  // and FP constants (specifically, the 'pp-number' regex), and assumes that
931
  // the byte at "*end" is both valid and not part of the regex.  Because of
932
  // this, it doesn't have to check for 'overscan' in various places.
933
  // Note: For HLSL, the end token is allowed to be '.' which would be in the
934
  // 'pp-number' regex. This is required to support vector swizzles on numeric
935
  // constants (i.e. 1.xx or 1.5f.rrr).
936
  if (isPreprocessingNumberBody(*ThisTokEnd) &&
937
      !(LangOpts.HLSL && *ThisTokEnd == '.')) {
938
    Diags.Report(TokLoc, diag::err_lexing_numeric);
939
    hadError = true;
940
    return;
941
  }
942

943
  if (*s == '0') { // parse radix
944
    ParseNumberStartingWithZero(TokLoc);
945
    if (hadError)
946
      return;
947
  } else { // the first digit is non-zero
948
    radix = 10;
949
    s = SkipDigits(s);
950
    if (s == ThisTokEnd) {
951
      // Done.
952
    } else {
953
      ParseDecimalOrOctalCommon(TokLoc);
954
      if (hadError)
955
        return;
956
    }
957
  }
958

959
  SuffixBegin = s;
960
  checkSeparator(TokLoc, s, CSK_AfterDigits);
961

962
  // Initial scan to lookahead for fixed point suffix.
963
  if (LangOpts.FixedPoint) {
964
    for (const char *c = s; c != ThisTokEnd; ++c) {
965
      if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
966
        saw_fixed_point_suffix = true;
967
        break;
968
      }
969
    }
970
  }
971

972
  // Parse the suffix.  At this point we can classify whether we have an FP or
973
  // integer constant.
974
  bool isFixedPointConstant = isFixedPointLiteral();
975
  bool isFPConstant = isFloatingLiteral();
976
  bool HasSize = false;
977
  bool DoubleUnderscore = false;
978

979
  // Loop over all of the characters of the suffix.  If we see something bad,
980
  // we break out of the loop.
981
  for (; s != ThisTokEnd; ++s) {
982
    switch (*s) {
983
    case 'R':
984
    case 'r':
985
      if (!LangOpts.FixedPoint)
986
        break;
987
      if (isFract || isAccum) break;
988
      if (!(saw_period || saw_exponent)) break;
989
      isFract = true;
990
      continue;
991
    case 'K':
992
    case 'k':
993
      if (!LangOpts.FixedPoint)
994
        break;
995
      if (isFract || isAccum) break;
996
      if (!(saw_period || saw_exponent)) break;
997
      isAccum = true;
998
      continue;
999
    case 'h':      // FP Suffix for "half".
1000
    case 'H':
1001
      // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1002
      if (!(LangOpts.Half || LangOpts.FixedPoint))
1003
        break;
1004
      if (isIntegerLiteral()) break;  // Error for integer constant.
1005
      if (HasSize)
1006
        break;
1007
      HasSize = true;
1008
      isHalf = true;
1009
      continue;  // Success.
1010
    case 'f':      // FP Suffix for "float"
1011
    case 'F':
1012
      if (!isFPConstant) break;  // Error for integer constant.
1013
      if (HasSize)
1014
        break;
1015
      HasSize = true;
1016

1017
      // CUDA host and device may have different _Float16 support, therefore
1018
      // allows f16 literals to avoid false alarm.
1019
      // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1020
      // should also be supported.
1021
      // ToDo: more precise check for CUDA.
1022
      // TODO: AMDGPU might also support it in the future.
1023
      if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1024
           (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1025
          s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1026
        s += 2; // success, eat up 2 characters.
1027
        isFloat16 = true;
1028
        continue;
1029
      }
1030

1031
      isFloat = true;
1032
      continue;  // Success.
1033
    case 'q':    // FP Suffix for "__float128"
1034
    case 'Q':
1035
      if (!isFPConstant) break;  // Error for integer constant.
1036
      if (HasSize)
1037
        break;
1038
      HasSize = true;
1039
      isFloat128 = true;
1040
      continue;  // Success.
1041
    case 'u':
1042
    case 'U':
1043
      if (isFPConstant) break;  // Error for floating constant.
1044
      if (isUnsigned) break;    // Cannot be repeated.
1045
      isUnsigned = true;
1046
      continue;  // Success.
1047
    case 'l':
1048
    case 'L':
1049
      if (HasSize)
1050
        break;
1051
      HasSize = true;
1052

1053
      // Check for long long.  The L's need to be adjacent and the same case.
1054
      if (s[1] == s[0]) {
1055
        assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1056
        if (isFPConstant) break;        // long long invalid for floats.
1057
        isLongLong = true;
1058
        ++s;  // Eat both of them.
1059
      } else {
1060
        isLong = true;
1061
      }
1062
      continue; // Success.
1063
    case 'z':
1064
    case 'Z':
1065
      if (isFPConstant)
1066
        break; // Invalid for floats.
1067
      if (HasSize)
1068
        break;
1069
      HasSize = true;
1070
      isSizeT = true;
1071
      continue;
1072
    case 'i':
1073
    case 'I':
1074
      if (LangOpts.MicrosoftExt && !isFPConstant) {
1075
        // Allow i8, i16, i32, and i64. First, look ahead and check if
1076
        // suffixes are Microsoft integers and not the imaginary unit.
1077
        uint8_t Bits = 0;
1078
        size_t ToSkip = 0;
1079
        switch (s[1]) {
1080
        case '8': // i8 suffix
1081
          Bits = 8;
1082
          ToSkip = 2;
1083
          break;
1084
        case '1':
1085
          if (s[2] == '6') { // i16 suffix
1086
            Bits = 16;
1087
            ToSkip = 3;
1088
          }
1089
          break;
1090
        case '3':
1091
          if (s[2] == '2') { // i32 suffix
1092
            Bits = 32;
1093
            ToSkip = 3;
1094
          }
1095
          break;
1096
        case '6':
1097
          if (s[2] == '4') { // i64 suffix
1098
            Bits = 64;
1099
            ToSkip = 3;
1100
          }
1101
          break;
1102
        default:
1103
          break;
1104
        }
1105
        if (Bits) {
1106
          if (HasSize)
1107
            break;
1108
          HasSize = true;
1109
          MicrosoftInteger = Bits;
1110
          s += ToSkip;
1111
          assert(s <= ThisTokEnd && "didn't maximally munch?");
1112
          break;
1113
        }
1114
      }
1115
      [[fallthrough]];
1116
    case 'j':
1117
    case 'J':
1118
      if (isImaginary) break;   // Cannot be repeated.
1119
      isImaginary = true;
1120
      continue;  // Success.
1121
    case '_':
1122
      if (isFPConstant)
1123
        break; // Invalid for floats
1124
      if (HasSize)
1125
        break;
1126
      // There is currently no way to reach this with DoubleUnderscore set.
1127
      // If new double underscope literals are added handle it here as above.
1128
      assert(!DoubleUnderscore && "unhandled double underscore case");
1129
      if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&
1130
          s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists
1131
                         // after __
1132
        DoubleUnderscore = true;
1133
        s += 2; // Skip both '_'
1134
        if (s + 1 < ThisTokEnd &&
1135
            (*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U'
1136
          isUnsigned = true;
1137
          ++s;
1138
        }
1139
        if (s + 1 < ThisTokEnd &&
1140
            ((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) {
1141
          isBitInt = true;
1142
          HasSize = true;
1143
          continue;
1144
        }
1145
      }
1146
      break;
1147
    case 'w':
1148
    case 'W':
1149
      if (isFPConstant)
1150
        break; // Invalid for floats.
1151
      if (HasSize)
1152
        break; // Invalid if we already have a size for the literal.
1153

1154
      // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1155
      // explicitly do not support the suffix in C++ as an extension because a
1156
      // library-based UDL that resolves to a library type may be more
1157
      // appropriate there. The same rules apply for __wb/__WB.
1158
      if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd &&
1159
          ((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) {
1160
        isBitInt = true;
1161
        HasSize = true;
1162
        ++s; // Skip both characters (2nd char skipped on continue).
1163
        continue; // Success.
1164
      }
1165
    }
1166
    // If we reached here, there was an error or a ud-suffix.
1167
    break;
1168
  }
1169

1170
  // "i", "if", and "il" are user-defined suffixes in C++1y.
1171
  if (s != ThisTokEnd || isImaginary) {
1172
    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1173
    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1174
    if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1175
      if (!isImaginary) {
1176
        // Any suffix pieces we might have parsed are actually part of the
1177
        // ud-suffix.
1178
        isLong = false;
1179
        isUnsigned = false;
1180
        isLongLong = false;
1181
        isSizeT = false;
1182
        isFloat = false;
1183
        isFloat16 = false;
1184
        isHalf = false;
1185
        isImaginary = false;
1186
        isBitInt = false;
1187
        MicrosoftInteger = 0;
1188
        saw_fixed_point_suffix = false;
1189
        isFract = false;
1190
        isAccum = false;
1191
      }
1192

1193
      saw_ud_suffix = true;
1194
      return;
1195
    }
1196

1197
    if (s != ThisTokEnd) {
1198
      // Report an error if there are any.
1199
      Diags.Report(Lexer::AdvanceToTokenCharacter(
1200
                       TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1201
                   diag::err_invalid_suffix_constant)
1202
          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1203
          << (isFixedPointConstant ? 2 : isFPConstant);
1204
      hadError = true;
1205
    }
1206
  }
1207

1208
  if (!hadError && saw_fixed_point_suffix) {
1209
    assert(isFract || isAccum);
1210
  }
1211
}
1212

1213
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1214
/// numbers. It issues an error for illegal digits, and handles floating point
1215
/// parsing. If it detects a floating point number, the radix is set to 10.
1216
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1217
  assert((radix == 8 || radix == 10) && "Unexpected radix");
1218

1219
  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1220
  // the code is using an incorrect base.
1221
  if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1222
      !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1223
    Diags.Report(
1224
        Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1225
        diag::err_invalid_digit)
1226
        << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1227
    hadError = true;
1228
    return;
1229
  }
1230

1231
  if (*s == '.') {
1232
    checkSeparator(TokLoc, s, CSK_AfterDigits);
1233
    s++;
1234
    radix = 10;
1235
    saw_period = true;
1236
    checkSeparator(TokLoc, s, CSK_BeforeDigits);
1237
    s = SkipDigits(s); // Skip suffix.
1238
  }
1239
  if (*s == 'e' || *s == 'E') { // exponent
1240
    checkSeparator(TokLoc, s, CSK_AfterDigits);
1241
    const char *Exponent = s;
1242
    s++;
1243
    radix = 10;
1244
    saw_exponent = true;
1245
    if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1246
    const char *first_non_digit = SkipDigits(s);
1247
    if (containsDigits(s, first_non_digit)) {
1248
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
1249
      s = first_non_digit;
1250
    } else {
1251
      if (!hadError) {
1252
        Diags.Report(Lexer::AdvanceToTokenCharacter(
1253
                         TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1254
                     diag::err_exponent_has_no_digits);
1255
        hadError = true;
1256
      }
1257
      return;
1258
    }
1259
  }
1260
}
1261

1262
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1263
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1264
/// treat it as an invalid suffix.
1265
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1266
                                           StringRef Suffix) {
1267
  if (!LangOpts.CPlusPlus11 || Suffix.empty())
1268
    return false;
1269

1270
  // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1271
  // Suffixes starting with '__' (double underscore) are for use by
1272
  // the implementation.
1273
  if (Suffix.starts_with("_") && !Suffix.starts_with("__"))
1274
    return true;
1275

1276
  // In C++11, there are no library suffixes.
1277
  if (!LangOpts.CPlusPlus14)
1278
    return false;
1279

1280
  // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1281
  // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1282
  // In C++2a "d" and "y" are used in the library.
1283
  return llvm::StringSwitch<bool>(Suffix)
1284
      .Cases("h", "min", "s", true)
1285
      .Cases("ms", "us", "ns", true)
1286
      .Cases("il", "i", "if", true)
1287
      .Cases("d", "y", LangOpts.CPlusPlus20)
1288
      .Default(false);
1289
}
1290

1291
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1292
                                          const char *Pos,
1293
                                          CheckSeparatorKind IsAfterDigits) {
1294
  if (IsAfterDigits == CSK_AfterDigits) {
1295
    if (Pos == ThisTokBegin)
1296
      return;
1297
    --Pos;
1298
  } else if (Pos == ThisTokEnd)
1299
    return;
1300

1301
  if (isDigitSeparator(*Pos)) {
1302
    Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1303
                                                LangOpts),
1304
                 diag::err_digit_separator_not_between_digits)
1305
        << IsAfterDigits;
1306
    hadError = true;
1307
  }
1308
}
1309

1310
/// ParseNumberStartingWithZero - This method is called when the first character
1311
/// of the number is found to be a zero.  This means it is either an octal
1312
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1313
/// a floating point number (01239.123e4).  Eat the prefix, determining the
1314
/// radix etc.
1315
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1316
  assert(s[0] == '0' && "Invalid method call");
1317
  s++;
1318

1319
  int c1 = s[0];
1320

1321
  // Handle a hex number like 0x1234.
1322
  if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1323
    s++;
1324
    assert(s < ThisTokEnd && "didn't maximally munch?");
1325
    radix = 16;
1326
    DigitsBegin = s;
1327
    s = SkipHexDigits(s);
1328
    bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1329
    if (s == ThisTokEnd) {
1330
      // Done.
1331
    } else if (*s == '.') {
1332
      s++;
1333
      saw_period = true;
1334
      const char *floatDigitsBegin = s;
1335
      s = SkipHexDigits(s);
1336
      if (containsDigits(floatDigitsBegin, s))
1337
        HasSignificandDigits = true;
1338
      if (HasSignificandDigits)
1339
        checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1340
    }
1341

1342
    if (!HasSignificandDigits) {
1343
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1344
                                                  LangOpts),
1345
                   diag::err_hex_constant_requires)
1346
          << LangOpts.CPlusPlus << 1;
1347
      hadError = true;
1348
      return;
1349
    }
1350

1351
    // A binary exponent can appear with or with a '.'. If dotted, the
1352
    // binary exponent is required.
1353
    if (*s == 'p' || *s == 'P') {
1354
      checkSeparator(TokLoc, s, CSK_AfterDigits);
1355
      const char *Exponent = s;
1356
      s++;
1357
      saw_exponent = true;
1358
      if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1359
      const char *first_non_digit = SkipDigits(s);
1360
      if (!containsDigits(s, first_non_digit)) {
1361
        if (!hadError) {
1362
          Diags.Report(Lexer::AdvanceToTokenCharacter(
1363
                           TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1364
                       diag::err_exponent_has_no_digits);
1365
          hadError = true;
1366
        }
1367
        return;
1368
      }
1369
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
1370
      s = first_non_digit;
1371

1372
      if (!LangOpts.HexFloats)
1373
        Diags.Report(TokLoc, LangOpts.CPlusPlus
1374
                                 ? diag::ext_hex_literal_invalid
1375
                                 : diag::ext_hex_constant_invalid);
1376
      else if (LangOpts.CPlusPlus17)
1377
        Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1378
    } else if (saw_period) {
1379
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1380
                                                  LangOpts),
1381
                   diag::err_hex_constant_requires)
1382
          << LangOpts.CPlusPlus << 0;
1383
      hadError = true;
1384
    }
1385
    return;
1386
  }
1387

1388
  // Handle simple binary numbers 0b01010
1389
  if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1390
    // 0b101010 is a C++14 and C23 extension.
1391
    unsigned DiagId;
1392
    if (LangOpts.CPlusPlus14)
1393
      DiagId = diag::warn_cxx11_compat_binary_literal;
1394
    else if (LangOpts.C23)
1395
      DiagId = diag::warn_c23_compat_binary_literal;
1396
    else if (LangOpts.CPlusPlus)
1397
      DiagId = diag::ext_binary_literal_cxx14;
1398
    else
1399
      DiagId = diag::ext_binary_literal;
1400
    Diags.Report(TokLoc, DiagId);
1401
    ++s;
1402
    assert(s < ThisTokEnd && "didn't maximally munch?");
1403
    radix = 2;
1404
    DigitsBegin = s;
1405
    s = SkipBinaryDigits(s);
1406
    if (s == ThisTokEnd) {
1407
      // Done.
1408
    } else if (isHexDigit(*s) &&
1409
               !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1410
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1411
                                                  LangOpts),
1412
                   diag::err_invalid_digit)
1413
          << StringRef(s, 1) << 2;
1414
      hadError = true;
1415
    }
1416
    // Other suffixes will be diagnosed by the caller.
1417
    return;
1418
  }
1419

1420
  // For now, the radix is set to 8. If we discover that we have a
1421
  // floating point constant, the radix will change to 10. Octal floating
1422
  // point constants are not permitted (only decimal and hexadecimal).
1423
  radix = 8;
1424
  const char *PossibleNewDigitStart = s;
1425
  s = SkipOctalDigits(s);
1426
  // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1427
  // as the start of the digits. So if skipping octal digits does not skip
1428
  // anything, we leave the digit start where it was.
1429
  if (s != PossibleNewDigitStart)
1430
    DigitsBegin = PossibleNewDigitStart;
1431

1432
  if (s == ThisTokEnd)
1433
    return; // Done, simple octal number like 01234
1434

1435
  // If we have some other non-octal digit that *is* a decimal digit, see if
1436
  // this is part of a floating point number like 094.123 or 09e1.
1437
  if (isDigit(*s)) {
1438
    const char *EndDecimal = SkipDigits(s);
1439
    if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1440
      s = EndDecimal;
1441
      radix = 10;
1442
    }
1443
  }
1444

1445
  ParseDecimalOrOctalCommon(TokLoc);
1446
}
1447

1448
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1449
  switch (Radix) {
1450
  case 2:
1451
    return NumDigits <= 64;
1452
  case 8:
1453
    return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1454
  case 10:
1455
    return NumDigits <= 19; // floor(log10(2^64))
1456
  case 16:
1457
    return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1458
  default:
1459
    llvm_unreachable("impossible Radix");
1460
  }
1461
}
1462

1463
/// GetIntegerValue - Convert this numeric literal value to an APInt that
1464
/// matches Val's input width.  If there is an overflow, set Val to the low bits
1465
/// of the result and return true.  Otherwise, return false.
1466
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1467
  // Fast path: Compute a conservative bound on the maximum number of
1468
  // bits per digit in this radix. If we can't possibly overflow a
1469
  // uint64 based on that bound then do the simple conversion to
1470
  // integer. This avoids the expensive overflow checking below, and
1471
  // handles the common cases that matter (small decimal integers and
1472
  // hex/octal values which don't overflow).
1473
  const unsigned NumDigits = SuffixBegin - DigitsBegin;
1474
  if (alwaysFitsInto64Bits(radix, NumDigits)) {
1475
    uint64_t N = 0;
1476
    for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1477
      if (!isDigitSeparator(*Ptr))
1478
        N = N * radix + llvm::hexDigitValue(*Ptr);
1479

1480
    // This will truncate the value to Val's input width. Simply check
1481
    // for overflow by comparing.
1482
    Val = N;
1483
    return Val.getZExtValue() != N;
1484
  }
1485

1486
  Val = 0;
1487
  const char *Ptr = DigitsBegin;
1488

1489
  llvm::APInt RadixVal(Val.getBitWidth(), radix);
1490
  llvm::APInt CharVal(Val.getBitWidth(), 0);
1491
  llvm::APInt OldVal = Val;
1492

1493
  bool OverflowOccurred = false;
1494
  while (Ptr < SuffixBegin) {
1495
    if (isDigitSeparator(*Ptr)) {
1496
      ++Ptr;
1497
      continue;
1498
    }
1499

1500
    unsigned C = llvm::hexDigitValue(*Ptr++);
1501

1502
    // If this letter is out of bound for this radix, reject it.
1503
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1504

1505
    CharVal = C;
1506

1507
    // Add the digit to the value in the appropriate radix.  If adding in digits
1508
    // made the value smaller, then this overflowed.
1509
    OldVal = Val;
1510

1511
    // Multiply by radix, did overflow occur on the multiply?
1512
    Val *= RadixVal;
1513
    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1514

1515
    // Add value, did overflow occur on the value?
1516
    //   (a + b) ult b  <=> overflow
1517
    Val += CharVal;
1518
    OverflowOccurred |= Val.ult(CharVal);
1519
  }
1520
  return OverflowOccurred;
1521
}
1522

1523
llvm::APFloat::opStatus
1524
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1525
                                    llvm::RoundingMode RM) {
1526
  using llvm::APFloat;
1527

1528
  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1529

1530
  llvm::SmallString<16> Buffer;
1531
  StringRef Str(ThisTokBegin, n);
1532
  if (Str.contains('\'')) {
1533
    Buffer.reserve(n);
1534
    std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1535
                        &isDigitSeparator);
1536
    Str = Buffer;
1537
  }
1538

1539
  auto StatusOrErr = Result.convertFromString(Str, RM);
1540
  assert(StatusOrErr && "Invalid floating point representation");
1541
  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1542
                                               : APFloat::opInvalidOp;
1543
}
1544

1545
static inline bool IsExponentPart(char c, bool isHex) {
1546
  if (isHex)
1547
    return c == 'p' || c == 'P';
1548
  return c == 'e' || c == 'E';
1549
}
1550

1551
bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1552
  assert(radix == 16 || radix == 10);
1553

1554
  // Find how many digits are needed to store the whole literal.
1555
  unsigned NumDigits = SuffixBegin - DigitsBegin;
1556
  if (saw_period) --NumDigits;
1557

1558
  // Initial scan of the exponent if it exists
1559
  bool ExpOverflowOccurred = false;
1560
  bool NegativeExponent = false;
1561
  const char *ExponentBegin;
1562
  uint64_t Exponent = 0;
1563
  int64_t BaseShift = 0;
1564
  if (saw_exponent) {
1565
    const char *Ptr = DigitsBegin;
1566

1567
    while (!IsExponentPart(*Ptr, radix == 16))
1568
      ++Ptr;
1569
    ExponentBegin = Ptr;
1570
    ++Ptr;
1571
    NegativeExponent = *Ptr == '-';
1572
    if (NegativeExponent) ++Ptr;
1573

1574
    unsigned NumExpDigits = SuffixBegin - Ptr;
1575
    if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1576
      llvm::StringRef ExpStr(Ptr, NumExpDigits);
1577
      llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1578
      Exponent = ExpInt.getZExtValue();
1579
    } else {
1580
      ExpOverflowOccurred = true;
1581
    }
1582

1583
    if (NegativeExponent) BaseShift -= Exponent;
1584
    else BaseShift += Exponent;
1585
  }
1586

1587
  // Number of bits needed for decimal literal is
1588
  //   ceil(NumDigits * log2(10))       Integral part
1589
  // + Scale                            Fractional part
1590
  // + ceil(Exponent * log2(10))        Exponent
1591
  // --------------------------------------------------
1592
  //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1593
  //
1594
  // But for simplicity in handling integers, we can round up log2(10) to 4,
1595
  // making:
1596
  // 4 * (NumDigits + Exponent) + Scale
1597
  //
1598
  // Number of digits needed for hexadecimal literal is
1599
  //   4 * NumDigits                    Integral part
1600
  // + Scale                            Fractional part
1601
  // + Exponent                         Exponent
1602
  // --------------------------------------------------
1603
  //   (4 * NumDigits) + Scale + Exponent
1604
  uint64_t NumBitsNeeded;
1605
  if (radix == 10)
1606
    NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1607
  else
1608
    NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1609

1610
  if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1611
    ExpOverflowOccurred = true;
1612
  llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1613

1614
  bool FoundDecimal = false;
1615

1616
  int64_t FractBaseShift = 0;
1617
  const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1618
  for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1619
    if (*Ptr == '.') {
1620
      FoundDecimal = true;
1621
      continue;
1622
    }
1623

1624
    // Normal reading of an integer
1625
    unsigned C = llvm::hexDigitValue(*Ptr);
1626
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1627

1628
    Val *= radix;
1629
    Val += C;
1630

1631
    if (FoundDecimal)
1632
      // Keep track of how much we will need to adjust this value by from the
1633
      // number of digits past the radix point.
1634
      --FractBaseShift;
1635
  }
1636

1637
  // For a radix of 16, we will be multiplying by 2 instead of 16.
1638
  if (radix == 16) FractBaseShift *= 4;
1639
  BaseShift += FractBaseShift;
1640

1641
  Val <<= Scale;
1642

1643
  uint64_t Base = (radix == 16) ? 2 : 10;
1644
  if (BaseShift > 0) {
1645
    for (int64_t i = 0; i < BaseShift; ++i) {
1646
      Val *= Base;
1647
    }
1648
  } else if (BaseShift < 0) {
1649
    for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1650
      Val = Val.udiv(Base);
1651
  }
1652

1653
  bool IntOverflowOccurred = false;
1654
  auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1655
  if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1656
    IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1657
    StoreVal = Val.trunc(StoreVal.getBitWidth());
1658
  } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1659
    IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1660
    StoreVal = Val.zext(StoreVal.getBitWidth());
1661
  } else {
1662
    StoreVal = Val;
1663
  }
1664

1665
  return IntOverflowOccurred || ExpOverflowOccurred;
1666
}
1667

1668
/// \verbatim
1669
///       user-defined-character-literal: [C++11 lex.ext]
1670
///         character-literal ud-suffix
1671
///       ud-suffix:
1672
///         identifier
1673
///       character-literal: [C++11 lex.ccon]
1674
///         ' c-char-sequence '
1675
///         u' c-char-sequence '
1676
///         U' c-char-sequence '
1677
///         L' c-char-sequence '
1678
///         u8' c-char-sequence ' [C++1z lex.ccon]
1679
///       c-char-sequence:
1680
///         c-char
1681
///         c-char-sequence c-char
1682
///       c-char:
1683
///         any member of the source character set except the single-quote ',
1684
///           backslash \, or new-line character
1685
///         escape-sequence
1686
///         universal-character-name
1687
///       escape-sequence:
1688
///         simple-escape-sequence
1689
///         octal-escape-sequence
1690
///         hexadecimal-escape-sequence
1691
///       simple-escape-sequence:
1692
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1693
///       octal-escape-sequence:
1694
///         \ octal-digit
1695
///         \ octal-digit octal-digit
1696
///         \ octal-digit octal-digit octal-digit
1697
///       hexadecimal-escape-sequence:
1698
///         \x hexadecimal-digit
1699
///         hexadecimal-escape-sequence hexadecimal-digit
1700
///       universal-character-name: [C++11 lex.charset]
1701
///         \u hex-quad
1702
///         \U hex-quad hex-quad
1703
///       hex-quad:
1704
///         hex-digit hex-digit hex-digit hex-digit
1705
/// \endverbatim
1706
///
1707
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1708
                                     SourceLocation Loc, Preprocessor &PP,
1709
                                     tok::TokenKind kind) {
1710
  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1711
  HadError = false;
1712

1713
  Kind = kind;
1714

1715
  const char *TokBegin = begin;
1716

1717
  // Skip over wide character determinant.
1718
  if (Kind != tok::char_constant)
1719
    ++begin;
1720
  if (Kind == tok::utf8_char_constant)
1721
    ++begin;
1722

1723
  // Skip over the entry quote.
1724
  if (begin[0] != '\'') {
1725
    PP.Diag(Loc, diag::err_lexing_char);
1726
    HadError = true;
1727
    return;
1728
  }
1729

1730
  ++begin;
1731

1732
  // Remove an optional ud-suffix.
1733
  if (end[-1] != '\'') {
1734
    const char *UDSuffixEnd = end;
1735
    do {
1736
      --end;
1737
    } while (end[-1] != '\'');
1738
    // FIXME: Don't bother with this if !tok.hasUCN().
1739
    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1740
    UDSuffixOffset = end - TokBegin;
1741
  }
1742

1743
  // Trim the ending quote.
1744
  assert(end != begin && "Invalid token lexed");
1745
  --end;
1746

1747
  // FIXME: The "Value" is an uint64_t so we can handle char literals of
1748
  // up to 64-bits.
1749
  // FIXME: This extensively assumes that 'char' is 8-bits.
1750
  assert(PP.getTargetInfo().getCharWidth() == 8 &&
1751
         "Assumes char is 8 bits");
1752
  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1753
         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1754
         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1755
  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1756
         "Assumes sizeof(wchar) on target is <= 64");
1757

1758
  SmallVector<uint32_t, 4> codepoint_buffer;
1759
  codepoint_buffer.resize(end - begin);
1760
  uint32_t *buffer_begin = &codepoint_buffer.front();
1761
  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1762

1763
  // Unicode escapes representing characters that cannot be correctly
1764
  // represented in a single code unit are disallowed in character literals
1765
  // by this implementation.
1766
  uint32_t largest_character_for_kind;
1767
  if (tok::wide_char_constant == Kind) {
1768
    largest_character_for_kind =
1769
        0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1770
  } else if (tok::utf8_char_constant == Kind) {
1771
    largest_character_for_kind = 0x7F;
1772
  } else if (tok::utf16_char_constant == Kind) {
1773
    largest_character_for_kind = 0xFFFF;
1774
  } else if (tok::utf32_char_constant == Kind) {
1775
    largest_character_for_kind = 0x10FFFF;
1776
  } else {
1777
    largest_character_for_kind = 0x7Fu;
1778
  }
1779

1780
  while (begin != end) {
1781
    // Is this a span of non-escape characters?
1782
    if (begin[0] != '\\') {
1783
      char const *start = begin;
1784
      do {
1785
        ++begin;
1786
      } while (begin != end && *begin != '\\');
1787

1788
      char const *tmp_in_start = start;
1789
      uint32_t *tmp_out_start = buffer_begin;
1790
      llvm::ConversionResult res =
1791
          llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1792
                             reinterpret_cast<llvm::UTF8 const *>(begin),
1793
                             &buffer_begin, buffer_end, llvm::strictConversion);
1794
      if (res != llvm::conversionOK) {
1795
        // If we see bad encoding for unprefixed character literals, warn and
1796
        // simply copy the byte values, for compatibility with gcc and
1797
        // older versions of clang.
1798
        bool NoErrorOnBadEncoding = isOrdinary();
1799
        unsigned Msg = diag::err_bad_character_encoding;
1800
        if (NoErrorOnBadEncoding)
1801
          Msg = diag::warn_bad_character_encoding;
1802
        PP.Diag(Loc, Msg);
1803
        if (NoErrorOnBadEncoding) {
1804
          start = tmp_in_start;
1805
          buffer_begin = tmp_out_start;
1806
          for (; start != begin; ++start, ++buffer_begin)
1807
            *buffer_begin = static_cast<uint8_t>(*start);
1808
        } else {
1809
          HadError = true;
1810
        }
1811
      } else {
1812
        for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1813
          if (*tmp_out_start > largest_character_for_kind) {
1814
            HadError = true;
1815
            PP.Diag(Loc, diag::err_character_too_large);
1816
          }
1817
        }
1818
      }
1819

1820
      continue;
1821
    }
1822
    // Is this a Universal Character Name escape?
1823
    if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1824
      unsigned short UcnLen = 0;
1825
      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1826
                            FullSourceLoc(Loc, PP.getSourceManager()),
1827
                            &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1828
        HadError = true;
1829
      } else if (*buffer_begin > largest_character_for_kind) {
1830
        HadError = true;
1831
        PP.Diag(Loc, diag::err_character_too_large);
1832
      }
1833

1834
      ++buffer_begin;
1835
      continue;
1836
    }
1837
    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1838
    uint64_t result =
1839
        ProcessCharEscape(TokBegin, begin, end, HadError,
1840
                          FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1841
                          &PP.getDiagnostics(), PP.getLangOpts(),
1842
                          StringLiteralEvalMethod::Evaluated);
1843
    *buffer_begin++ = result;
1844
  }
1845

1846
  unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1847

1848
  if (NumCharsSoFar > 1) {
1849
    if (isOrdinary() && NumCharsSoFar == 4)
1850
      PP.Diag(Loc, diag::warn_four_char_character_literal);
1851
    else if (isOrdinary())
1852
      PP.Diag(Loc, diag::warn_multichar_character_literal);
1853
    else {
1854
      PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1855
      HadError = true;
1856
    }
1857
    IsMultiChar = true;
1858
  } else {
1859
    IsMultiChar = false;
1860
  }
1861

1862
  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1863

1864
  // Narrow character literals act as though their value is concatenated
1865
  // in this implementation, but warn on overflow.
1866
  bool multi_char_too_long = false;
1867
  if (isOrdinary() && isMultiChar()) {
1868
    LitVal = 0;
1869
    for (size_t i = 0; i < NumCharsSoFar; ++i) {
1870
      // check for enough leading zeros to shift into
1871
      multi_char_too_long |= (LitVal.countl_zero() < 8);
1872
      LitVal <<= 8;
1873
      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1874
    }
1875
  } else if (NumCharsSoFar > 0) {
1876
    // otherwise just take the last character
1877
    LitVal = buffer_begin[-1];
1878
  }
1879

1880
  if (!HadError && multi_char_too_long) {
1881
    PP.Diag(Loc, diag::warn_char_constant_too_large);
1882
  }
1883

1884
  // Transfer the value from APInt to uint64_t
1885
  Value = LitVal.getZExtValue();
1886

1887
  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1888
  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1889
  // character constants are not sign extended in the this implementation:
1890
  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1891
  if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1892
      PP.getLangOpts().CharIsSigned)
1893
    Value = (signed char)Value;
1894
}
1895

1896
/// \verbatim
1897
///       string-literal: [C++0x lex.string]
1898
///         encoding-prefix " [s-char-sequence] "
1899
///         encoding-prefix R raw-string
1900
///       encoding-prefix:
1901
///         u8
1902
///         u
1903
///         U
1904
///         L
1905
///       s-char-sequence:
1906
///         s-char
1907
///         s-char-sequence s-char
1908
///       s-char:
1909
///         any member of the source character set except the double-quote ",
1910
///           backslash \, or new-line character
1911
///         escape-sequence
1912
///         universal-character-name
1913
///       raw-string:
1914
///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1915
///       r-char-sequence:
1916
///         r-char
1917
///         r-char-sequence r-char
1918
///       r-char:
1919
///         any member of the source character set, except a right parenthesis )
1920
///           followed by the initial d-char-sequence (which may be empty)
1921
///           followed by a double quote ".
1922
///       d-char-sequence:
1923
///         d-char
1924
///         d-char-sequence d-char
1925
///       d-char:
1926
///         any member of the basic source character set except:
1927
///           space, the left parenthesis (, the right parenthesis ),
1928
///           the backslash \, and the control characters representing horizontal
1929
///           tab, vertical tab, form feed, and newline.
1930
///       escape-sequence: [C++0x lex.ccon]
1931
///         simple-escape-sequence
1932
///         octal-escape-sequence
1933
///         hexadecimal-escape-sequence
1934
///       simple-escape-sequence:
1935
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1936
///       octal-escape-sequence:
1937
///         \ octal-digit
1938
///         \ octal-digit octal-digit
1939
///         \ octal-digit octal-digit octal-digit
1940
///       hexadecimal-escape-sequence:
1941
///         \x hexadecimal-digit
1942
///         hexadecimal-escape-sequence hexadecimal-digit
1943
///       universal-character-name:
1944
///         \u hex-quad
1945
///         \U hex-quad hex-quad
1946
///       hex-quad:
1947
///         hex-digit hex-digit hex-digit hex-digit
1948
/// \endverbatim
1949
///
1950
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1951
                                         Preprocessor &PP,
1952
                                         StringLiteralEvalMethod EvalMethod)
1953
    : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1954
      Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1955
      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1956
      ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1957
      Pascal(false) {
1958
  init(StringToks);
1959
}
1960

1961
void StringLiteralParser::init(ArrayRef<Token> StringToks){
1962
  // The literal token may have come from an invalid source location (e.g. due
1963
  // to a PCH error), in which case the token length will be 0.
1964
  if (StringToks.empty() || StringToks[0].getLength() < 2)
1965
    return DiagnoseLexingError(SourceLocation());
1966

1967
  // Scan all of the string portions, remember the max individual token length,
1968
  // computing a bound on the concatenated string length, and see whether any
1969
  // piece is a wide-string.  If any of the string portions is a wide-string
1970
  // literal, the result is a wide-string literal [C99 6.4.5p4].
1971
  assert(!StringToks.empty() && "expected at least one token");
1972
  MaxTokenLength = StringToks[0].getLength();
1973
  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1974
  SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1975
  hadError = false;
1976

1977
  // Determines the kind of string from the prefix
1978
  Kind = tok::string_literal;
1979

1980
  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1981
  for (const Token &Tok : StringToks) {
1982
    if (Tok.getLength() < 2)
1983
      return DiagnoseLexingError(Tok.getLocation());
1984

1985
    // The string could be shorter than this if it needs cleaning, but this is a
1986
    // reasonable bound, which is all we need.
1987
    assert(Tok.getLength() >= 2 && "literal token is invalid!");
1988
    SizeBound += Tok.getLength() - 2; // -2 for "".
1989

1990
    // Remember maximum string piece length.
1991
    if (Tok.getLength() > MaxTokenLength)
1992
      MaxTokenLength = Tok.getLength();
1993

1994
    // Remember if we see any wide or utf-8/16/32 strings.
1995
    // Also check for illegal concatenations.
1996
    if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1997
      if (Diags) {
1998
        SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1999
            Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
2000
            Features);
2001
        CharSourceRange Range =
2002
            CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
2003
        StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
2004
                         getEncodingPrefixLen(Tok.getKind()));
2005
        Diags->Report(Tok.getLocation(),
2006
                      Features.CPlusPlus26
2007
                          ? diag::err_unevaluated_string_prefix
2008
                          : diag::warn_unevaluated_string_prefix)
2009
            << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
2010
      }
2011
      if (Features.CPlusPlus26)
2012
        hadError = true;
2013
    } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
2014
      if (isOrdinary()) {
2015
        Kind = Tok.getKind();
2016
      } else {
2017
        if (Diags)
2018
          Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
2019
        hadError = true;
2020
      }
2021
    }
2022
  }
2023

2024
  // Include space for the null terminator.
2025
  ++SizeBound;
2026

2027
  // TODO: K&R warning: "traditional C rejects string constant concatenation"
2028

2029
  // Get the width in bytes of char/wchar_t/char16_t/char32_t
2030
  CharByteWidth = getCharWidth(Kind, Target);
2031
  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
2032
  CharByteWidth /= 8;
2033

2034
  // The output buffer size needs to be large enough to hold wide characters.
2035
  // This is a worst-case assumption which basically corresponds to L"" "long".
2036
  SizeBound *= CharByteWidth;
2037

2038
  // Size the temporary buffer to hold the result string data.
2039
  ResultBuf.resize(SizeBound);
2040

2041
  // Likewise, but for each string piece.
2042
  SmallString<512> TokenBuf;
2043
  TokenBuf.resize(MaxTokenLength);
2044

2045
  // Loop over all the strings, getting their spelling, and expanding them to
2046
  // wide strings as appropriate.
2047
  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
2048

2049
  Pascal = false;
2050

2051
  SourceLocation UDSuffixTokLoc;
2052

2053
  for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
2054
    const char *ThisTokBuf = &TokenBuf[0];
2055
    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
2056
    // that ThisTokBuf points to a buffer that is big enough for the whole token
2057
    // and 'spelled' tokens can only shrink.
2058
    bool StringInvalid = false;
2059
    unsigned ThisTokLen =
2060
      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2061
                         &StringInvalid);
2062
    if (StringInvalid)
2063
      return DiagnoseLexingError(StringToks[i].getLocation());
2064

2065
    const char *ThisTokBegin = ThisTokBuf;
2066
    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2067

2068
    // Remove an optional ud-suffix.
2069
    if (ThisTokEnd[-1] != '"') {
2070
      const char *UDSuffixEnd = ThisTokEnd;
2071
      do {
2072
        --ThisTokEnd;
2073
      } while (ThisTokEnd[-1] != '"');
2074

2075
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2076

2077
      if (UDSuffixBuf.empty()) {
2078
        if (StringToks[i].hasUCN())
2079
          expandUCNs(UDSuffixBuf, UDSuffix);
2080
        else
2081
          UDSuffixBuf.assign(UDSuffix);
2082
        UDSuffixToken = i;
2083
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2084
        UDSuffixTokLoc = StringToks[i].getLocation();
2085
      } else {
2086
        SmallString<32> ExpandedUDSuffix;
2087
        if (StringToks[i].hasUCN()) {
2088
          expandUCNs(ExpandedUDSuffix, UDSuffix);
2089
          UDSuffix = ExpandedUDSuffix;
2090
        }
2091

2092
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2093
        // result of a concatenation involving at least one user-defined-string-
2094
        // literal, all the participating user-defined-string-literals shall
2095
        // have the same ud-suffix.
2096
        bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2097
        if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2098
          if (Diags) {
2099
            SourceLocation TokLoc = StringToks[i].getLocation();
2100
            if (UnevaluatedStringHasUDL) {
2101
              Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2102
                  << SourceRange(TokLoc, TokLoc);
2103
            } else {
2104
              Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2105
                  << UDSuffixBuf << UDSuffix
2106
                  << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2107
            }
2108
          }
2109
          hadError = true;
2110
        }
2111
      }
2112
    }
2113

2114
    // Strip the end quote.
2115
    --ThisTokEnd;
2116

2117
    // TODO: Input character set mapping support.
2118

2119
    // Skip marker for wide or unicode strings.
2120
    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2121
      ++ThisTokBuf;
2122
      // Skip 8 of u8 marker for utf8 strings.
2123
      if (ThisTokBuf[0] == '8')
2124
        ++ThisTokBuf;
2125
    }
2126

2127
    // Check for raw string
2128
    if (ThisTokBuf[0] == 'R') {
2129
      if (ThisTokBuf[1] != '"') {
2130
        // The file may have come from PCH and then changed after loading the
2131
        // PCH; Fail gracefully.
2132
        return DiagnoseLexingError(StringToks[i].getLocation());
2133
      }
2134
      ThisTokBuf += 2; // skip R"
2135

2136
      // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2137
      // characters.
2138
      constexpr unsigned MaxRawStrDelimLen = 16;
2139

2140
      const char *Prefix = ThisTokBuf;
2141
      while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2142
             ThisTokBuf[0] != '(')
2143
        ++ThisTokBuf;
2144
      if (ThisTokBuf[0] != '(')
2145
        return DiagnoseLexingError(StringToks[i].getLocation());
2146
      ++ThisTokBuf; // skip '('
2147

2148
      // Remove same number of characters from the end
2149
      ThisTokEnd -= ThisTokBuf - Prefix;
2150
      if (ThisTokEnd < ThisTokBuf)
2151
        return DiagnoseLexingError(StringToks[i].getLocation());
2152

2153
      // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2154
      // results in a new-line in the resulting execution string-literal.
2155
      StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2156
      while (!RemainingTokenSpan.empty()) {
2157
        // Split the string literal on \r\n boundaries.
2158
        size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2159
        StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2160
        StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2161

2162
        // Copy everything before the \r\n sequence into the string literal.
2163
        if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2164
          hadError = true;
2165

2166
        // Point into the \n inside the \r\n sequence and operate on the
2167
        // remaining portion of the literal.
2168
        RemainingTokenSpan = AfterCRLF.substr(1);
2169
      }
2170
    } else {
2171
      if (ThisTokBuf[0] != '"') {
2172
        // The file may have come from PCH and then changed after loading the
2173
        // PCH; Fail gracefully.
2174
        return DiagnoseLexingError(StringToks[i].getLocation());
2175
      }
2176
      ++ThisTokBuf; // skip "
2177

2178
      // Check if this is a pascal string
2179
      if (!isUnevaluated() && Features.PascalStrings &&
2180
          ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2181
          ThisTokBuf[1] == 'p') {
2182

2183
        // If the \p sequence is found in the first token, we have a pascal string
2184
        // Otherwise, if we already have a pascal string, ignore the first \p
2185
        if (i == 0) {
2186
          ++ThisTokBuf;
2187
          Pascal = true;
2188
        } else if (Pascal)
2189
          ThisTokBuf += 2;
2190
      }
2191

2192
      while (ThisTokBuf != ThisTokEnd) {
2193
        // Is this a span of non-escape characters?
2194
        if (ThisTokBuf[0] != '\\') {
2195
          const char *InStart = ThisTokBuf;
2196
          do {
2197
            ++ThisTokBuf;
2198
          } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2199

2200
          // Copy the character span over.
2201
          if (CopyStringFragment(StringToks[i], ThisTokBegin,
2202
                                 StringRef(InStart, ThisTokBuf - InStart)))
2203
            hadError = true;
2204
          continue;
2205
        }
2206
        // Is this a Universal Character Name escape?
2207
        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2208
            ThisTokBuf[1] == 'N') {
2209
          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2210
                          ResultPtr, hadError,
2211
                          FullSourceLoc(StringToks[i].getLocation(), SM),
2212
                          CharByteWidth, Diags, Features);
2213
          continue;
2214
        }
2215
        // Otherwise, this is a non-UCN escape character.  Process it.
2216
        unsigned ResultChar =
2217
            ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2218
                              FullSourceLoc(StringToks[i].getLocation(), SM),
2219
                              CharByteWidth * 8, Diags, Features, EvalMethod);
2220

2221
        if (CharByteWidth == 4) {
2222
          // FIXME: Make the type of the result buffer correct instead of
2223
          // using reinterpret_cast.
2224
          llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2225
          *ResultWidePtr = ResultChar;
2226
          ResultPtr += 4;
2227
        } else if (CharByteWidth == 2) {
2228
          // FIXME: Make the type of the result buffer correct instead of
2229
          // using reinterpret_cast.
2230
          llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2231
          *ResultWidePtr = ResultChar & 0xFFFF;
2232
          ResultPtr += 2;
2233
        } else {
2234
          assert(CharByteWidth == 1 && "Unexpected char width");
2235
          *ResultPtr++ = ResultChar & 0xFF;
2236
        }
2237
      }
2238
    }
2239
  }
2240

2241
  assert((!Pascal || !isUnevaluated()) &&
2242
         "Pascal string in unevaluated context");
2243
  if (Pascal) {
2244
    if (CharByteWidth == 4) {
2245
      // FIXME: Make the type of the result buffer correct instead of
2246
      // using reinterpret_cast.
2247
      llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2248
      ResultWidePtr[0] = GetNumStringChars() - 1;
2249
    } else if (CharByteWidth == 2) {
2250
      // FIXME: Make the type of the result buffer correct instead of
2251
      // using reinterpret_cast.
2252
      llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2253
      ResultWidePtr[0] = GetNumStringChars() - 1;
2254
    } else {
2255
      assert(CharByteWidth == 1 && "Unexpected char width");
2256
      ResultBuf[0] = GetNumStringChars() - 1;
2257
    }
2258

2259
    // Verify that pascal strings aren't too large.
2260
    if (GetStringLength() > 256) {
2261
      if (Diags)
2262
        Diags->Report(StringToks.front().getLocation(),
2263
                      diag::err_pascal_string_too_long)
2264
          << SourceRange(StringToks.front().getLocation(),
2265
                         StringToks.back().getLocation());
2266
      hadError = true;
2267
      return;
2268
    }
2269
  } else if (Diags) {
2270
    // Complain if this string literal has too many characters.
2271
    unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2272

2273
    if (GetNumStringChars() > MaxChars)
2274
      Diags->Report(StringToks.front().getLocation(),
2275
                    diag::ext_string_too_long)
2276
        << GetNumStringChars() << MaxChars
2277
        << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2278
        << SourceRange(StringToks.front().getLocation(),
2279
                       StringToks.back().getLocation());
2280
  }
2281
}
2282

2283
static const char *resyncUTF8(const char *Err, const char *End) {
2284
  if (Err == End)
2285
    return End;
2286
  End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2287
  while (++Err != End && (*Err & 0xC0) == 0x80)
2288
    ;
2289
  return Err;
2290
}
2291

2292
/// This function copies from Fragment, which is a sequence of bytes
2293
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2294
/// Performs widening for multi-byte characters.
2295
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2296
                                             const char *TokBegin,
2297
                                             StringRef Fragment) {
2298
  const llvm::UTF8 *ErrorPtrTmp;
2299
  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2300
    return false;
2301

2302
  // If we see bad encoding for unprefixed string literals, warn and
2303
  // simply copy the byte values, for compatibility with gcc and older
2304
  // versions of clang.
2305
  bool NoErrorOnBadEncoding = isOrdinary();
2306
  if (NoErrorOnBadEncoding) {
2307
    memcpy(ResultPtr, Fragment.data(), Fragment.size());
2308
    ResultPtr += Fragment.size();
2309
  }
2310

2311
  if (Diags) {
2312
    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2313

2314
    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2315
    const DiagnosticBuilder &Builder =
2316
      Diag(Diags, Features, SourceLoc, TokBegin,
2317
           ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2318
           NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2319
                                : diag::err_bad_string_encoding);
2320

2321
    const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2322
    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2323

2324
    // Decode into a dummy buffer.
2325
    SmallString<512> Dummy;
2326
    Dummy.reserve(Fragment.size() * CharByteWidth);
2327
    char *Ptr = Dummy.data();
2328

2329
    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2330
      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2331
      NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2332
      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2333
                                     ErrorPtr, NextStart);
2334
      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2335
    }
2336
  }
2337
  return !NoErrorOnBadEncoding;
2338
}
2339

2340
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2341
  hadError = true;
2342
  if (Diags)
2343
    Diags->Report(Loc, diag::err_lexing_string);
2344
}
2345

2346
/// getOffsetOfStringByte - This function returns the offset of the
2347
/// specified byte of the string data represented by Token.  This handles
2348
/// advancing over escape sequences in the string.
2349
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2350
                                                    unsigned ByteNo) const {
2351
  // Get the spelling of the token.
2352
  SmallString<32> SpellingBuffer;
2353
  SpellingBuffer.resize(Tok.getLength());
2354

2355
  bool StringInvalid = false;
2356
  const char *SpellingPtr = &SpellingBuffer[0];
2357
  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2358
                                       &StringInvalid);
2359
  if (StringInvalid)
2360
    return 0;
2361

2362
  const char *SpellingStart = SpellingPtr;
2363
  const char *SpellingEnd = SpellingPtr+TokLen;
2364

2365
  // Handle UTF-8 strings just like narrow strings.
2366
  if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2367
    SpellingPtr += 2;
2368

2369
  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2370
         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2371

2372
  // For raw string literals, this is easy.
2373
  if (SpellingPtr[0] == 'R') {
2374
    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2375
    // Skip 'R"'.
2376
    SpellingPtr += 2;
2377
    while (*SpellingPtr != '(') {
2378
      ++SpellingPtr;
2379
      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2380
    }
2381
    // Skip '('.
2382
    ++SpellingPtr;
2383
    return SpellingPtr - SpellingStart + ByteNo;
2384
  }
2385

2386
  // Skip over the leading quote
2387
  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2388
  ++SpellingPtr;
2389

2390
  // Skip over bytes until we find the offset we're looking for.
2391
  while (ByteNo) {
2392
    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2393

2394
    // Step over non-escapes simply.
2395
    if (*SpellingPtr != '\\') {
2396
      ++SpellingPtr;
2397
      --ByteNo;
2398
      continue;
2399
    }
2400

2401
    // Otherwise, this is an escape character.  Advance over it.
2402
    bool HadError = false;
2403
    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2404
        SpellingPtr[1] == 'N') {
2405
      const char *EscapePtr = SpellingPtr;
2406
      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2407
                                      1, Features, HadError);
2408
      if (Len > ByteNo) {
2409
        // ByteNo is somewhere within the escape sequence.
2410
        SpellingPtr = EscapePtr;
2411
        break;
2412
      }
2413
      ByteNo -= Len;
2414
    } else {
2415
      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2416
                        FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2417
                        Diags, Features, StringLiteralEvalMethod::Evaluated);
2418
      --ByteNo;
2419
    }
2420
    assert(!HadError && "This method isn't valid on erroneous strings");
2421
  }
2422

2423
  return SpellingPtr-SpellingStart;
2424
}
2425

2426
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2427
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2428
/// treat it as an invalid suffix.
2429
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2430
                                          StringRef Suffix) {
2431
  return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2432
         Suffix == "sv";
2433
}
2434
llvm-project

Использование cookies