llvm-project
2433 строки · 82.5 Кб
1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the NumericLiteralParser, CharLiteralParser, and
10// StringLiteralParser interfaces.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/LiteralSupport.h"
15#include "clang/Basic/CharInfo.h"
16#include "clang/Basic/LangOptions.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TargetInfo.h"
19#include "clang/Lex/LexDiagnostic.h"
20#include "clang/Lex/Lexer.h"
21#include "clang/Lex/Preprocessor.h"
22#include "clang/Lex/Token.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/StringExtras.h"
26#include "llvm/ADT/StringSwitch.h"
27#include "llvm/Support/ConvertUTF.h"
28#include "llvm/Support/Error.h"
29#include "llvm/Support/ErrorHandling.h"
30#include "llvm/Support/Unicode.h"
31#include <algorithm>
32#include <cassert>
33#include <cstddef>
34#include <cstdint>
35#include <cstring>
36#include <string>
37
38using namespace clang;
39
40static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41switch (kind) {
42default: llvm_unreachable("Unknown token type!");
43case tok::char_constant:
44case tok::string_literal:
45case tok::utf8_char_constant:
46case tok::utf8_string_literal:
47return Target.getCharWidth();
48case tok::wide_char_constant:
49case tok::wide_string_literal:
50return Target.getWCharWidth();
51case tok::utf16_char_constant:
52case tok::utf16_string_literal:
53return Target.getChar16Width();
54case tok::utf32_char_constant:
55case tok::utf32_string_literal:
56return Target.getChar32Width();
57}
58}
59
60static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61switch (kind) {
62default:
63llvm_unreachable("Unknown token type!");
64case tok::char_constant:
65case tok::string_literal:
66return 0;
67case tok::utf8_char_constant:
68case tok::utf8_string_literal:
69return 2;
70case tok::wide_char_constant:
71case tok::wide_string_literal:
72case tok::utf16_char_constant:
73case tok::utf16_string_literal:
74case tok::utf32_char_constant:
75case tok::utf32_string_literal:
76return 1;
77}
78}
79
80static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81FullSourceLoc TokLoc,
82const char *TokBegin,
83const char *TokRangeBegin,
84const char *TokRangeEnd) {
85SourceLocation Begin =
86Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
87TokLoc.getManager(), Features);
88SourceLocation End =
89Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
90TokLoc.getManager(), Features);
91return CharSourceRange::getCharRange(Begin, End);
92}
93
94/// Produce a diagnostic highlighting some portion of a literal.
95///
96/// Emits the diagnostic \p DiagID, highlighting the range of characters from
97/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98/// a substring of a spelling buffer for the token beginning at \p TokBegin.
99static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100const LangOptions &Features, FullSourceLoc TokLoc,
101const char *TokBegin, const char *TokRangeBegin,
102const char *TokRangeEnd, unsigned DiagID) {
103SourceLocation Begin =
104Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
105TokLoc.getManager(), Features);
106return Diags->Report(Begin, DiagID) <<
107MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108}
109
110static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111switch (Escape) {
112case '\'':
113case '"':
114case '?':
115case '\\':
116case 'a':
117case 'b':
118case 'f':
119case 'n':
120case 'r':
121case 't':
122case 'v':
123return true;
124}
125return false;
126}
127
128/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129/// either a character or a string literal.
130static unsigned ProcessCharEscape(const char *ThisTokBegin,
131const char *&ThisTokBuf,
132const char *ThisTokEnd, bool &HadError,
133FullSourceLoc Loc, unsigned CharWidth,
134DiagnosticsEngine *Diags,
135const LangOptions &Features,
136StringLiteralEvalMethod EvalMethod) {
137const char *EscapeBegin = ThisTokBuf;
138bool Delimited = false;
139bool EndDelimiterFound = false;
140
141// Skip the '\' char.
142++ThisTokBuf;
143
144// We know that this character can't be off the end of the buffer, because
145// that would have been \", which would not have been the end of string.
146unsigned ResultChar = *ThisTokBuf++;
147char Escape = ResultChar;
148switch (ResultChar) {
149// These map to themselves.
150case '\\': case '\'': case '"': case '?': break;
151
152// These have fixed mappings.
153case 'a':
154// TODO: K&R: the meaning of '\\a' is different in traditional C
155ResultChar = 7;
156break;
157case 'b':
158ResultChar = 8;
159break;
160case 'e':
161if (Diags)
162Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
163diag::ext_nonstandard_escape) << "e";
164ResultChar = 27;
165break;
166case 'E':
167if (Diags)
168Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
169diag::ext_nonstandard_escape) << "E";
170ResultChar = 27;
171break;
172case 'f':
173ResultChar = 12;
174break;
175case 'n':
176ResultChar = 10;
177break;
178case 'r':
179ResultChar = 13;
180break;
181case 't':
182ResultChar = 9;
183break;
184case 'v':
185ResultChar = 11;
186break;
187case 'x': { // Hex escape.
188ResultChar = 0;
189if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190Delimited = true;
191ThisTokBuf++;
192if (*ThisTokBuf == '}') {
193Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194diag::err_delimited_escape_empty);
195return ResultChar;
196}
197} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
198if (Diags)
199Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200diag::err_hex_escape_no_digits) << "x";
201return ResultChar;
202}
203
204// Hex escapes are a maximal series of hex digits.
205bool Overflow = false;
206for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207if (Delimited && *ThisTokBuf == '}') {
208ThisTokBuf++;
209EndDelimiterFound = true;
210break;
211}
212int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213if (CharVal == -1) {
214// Non delimited hex escape sequences stop at the first non-hex digit.
215if (!Delimited)
216break;
217HadError = true;
218if (Diags)
219Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220diag::err_delimited_escape_invalid)
221<< StringRef(ThisTokBuf, 1);
222continue;
223}
224// About to shift out a digit?
225if (ResultChar & 0xF0000000)
226Overflow = true;
227ResultChar <<= 4;
228ResultChar |= CharVal;
229}
230// See if any bits will be truncated when evaluated as a character.
231if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
232Overflow = true;
233ResultChar &= ~0U >> (32-CharWidth);
234}
235
236// Check for overflow.
237if (!HadError && Overflow) { // Too many digits to fit in
238HadError = true;
239if (Diags)
240Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241diag::err_escape_too_large)
242<< 0;
243}
244break;
245}
246case '0': case '1': case '2': case '3':
247case '4': case '5': case '6': case '7': {
248// Octal escapes.
249--ThisTokBuf;
250ResultChar = 0;
251
252// Octal escapes are a series of octal digits with maximum length 3.
253// "\0123" is a two digit sequence equal to "\012" "3".
254unsigned NumDigits = 0;
255do {
256ResultChar <<= 3;
257ResultChar |= *ThisTokBuf++ - '0';
258++NumDigits;
259} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
260ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
261
262// Check for overflow. Reject '\777', but not L'\777'.
263if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
264if (Diags)
265Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
266diag::err_escape_too_large) << 1;
267ResultChar &= ~0U >> (32-CharWidth);
268}
269break;
270}
271case 'o': {
272bool Overflow = false;
273if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274HadError = true;
275if (Diags)
276Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
277diag::err_delimited_escape_missing_brace)
278<< "o";
279
280break;
281}
282ResultChar = 0;
283Delimited = true;
284++ThisTokBuf;
285if (*ThisTokBuf == '}') {
286Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287diag::err_delimited_escape_empty);
288return ResultChar;
289}
290
291while (ThisTokBuf != ThisTokEnd) {
292if (*ThisTokBuf == '}') {
293EndDelimiterFound = true;
294ThisTokBuf++;
295break;
296}
297if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298HadError = true;
299if (Diags)
300Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301diag::err_delimited_escape_invalid)
302<< StringRef(ThisTokBuf, 1);
303ThisTokBuf++;
304continue;
305}
306// Check if one of the top three bits is set before shifting them out.
307if (ResultChar & 0xE0000000)
308Overflow = true;
309
310ResultChar <<= 3;
311ResultChar |= *ThisTokBuf++ - '0';
312}
313// Check for overflow. Reject '\777', but not L'\777'.
314if (!HadError &&
315(Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316HadError = true;
317if (Diags)
318Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319diag::err_escape_too_large)
320<< 1;
321ResultChar &= ~0U >> (32 - CharWidth);
322}
323break;
324}
325// Otherwise, these are not valid escapes.
326case '(': case '{': case '[': case '%':
327// GCC accepts these as extensions. We warn about them as such though.
328if (Diags)
329Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
330diag::ext_nonstandard_escape)
331<< std::string(1, ResultChar);
332break;
333default:
334if (!Diags)
335break;
336
337if (isPrintable(ResultChar))
338Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
339diag::ext_unknown_escape)
340<< std::string(1, ResultChar);
341else
342Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
343diag::ext_unknown_escape)
344<< "x" + llvm::utohexstr(ResultChar);
345break;
346}
347
348if (Delimited && Diags) {
349if (!EndDelimiterFound)
350Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351diag::err_expected)
352<< tok::r_brace;
353else if (!HadError) {
354Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
355Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356: diag::ext_delimited_escape_sequence)
357<< /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358}
359}
360
361if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362!IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
363Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
364diag::err_unevaluated_string_invalid_escape_sequence)
365<< StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
366HadError = true;
367}
368
369return ResultChar;
370}
371
372static void appendCodePoint(unsigned Codepoint,
373llvm::SmallVectorImpl<char> &Str) {
374char ResultBuf[4];
375char *ResultPtr = ResultBuf;
376if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
377Str.append(ResultBuf, ResultPtr);
378}
379
380void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
382if (*I != '\\') {
383Buf.push_back(*I);
384continue;
385}
386
387++I;
388char Kind = *I;
389++I;
390
391assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392uint32_t CodePoint = 0;
393
394if (Kind == 'u' && *I == '{') {
395for (++I; *I != '}'; ++I) {
396unsigned Value = llvm::hexDigitValue(*I);
397assert(Value != -1U);
398CodePoint <<= 4;
399CodePoint += Value;
400}
401appendCodePoint(CodePoint, Buf);
402continue;
403}
404
405if (Kind == 'N') {
406assert(*I == '{');
407++I;
408auto Delim = std::find(I, Input.end(), '}');
409assert(Delim != Input.end());
410StringRef Name(I, std::distance(I, Delim));
411std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
412llvm::sys::unicode::nameToCodepointLooseMatching(Name);
413assert(Res && "could not find a codepoint that was previously found");
414CodePoint = Res->CodePoint;
415assert(CodePoint != 0xFFFFFFFF);
416appendCodePoint(CodePoint, Buf);
417I = Delim;
418continue;
419}
420
421unsigned NumHexDigits;
422if (Kind == 'u')
423NumHexDigits = 4;
424else
425NumHexDigits = 8;
426
427assert(I + NumHexDigits <= E);
428
429for (; NumHexDigits != 0; ++I, --NumHexDigits) {
430unsigned Value = llvm::hexDigitValue(*I);
431assert(Value != -1U);
432
433CodePoint <<= 4;
434CodePoint += Value;
435}
436
437appendCodePoint(CodePoint, Buf);
438--I;
439}
440}
441
442bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
443const LangOptions &LO) {
444return LO.MicrosoftExt &&
445(K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
446K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
447K == tok::kw___FUNCDNAME__);
448}
449
450bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
451return tok::isStringLiteral(Tok.getKind()) ||
452isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
453}
454
455static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
456const char *&ThisTokBuf,
457const char *ThisTokEnd, uint32_t &UcnVal,
458unsigned short &UcnLen, bool &Delimited,
459FullSourceLoc Loc, DiagnosticsEngine *Diags,
460const LangOptions &Features,
461bool in_char_string_literal = false) {
462const char *UcnBegin = ThisTokBuf;
463bool HasError = false;
464bool EndDelimiterFound = false;
465
466// Skip the '\u' char's.
467ThisTokBuf += 2;
468Delimited = false;
469if (UcnBegin[1] == 'u' && in_char_string_literal &&
470ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
471Delimited = true;
472ThisTokBuf++;
473} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
474if (Diags)
475Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
476diag::err_hex_escape_no_digits)
477<< StringRef(&ThisTokBuf[-1], 1);
478return false;
479}
480UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
481
482bool Overflow = false;
483unsigned short Count = 0;
484for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
485++ThisTokBuf) {
486if (Delimited && *ThisTokBuf == '}') {
487++ThisTokBuf;
488EndDelimiterFound = true;
489break;
490}
491int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492if (CharVal == -1) {
493HasError = true;
494if (!Delimited)
495break;
496if (Diags) {
497Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498diag::err_delimited_escape_invalid)
499<< StringRef(ThisTokBuf, 1);
500}
501Count++;
502continue;
503}
504if (UcnVal & 0xF0000000) {
505Overflow = true;
506continue;
507}
508UcnVal <<= 4;
509UcnVal |= CharVal;
510Count++;
511}
512
513if (Overflow) {
514if (Diags)
515Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516diag::err_escape_too_large)
517<< 0;
518return false;
519}
520
521if (Delimited && !EndDelimiterFound) {
522if (Diags) {
523Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524diag::err_expected)
525<< tok::r_brace;
526}
527return false;
528}
529
530// If we didn't consume the proper number of digits, there is a problem.
531if (Count == 0 || (!Delimited && Count != UcnLen)) {
532if (Diags)
533Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534Delimited ? diag::err_delimited_escape_empty
535: diag::err_ucn_escape_incomplete);
536return false;
537}
538return !HasError;
539}
540
541static void DiagnoseInvalidUnicodeCharacterName(
542DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
543const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
544llvm::StringRef Name) {
545
546Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
547diag::err_invalid_ucn_name)
548<< Name;
549
550namespace u = llvm::sys::unicode;
551
552std::optional<u::LooseMatchingResult> Res =
553u::nameToCodepointLooseMatching(Name);
554if (Res) {
555Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
556diag::note_invalid_ucn_name_loose_matching)
557<< FixItHint::CreateReplacement(
558MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
559TokRangeEnd),
560Res->Name);
561return;
562}
563
564unsigned Distance = 0;
565SmallVector<u::MatchForCodepointName> Matches =
566u::nearestMatchesForCodepointName(Name, 5);
567assert(!Matches.empty() && "No unicode characters found");
568
569for (const auto &Match : Matches) {
570if (Distance == 0)
571Distance = Match.Distance;
572if (std::max(Distance, Match.Distance) -
573std::min(Distance, Match.Distance) >
5743)
575break;
576Distance = Match.Distance;
577
578std::string Str;
579llvm::UTF32 V = Match.Value;
580bool Converted =
581llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582(void)Converted;
583assert(Converted && "Found a match wich is not a unicode character");
584
585Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
586diag::note_invalid_ucn_name_candidate)
587<< Match.Name << llvm::utohexstr(Match.Value)
588<< Str // FIXME: Fix the rendering of non printable characters
589<< FixItHint::CreateReplacement(
590MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
591TokRangeEnd),
592Match.Name);
593}
594}
595
596static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
597const char *&ThisTokBuf,
598const char *ThisTokEnd, uint32_t &UcnVal,
599unsigned short &UcnLen, FullSourceLoc Loc,
600DiagnosticsEngine *Diags,
601const LangOptions &Features) {
602const char *UcnBegin = ThisTokBuf;
603assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
604ThisTokBuf += 2;
605if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
606if (Diags) {
607Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
608diag::err_delimited_escape_missing_brace)
609<< StringRef(&ThisTokBuf[-1], 1);
610}
611return false;
612}
613ThisTokBuf++;
614const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615return C == '}' || isVerticalWhitespace(C);
616});
617bool Incomplete = ClosingBrace == ThisTokEnd;
618bool Empty = ClosingBrace == ThisTokBuf;
619if (Incomplete || Empty) {
620if (Diags) {
621Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
622Incomplete ? diag::err_ucn_escape_incomplete
623: diag::err_delimited_escape_empty)
624<< StringRef(&UcnBegin[1], 1);
625}
626ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
627return false;
628}
629StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
630ThisTokBuf = ClosingBrace + 1;
631std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
632if (!Res) {
633if (Diags)
634DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
635&UcnBegin[3], ClosingBrace, Name);
636return false;
637}
638UcnVal = *Res;
639UcnLen = UcnVal > 0xFFFF ? 8 : 4;
640return true;
641}
642
643/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
644/// return the UTF32.
645static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
646const char *ThisTokEnd, uint32_t &UcnVal,
647unsigned short &UcnLen, FullSourceLoc Loc,
648DiagnosticsEngine *Diags,
649const LangOptions &Features,
650bool in_char_string_literal = false) {
651
652bool HasError;
653const char *UcnBegin = ThisTokBuf;
654bool IsDelimitedEscapeSequence = false;
655bool IsNamedEscapeSequence = false;
656if (ThisTokBuf[1] == 'N') {
657IsNamedEscapeSequence = true;
658HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
659UcnVal, UcnLen, Loc, Diags, Features);
660} else {
661HasError =
662!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
663UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
664Features, in_char_string_literal);
665}
666if (HasError)
667return false;
668
669// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
670if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
671UcnVal > 0x10FFFF) { // maximum legal UTF32 value
672if (Diags)
673Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
674diag::err_ucn_escape_invalid);
675return false;
676}
677
678// C23 and C++11 allow UCNs that refer to control characters
679// and basic source characters inside character and string literals
680if (UcnVal < 0xa0 &&
681// $, @, ` are allowed in all language modes
682(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
683bool IsError =
684(!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
685if (Diags) {
686char BasicSCSChar = UcnVal;
687if (UcnVal >= 0x20 && UcnVal < 0x7f)
688Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
689IsError ? diag::err_ucn_escape_basic_scs
690: Features.CPlusPlus
691? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
692: diag::warn_c23_compat_literal_ucn_escape_basic_scs)
693<< StringRef(&BasicSCSChar, 1);
694else
695Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
696IsError ? diag::err_ucn_control_character
697: Features.CPlusPlus
698? diag::warn_cxx98_compat_literal_ucn_control_character
699: diag::warn_c23_compat_literal_ucn_control_character);
700}
701if (IsError)
702return false;
703}
704
705if (!Features.CPlusPlus && !Features.C99 && Diags)
706Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
707diag::warn_ucn_not_valid_in_c89_literal);
708
709if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
710Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
711Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712: diag::ext_delimited_escape_sequence)
713<< (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
714
715return true;
716}
717
718/// MeasureUCNEscape - Determine the number of bytes within the resulting string
719/// which this UCN will occupy.
720static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
721const char *ThisTokEnd, unsigned CharByteWidth,
722const LangOptions &Features, bool &HadError) {
723// UTF-32: 4 bytes per escape.
724if (CharByteWidth == 4)
725return 4;
726
727uint32_t UcnVal = 0;
728unsigned short UcnLen = 0;
729FullSourceLoc Loc;
730
731if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
732UcnLen, Loc, nullptr, Features, true)) {
733HadError = true;
734return 0;
735}
736
737// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
738if (CharByteWidth == 2)
739return UcnVal <= 0xFFFF ? 2 : 4;
740
741// UTF-8.
742if (UcnVal < 0x80)
743return 1;
744if (UcnVal < 0x800)
745return 2;
746if (UcnVal < 0x10000)
747return 3;
748return 4;
749}
750
751/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
752/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
753/// StringLiteralParser. When we decide to implement UCN's for identifiers,
754/// we will likely rework our support for UCN's.
755static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
756const char *ThisTokEnd,
757char *&ResultBuf, bool &HadError,
758FullSourceLoc Loc, unsigned CharByteWidth,
759DiagnosticsEngine *Diags,
760const LangOptions &Features) {
761typedef uint32_t UTF32;
762UTF32 UcnVal = 0;
763unsigned short UcnLen = 0;
764if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
765Loc, Diags, Features, true)) {
766HadError = true;
767return;
768}
769
770assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
771"only character widths of 1, 2, or 4 bytes supported");
772
773(void)UcnLen;
774assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
775
776if (CharByteWidth == 4) {
777// FIXME: Make the type of the result buffer correct instead of
778// using reinterpret_cast.
779llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
780*ResultPtr = UcnVal;
781ResultBuf += 4;
782return;
783}
784
785if (CharByteWidth == 2) {
786// FIXME: Make the type of the result buffer correct instead of
787// using reinterpret_cast.
788llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
789
790if (UcnVal <= (UTF32)0xFFFF) {
791*ResultPtr = UcnVal;
792ResultBuf += 2;
793return;
794}
795
796// Convert to UTF16.
797UcnVal -= 0x10000;
798*ResultPtr = 0xD800 + (UcnVal >> 10);
799*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
800ResultBuf += 4;
801return;
802}
803
804assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
805
806// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
807// The conversion below was inspired by:
808// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
809// First, we determine how many bytes the result will require.
810typedef uint8_t UTF8;
811
812unsigned short bytesToWrite = 0;
813if (UcnVal < (UTF32)0x80)
814bytesToWrite = 1;
815else if (UcnVal < (UTF32)0x800)
816bytesToWrite = 2;
817else if (UcnVal < (UTF32)0x10000)
818bytesToWrite = 3;
819else
820bytesToWrite = 4;
821
822const unsigned byteMask = 0xBF;
823const unsigned byteMark = 0x80;
824
825// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
826// into the first byte, depending on how many bytes follow.
827static const UTF8 firstByteMark[5] = {
8280x00, 0x00, 0xC0, 0xE0, 0xF0
829};
830// Finally, we write the bytes into ResultBuf.
831ResultBuf += bytesToWrite;
832switch (bytesToWrite) { // note: everything falls through.
833case 4:
834*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835[[fallthrough]];
836case 3:
837*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838[[fallthrough]];
839case 2:
840*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841[[fallthrough]];
842case 1:
843*--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
844}
845// Update the buffer.
846ResultBuf += bytesToWrite;
847}
848
849/// integer-constant: [C99 6.4.4.1]
850/// decimal-constant integer-suffix
851/// octal-constant integer-suffix
852/// hexadecimal-constant integer-suffix
853/// binary-literal integer-suffix [GNU, C++1y]
854/// user-defined-integer-literal: [C++11 lex.ext]
855/// decimal-literal ud-suffix
856/// octal-literal ud-suffix
857/// hexadecimal-literal ud-suffix
858/// binary-literal ud-suffix [GNU, C++1y]
859/// decimal-constant:
860/// nonzero-digit
861/// decimal-constant digit
862/// octal-constant:
863/// 0
864/// octal-constant octal-digit
865/// hexadecimal-constant:
866/// hexadecimal-prefix hexadecimal-digit
867/// hexadecimal-constant hexadecimal-digit
868/// hexadecimal-prefix: one of
869/// 0x 0X
870/// binary-literal:
871/// 0b binary-digit
872/// 0B binary-digit
873/// binary-literal binary-digit
874/// integer-suffix:
875/// unsigned-suffix [long-suffix]
876/// unsigned-suffix [long-long-suffix]
877/// long-suffix [unsigned-suffix]
878/// long-long-suffix [unsigned-sufix]
879/// nonzero-digit:
880/// 1 2 3 4 5 6 7 8 9
881/// octal-digit:
882/// 0 1 2 3 4 5 6 7
883/// hexadecimal-digit:
884/// 0 1 2 3 4 5 6 7 8 9
885/// a b c d e f
886/// A B C D E F
887/// binary-digit:
888/// 0
889/// 1
890/// unsigned-suffix: one of
891/// u U
892/// long-suffix: one of
893/// l L
894/// long-long-suffix: one of
895/// ll LL
896///
897/// floating-constant: [C99 6.4.4.2]
898/// TODO: add rules...
899///
900NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
901SourceLocation TokLoc,
902const SourceManager &SM,
903const LangOptions &LangOpts,
904const TargetInfo &Target,
905DiagnosticsEngine &Diags)
906: SM(SM), LangOpts(LangOpts), Diags(Diags),
907ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
908
909s = DigitsBegin = ThisTokBegin;
910saw_exponent = false;
911saw_period = false;
912saw_ud_suffix = false;
913saw_fixed_point_suffix = false;
914isLong = false;
915isUnsigned = false;
916isLongLong = false;
917isSizeT = false;
918isHalf = false;
919isFloat = false;
920isImaginary = false;
921isFloat16 = false;
922isFloat128 = false;
923MicrosoftInteger = 0;
924isFract = false;
925isAccum = false;
926hadError = false;
927isBitInt = false;
928
929// This routine assumes that the range begin/end matches the regex for integer
930// and FP constants (specifically, the 'pp-number' regex), and assumes that
931// the byte at "*end" is both valid and not part of the regex. Because of
932// this, it doesn't have to check for 'overscan' in various places.
933// Note: For HLSL, the end token is allowed to be '.' which would be in the
934// 'pp-number' regex. This is required to support vector swizzles on numeric
935// constants (i.e. 1.xx or 1.5f.rrr).
936if (isPreprocessingNumberBody(*ThisTokEnd) &&
937!(LangOpts.HLSL && *ThisTokEnd == '.')) {
938Diags.Report(TokLoc, diag::err_lexing_numeric);
939hadError = true;
940return;
941}
942
943if (*s == '0') { // parse radix
944ParseNumberStartingWithZero(TokLoc);
945if (hadError)
946return;
947} else { // the first digit is non-zero
948radix = 10;
949s = SkipDigits(s);
950if (s == ThisTokEnd) {
951// Done.
952} else {
953ParseDecimalOrOctalCommon(TokLoc);
954if (hadError)
955return;
956}
957}
958
959SuffixBegin = s;
960checkSeparator(TokLoc, s, CSK_AfterDigits);
961
962// Initial scan to lookahead for fixed point suffix.
963if (LangOpts.FixedPoint) {
964for (const char *c = s; c != ThisTokEnd; ++c) {
965if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
966saw_fixed_point_suffix = true;
967break;
968}
969}
970}
971
972// Parse the suffix. At this point we can classify whether we have an FP or
973// integer constant.
974bool isFixedPointConstant = isFixedPointLiteral();
975bool isFPConstant = isFloatingLiteral();
976bool HasSize = false;
977bool DoubleUnderscore = false;
978
979// Loop over all of the characters of the suffix. If we see something bad,
980// we break out of the loop.
981for (; s != ThisTokEnd; ++s) {
982switch (*s) {
983case 'R':
984case 'r':
985if (!LangOpts.FixedPoint)
986break;
987if (isFract || isAccum) break;
988if (!(saw_period || saw_exponent)) break;
989isFract = true;
990continue;
991case 'K':
992case 'k':
993if (!LangOpts.FixedPoint)
994break;
995if (isFract || isAccum) break;
996if (!(saw_period || saw_exponent)) break;
997isAccum = true;
998continue;
999case 'h': // FP Suffix for "half".
1000case 'H':
1001// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1002if (!(LangOpts.Half || LangOpts.FixedPoint))
1003break;
1004if (isIntegerLiteral()) break; // Error for integer constant.
1005if (HasSize)
1006break;
1007HasSize = true;
1008isHalf = true;
1009continue; // Success.
1010case 'f': // FP Suffix for "float"
1011case 'F':
1012if (!isFPConstant) break; // Error for integer constant.
1013if (HasSize)
1014break;
1015HasSize = true;
1016
1017// CUDA host and device may have different _Float16 support, therefore
1018// allows f16 literals to avoid false alarm.
1019// When we compile for OpenMP target offloading on NVPTX, f16 suffix
1020// should also be supported.
1021// ToDo: more precise check for CUDA.
1022// TODO: AMDGPU might also support it in the future.
1023if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1024(LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1025s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1026s += 2; // success, eat up 2 characters.
1027isFloat16 = true;
1028continue;
1029}
1030
1031isFloat = true;
1032continue; // Success.
1033case 'q': // FP Suffix for "__float128"
1034case 'Q':
1035if (!isFPConstant) break; // Error for integer constant.
1036if (HasSize)
1037break;
1038HasSize = true;
1039isFloat128 = true;
1040continue; // Success.
1041case 'u':
1042case 'U':
1043if (isFPConstant) break; // Error for floating constant.
1044if (isUnsigned) break; // Cannot be repeated.
1045isUnsigned = true;
1046continue; // Success.
1047case 'l':
1048case 'L':
1049if (HasSize)
1050break;
1051HasSize = true;
1052
1053// Check for long long. The L's need to be adjacent and the same case.
1054if (s[1] == s[0]) {
1055assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1056if (isFPConstant) break; // long long invalid for floats.
1057isLongLong = true;
1058++s; // Eat both of them.
1059} else {
1060isLong = true;
1061}
1062continue; // Success.
1063case 'z':
1064case 'Z':
1065if (isFPConstant)
1066break; // Invalid for floats.
1067if (HasSize)
1068break;
1069HasSize = true;
1070isSizeT = true;
1071continue;
1072case 'i':
1073case 'I':
1074if (LangOpts.MicrosoftExt && !isFPConstant) {
1075// Allow i8, i16, i32, and i64. First, look ahead and check if
1076// suffixes are Microsoft integers and not the imaginary unit.
1077uint8_t Bits = 0;
1078size_t ToSkip = 0;
1079switch (s[1]) {
1080case '8': // i8 suffix
1081Bits = 8;
1082ToSkip = 2;
1083break;
1084case '1':
1085if (s[2] == '6') { // i16 suffix
1086Bits = 16;
1087ToSkip = 3;
1088}
1089break;
1090case '3':
1091if (s[2] == '2') { // i32 suffix
1092Bits = 32;
1093ToSkip = 3;
1094}
1095break;
1096case '6':
1097if (s[2] == '4') { // i64 suffix
1098Bits = 64;
1099ToSkip = 3;
1100}
1101break;
1102default:
1103break;
1104}
1105if (Bits) {
1106if (HasSize)
1107break;
1108HasSize = true;
1109MicrosoftInteger = Bits;
1110s += ToSkip;
1111assert(s <= ThisTokEnd && "didn't maximally munch?");
1112break;
1113}
1114}
1115[[fallthrough]];
1116case 'j':
1117case 'J':
1118if (isImaginary) break; // Cannot be repeated.
1119isImaginary = true;
1120continue; // Success.
1121case '_':
1122if (isFPConstant)
1123break; // Invalid for floats
1124if (HasSize)
1125break;
1126// There is currently no way to reach this with DoubleUnderscore set.
1127// If new double underscope literals are added handle it here as above.
1128assert(!DoubleUnderscore && "unhandled double underscore case");
1129if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&
1130s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists
1131// after __
1132DoubleUnderscore = true;
1133s += 2; // Skip both '_'
1134if (s + 1 < ThisTokEnd &&
1135(*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U'
1136isUnsigned = true;
1137++s;
1138}
1139if (s + 1 < ThisTokEnd &&
1140((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) {
1141isBitInt = true;
1142HasSize = true;
1143continue;
1144}
1145}
1146break;
1147case 'w':
1148case 'W':
1149if (isFPConstant)
1150break; // Invalid for floats.
1151if (HasSize)
1152break; // Invalid if we already have a size for the literal.
1153
1154// wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1155// explicitly do not support the suffix in C++ as an extension because a
1156// library-based UDL that resolves to a library type may be more
1157// appropriate there. The same rules apply for __wb/__WB.
1158if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd &&
1159((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) {
1160isBitInt = true;
1161HasSize = true;
1162++s; // Skip both characters (2nd char skipped on continue).
1163continue; // Success.
1164}
1165}
1166// If we reached here, there was an error or a ud-suffix.
1167break;
1168}
1169
1170// "i", "if", and "il" are user-defined suffixes in C++1y.
1171if (s != ThisTokEnd || isImaginary) {
1172// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1173expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1174if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1175if (!isImaginary) {
1176// Any suffix pieces we might have parsed are actually part of the
1177// ud-suffix.
1178isLong = false;
1179isUnsigned = false;
1180isLongLong = false;
1181isSizeT = false;
1182isFloat = false;
1183isFloat16 = false;
1184isHalf = false;
1185isImaginary = false;
1186isBitInt = false;
1187MicrosoftInteger = 0;
1188saw_fixed_point_suffix = false;
1189isFract = false;
1190isAccum = false;
1191}
1192
1193saw_ud_suffix = true;
1194return;
1195}
1196
1197if (s != ThisTokEnd) {
1198// Report an error if there are any.
1199Diags.Report(Lexer::AdvanceToTokenCharacter(
1200TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1201diag::err_invalid_suffix_constant)
1202<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1203<< (isFixedPointConstant ? 2 : isFPConstant);
1204hadError = true;
1205}
1206}
1207
1208if (!hadError && saw_fixed_point_suffix) {
1209assert(isFract || isAccum);
1210}
1211}
1212
1213/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1214/// numbers. It issues an error for illegal digits, and handles floating point
1215/// parsing. If it detects a floating point number, the radix is set to 10.
1216void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1217assert((radix == 8 || radix == 10) && "Unexpected radix");
1218
1219// If we have a hex digit other than 'e' (which denotes a FP exponent) then
1220// the code is using an incorrect base.
1221if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1222!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1223Diags.Report(
1224Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1225diag::err_invalid_digit)
1226<< StringRef(s, 1) << (radix == 8 ? 1 : 0);
1227hadError = true;
1228return;
1229}
1230
1231if (*s == '.') {
1232checkSeparator(TokLoc, s, CSK_AfterDigits);
1233s++;
1234radix = 10;
1235saw_period = true;
1236checkSeparator(TokLoc, s, CSK_BeforeDigits);
1237s = SkipDigits(s); // Skip suffix.
1238}
1239if (*s == 'e' || *s == 'E') { // exponent
1240checkSeparator(TokLoc, s, CSK_AfterDigits);
1241const char *Exponent = s;
1242s++;
1243radix = 10;
1244saw_exponent = true;
1245if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
1246const char *first_non_digit = SkipDigits(s);
1247if (containsDigits(s, first_non_digit)) {
1248checkSeparator(TokLoc, s, CSK_BeforeDigits);
1249s = first_non_digit;
1250} else {
1251if (!hadError) {
1252Diags.Report(Lexer::AdvanceToTokenCharacter(
1253TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1254diag::err_exponent_has_no_digits);
1255hadError = true;
1256}
1257return;
1258}
1259}
1260}
1261
1262/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1263/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1264/// treat it as an invalid suffix.
1265bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1266StringRef Suffix) {
1267if (!LangOpts.CPlusPlus11 || Suffix.empty())
1268return false;
1269
1270// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1271// Suffixes starting with '__' (double underscore) are for use by
1272// the implementation.
1273if (Suffix.starts_with("_") && !Suffix.starts_with("__"))
1274return true;
1275
1276// In C++11, there are no library suffixes.
1277if (!LangOpts.CPlusPlus14)
1278return false;
1279
1280// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1281// Per tweaked N3660, "il", "i", and "if" are also used in the library.
1282// In C++2a "d" and "y" are used in the library.
1283return llvm::StringSwitch<bool>(Suffix)
1284.Cases("h", "min", "s", true)
1285.Cases("ms", "us", "ns", true)
1286.Cases("il", "i", "if", true)
1287.Cases("d", "y", LangOpts.CPlusPlus20)
1288.Default(false);
1289}
1290
1291void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1292const char *Pos,
1293CheckSeparatorKind IsAfterDigits) {
1294if (IsAfterDigits == CSK_AfterDigits) {
1295if (Pos == ThisTokBegin)
1296return;
1297--Pos;
1298} else if (Pos == ThisTokEnd)
1299return;
1300
1301if (isDigitSeparator(*Pos)) {
1302Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1303LangOpts),
1304diag::err_digit_separator_not_between_digits)
1305<< IsAfterDigits;
1306hadError = true;
1307}
1308}
1309
1310/// ParseNumberStartingWithZero - This method is called when the first character
1311/// of the number is found to be a zero. This means it is either an octal
1312/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1313/// a floating point number (01239.123e4). Eat the prefix, determining the
1314/// radix etc.
1315void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1316assert(s[0] == '0' && "Invalid method call");
1317s++;
1318
1319int c1 = s[0];
1320
1321// Handle a hex number like 0x1234.
1322if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1323s++;
1324assert(s < ThisTokEnd && "didn't maximally munch?");
1325radix = 16;
1326DigitsBegin = s;
1327s = SkipHexDigits(s);
1328bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1329if (s == ThisTokEnd) {
1330// Done.
1331} else if (*s == '.') {
1332s++;
1333saw_period = true;
1334const char *floatDigitsBegin = s;
1335s = SkipHexDigits(s);
1336if (containsDigits(floatDigitsBegin, s))
1337HasSignificandDigits = true;
1338if (HasSignificandDigits)
1339checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1340}
1341
1342if (!HasSignificandDigits) {
1343Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1344LangOpts),
1345diag::err_hex_constant_requires)
1346<< LangOpts.CPlusPlus << 1;
1347hadError = true;
1348return;
1349}
1350
1351// A binary exponent can appear with or with a '.'. If dotted, the
1352// binary exponent is required.
1353if (*s == 'p' || *s == 'P') {
1354checkSeparator(TokLoc, s, CSK_AfterDigits);
1355const char *Exponent = s;
1356s++;
1357saw_exponent = true;
1358if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
1359const char *first_non_digit = SkipDigits(s);
1360if (!containsDigits(s, first_non_digit)) {
1361if (!hadError) {
1362Diags.Report(Lexer::AdvanceToTokenCharacter(
1363TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1364diag::err_exponent_has_no_digits);
1365hadError = true;
1366}
1367return;
1368}
1369checkSeparator(TokLoc, s, CSK_BeforeDigits);
1370s = first_non_digit;
1371
1372if (!LangOpts.HexFloats)
1373Diags.Report(TokLoc, LangOpts.CPlusPlus
1374? diag::ext_hex_literal_invalid
1375: diag::ext_hex_constant_invalid);
1376else if (LangOpts.CPlusPlus17)
1377Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1378} else if (saw_period) {
1379Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1380LangOpts),
1381diag::err_hex_constant_requires)
1382<< LangOpts.CPlusPlus << 0;
1383hadError = true;
1384}
1385return;
1386}
1387
1388// Handle simple binary numbers 0b01010
1389if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1390// 0b101010 is a C++14 and C23 extension.
1391unsigned DiagId;
1392if (LangOpts.CPlusPlus14)
1393DiagId = diag::warn_cxx11_compat_binary_literal;
1394else if (LangOpts.C23)
1395DiagId = diag::warn_c23_compat_binary_literal;
1396else if (LangOpts.CPlusPlus)
1397DiagId = diag::ext_binary_literal_cxx14;
1398else
1399DiagId = diag::ext_binary_literal;
1400Diags.Report(TokLoc, DiagId);
1401++s;
1402assert(s < ThisTokEnd && "didn't maximally munch?");
1403radix = 2;
1404DigitsBegin = s;
1405s = SkipBinaryDigits(s);
1406if (s == ThisTokEnd) {
1407// Done.
1408} else if (isHexDigit(*s) &&
1409!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1410Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1411LangOpts),
1412diag::err_invalid_digit)
1413<< StringRef(s, 1) << 2;
1414hadError = true;
1415}
1416// Other suffixes will be diagnosed by the caller.
1417return;
1418}
1419
1420// For now, the radix is set to 8. If we discover that we have a
1421// floating point constant, the radix will change to 10. Octal floating
1422// point constants are not permitted (only decimal and hexadecimal).
1423radix = 8;
1424const char *PossibleNewDigitStart = s;
1425s = SkipOctalDigits(s);
1426// When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1427// as the start of the digits. So if skipping octal digits does not skip
1428// anything, we leave the digit start where it was.
1429if (s != PossibleNewDigitStart)
1430DigitsBegin = PossibleNewDigitStart;
1431
1432if (s == ThisTokEnd)
1433return; // Done, simple octal number like 01234
1434
1435// If we have some other non-octal digit that *is* a decimal digit, see if
1436// this is part of a floating point number like 094.123 or 09e1.
1437if (isDigit(*s)) {
1438const char *EndDecimal = SkipDigits(s);
1439if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1440s = EndDecimal;
1441radix = 10;
1442}
1443}
1444
1445ParseDecimalOrOctalCommon(TokLoc);
1446}
1447
1448static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1449switch (Radix) {
1450case 2:
1451return NumDigits <= 64;
1452case 8:
1453return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1454case 10:
1455return NumDigits <= 19; // floor(log10(2^64))
1456case 16:
1457return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1458default:
1459llvm_unreachable("impossible Radix");
1460}
1461}
1462
1463/// GetIntegerValue - Convert this numeric literal value to an APInt that
1464/// matches Val's input width. If there is an overflow, set Val to the low bits
1465/// of the result and return true. Otherwise, return false.
1466bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1467// Fast path: Compute a conservative bound on the maximum number of
1468// bits per digit in this radix. If we can't possibly overflow a
1469// uint64 based on that bound then do the simple conversion to
1470// integer. This avoids the expensive overflow checking below, and
1471// handles the common cases that matter (small decimal integers and
1472// hex/octal values which don't overflow).
1473const unsigned NumDigits = SuffixBegin - DigitsBegin;
1474if (alwaysFitsInto64Bits(radix, NumDigits)) {
1475uint64_t N = 0;
1476for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1477if (!isDigitSeparator(*Ptr))
1478N = N * radix + llvm::hexDigitValue(*Ptr);
1479
1480// This will truncate the value to Val's input width. Simply check
1481// for overflow by comparing.
1482Val = N;
1483return Val.getZExtValue() != N;
1484}
1485
1486Val = 0;
1487const char *Ptr = DigitsBegin;
1488
1489llvm::APInt RadixVal(Val.getBitWidth(), radix);
1490llvm::APInt CharVal(Val.getBitWidth(), 0);
1491llvm::APInt OldVal = Val;
1492
1493bool OverflowOccurred = false;
1494while (Ptr < SuffixBegin) {
1495if (isDigitSeparator(*Ptr)) {
1496++Ptr;
1497continue;
1498}
1499
1500unsigned C = llvm::hexDigitValue(*Ptr++);
1501
1502// If this letter is out of bound for this radix, reject it.
1503assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1504
1505CharVal = C;
1506
1507// Add the digit to the value in the appropriate radix. If adding in digits
1508// made the value smaller, then this overflowed.
1509OldVal = Val;
1510
1511// Multiply by radix, did overflow occur on the multiply?
1512Val *= RadixVal;
1513OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1514
1515// Add value, did overflow occur on the value?
1516// (a + b) ult b <=> overflow
1517Val += CharVal;
1518OverflowOccurred |= Val.ult(CharVal);
1519}
1520return OverflowOccurred;
1521}
1522
1523llvm::APFloat::opStatus
1524NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1525llvm::RoundingMode RM) {
1526using llvm::APFloat;
1527
1528unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1529
1530llvm::SmallString<16> Buffer;
1531StringRef Str(ThisTokBegin, n);
1532if (Str.contains('\'')) {
1533Buffer.reserve(n);
1534std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1535&isDigitSeparator);
1536Str = Buffer;
1537}
1538
1539auto StatusOrErr = Result.convertFromString(Str, RM);
1540assert(StatusOrErr && "Invalid floating point representation");
1541return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1542: APFloat::opInvalidOp;
1543}
1544
1545static inline bool IsExponentPart(char c, bool isHex) {
1546if (isHex)
1547return c == 'p' || c == 'P';
1548return c == 'e' || c == 'E';
1549}
1550
1551bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1552assert(radix == 16 || radix == 10);
1553
1554// Find how many digits are needed to store the whole literal.
1555unsigned NumDigits = SuffixBegin - DigitsBegin;
1556if (saw_period) --NumDigits;
1557
1558// Initial scan of the exponent if it exists
1559bool ExpOverflowOccurred = false;
1560bool NegativeExponent = false;
1561const char *ExponentBegin;
1562uint64_t Exponent = 0;
1563int64_t BaseShift = 0;
1564if (saw_exponent) {
1565const char *Ptr = DigitsBegin;
1566
1567while (!IsExponentPart(*Ptr, radix == 16))
1568++Ptr;
1569ExponentBegin = Ptr;
1570++Ptr;
1571NegativeExponent = *Ptr == '-';
1572if (NegativeExponent) ++Ptr;
1573
1574unsigned NumExpDigits = SuffixBegin - Ptr;
1575if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1576llvm::StringRef ExpStr(Ptr, NumExpDigits);
1577llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1578Exponent = ExpInt.getZExtValue();
1579} else {
1580ExpOverflowOccurred = true;
1581}
1582
1583if (NegativeExponent) BaseShift -= Exponent;
1584else BaseShift += Exponent;
1585}
1586
1587// Number of bits needed for decimal literal is
1588// ceil(NumDigits * log2(10)) Integral part
1589// + Scale Fractional part
1590// + ceil(Exponent * log2(10)) Exponent
1591// --------------------------------------------------
1592// ceil((NumDigits + Exponent) * log2(10)) + Scale
1593//
1594// But for simplicity in handling integers, we can round up log2(10) to 4,
1595// making:
1596// 4 * (NumDigits + Exponent) + Scale
1597//
1598// Number of digits needed for hexadecimal literal is
1599// 4 * NumDigits Integral part
1600// + Scale Fractional part
1601// + Exponent Exponent
1602// --------------------------------------------------
1603// (4 * NumDigits) + Scale + Exponent
1604uint64_t NumBitsNeeded;
1605if (radix == 10)
1606NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1607else
1608NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1609
1610if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1611ExpOverflowOccurred = true;
1612llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1613
1614bool FoundDecimal = false;
1615
1616int64_t FractBaseShift = 0;
1617const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1618for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1619if (*Ptr == '.') {
1620FoundDecimal = true;
1621continue;
1622}
1623
1624// Normal reading of an integer
1625unsigned C = llvm::hexDigitValue(*Ptr);
1626assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1627
1628Val *= radix;
1629Val += C;
1630
1631if (FoundDecimal)
1632// Keep track of how much we will need to adjust this value by from the
1633// number of digits past the radix point.
1634--FractBaseShift;
1635}
1636
1637// For a radix of 16, we will be multiplying by 2 instead of 16.
1638if (radix == 16) FractBaseShift *= 4;
1639BaseShift += FractBaseShift;
1640
1641Val <<= Scale;
1642
1643uint64_t Base = (radix == 16) ? 2 : 10;
1644if (BaseShift > 0) {
1645for (int64_t i = 0; i < BaseShift; ++i) {
1646Val *= Base;
1647}
1648} else if (BaseShift < 0) {
1649for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1650Val = Val.udiv(Base);
1651}
1652
1653bool IntOverflowOccurred = false;
1654auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1655if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1656IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1657StoreVal = Val.trunc(StoreVal.getBitWidth());
1658} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1659IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1660StoreVal = Val.zext(StoreVal.getBitWidth());
1661} else {
1662StoreVal = Val;
1663}
1664
1665return IntOverflowOccurred || ExpOverflowOccurred;
1666}
1667
1668/// \verbatim
1669/// user-defined-character-literal: [C++11 lex.ext]
1670/// character-literal ud-suffix
1671/// ud-suffix:
1672/// identifier
1673/// character-literal: [C++11 lex.ccon]
1674/// ' c-char-sequence '
1675/// u' c-char-sequence '
1676/// U' c-char-sequence '
1677/// L' c-char-sequence '
1678/// u8' c-char-sequence ' [C++1z lex.ccon]
1679/// c-char-sequence:
1680/// c-char
1681/// c-char-sequence c-char
1682/// c-char:
1683/// any member of the source character set except the single-quote ',
1684/// backslash \, or new-line character
1685/// escape-sequence
1686/// universal-character-name
1687/// escape-sequence:
1688/// simple-escape-sequence
1689/// octal-escape-sequence
1690/// hexadecimal-escape-sequence
1691/// simple-escape-sequence:
1692/// one of \' \" \? \\ \a \b \f \n \r \t \v
1693/// octal-escape-sequence:
1694/// \ octal-digit
1695/// \ octal-digit octal-digit
1696/// \ octal-digit octal-digit octal-digit
1697/// hexadecimal-escape-sequence:
1698/// \x hexadecimal-digit
1699/// hexadecimal-escape-sequence hexadecimal-digit
1700/// universal-character-name: [C++11 lex.charset]
1701/// \u hex-quad
1702/// \U hex-quad hex-quad
1703/// hex-quad:
1704/// hex-digit hex-digit hex-digit hex-digit
1705/// \endverbatim
1706///
1707CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1708SourceLocation Loc, Preprocessor &PP,
1709tok::TokenKind kind) {
1710// At this point we know that the character matches the regex "(L|u|U)?'.*'".
1711HadError = false;
1712
1713Kind = kind;
1714
1715const char *TokBegin = begin;
1716
1717// Skip over wide character determinant.
1718if (Kind != tok::char_constant)
1719++begin;
1720if (Kind == tok::utf8_char_constant)
1721++begin;
1722
1723// Skip over the entry quote.
1724if (begin[0] != '\'') {
1725PP.Diag(Loc, diag::err_lexing_char);
1726HadError = true;
1727return;
1728}
1729
1730++begin;
1731
1732// Remove an optional ud-suffix.
1733if (end[-1] != '\'') {
1734const char *UDSuffixEnd = end;
1735do {
1736--end;
1737} while (end[-1] != '\'');
1738// FIXME: Don't bother with this if !tok.hasUCN().
1739expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1740UDSuffixOffset = end - TokBegin;
1741}
1742
1743// Trim the ending quote.
1744assert(end != begin && "Invalid token lexed");
1745--end;
1746
1747// FIXME: The "Value" is an uint64_t so we can handle char literals of
1748// up to 64-bits.
1749// FIXME: This extensively assumes that 'char' is 8-bits.
1750assert(PP.getTargetInfo().getCharWidth() == 8 &&
1751"Assumes char is 8 bits");
1752assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1753(PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1754"Assumes sizeof(int) on target is <= 64 and a multiple of char");
1755assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1756"Assumes sizeof(wchar) on target is <= 64");
1757
1758SmallVector<uint32_t, 4> codepoint_buffer;
1759codepoint_buffer.resize(end - begin);
1760uint32_t *buffer_begin = &codepoint_buffer.front();
1761uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1762
1763// Unicode escapes representing characters that cannot be correctly
1764// represented in a single code unit are disallowed in character literals
1765// by this implementation.
1766uint32_t largest_character_for_kind;
1767if (tok::wide_char_constant == Kind) {
1768largest_character_for_kind =
17690xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1770} else if (tok::utf8_char_constant == Kind) {
1771largest_character_for_kind = 0x7F;
1772} else if (tok::utf16_char_constant == Kind) {
1773largest_character_for_kind = 0xFFFF;
1774} else if (tok::utf32_char_constant == Kind) {
1775largest_character_for_kind = 0x10FFFF;
1776} else {
1777largest_character_for_kind = 0x7Fu;
1778}
1779
1780while (begin != end) {
1781// Is this a span of non-escape characters?
1782if (begin[0] != '\\') {
1783char const *start = begin;
1784do {
1785++begin;
1786} while (begin != end && *begin != '\\');
1787
1788char const *tmp_in_start = start;
1789uint32_t *tmp_out_start = buffer_begin;
1790llvm::ConversionResult res =
1791llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1792reinterpret_cast<llvm::UTF8 const *>(begin),
1793&buffer_begin, buffer_end, llvm::strictConversion);
1794if (res != llvm::conversionOK) {
1795// If we see bad encoding for unprefixed character literals, warn and
1796// simply copy the byte values, for compatibility with gcc and
1797// older versions of clang.
1798bool NoErrorOnBadEncoding = isOrdinary();
1799unsigned Msg = diag::err_bad_character_encoding;
1800if (NoErrorOnBadEncoding)
1801Msg = diag::warn_bad_character_encoding;
1802PP.Diag(Loc, Msg);
1803if (NoErrorOnBadEncoding) {
1804start = tmp_in_start;
1805buffer_begin = tmp_out_start;
1806for (; start != begin; ++start, ++buffer_begin)
1807*buffer_begin = static_cast<uint8_t>(*start);
1808} else {
1809HadError = true;
1810}
1811} else {
1812for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1813if (*tmp_out_start > largest_character_for_kind) {
1814HadError = true;
1815PP.Diag(Loc, diag::err_character_too_large);
1816}
1817}
1818}
1819
1820continue;
1821}
1822// Is this a Universal Character Name escape?
1823if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1824unsigned short UcnLen = 0;
1825if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1826FullSourceLoc(Loc, PP.getSourceManager()),
1827&PP.getDiagnostics(), PP.getLangOpts(), true)) {
1828HadError = true;
1829} else if (*buffer_begin > largest_character_for_kind) {
1830HadError = true;
1831PP.Diag(Loc, diag::err_character_too_large);
1832}
1833
1834++buffer_begin;
1835continue;
1836}
1837unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1838uint64_t result =
1839ProcessCharEscape(TokBegin, begin, end, HadError,
1840FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1841&PP.getDiagnostics(), PP.getLangOpts(),
1842StringLiteralEvalMethod::Evaluated);
1843*buffer_begin++ = result;
1844}
1845
1846unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1847
1848if (NumCharsSoFar > 1) {
1849if (isOrdinary() && NumCharsSoFar == 4)
1850PP.Diag(Loc, diag::warn_four_char_character_literal);
1851else if (isOrdinary())
1852PP.Diag(Loc, diag::warn_multichar_character_literal);
1853else {
1854PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1855HadError = true;
1856}
1857IsMultiChar = true;
1858} else {
1859IsMultiChar = false;
1860}
1861
1862llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1863
1864// Narrow character literals act as though their value is concatenated
1865// in this implementation, but warn on overflow.
1866bool multi_char_too_long = false;
1867if (isOrdinary() && isMultiChar()) {
1868LitVal = 0;
1869for (size_t i = 0; i < NumCharsSoFar; ++i) {
1870// check for enough leading zeros to shift into
1871multi_char_too_long |= (LitVal.countl_zero() < 8);
1872LitVal <<= 8;
1873LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1874}
1875} else if (NumCharsSoFar > 0) {
1876// otherwise just take the last character
1877LitVal = buffer_begin[-1];
1878}
1879
1880if (!HadError && multi_char_too_long) {
1881PP.Diag(Loc, diag::warn_char_constant_too_large);
1882}
1883
1884// Transfer the value from APInt to uint64_t
1885Value = LitVal.getZExtValue();
1886
1887// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1888// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1889// character constants are not sign extended in the this implementation:
1890// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1891if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1892PP.getLangOpts().CharIsSigned)
1893Value = (signed char)Value;
1894}
1895
1896/// \verbatim
1897/// string-literal: [C++0x lex.string]
1898/// encoding-prefix " [s-char-sequence] "
1899/// encoding-prefix R raw-string
1900/// encoding-prefix:
1901/// u8
1902/// u
1903/// U
1904/// L
1905/// s-char-sequence:
1906/// s-char
1907/// s-char-sequence s-char
1908/// s-char:
1909/// any member of the source character set except the double-quote ",
1910/// backslash \, or new-line character
1911/// escape-sequence
1912/// universal-character-name
1913/// raw-string:
1914/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1915/// r-char-sequence:
1916/// r-char
1917/// r-char-sequence r-char
1918/// r-char:
1919/// any member of the source character set, except a right parenthesis )
1920/// followed by the initial d-char-sequence (which may be empty)
1921/// followed by a double quote ".
1922/// d-char-sequence:
1923/// d-char
1924/// d-char-sequence d-char
1925/// d-char:
1926/// any member of the basic source character set except:
1927/// space, the left parenthesis (, the right parenthesis ),
1928/// the backslash \, and the control characters representing horizontal
1929/// tab, vertical tab, form feed, and newline.
1930/// escape-sequence: [C++0x lex.ccon]
1931/// simple-escape-sequence
1932/// octal-escape-sequence
1933/// hexadecimal-escape-sequence
1934/// simple-escape-sequence:
1935/// one of \' \" \? \\ \a \b \f \n \r \t \v
1936/// octal-escape-sequence:
1937/// \ octal-digit
1938/// \ octal-digit octal-digit
1939/// \ octal-digit octal-digit octal-digit
1940/// hexadecimal-escape-sequence:
1941/// \x hexadecimal-digit
1942/// hexadecimal-escape-sequence hexadecimal-digit
1943/// universal-character-name:
1944/// \u hex-quad
1945/// \U hex-quad hex-quad
1946/// hex-quad:
1947/// hex-digit hex-digit hex-digit hex-digit
1948/// \endverbatim
1949///
1950StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1951Preprocessor &PP,
1952StringLiteralEvalMethod EvalMethod)
1953: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1954Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1955MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1956ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1957Pascal(false) {
1958init(StringToks);
1959}
1960
1961void StringLiteralParser::init(ArrayRef<Token> StringToks){
1962// The literal token may have come from an invalid source location (e.g. due
1963// to a PCH error), in which case the token length will be 0.
1964if (StringToks.empty() || StringToks[0].getLength() < 2)
1965return DiagnoseLexingError(SourceLocation());
1966
1967// Scan all of the string portions, remember the max individual token length,
1968// computing a bound on the concatenated string length, and see whether any
1969// piece is a wide-string. If any of the string portions is a wide-string
1970// literal, the result is a wide-string literal [C99 6.4.5p4].
1971assert(!StringToks.empty() && "expected at least one token");
1972MaxTokenLength = StringToks[0].getLength();
1973assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1974SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1975hadError = false;
1976
1977// Determines the kind of string from the prefix
1978Kind = tok::string_literal;
1979
1980/// (C99 5.1.1.2p1). The common case is only one string fragment.
1981for (const Token &Tok : StringToks) {
1982if (Tok.getLength() < 2)
1983return DiagnoseLexingError(Tok.getLocation());
1984
1985// The string could be shorter than this if it needs cleaning, but this is a
1986// reasonable bound, which is all we need.
1987assert(Tok.getLength() >= 2 && "literal token is invalid!");
1988SizeBound += Tok.getLength() - 2; // -2 for "".
1989
1990// Remember maximum string piece length.
1991if (Tok.getLength() > MaxTokenLength)
1992MaxTokenLength = Tok.getLength();
1993
1994// Remember if we see any wide or utf-8/16/32 strings.
1995// Also check for illegal concatenations.
1996if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1997if (Diags) {
1998SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1999Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
2000Features);
2001CharSourceRange Range =
2002CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
2003StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
2004getEncodingPrefixLen(Tok.getKind()));
2005Diags->Report(Tok.getLocation(),
2006Features.CPlusPlus26
2007? diag::err_unevaluated_string_prefix
2008: diag::warn_unevaluated_string_prefix)
2009<< Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
2010}
2011if (Features.CPlusPlus26)
2012hadError = true;
2013} else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
2014if (isOrdinary()) {
2015Kind = Tok.getKind();
2016} else {
2017if (Diags)
2018Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
2019hadError = true;
2020}
2021}
2022}
2023
2024// Include space for the null terminator.
2025++SizeBound;
2026
2027// TODO: K&R warning: "traditional C rejects string constant concatenation"
2028
2029// Get the width in bytes of char/wchar_t/char16_t/char32_t
2030CharByteWidth = getCharWidth(Kind, Target);
2031assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
2032CharByteWidth /= 8;
2033
2034// The output buffer size needs to be large enough to hold wide characters.
2035// This is a worst-case assumption which basically corresponds to L"" "long".
2036SizeBound *= CharByteWidth;
2037
2038// Size the temporary buffer to hold the result string data.
2039ResultBuf.resize(SizeBound);
2040
2041// Likewise, but for each string piece.
2042SmallString<512> TokenBuf;
2043TokenBuf.resize(MaxTokenLength);
2044
2045// Loop over all the strings, getting their spelling, and expanding them to
2046// wide strings as appropriate.
2047ResultPtr = &ResultBuf[0]; // Next byte to fill in.
2048
2049Pascal = false;
2050
2051SourceLocation UDSuffixTokLoc;
2052
2053for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
2054const char *ThisTokBuf = &TokenBuf[0];
2055// Get the spelling of the token, which eliminates trigraphs, etc. We know
2056// that ThisTokBuf points to a buffer that is big enough for the whole token
2057// and 'spelled' tokens can only shrink.
2058bool StringInvalid = false;
2059unsigned ThisTokLen =
2060Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2061&StringInvalid);
2062if (StringInvalid)
2063return DiagnoseLexingError(StringToks[i].getLocation());
2064
2065const char *ThisTokBegin = ThisTokBuf;
2066const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2067
2068// Remove an optional ud-suffix.
2069if (ThisTokEnd[-1] != '"') {
2070const char *UDSuffixEnd = ThisTokEnd;
2071do {
2072--ThisTokEnd;
2073} while (ThisTokEnd[-1] != '"');
2074
2075StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2076
2077if (UDSuffixBuf.empty()) {
2078if (StringToks[i].hasUCN())
2079expandUCNs(UDSuffixBuf, UDSuffix);
2080else
2081UDSuffixBuf.assign(UDSuffix);
2082UDSuffixToken = i;
2083UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2084UDSuffixTokLoc = StringToks[i].getLocation();
2085} else {
2086SmallString<32> ExpandedUDSuffix;
2087if (StringToks[i].hasUCN()) {
2088expandUCNs(ExpandedUDSuffix, UDSuffix);
2089UDSuffix = ExpandedUDSuffix;
2090}
2091
2092// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2093// result of a concatenation involving at least one user-defined-string-
2094// literal, all the participating user-defined-string-literals shall
2095// have the same ud-suffix.
2096bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2097if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2098if (Diags) {
2099SourceLocation TokLoc = StringToks[i].getLocation();
2100if (UnevaluatedStringHasUDL) {
2101Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2102<< SourceRange(TokLoc, TokLoc);
2103} else {
2104Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2105<< UDSuffixBuf << UDSuffix
2106<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2107}
2108}
2109hadError = true;
2110}
2111}
2112}
2113
2114// Strip the end quote.
2115--ThisTokEnd;
2116
2117// TODO: Input character set mapping support.
2118
2119// Skip marker for wide or unicode strings.
2120if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2121++ThisTokBuf;
2122// Skip 8 of u8 marker for utf8 strings.
2123if (ThisTokBuf[0] == '8')
2124++ThisTokBuf;
2125}
2126
2127// Check for raw string
2128if (ThisTokBuf[0] == 'R') {
2129if (ThisTokBuf[1] != '"') {
2130// The file may have come from PCH and then changed after loading the
2131// PCH; Fail gracefully.
2132return DiagnoseLexingError(StringToks[i].getLocation());
2133}
2134ThisTokBuf += 2; // skip R"
2135
2136// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2137// characters.
2138constexpr unsigned MaxRawStrDelimLen = 16;
2139
2140const char *Prefix = ThisTokBuf;
2141while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2142ThisTokBuf[0] != '(')
2143++ThisTokBuf;
2144if (ThisTokBuf[0] != '(')
2145return DiagnoseLexingError(StringToks[i].getLocation());
2146++ThisTokBuf; // skip '('
2147
2148// Remove same number of characters from the end
2149ThisTokEnd -= ThisTokBuf - Prefix;
2150if (ThisTokEnd < ThisTokBuf)
2151return DiagnoseLexingError(StringToks[i].getLocation());
2152
2153// C++14 [lex.string]p4: A source-file new-line in a raw string literal
2154// results in a new-line in the resulting execution string-literal.
2155StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2156while (!RemainingTokenSpan.empty()) {
2157// Split the string literal on \r\n boundaries.
2158size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2159StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2160StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2161
2162// Copy everything before the \r\n sequence into the string literal.
2163if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2164hadError = true;
2165
2166// Point into the \n inside the \r\n sequence and operate on the
2167// remaining portion of the literal.
2168RemainingTokenSpan = AfterCRLF.substr(1);
2169}
2170} else {
2171if (ThisTokBuf[0] != '"') {
2172// The file may have come from PCH and then changed after loading the
2173// PCH; Fail gracefully.
2174return DiagnoseLexingError(StringToks[i].getLocation());
2175}
2176++ThisTokBuf; // skip "
2177
2178// Check if this is a pascal string
2179if (!isUnevaluated() && Features.PascalStrings &&
2180ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2181ThisTokBuf[1] == 'p') {
2182
2183// If the \p sequence is found in the first token, we have a pascal string
2184// Otherwise, if we already have a pascal string, ignore the first \p
2185if (i == 0) {
2186++ThisTokBuf;
2187Pascal = true;
2188} else if (Pascal)
2189ThisTokBuf += 2;
2190}
2191
2192while (ThisTokBuf != ThisTokEnd) {
2193// Is this a span of non-escape characters?
2194if (ThisTokBuf[0] != '\\') {
2195const char *InStart = ThisTokBuf;
2196do {
2197++ThisTokBuf;
2198} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2199
2200// Copy the character span over.
2201if (CopyStringFragment(StringToks[i], ThisTokBegin,
2202StringRef(InStart, ThisTokBuf - InStart)))
2203hadError = true;
2204continue;
2205}
2206// Is this a Universal Character Name escape?
2207if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2208ThisTokBuf[1] == 'N') {
2209EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2210ResultPtr, hadError,
2211FullSourceLoc(StringToks[i].getLocation(), SM),
2212CharByteWidth, Diags, Features);
2213continue;
2214}
2215// Otherwise, this is a non-UCN escape character. Process it.
2216unsigned ResultChar =
2217ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2218FullSourceLoc(StringToks[i].getLocation(), SM),
2219CharByteWidth * 8, Diags, Features, EvalMethod);
2220
2221if (CharByteWidth == 4) {
2222// FIXME: Make the type of the result buffer correct instead of
2223// using reinterpret_cast.
2224llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2225*ResultWidePtr = ResultChar;
2226ResultPtr += 4;
2227} else if (CharByteWidth == 2) {
2228// FIXME: Make the type of the result buffer correct instead of
2229// using reinterpret_cast.
2230llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2231*ResultWidePtr = ResultChar & 0xFFFF;
2232ResultPtr += 2;
2233} else {
2234assert(CharByteWidth == 1 && "Unexpected char width");
2235*ResultPtr++ = ResultChar & 0xFF;
2236}
2237}
2238}
2239}
2240
2241assert((!Pascal || !isUnevaluated()) &&
2242"Pascal string in unevaluated context");
2243if (Pascal) {
2244if (CharByteWidth == 4) {
2245// FIXME: Make the type of the result buffer correct instead of
2246// using reinterpret_cast.
2247llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2248ResultWidePtr[0] = GetNumStringChars() - 1;
2249} else if (CharByteWidth == 2) {
2250// FIXME: Make the type of the result buffer correct instead of
2251// using reinterpret_cast.
2252llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2253ResultWidePtr[0] = GetNumStringChars() - 1;
2254} else {
2255assert(CharByteWidth == 1 && "Unexpected char width");
2256ResultBuf[0] = GetNumStringChars() - 1;
2257}
2258
2259// Verify that pascal strings aren't too large.
2260if (GetStringLength() > 256) {
2261if (Diags)
2262Diags->Report(StringToks.front().getLocation(),
2263diag::err_pascal_string_too_long)
2264<< SourceRange(StringToks.front().getLocation(),
2265StringToks.back().getLocation());
2266hadError = true;
2267return;
2268}
2269} else if (Diags) {
2270// Complain if this string literal has too many characters.
2271unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2272
2273if (GetNumStringChars() > MaxChars)
2274Diags->Report(StringToks.front().getLocation(),
2275diag::ext_string_too_long)
2276<< GetNumStringChars() << MaxChars
2277<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2278<< SourceRange(StringToks.front().getLocation(),
2279StringToks.back().getLocation());
2280}
2281}
2282
2283static const char *resyncUTF8(const char *Err, const char *End) {
2284if (Err == End)
2285return End;
2286End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2287while (++Err != End && (*Err & 0xC0) == 0x80)
2288;
2289return Err;
2290}
2291
2292/// This function copies from Fragment, which is a sequence of bytes
2293/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2294/// Performs widening for multi-byte characters.
2295bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2296const char *TokBegin,
2297StringRef Fragment) {
2298const llvm::UTF8 *ErrorPtrTmp;
2299if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2300return false;
2301
2302// If we see bad encoding for unprefixed string literals, warn and
2303// simply copy the byte values, for compatibility with gcc and older
2304// versions of clang.
2305bool NoErrorOnBadEncoding = isOrdinary();
2306if (NoErrorOnBadEncoding) {
2307memcpy(ResultPtr, Fragment.data(), Fragment.size());
2308ResultPtr += Fragment.size();
2309}
2310
2311if (Diags) {
2312const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2313
2314FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2315const DiagnosticBuilder &Builder =
2316Diag(Diags, Features, SourceLoc, TokBegin,
2317ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2318NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2319: diag::err_bad_string_encoding);
2320
2321const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2322StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2323
2324// Decode into a dummy buffer.
2325SmallString<512> Dummy;
2326Dummy.reserve(Fragment.size() * CharByteWidth);
2327char *Ptr = Dummy.data();
2328
2329while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2330const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2331NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2332Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2333ErrorPtr, NextStart);
2334NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2335}
2336}
2337return !NoErrorOnBadEncoding;
2338}
2339
2340void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2341hadError = true;
2342if (Diags)
2343Diags->Report(Loc, diag::err_lexing_string);
2344}
2345
2346/// getOffsetOfStringByte - This function returns the offset of the
2347/// specified byte of the string data represented by Token. This handles
2348/// advancing over escape sequences in the string.
2349unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2350unsigned ByteNo) const {
2351// Get the spelling of the token.
2352SmallString<32> SpellingBuffer;
2353SpellingBuffer.resize(Tok.getLength());
2354
2355bool StringInvalid = false;
2356const char *SpellingPtr = &SpellingBuffer[0];
2357unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2358&StringInvalid);
2359if (StringInvalid)
2360return 0;
2361
2362const char *SpellingStart = SpellingPtr;
2363const char *SpellingEnd = SpellingPtr+TokLen;
2364
2365// Handle UTF-8 strings just like narrow strings.
2366if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2367SpellingPtr += 2;
2368
2369assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2370SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2371
2372// For raw string literals, this is easy.
2373if (SpellingPtr[0] == 'R') {
2374assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2375// Skip 'R"'.
2376SpellingPtr += 2;
2377while (*SpellingPtr != '(') {
2378++SpellingPtr;
2379assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2380}
2381// Skip '('.
2382++SpellingPtr;
2383return SpellingPtr - SpellingStart + ByteNo;
2384}
2385
2386// Skip over the leading quote
2387assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2388++SpellingPtr;
2389
2390// Skip over bytes until we find the offset we're looking for.
2391while (ByteNo) {
2392assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2393
2394// Step over non-escapes simply.
2395if (*SpellingPtr != '\\') {
2396++SpellingPtr;
2397--ByteNo;
2398continue;
2399}
2400
2401// Otherwise, this is an escape character. Advance over it.
2402bool HadError = false;
2403if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2404SpellingPtr[1] == 'N') {
2405const char *EscapePtr = SpellingPtr;
2406unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
24071, Features, HadError);
2408if (Len > ByteNo) {
2409// ByteNo is somewhere within the escape sequence.
2410SpellingPtr = EscapePtr;
2411break;
2412}
2413ByteNo -= Len;
2414} else {
2415ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2416FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2417Diags, Features, StringLiteralEvalMethod::Evaluated);
2418--ByteNo;
2419}
2420assert(!HadError && "This method isn't valid on erroneous strings");
2421}
2422
2423return SpellingPtr-SpellingStart;
2424}
2425
2426/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2427/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2428/// treat it as an invalid suffix.
2429bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2430StringRef Suffix) {
2431return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2432Suffix == "sv";
2433}
2434