llvm-project
862 строки · 25.6 Кб
1//===--- CommentLexer.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/AST/CommentLexer.h"
10#include "clang/AST/CommentCommandTraits.h"
11#include "clang/AST/CommentDiagnostic.h"
12#include "clang/Basic/CharInfo.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/StringSwitch.h"
15#include "llvm/Support/ConvertUTF.h"
16#include "llvm/Support/ErrorHandling.h"
17
18namespace clang {
19namespace comments {
20
21void Token::dump(const Lexer &L, const SourceManager &SM) const {
22llvm::errs() << "comments::Token Kind=" << Kind << " ";
23Loc.print(llvm::errs(), SM);
24llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25}
26
27static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28return isLetter(C);
29}
30
31static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32return isDigit(C);
33}
34
35static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36return isHexDigit(C);
37}
38
39static inline StringRef convertCodePointToUTF8(
40llvm::BumpPtrAllocator &Allocator,
41unsigned CodePoint) {
42char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43char *ResolvedPtr = Resolved;
44if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45return StringRef(Resolved, ResolvedPtr - Resolved);
46else
47return StringRef();
48}
49
50namespace {
51
52#include "clang/AST/CommentHTMLTags.inc"
53#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55} // end anonymous namespace
56
57StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58// Fast path, first check a few most widely used named character references.
59return llvm::StringSwitch<StringRef>(Name)
60.Case("amp", "&")
61.Case("lt", "<")
62.Case("gt", ">")
63.Case("quot", "\"")
64.Case("apos", "\'")
65// Slow path.
66.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67}
68
69StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70unsigned CodePoint = 0;
71for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73CodePoint *= 10;
74CodePoint += Name[i] - '0';
75}
76return convertCodePointToUTF8(Allocator, CodePoint);
77}
78
79StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80unsigned CodePoint = 0;
81for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82CodePoint *= 16;
83const char C = Name[i];
84assert(isHTMLHexCharacterReferenceCharacter(C));
85CodePoint += llvm::hexDigitValue(C);
86}
87return convertCodePointToUTF8(Allocator, CodePoint);
88}
89
90void Lexer::skipLineStartingDecorations() {
91// This function should be called only for C comments
92assert(CommentState == LCS_InsideCComment);
93
94if (BufferPtr == CommentEnd)
95return;
96
97const char *NewBufferPtr = BufferPtr;
98while (isHorizontalWhitespace(*NewBufferPtr))
99if (++NewBufferPtr == CommentEnd)
100return;
101if (*NewBufferPtr == '*')
102BufferPtr = NewBufferPtr + 1;
103}
104
105namespace {
106/// Returns pointer to the first newline character in the string.
107const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109if (isVerticalWhitespace(*BufferPtr))
110return BufferPtr;
111}
112return BufferEnd;
113}
114
115const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116if (BufferPtr == BufferEnd)
117return BufferPtr;
118
119if (*BufferPtr == '\n')
120BufferPtr++;
121else {
122assert(*BufferPtr == '\r');
123BufferPtr++;
124if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125BufferPtr++;
126}
127return BufferPtr;
128}
129
130const char *skipNamedCharacterReference(const char *BufferPtr,
131const char *BufferEnd) {
132for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134return BufferPtr;
135}
136return BufferEnd;
137}
138
139const char *skipDecimalCharacterReference(const char *BufferPtr,
140const char *BufferEnd) {
141for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143return BufferPtr;
144}
145return BufferEnd;
146}
147
148const char *skipHexCharacterReference(const char *BufferPtr,
149const char *BufferEnd) {
150for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152return BufferPtr;
153}
154return BufferEnd;
155}
156
157bool isHTMLIdentifierStartingCharacter(char C) {
158return isLetter(C);
159}
160
161bool isHTMLIdentifierCharacter(char C) {
162return isAlphanumeric(C);
163}
164
165const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167if (!isHTMLIdentifierCharacter(*BufferPtr))
168return BufferPtr;
169}
170return BufferEnd;
171}
172
173/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174/// string allowed.
175///
176/// Returns pointer to closing quote.
177const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178{
179const char Quote = *BufferPtr;
180assert(Quote == '\"' || Quote == '\'');
181
182BufferPtr++;
183for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184const char C = *BufferPtr;
185if (C == Quote && BufferPtr[-1] != '\\')
186return BufferPtr;
187}
188return BufferEnd;
189}
190
191const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193if (!isWhitespace(*BufferPtr))
194return BufferPtr;
195}
196return BufferEnd;
197}
198
199bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201}
202
203bool isCommandNameStartCharacter(char C) {
204return isLetter(C);
205}
206
207bool isCommandNameCharacter(char C) {
208return isAlphanumeric(C);
209}
210
211const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213if (!isCommandNameCharacter(*BufferPtr))
214return BufferPtr;
215}
216return BufferEnd;
217}
218
219/// Return the one past end pointer for BCPL comments.
220/// Handles newlines escaped with backslash or trigraph for backslahs.
221const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222const char *CurPtr = BufferPtr;
223while (CurPtr != BufferEnd) {
224while (!isVerticalWhitespace(*CurPtr)) {
225CurPtr++;
226if (CurPtr == BufferEnd)
227return BufferEnd;
228}
229// We found a newline, check if it is escaped.
230const char *EscapePtr = CurPtr - 1;
231while(isHorizontalWhitespace(*EscapePtr))
232EscapePtr--;
233
234if (*EscapePtr == '\\' ||
235(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237// We found an escaped newline.
238CurPtr = skipNewline(CurPtr, BufferEnd);
239} else
240return CurPtr; // Not an escaped newline.
241}
242return BufferEnd;
243}
244
245/// Return the one past end pointer for C comments.
246/// Very dumb, does not handle escaped newlines or trigraphs.
247const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249if (*BufferPtr == '*') {
250assert(BufferPtr + 1 != BufferEnd);
251if (*(BufferPtr + 1) == '/')
252return BufferPtr;
253}
254}
255llvm_unreachable("buffer end hit before '*/' was seen");
256}
257
258} // end anonymous namespace
259
260void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261tok::TokenKind Kind) {
262const unsigned TokLen = TokEnd - BufferPtr;
263Result.setLocation(getSourceLocation(BufferPtr));
264Result.setKind(Kind);
265Result.setLength(TokLen);
266#ifndef NDEBUG
267Result.TextPtr = "<UNSET>";
268Result.IntVal = 7;
269#endif
270BufferPtr = TokEnd;
271}
272
273const char *Lexer::skipTextToken() {
274const char *TokenPtr = BufferPtr;
275assert(TokenPtr < CommentEnd);
276StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277
278again:
279size_t End =
280StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281if (End == StringRef::npos)
282return CommentEnd;
283
284// Doxygen doesn't recognize any commands in a one-line double quotation.
285// If we don't find an ending quotation mark, we pretend it never began.
286if (*(TokenPtr + End) == '\"') {
287TokenPtr += End + 1;
288End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290TokenPtr += End + 1;
291goto again;
292}
293return TokenPtr + End;
294}
295
296void Lexer::lexCommentText(Token &T) {
297assert(CommentState == LCS_InsideBCPLComment ||
298CommentState == LCS_InsideCComment);
299
300// Handles lexing non-command text, i.e. text and newline.
301auto HandleNonCommandToken = [&]() -> void {
302assert(State == LS_Normal);
303
304const char *TokenPtr = BufferPtr;
305assert(TokenPtr < CommentEnd);
306switch (*TokenPtr) {
307case '\n':
308case '\r':
309TokenPtr = skipNewline(TokenPtr, CommentEnd);
310formTokenWithChars(T, TokenPtr, tok::newline);
311
312if (CommentState == LCS_InsideCComment)
313skipLineStartingDecorations();
314return;
315
316default:
317return formTextToken(T, skipTextToken());
318}
319};
320
321if (!ParseCommands)
322return HandleNonCommandToken();
323
324switch (State) {
325case LS_Normal:
326break;
327case LS_VerbatimBlockFirstLine:
328lexVerbatimBlockFirstLine(T);
329return;
330case LS_VerbatimBlockBody:
331lexVerbatimBlockBody(T);
332return;
333case LS_VerbatimLineText:
334lexVerbatimLineText(T);
335return;
336case LS_HTMLStartTag:
337lexHTMLStartTag(T);
338return;
339case LS_HTMLEndTag:
340lexHTMLEndTag(T);
341return;
342}
343
344assert(State == LS_Normal);
345const char *TokenPtr = BufferPtr;
346assert(TokenPtr < CommentEnd);
347switch(*TokenPtr) {
348case '\\':
349case '@': {
350// Commands that start with a backslash and commands that start with
351// 'at' have equivalent semantics. But we keep information about the
352// exact syntax in AST for comments.
353tok::TokenKind CommandKind =
354(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355TokenPtr++;
356if (TokenPtr == CommentEnd) {
357formTextToken(T, TokenPtr);
358return;
359}
360char C = *TokenPtr;
361switch (C) {
362default:
363break;
364
365case '\\': case '@': case '&': case '$':
366case '#': case '<': case '>': case '%':
367case '\"': case '.': case ':':
368// This is one of \\ \@ \& \$ etc escape sequences.
369TokenPtr++;
370if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371// This is the \:: escape sequence.
372TokenPtr++;
373}
374StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375formTokenWithChars(T, TokenPtr, tok::text);
376T.setText(UnescapedText);
377return;
378}
379
380// Don't make zero-length commands.
381if (!isCommandNameStartCharacter(*TokenPtr)) {
382formTextToken(T, TokenPtr);
383return;
384}
385
386TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387unsigned Length = TokenPtr - (BufferPtr + 1);
388
389// Hardcoded support for lexing LaTeX formula commands
390// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392C = *TokenPtr;
393if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394C == '{' || C == '}') {
395TokenPtr++;
396Length++;
397}
398}
399
400StringRef CommandName(BufferPtr + 1, Length);
401
402const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403if (!Info) {
404if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405StringRef CorrectedName = Info->Name;
406SourceLocation Loc = getSourceLocation(BufferPtr);
407SourceLocation EndLoc = getSourceLocation(TokenPtr);
408SourceRange FullRange = SourceRange(Loc, EndLoc);
409SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410Diag(Loc, diag::warn_correct_comment_command_name)
411<< FullRange << CommandName << CorrectedName
412<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
413} else {
414formTokenWithChars(T, TokenPtr, tok::unknown_command);
415T.setUnknownCommandName(CommandName);
416Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417<< SourceRange(T.getLocation(), T.getEndLocation());
418return;
419}
420}
421if (Info->IsVerbatimBlockCommand) {
422setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423return;
424}
425if (Info->IsVerbatimLineCommand) {
426setupAndLexVerbatimLine(T, TokenPtr, Info);
427return;
428}
429formTokenWithChars(T, TokenPtr, CommandKind);
430T.setCommandID(Info->getID());
431return;
432}
433
434case '&':
435lexHTMLCharacterReference(T);
436return;
437
438case '<': {
439TokenPtr++;
440if (TokenPtr == CommentEnd) {
441formTextToken(T, TokenPtr);
442return;
443}
444const char C = *TokenPtr;
445if (isHTMLIdentifierStartingCharacter(C))
446setupAndLexHTMLStartTag(T);
447else if (C == '/')
448setupAndLexHTMLEndTag(T);
449else
450formTextToken(T, TokenPtr);
451return;
452}
453
454default:
455return HandleNonCommandToken();
456}
457}
458
459void Lexer::setupAndLexVerbatimBlock(Token &T,
460const char *TextBegin,
461char Marker, const CommandInfo *Info) {
462assert(Info->IsVerbatimBlockCommand);
463
464VerbatimBlockEndCommandName.clear();
465VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466VerbatimBlockEndCommandName.append(Info->EndCommandName);
467
468formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469T.setVerbatimBlockID(Info->getID());
470
471// If there is a newline following the verbatim opening command, skip the
472// newline so that we don't create an tok::verbatim_block_line with empty
473// text content.
474if (BufferPtr != CommentEnd &&
475isVerticalWhitespace(*BufferPtr)) {
476BufferPtr = skipNewline(BufferPtr, CommentEnd);
477State = LS_VerbatimBlockBody;
478return;
479}
480
481State = LS_VerbatimBlockFirstLine;
482}
483
484void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485again:
486assert(BufferPtr < CommentEnd);
487
488// FIXME: It would be better to scan the text once, finding either the block
489// end command or newline.
490//
491// Extract current line.
492const char *Newline = findNewline(BufferPtr, CommentEnd);
493StringRef Line(BufferPtr, Newline - BufferPtr);
494
495// Look for end command in current line.
496size_t Pos = Line.find(VerbatimBlockEndCommandName);
497const char *TextEnd;
498const char *NextLine;
499if (Pos == StringRef::npos) {
500// Current line is completely verbatim.
501TextEnd = Newline;
502NextLine = skipNewline(Newline, CommentEnd);
503} else if (Pos == 0) {
504// Current line contains just an end command.
505const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507formTokenWithChars(T, End, tok::verbatim_block_end);
508T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509State = LS_Normal;
510return;
511} else {
512// There is some text, followed by end command. Extract text first.
513TextEnd = BufferPtr + Pos;
514NextLine = TextEnd;
515// If there is only whitespace before end command, skip whitespace.
516if (isWhitespace(BufferPtr, TextEnd)) {
517BufferPtr = TextEnd;
518goto again;
519}
520}
521
522StringRef Text(BufferPtr, TextEnd - BufferPtr);
523formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524T.setVerbatimBlockText(Text);
525
526State = LS_VerbatimBlockBody;
527}
528
529void Lexer::lexVerbatimBlockBody(Token &T) {
530assert(State == LS_VerbatimBlockBody);
531
532if (CommentState == LCS_InsideCComment)
533skipLineStartingDecorations();
534
535if (BufferPtr == CommentEnd) {
536formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537T.setVerbatimBlockText("");
538return;
539}
540
541lexVerbatimBlockFirstLine(T);
542}
543
544void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545const CommandInfo *Info) {
546assert(Info->IsVerbatimLineCommand);
547formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548T.setVerbatimLineID(Info->getID());
549
550State = LS_VerbatimLineText;
551}
552
553void Lexer::lexVerbatimLineText(Token &T) {
554assert(State == LS_VerbatimLineText);
555
556// Extract current line.
557const char *Newline = findNewline(BufferPtr, CommentEnd);
558StringRef Text(BufferPtr, Newline - BufferPtr);
559formTokenWithChars(T, Newline, tok::verbatim_line_text);
560T.setVerbatimLineText(Text);
561
562State = LS_Normal;
563}
564
565void Lexer::lexHTMLCharacterReference(Token &T) {
566const char *TokenPtr = BufferPtr;
567assert(*TokenPtr == '&');
568TokenPtr++;
569if (TokenPtr == CommentEnd) {
570formTextToken(T, TokenPtr);
571return;
572}
573const char *NamePtr;
574bool isNamed = false;
575bool isDecimal = false;
576char C = *TokenPtr;
577if (isHTMLNamedCharacterReferenceCharacter(C)) {
578NamePtr = TokenPtr;
579TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580isNamed = true;
581} else if (C == '#') {
582TokenPtr++;
583if (TokenPtr == CommentEnd) {
584formTextToken(T, TokenPtr);
585return;
586}
587C = *TokenPtr;
588if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589NamePtr = TokenPtr;
590TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591isDecimal = true;
592} else if (C == 'x' || C == 'X') {
593TokenPtr++;
594NamePtr = TokenPtr;
595TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596} else {
597formTextToken(T, TokenPtr);
598return;
599}
600} else {
601formTextToken(T, TokenPtr);
602return;
603}
604if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605*TokenPtr != ';') {
606formTextToken(T, TokenPtr);
607return;
608}
609StringRef Name(NamePtr, TokenPtr - NamePtr);
610TokenPtr++; // Skip semicolon.
611StringRef Resolved;
612if (isNamed)
613Resolved = resolveHTMLNamedCharacterReference(Name);
614else if (isDecimal)
615Resolved = resolveHTMLDecimalCharacterReference(Name);
616else
617Resolved = resolveHTMLHexCharacterReference(Name);
618
619if (Resolved.empty()) {
620formTextToken(T, TokenPtr);
621return;
622}
623formTokenWithChars(T, TokenPtr, tok::text);
624T.setText(Resolved);
625}
626
627void Lexer::setupAndLexHTMLStartTag(Token &T) {
628assert(BufferPtr[0] == '<' &&
629isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632if (!isHTMLTagName(Name)) {
633formTextToken(T, TagNameEnd);
634return;
635}
636
637formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638T.setHTMLTagStartName(Name);
639
640BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641
642const char C = *BufferPtr;
643if (BufferPtr != CommentEnd &&
644(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645State = LS_HTMLStartTag;
646}
647
648void Lexer::lexHTMLStartTag(Token &T) {
649assert(State == LS_HTMLStartTag);
650
651const char *TokenPtr = BufferPtr;
652char C = *TokenPtr;
653if (isHTMLIdentifierCharacter(C)) {
654TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656formTokenWithChars(T, TokenPtr, tok::html_ident);
657T.setHTMLIdent(Ident);
658} else {
659switch (C) {
660case '=':
661TokenPtr++;
662formTokenWithChars(T, TokenPtr, tok::html_equals);
663break;
664case '\"':
665case '\'': {
666const char *OpenQuote = TokenPtr;
667TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668const char *ClosingQuote = TokenPtr;
669if (TokenPtr != CommentEnd) // Skip closing quote.
670TokenPtr++;
671formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673ClosingQuote - (OpenQuote + 1)));
674break;
675}
676case '>':
677TokenPtr++;
678formTokenWithChars(T, TokenPtr, tok::html_greater);
679State = LS_Normal;
680return;
681case '/':
682TokenPtr++;
683if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684TokenPtr++;
685formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686} else
687formTextToken(T, TokenPtr);
688
689State = LS_Normal;
690return;
691}
692}
693
694// Now look ahead and return to normal state if we don't see any HTML tokens
695// ahead.
696BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697if (BufferPtr == CommentEnd) {
698State = LS_Normal;
699return;
700}
701
702C = *BufferPtr;
703if (!isHTMLIdentifierStartingCharacter(C) &&
704C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705State = LS_Normal;
706return;
707}
708}
709
710void Lexer::setupAndLexHTMLEndTag(Token &T) {
711assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712
713const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716if (!isHTMLTagName(Name)) {
717formTextToken(T, TagNameEnd);
718return;
719}
720
721const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722
723formTokenWithChars(T, End, tok::html_end_tag);
724T.setHTMLTagEndName(Name);
725
726if (BufferPtr != CommentEnd && *BufferPtr == '>')
727State = LS_HTMLEndTag;
728}
729
730void Lexer::lexHTMLEndTag(Token &T) {
731assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732
733formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734State = LS_Normal;
735}
736
737Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738const CommandTraits &Traits, SourceLocation FileLoc,
739const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740: Allocator(Allocator), Diags(Diags), Traits(Traits),
741BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742FileLoc(FileLoc), ParseCommands(ParseCommands),
743CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745void Lexer::lex(Token &T) {
746again:
747switch (CommentState) {
748case LCS_BeforeComment:
749if (BufferPtr == BufferEnd) {
750formTokenWithChars(T, BufferPtr, tok::eof);
751return;
752}
753
754assert(*BufferPtr == '/');
755BufferPtr++; // Skip first slash.
756switch(*BufferPtr) {
757case '/': { // BCPL comment.
758BufferPtr++; // Skip second slash.
759
760if (BufferPtr != BufferEnd) {
761// Skip Doxygen magic marker, if it is present.
762// It might be missing because of a typo //< or /*<, or because we
763// merged this non-Doxygen comment into a bunch of Doxygen comments
764// around it: /** ... */ /* ... */ /** ... */
765const char C = *BufferPtr;
766if (C == '/' || C == '!')
767BufferPtr++;
768}
769
770// Skip less-than symbol that marks trailing comments.
771// Skip it even if the comment is not a Doxygen one, because //< and /*<
772// are frequent typos.
773if (BufferPtr != BufferEnd && *BufferPtr == '<')
774BufferPtr++;
775
776CommentState = LCS_InsideBCPLComment;
777if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778State = LS_Normal;
779CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780goto again;
781}
782case '*': { // C comment.
783BufferPtr++; // Skip star.
784
785// Skip Doxygen magic marker.
786const char C = *BufferPtr;
787if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788BufferPtr++;
789
790// Skip less-than symbol that marks trailing comments.
791if (BufferPtr != BufferEnd && *BufferPtr == '<')
792BufferPtr++;
793
794CommentState = LCS_InsideCComment;
795State = LS_Normal;
796CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797goto again;
798}
799default:
800llvm_unreachable("second character of comment should be '/' or '*'");
801}
802
803case LCS_BetweenComments: {
804// Consecutive comments are extracted only if there is only whitespace
805// between them. So we can search for the start of the next comment.
806const char *EndWhitespace = BufferPtr;
807while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808EndWhitespace++;
809
810// Turn any whitespace between comments (and there is only whitespace
811// between them -- guaranteed by comment extraction) into a newline. We
812// have two newlines between C comments in total (first one was synthesized
813// after a comment).
814formTokenWithChars(T, EndWhitespace, tok::newline);
815
816CommentState = LCS_BeforeComment;
817break;
818}
819
820case LCS_InsideBCPLComment:
821case LCS_InsideCComment:
822if (BufferPtr != CommentEnd) {
823lexCommentText(T);
824break;
825} else {
826// Skip C comment closing sequence.
827if (CommentState == LCS_InsideCComment) {
828assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829BufferPtr += 2;
830assert(BufferPtr <= BufferEnd);
831
832// Synthenize newline just after the C comment, regardless if there is
833// actually a newline.
834formTokenWithChars(T, BufferPtr, tok::newline);
835
836CommentState = LCS_BetweenComments;
837break;
838} else {
839// Don't synthesized a newline after BCPL comment.
840CommentState = LCS_BetweenComments;
841goto again;
842}
843}
844}
845}
846
847StringRef Lexer::getSpelling(const Token &Tok,
848const SourceManager &SourceMgr) const {
849SourceLocation Loc = Tok.getLocation();
850std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852bool InvalidTemp = false;
853StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854if (InvalidTemp)
855return StringRef();
856
857const char *Begin = File.data() + LocInfo.second;
858return StringRef(Begin, Tok.getLength());
859}
860
861} // end namespace comments
862} // end namespace clang
863