podman
1283 строки · 30.2 Кб
1package toml
2
3import (
4"fmt"
5"reflect"
6"runtime"
7"strings"
8"unicode"
9"unicode/utf8"
10)
11
12type itemType int
13
14const (
15itemError itemType = iota
16itemNIL // used in the parser to indicate no type
17itemEOF
18itemText
19itemString
20itemRawString
21itemMultilineString
22itemRawMultilineString
23itemBool
24itemInteger
25itemFloat
26itemDatetime
27itemArray // the start of an array
28itemArrayEnd
29itemTableStart
30itemTableEnd
31itemArrayTableStart
32itemArrayTableEnd
33itemKeyStart
34itemKeyEnd
35itemCommentStart
36itemInlineTableStart
37itemInlineTableEnd
38)
39
40const eof = 0
41
42type stateFn func(lx *lexer) stateFn
43
44func (p Position) String() string {
45return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
46}
47
48type lexer struct {
49input string
50start int
51pos int
52line int
53state stateFn
54items chan item
55tomlNext bool
56
57// Allow for backing up up to 4 runes. This is necessary because TOML
58// contains 3-rune tokens (""" and ''').
59prevWidths [4]int
60nprev int // how many of prevWidths are in use
61atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
62
63// A stack of state functions used to maintain context.
64//
65// The idea is to reuse parts of the state machine in various places. For
66// example, values can appear at the top level or within arbitrarily nested
67// arrays. The last state on the stack is used after a value has been lexed.
68// Similarly for comments.
69stack []stateFn
70}
71
72type item struct {
73typ itemType
74val string
75err error
76pos Position
77}
78
79func (lx *lexer) nextItem() item {
80for {
81select {
82case item := <-lx.items:
83return item
84default:
85lx.state = lx.state(lx)
86//fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack)
87}
88}
89}
90
91func lex(input string, tomlNext bool) *lexer {
92lx := &lexer{
93input: input,
94state: lexTop,
95items: make(chan item, 10),
96stack: make([]stateFn, 0, 10),
97line: 1,
98tomlNext: tomlNext,
99}
100return lx
101}
102
103func (lx *lexer) push(state stateFn) {
104lx.stack = append(lx.stack, state)
105}
106
107func (lx *lexer) pop() stateFn {
108if len(lx.stack) == 0 {
109return lx.errorf("BUG in lexer: no states to pop")
110}
111last := lx.stack[len(lx.stack)-1]
112lx.stack = lx.stack[0 : len(lx.stack)-1]
113return last
114}
115
116func (lx *lexer) current() string {
117return lx.input[lx.start:lx.pos]
118}
119
120func (lx lexer) getPos() Position {
121p := Position{
122Line: lx.line,
123Start: lx.start,
124Len: lx.pos - lx.start,
125}
126if p.Len <= 0 {
127p.Len = 1
128}
129return p
130}
131
132func (lx *lexer) emit(typ itemType) {
133// Needed for multiline strings ending with an incomplete UTF-8 sequence.
134if lx.start > lx.pos {
135lx.error(errLexUTF8{lx.input[lx.pos]})
136return
137}
138lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
139lx.start = lx.pos
140}
141
142func (lx *lexer) emitTrim(typ itemType) {
143lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
144lx.start = lx.pos
145}
146
147func (lx *lexer) next() (r rune) {
148if lx.atEOF {
149panic("BUG in lexer: next called after EOF")
150}
151if lx.pos >= len(lx.input) {
152lx.atEOF = true
153return eof
154}
155
156if lx.input[lx.pos] == '\n' {
157lx.line++
158}
159lx.prevWidths[3] = lx.prevWidths[2]
160lx.prevWidths[2] = lx.prevWidths[1]
161lx.prevWidths[1] = lx.prevWidths[0]
162if lx.nprev < 4 {
163lx.nprev++
164}
165
166r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
167if r == utf8.RuneError {
168lx.error(errLexUTF8{lx.input[lx.pos]})
169return utf8.RuneError
170}
171
172// Note: don't use peek() here, as this calls next().
173if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
174lx.errorControlChar(r)
175return utf8.RuneError
176}
177
178lx.prevWidths[0] = w
179lx.pos += w
180return r
181}
182
183// ignore skips over the pending input before this point.
184func (lx *lexer) ignore() {
185lx.start = lx.pos
186}
187
188// backup steps back one rune. Can be called 4 times between calls to next.
189func (lx *lexer) backup() {
190if lx.atEOF {
191lx.atEOF = false
192return
193}
194if lx.nprev < 1 {
195panic("BUG in lexer: backed up too far")
196}
197w := lx.prevWidths[0]
198lx.prevWidths[0] = lx.prevWidths[1]
199lx.prevWidths[1] = lx.prevWidths[2]
200lx.prevWidths[2] = lx.prevWidths[3]
201lx.nprev--
202
203lx.pos -= w
204if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
205lx.line--
206}
207}
208
209// accept consumes the next rune if it's equal to `valid`.
210func (lx *lexer) accept(valid rune) bool {
211if lx.next() == valid {
212return true
213}
214lx.backup()
215return false
216}
217
218// peek returns but does not consume the next rune in the input.
219func (lx *lexer) peek() rune {
220r := lx.next()
221lx.backup()
222return r
223}
224
225// skip ignores all input that matches the given predicate.
226func (lx *lexer) skip(pred func(rune) bool) {
227for {
228r := lx.next()
229if pred(r) {
230continue
231}
232lx.backup()
233lx.ignore()
234return
235}
236}
237
238// error stops all lexing by emitting an error and returning `nil`.
239//
240// Note that any value that is a character is escaped if it's a special
241// character (newlines, tabs, etc.).
242func (lx *lexer) error(err error) stateFn {
243if lx.atEOF {
244return lx.errorPrevLine(err)
245}
246lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
247return nil
248}
249
250// errorfPrevline is like error(), but sets the position to the last column of
251// the previous line.
252//
253// This is so that unexpected EOF or NL errors don't show on a new blank line.
254func (lx *lexer) errorPrevLine(err error) stateFn {
255pos := lx.getPos()
256pos.Line--
257pos.Len = 1
258pos.Start = lx.pos - 1
259lx.items <- item{typ: itemError, pos: pos, err: err}
260return nil
261}
262
263// errorPos is like error(), but allows explicitly setting the position.
264func (lx *lexer) errorPos(start, length int, err error) stateFn {
265pos := lx.getPos()
266pos.Start = start
267pos.Len = length
268lx.items <- item{typ: itemError, pos: pos, err: err}
269return nil
270}
271
272// errorf is like error, and creates a new error.
273func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
274if lx.atEOF {
275pos := lx.getPos()
276pos.Line--
277pos.Len = 1
278pos.Start = lx.pos - 1
279lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
280return nil
281}
282lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
283return nil
284}
285
286func (lx *lexer) errorControlChar(cc rune) stateFn {
287return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
288}
289
290// lexTop consumes elements at the top level of TOML data.
291func lexTop(lx *lexer) stateFn {
292r := lx.next()
293if isWhitespace(r) || isNL(r) {
294return lexSkip(lx, lexTop)
295}
296switch r {
297case '#':
298lx.push(lexTop)
299return lexCommentStart
300case '[':
301return lexTableStart
302case eof:
303if lx.pos > lx.start {
304return lx.errorf("unexpected EOF")
305}
306lx.emit(itemEOF)
307return nil
308}
309
310// At this point, the only valid item can be a key, so we back up
311// and let the key lexer do the rest.
312lx.backup()
313lx.push(lexTopEnd)
314return lexKeyStart
315}
316
317// lexTopEnd is entered whenever a top-level item has been consumed. (A value
318// or a table.) It must see only whitespace, and will turn back to lexTop
319// upon a newline. If it sees EOF, it will quit the lexer successfully.
320func lexTopEnd(lx *lexer) stateFn {
321r := lx.next()
322switch {
323case r == '#':
324// a comment will read to a newline for us.
325lx.push(lexTop)
326return lexCommentStart
327case isWhitespace(r):
328return lexTopEnd
329case isNL(r):
330lx.ignore()
331return lexTop
332case r == eof:
333lx.emit(itemEOF)
334return nil
335}
336return lx.errorf(
337"expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
338r)
339}
340
341// lexTable lexes the beginning of a table. Namely, it makes sure that
342// it starts with a character other than '.' and ']'.
343// It assumes that '[' has already been consumed.
344// It also handles the case that this is an item in an array of tables.
345// e.g., '[[name]]'.
346func lexTableStart(lx *lexer) stateFn {
347if lx.peek() == '[' {
348lx.next()
349lx.emit(itemArrayTableStart)
350lx.push(lexArrayTableEnd)
351} else {
352lx.emit(itemTableStart)
353lx.push(lexTableEnd)
354}
355return lexTableNameStart
356}
357
358func lexTableEnd(lx *lexer) stateFn {
359lx.emit(itemTableEnd)
360return lexTopEnd
361}
362
363func lexArrayTableEnd(lx *lexer) stateFn {
364if r := lx.next(); r != ']' {
365return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
366}
367lx.emit(itemArrayTableEnd)
368return lexTopEnd
369}
370
371func lexTableNameStart(lx *lexer) stateFn {
372lx.skip(isWhitespace)
373switch r := lx.peek(); {
374case r == ']' || r == eof:
375return lx.errorf("unexpected end of table name (table names cannot be empty)")
376case r == '.':
377return lx.errorf("unexpected table separator (table names cannot be empty)")
378case r == '"' || r == '\'':
379lx.ignore()
380lx.push(lexTableNameEnd)
381return lexQuotedName
382default:
383lx.push(lexTableNameEnd)
384return lexBareName
385}
386}
387
388// lexTableNameEnd reads the end of a piece of a table name, optionally
389// consuming whitespace.
390func lexTableNameEnd(lx *lexer) stateFn {
391lx.skip(isWhitespace)
392switch r := lx.next(); {
393case isWhitespace(r):
394return lexTableNameEnd
395case r == '.':
396lx.ignore()
397return lexTableNameStart
398case r == ']':
399return lx.pop()
400default:
401return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
402}
403}
404
405// lexBareName lexes one part of a key or table.
406//
407// It assumes that at least one valid character for the table has already been
408// read.
409//
410// Lexes only one part, e.g. only 'a' inside 'a.b'.
411func lexBareName(lx *lexer) stateFn {
412r := lx.next()
413if isBareKeyChar(r, lx.tomlNext) {
414return lexBareName
415}
416lx.backup()
417lx.emit(itemText)
418return lx.pop()
419}
420
421// lexBareName lexes one part of a key or table.
422//
423// It assumes that at least one valid character for the table has already been
424// read.
425//
426// Lexes only one part, e.g. only '"a"' inside '"a".b'.
427func lexQuotedName(lx *lexer) stateFn {
428r := lx.next()
429switch {
430case isWhitespace(r):
431return lexSkip(lx, lexValue)
432case r == '"':
433lx.ignore() // ignore the '"'
434return lexString
435case r == '\'':
436lx.ignore() // ignore the "'"
437return lexRawString
438case r == eof:
439return lx.errorf("unexpected EOF; expected value")
440default:
441return lx.errorf("expected value but found %q instead", r)
442}
443}
444
445// lexKeyStart consumes all key parts until a '='.
446func lexKeyStart(lx *lexer) stateFn {
447lx.skip(isWhitespace)
448switch r := lx.peek(); {
449case r == '=' || r == eof:
450return lx.errorf("unexpected '=': key name appears blank")
451case r == '.':
452return lx.errorf("unexpected '.': keys cannot start with a '.'")
453case r == '"' || r == '\'':
454lx.ignore()
455fallthrough
456default: // Bare key
457lx.emit(itemKeyStart)
458return lexKeyNameStart
459}
460}
461
462func lexKeyNameStart(lx *lexer) stateFn {
463lx.skip(isWhitespace)
464switch r := lx.peek(); {
465case r == '=' || r == eof:
466return lx.errorf("unexpected '='")
467case r == '.':
468return lx.errorf("unexpected '.'")
469case r == '"' || r == '\'':
470lx.ignore()
471lx.push(lexKeyEnd)
472return lexQuotedName
473default:
474lx.push(lexKeyEnd)
475return lexBareName
476}
477}
478
479// lexKeyEnd consumes the end of a key and trims whitespace (up to the key
480// separator).
481func lexKeyEnd(lx *lexer) stateFn {
482lx.skip(isWhitespace)
483switch r := lx.next(); {
484case isWhitespace(r):
485return lexSkip(lx, lexKeyEnd)
486case r == eof:
487return lx.errorf("unexpected EOF; expected key separator '='")
488case r == '.':
489lx.ignore()
490return lexKeyNameStart
491case r == '=':
492lx.emit(itemKeyEnd)
493return lexSkip(lx, lexValue)
494default:
495return lx.errorf("expected '.' or '=', but got %q instead", r)
496}
497}
498
499// lexValue starts the consumption of a value anywhere a value is expected.
500// lexValue will ignore whitespace.
501// After a value is lexed, the last state on the next is popped and returned.
502func lexValue(lx *lexer) stateFn {
503// We allow whitespace to precede a value, but NOT newlines.
504// In array syntax, the array states are responsible for ignoring newlines.
505r := lx.next()
506switch {
507case isWhitespace(r):
508return lexSkip(lx, lexValue)
509case isDigit(r):
510lx.backup() // avoid an extra state and use the same as above
511return lexNumberOrDateStart
512}
513switch r {
514case '[':
515lx.ignore()
516lx.emit(itemArray)
517return lexArrayValue
518case '{':
519lx.ignore()
520lx.emit(itemInlineTableStart)
521return lexInlineTableValue
522case '"':
523if lx.accept('"') {
524if lx.accept('"') {
525lx.ignore() // Ignore """
526return lexMultilineString
527}
528lx.backup()
529}
530lx.ignore() // ignore the '"'
531return lexString
532case '\'':
533if lx.accept('\'') {
534if lx.accept('\'') {
535lx.ignore() // Ignore """
536return lexMultilineRawString
537}
538lx.backup()
539}
540lx.ignore() // ignore the "'"
541return lexRawString
542case '.': // special error case, be kind to users
543return lx.errorf("floats must start with a digit, not '.'")
544case 'i', 'n':
545if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
546lx.emit(itemFloat)
547return lx.pop()
548}
549case '-', '+':
550return lexDecimalNumberStart
551}
552if unicode.IsLetter(r) {
553// Be permissive here; lexBool will give a nice error if the
554// user wrote something like
555// x = foo
556// (i.e. not 'true' or 'false' but is something else word-like.)
557lx.backup()
558return lexBool
559}
560if r == eof {
561return lx.errorf("unexpected EOF; expected value")
562}
563return lx.errorf("expected value but found %q instead", r)
564}
565
566// lexArrayValue consumes one value in an array. It assumes that '[' or ','
567// have already been consumed. All whitespace and newlines are ignored.
568func lexArrayValue(lx *lexer) stateFn {
569r := lx.next()
570switch {
571case isWhitespace(r) || isNL(r):
572return lexSkip(lx, lexArrayValue)
573case r == '#':
574lx.push(lexArrayValue)
575return lexCommentStart
576case r == ',':
577return lx.errorf("unexpected comma")
578case r == ']':
579return lexArrayEnd
580}
581
582lx.backup()
583lx.push(lexArrayValueEnd)
584return lexValue
585}
586
587// lexArrayValueEnd consumes everything between the end of an array value and
588// the next value (or the end of the array): it ignores whitespace and newlines
589// and expects either a ',' or a ']'.
590func lexArrayValueEnd(lx *lexer) stateFn {
591switch r := lx.next(); {
592case isWhitespace(r) || isNL(r):
593return lexSkip(lx, lexArrayValueEnd)
594case r == '#':
595lx.push(lexArrayValueEnd)
596return lexCommentStart
597case r == ',':
598lx.ignore()
599return lexArrayValue // move on to the next value
600case r == ']':
601return lexArrayEnd
602default:
603return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
604}
605}
606
607// lexArrayEnd finishes the lexing of an array.
608// It assumes that a ']' has just been consumed.
609func lexArrayEnd(lx *lexer) stateFn {
610lx.ignore()
611lx.emit(itemArrayEnd)
612return lx.pop()
613}
614
615// lexInlineTableValue consumes one key/value pair in an inline table.
616// It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
617func lexInlineTableValue(lx *lexer) stateFn {
618r := lx.next()
619switch {
620case isWhitespace(r):
621return lexSkip(lx, lexInlineTableValue)
622case isNL(r):
623if lx.tomlNext {
624return lexSkip(lx, lexInlineTableValue)
625}
626return lx.errorPrevLine(errLexInlineTableNL{})
627case r == '#':
628lx.push(lexInlineTableValue)
629return lexCommentStart
630case r == ',':
631return lx.errorf("unexpected comma")
632case r == '}':
633return lexInlineTableEnd
634}
635lx.backup()
636lx.push(lexInlineTableValueEnd)
637return lexKeyStart
638}
639
640// lexInlineTableValueEnd consumes everything between the end of an inline table
641// key/value pair and the next pair (or the end of the table):
642// it ignores whitespace and expects either a ',' or a '}'.
643func lexInlineTableValueEnd(lx *lexer) stateFn {
644switch r := lx.next(); {
645case isWhitespace(r):
646return lexSkip(lx, lexInlineTableValueEnd)
647case isNL(r):
648if lx.tomlNext {
649return lexSkip(lx, lexInlineTableValueEnd)
650}
651return lx.errorPrevLine(errLexInlineTableNL{})
652case r == '#':
653lx.push(lexInlineTableValueEnd)
654return lexCommentStart
655case r == ',':
656lx.ignore()
657lx.skip(isWhitespace)
658if lx.peek() == '}' {
659if lx.tomlNext {
660return lexInlineTableValueEnd
661}
662return lx.errorf("trailing comma not allowed in inline tables")
663}
664return lexInlineTableValue
665case r == '}':
666return lexInlineTableEnd
667default:
668return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
669}
670}
671
672func runeOrEOF(r rune) string {
673if r == eof {
674return "end of file"
675}
676return "'" + string(r) + "'"
677}
678
679// lexInlineTableEnd finishes the lexing of an inline table.
680// It assumes that a '}' has just been consumed.
681func lexInlineTableEnd(lx *lexer) stateFn {
682lx.ignore()
683lx.emit(itemInlineTableEnd)
684return lx.pop()
685}
686
687// lexString consumes the inner contents of a string. It assumes that the
688// beginning '"' has already been consumed and ignored.
689func lexString(lx *lexer) stateFn {
690r := lx.next()
691switch {
692case r == eof:
693return lx.errorf(`unexpected EOF; expected '"'`)
694case isNL(r):
695return lx.errorPrevLine(errLexStringNL{})
696case r == '\\':
697lx.push(lexString)
698return lexStringEscape
699case r == '"':
700lx.backup()
701lx.emit(itemString)
702lx.next()
703lx.ignore()
704return lx.pop()
705}
706return lexString
707}
708
709// lexMultilineString consumes the inner contents of a string. It assumes that
710// the beginning '"""' has already been consumed and ignored.
711func lexMultilineString(lx *lexer) stateFn {
712r := lx.next()
713switch r {
714default:
715return lexMultilineString
716case eof:
717return lx.errorf(`unexpected EOF; expected '"""'`)
718case '\\':
719return lexMultilineStringEscape
720case '"':
721/// Found " → try to read two more "".
722if lx.accept('"') {
723if lx.accept('"') {
724/// Peek ahead: the string can contain " and "", including at the
725/// end: """str"""""
726/// 6 or more at the end, however, is an error.
727if lx.peek() == '"' {
728/// Check if we already lexed 5 's; if so we have 6 now, and
729/// that's just too many man!
730///
731/// Second check is for the edge case:
732///
733/// two quotes allowed.
734/// vv
735/// """lol \""""""
736/// ^^ ^^^---- closing three
737/// escaped
738///
739/// But ugly, but it works
740if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
741return lx.errorf(`unexpected '""""""'`)
742}
743lx.backup()
744lx.backup()
745return lexMultilineString
746}
747
748lx.backup() /// backup: don't include the """ in the item.
749lx.backup()
750lx.backup()
751lx.emit(itemMultilineString)
752lx.next() /// Read over ''' again and discard it.
753lx.next()
754lx.next()
755lx.ignore()
756return lx.pop()
757}
758lx.backup()
759}
760return lexMultilineString
761}
762}
763
764// lexRawString consumes a raw string. Nothing can be escaped in such a string.
765// It assumes that the beginning "'" has already been consumed and ignored.
766func lexRawString(lx *lexer) stateFn {
767r := lx.next()
768switch {
769default:
770return lexRawString
771case r == eof:
772return lx.errorf(`unexpected EOF; expected "'"`)
773case isNL(r):
774return lx.errorPrevLine(errLexStringNL{})
775case r == '\'':
776lx.backup()
777lx.emit(itemRawString)
778lx.next()
779lx.ignore()
780return lx.pop()
781}
782}
783
784// lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
785// string. It assumes that the beginning triple-' has already been consumed and
786// ignored.
787func lexMultilineRawString(lx *lexer) stateFn {
788r := lx.next()
789switch r {
790default:
791return lexMultilineRawString
792case eof:
793return lx.errorf(`unexpected EOF; expected "'''"`)
794case '\'':
795/// Found ' → try to read two more ''.
796if lx.accept('\'') {
797if lx.accept('\'') {
798/// Peek ahead: the string can contain ' and '', including at the
799/// end: '''str'''''
800/// 6 or more at the end, however, is an error.
801if lx.peek() == '\'' {
802/// Check if we already lexed 5 's; if so we have 6 now, and
803/// that's just too many man!
804if strings.HasSuffix(lx.current(), "'''''") {
805return lx.errorf(`unexpected "''''''"`)
806}
807lx.backup()
808lx.backup()
809return lexMultilineRawString
810}
811
812lx.backup() /// backup: don't include the ''' in the item.
813lx.backup()
814lx.backup()
815lx.emit(itemRawMultilineString)
816lx.next() /// Read over ''' again and discard it.
817lx.next()
818lx.next()
819lx.ignore()
820return lx.pop()
821}
822lx.backup()
823}
824return lexMultilineRawString
825}
826}
827
828// lexMultilineStringEscape consumes an escaped character. It assumes that the
829// preceding '\\' has already been consumed.
830func lexMultilineStringEscape(lx *lexer) stateFn {
831if isNL(lx.next()) { /// \ escaping newline.
832return lexMultilineString
833}
834lx.backup()
835lx.push(lexMultilineString)
836return lexStringEscape(lx)
837}
838
839func lexStringEscape(lx *lexer) stateFn {
840r := lx.next()
841switch r {
842case 'e':
843if !lx.tomlNext {
844return lx.error(errLexEscape{r})
845}
846fallthrough
847case 'b':
848fallthrough
849case 't':
850fallthrough
851case 'n':
852fallthrough
853case 'f':
854fallthrough
855case 'r':
856fallthrough
857case '"':
858fallthrough
859case ' ', '\t':
860// Inside """ .. """ strings you can use \ to escape newlines, and any
861// amount of whitespace can be between the \ and \n.
862fallthrough
863case '\\':
864return lx.pop()
865case 'x':
866if !lx.tomlNext {
867return lx.error(errLexEscape{r})
868}
869return lexHexEscape
870case 'u':
871return lexShortUnicodeEscape
872case 'U':
873return lexLongUnicodeEscape
874}
875return lx.error(errLexEscape{r})
876}
877
878func lexHexEscape(lx *lexer) stateFn {
879var r rune
880for i := 0; i < 2; i++ {
881r = lx.next()
882if !isHexadecimal(r) {
883return lx.errorf(
884`expected two hexadecimal digits after '\x', but got %q instead`,
885lx.current())
886}
887}
888return lx.pop()
889}
890
891func lexShortUnicodeEscape(lx *lexer) stateFn {
892var r rune
893for i := 0; i < 4; i++ {
894r = lx.next()
895if !isHexadecimal(r) {
896return lx.errorf(
897`expected four hexadecimal digits after '\u', but got %q instead`,
898lx.current())
899}
900}
901return lx.pop()
902}
903
904func lexLongUnicodeEscape(lx *lexer) stateFn {
905var r rune
906for i := 0; i < 8; i++ {
907r = lx.next()
908if !isHexadecimal(r) {
909return lx.errorf(
910`expected eight hexadecimal digits after '\U', but got %q instead`,
911lx.current())
912}
913}
914return lx.pop()
915}
916
917// lexNumberOrDateStart processes the first character of a value which begins
918// with a digit. It exists to catch values starting with '0', so that
919// lexBaseNumberOrDate can differentiate base prefixed integers from other
920// types.
921func lexNumberOrDateStart(lx *lexer) stateFn {
922r := lx.next()
923switch r {
924case '0':
925return lexBaseNumberOrDate
926}
927
928if !isDigit(r) {
929// The only way to reach this state is if the value starts
930// with a digit, so specifically treat anything else as an
931// error.
932return lx.errorf("expected a digit but got %q", r)
933}
934
935return lexNumberOrDate
936}
937
938// lexNumberOrDate consumes either an integer, float or datetime.
939func lexNumberOrDate(lx *lexer) stateFn {
940r := lx.next()
941if isDigit(r) {
942return lexNumberOrDate
943}
944switch r {
945case '-', ':':
946return lexDatetime
947case '_':
948return lexDecimalNumber
949case '.', 'e', 'E':
950return lexFloat
951}
952
953lx.backup()
954lx.emit(itemInteger)
955return lx.pop()
956}
957
958// lexDatetime consumes a Datetime, to a first approximation.
959// The parser validates that it matches one of the accepted formats.
960func lexDatetime(lx *lexer) stateFn {
961r := lx.next()
962if isDigit(r) {
963return lexDatetime
964}
965switch r {
966case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
967return lexDatetime
968}
969
970lx.backup()
971lx.emitTrim(itemDatetime)
972return lx.pop()
973}
974
975// lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
976func lexHexInteger(lx *lexer) stateFn {
977r := lx.next()
978if isHexadecimal(r) {
979return lexHexInteger
980}
981switch r {
982case '_':
983return lexHexInteger
984}
985
986lx.backup()
987lx.emit(itemInteger)
988return lx.pop()
989}
990
991// lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
992func lexOctalInteger(lx *lexer) stateFn {
993r := lx.next()
994if isOctal(r) {
995return lexOctalInteger
996}
997switch r {
998case '_':
999return lexOctalInteger
1000}
1001
1002lx.backup()
1003lx.emit(itemInteger)
1004return lx.pop()
1005}
1006
1007// lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
1008func lexBinaryInteger(lx *lexer) stateFn {
1009r := lx.next()
1010if isBinary(r) {
1011return lexBinaryInteger
1012}
1013switch r {
1014case '_':
1015return lexBinaryInteger
1016}
1017
1018lx.backup()
1019lx.emit(itemInteger)
1020return lx.pop()
1021}
1022
1023// lexDecimalNumber consumes a decimal float or integer.
1024func lexDecimalNumber(lx *lexer) stateFn {
1025r := lx.next()
1026if isDigit(r) {
1027return lexDecimalNumber
1028}
1029switch r {
1030case '.', 'e', 'E':
1031return lexFloat
1032case '_':
1033return lexDecimalNumber
1034}
1035
1036lx.backup()
1037lx.emit(itemInteger)
1038return lx.pop()
1039}
1040
1041// lexDecimalNumber consumes the first digit of a number beginning with a sign.
1042// It assumes the sign has already been consumed. Values which start with a sign
1043// are only allowed to be decimal integers or floats.
1044//
1045// The special "nan" and "inf" values are also recognized.
1046func lexDecimalNumberStart(lx *lexer) stateFn {
1047r := lx.next()
1048
1049// Special error cases to give users better error messages
1050switch r {
1051case 'i':
1052if !lx.accept('n') || !lx.accept('f') {
1053return lx.errorf("invalid float: '%s'", lx.current())
1054}
1055lx.emit(itemFloat)
1056return lx.pop()
1057case 'n':
1058if !lx.accept('a') || !lx.accept('n') {
1059return lx.errorf("invalid float: '%s'", lx.current())
1060}
1061lx.emit(itemFloat)
1062return lx.pop()
1063case '0':
1064p := lx.peek()
1065switch p {
1066case 'b', 'o', 'x':
1067return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
1068}
1069case '.':
1070return lx.errorf("floats must start with a digit, not '.'")
1071}
1072
1073if isDigit(r) {
1074return lexDecimalNumber
1075}
1076
1077return lx.errorf("expected a digit but got %q", r)
1078}
1079
1080// lexBaseNumberOrDate differentiates between the possible values which
1081// start with '0'. It assumes that before reaching this state, the initial '0'
1082// has been consumed.
1083func lexBaseNumberOrDate(lx *lexer) stateFn {
1084r := lx.next()
1085// Note: All datetimes start with at least two digits, so we don't
1086// handle date characters (':', '-', etc.) here.
1087if isDigit(r) {
1088return lexNumberOrDate
1089}
1090switch r {
1091case '_':
1092// Can only be decimal, because there can't be an underscore
1093// between the '0' and the base designator, and dates can't
1094// contain underscores.
1095return lexDecimalNumber
1096case '.', 'e', 'E':
1097return lexFloat
1098case 'b':
1099r = lx.peek()
1100if !isBinary(r) {
1101lx.errorf("not a binary number: '%s%c'", lx.current(), r)
1102}
1103return lexBinaryInteger
1104case 'o':
1105r = lx.peek()
1106if !isOctal(r) {
1107lx.errorf("not an octal number: '%s%c'", lx.current(), r)
1108}
1109return lexOctalInteger
1110case 'x':
1111r = lx.peek()
1112if !isHexadecimal(r) {
1113lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
1114}
1115return lexHexInteger
1116}
1117
1118lx.backup()
1119lx.emit(itemInteger)
1120return lx.pop()
1121}
1122
1123// lexFloat consumes the elements of a float. It allows any sequence of
1124// float-like characters, so floats emitted by the lexer are only a first
1125// approximation and must be validated by the parser.
1126func lexFloat(lx *lexer) stateFn {
1127r := lx.next()
1128if isDigit(r) {
1129return lexFloat
1130}
1131switch r {
1132case '_', '.', '-', '+', 'e', 'E':
1133return lexFloat
1134}
1135
1136lx.backup()
1137lx.emit(itemFloat)
1138return lx.pop()
1139}
1140
1141// lexBool consumes a bool string: 'true' or 'false.
1142func lexBool(lx *lexer) stateFn {
1143var rs []rune
1144for {
1145r := lx.next()
1146if !unicode.IsLetter(r) {
1147lx.backup()
1148break
1149}
1150rs = append(rs, r)
1151}
1152s := string(rs)
1153switch s {
1154case "true", "false":
1155lx.emit(itemBool)
1156return lx.pop()
1157}
1158return lx.errorf("expected value but found %q instead", s)
1159}
1160
1161// lexCommentStart begins the lexing of a comment. It will emit
1162// itemCommentStart and consume no characters, passing control to lexComment.
1163func lexCommentStart(lx *lexer) stateFn {
1164lx.ignore()
1165lx.emit(itemCommentStart)
1166return lexComment
1167}
1168
1169// lexComment lexes an entire comment. It assumes that '#' has been consumed.
1170// It will consume *up to* the first newline character, and pass control
1171// back to the last state on the stack.
1172func lexComment(lx *lexer) stateFn {
1173switch r := lx.next(); {
1174case isNL(r) || r == eof:
1175lx.backup()
1176lx.emit(itemText)
1177return lx.pop()
1178default:
1179return lexComment
1180}
1181}
1182
1183// lexSkip ignores all slurped input and moves on to the next state.
1184func lexSkip(lx *lexer, nextState stateFn) stateFn {
1185lx.ignore()
1186return nextState
1187}
1188
1189func (s stateFn) String() string {
1190name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
1191if i := strings.LastIndexByte(name, '.'); i > -1 {
1192name = name[i+1:]
1193}
1194if s == nil {
1195name = "<nil>"
1196}
1197return name + "()"
1198}
1199
1200func (itype itemType) String() string {
1201switch itype {
1202case itemError:
1203return "Error"
1204case itemNIL:
1205return "NIL"
1206case itemEOF:
1207return "EOF"
1208case itemText:
1209return "Text"
1210case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
1211return "String"
1212case itemBool:
1213return "Bool"
1214case itemInteger:
1215return "Integer"
1216case itemFloat:
1217return "Float"
1218case itemDatetime:
1219return "DateTime"
1220case itemTableStart:
1221return "TableStart"
1222case itemTableEnd:
1223return "TableEnd"
1224case itemKeyStart:
1225return "KeyStart"
1226case itemKeyEnd:
1227return "KeyEnd"
1228case itemArray:
1229return "Array"
1230case itemArrayEnd:
1231return "ArrayEnd"
1232case itemCommentStart:
1233return "CommentStart"
1234case itemInlineTableStart:
1235return "InlineTableStart"
1236case itemInlineTableEnd:
1237return "InlineTableEnd"
1238}
1239panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
1240}
1241
1242func (item item) String() string {
1243return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
1244}
1245
1246func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
1247func isNL(r rune) bool { return r == '\n' || r == '\r' }
1248func isControl(r rune) bool { // Control characters except \t, \r, \n
1249switch r {
1250case '\t', '\r', '\n':
1251return false
1252default:
1253return (r >= 0x00 && r <= 0x1f) || r == 0x7f
1254}
1255}
1256func isDigit(r rune) bool { return r >= '0' && r <= '9' }
1257func isBinary(r rune) bool { return r == '0' || r == '1' }
1258func isOctal(r rune) bool { return r >= '0' && r <= '7' }
1259func isHexadecimal(r rune) bool {
1260return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
1261}
1262
1263func isBareKeyChar(r rune, tomlNext bool) bool {
1264if tomlNext {
1265return (r >= 'A' && r <= 'Z') ||
1266(r >= 'a' && r <= 'z') ||
1267(r >= '0' && r <= '9') ||
1268r == '_' || r == '-' ||
1269r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) ||
1270(r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) ||
1271(r >= 0x037f && r <= 0x1fff) ||
1272(r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) ||
1273(r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) ||
1274(r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) ||
1275(r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) ||
1276(r >= 0x10000 && r <= 0xeffff)
1277}
1278
1279return (r >= 'A' && r <= 'Z') ||
1280(r >= 'a' && r <= 'z') ||
1281(r >= '0' && r <= '9') ||
1282r == '_' || r == '-'
1283}
1284