1
// Copyright 2018 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
14
"google.golang.org/protobuf/internal/errors"
17
// Decoder is a token-based textproto decoder.
19
// lastCall is last method called, either readCall or peekCall.
20
// Initial value is readCall.
23
// lastToken contains the last read token.
26
// lastErr contains the last read error.
29
// openStack is a stack containing the byte characters for MessageOpen and
30
// ListOpen kinds. The top of stack represents the message or the list that
31
// the current token is nested in. An empty stack means the current token is
32
// at the top level message. The characters '{' and '<' both represent the
36
// orig is used in reporting line and column.
38
// in contains the unconsumed input.
42
// NewDecoder returns a Decoder to read the given []byte.
43
func NewDecoder(b []byte) *Decoder {
44
return &Decoder{orig: b, in: b}
47
// ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
48
var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
50
// call specifies which Decoder method was invoked.
58
// Peek looks ahead and returns the next token and error without advancing a read.
59
func (d *Decoder) Peek() (Token, error) {
60
defer func() { d.lastCall = peekCall }()
61
if d.lastCall == readCall {
62
d.lastToken, d.lastErr = d.Read()
64
return d.lastToken, d.lastErr
67
// Read returns the next token.
68
// It will return an error if there is no valid token.
69
func (d *Decoder) Read() (Token, error) {
70
defer func() { d.lastCall = readCall }()
71
if d.lastCall == peekCall {
72
return d.lastToken, d.lastErr
75
tok, err := d.parseNext(d.lastToken.kind)
81
case comma, semicolon:
82
tok, err = d.parseNext(tok.kind)
92
mismatchedFmt = "mismatched close character %q"
93
unexpectedFmt = "unexpected character %q"
96
// parseNext parses the next Token based on given last kind.
97
func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
98
// Trim leading spaces.
107
return d.consumeToken(EOF, 0, 0), nil
110
// Start of top level message. Next token can be EOF or Name.
112
return d.consumeToken(EOF, 0, 0), nil
114
return d.parseFieldName()
117
// Next token can be MessageOpen, ListOpen or Scalar.
119
return Token{}, ErrUnexpectedEOF
121
switch ch := d.in[0]; ch {
124
return d.consumeToken(MessageOpen, 1, 0), nil
127
return d.consumeToken(ListOpen, 1, 0), nil
129
return d.parseScalar()
133
openKind, closeCh := d.currentOpenKind()
136
// Top level message.
137
// Next token can be EOF, comma, semicolon or Name.
139
return d.consumeToken(EOF, 0, 0), nil
143
return d.consumeToken(comma, 1, 0), nil
145
return d.consumeToken(semicolon, 1, 0), nil
147
return d.parseFieldName()
151
// Next token can be MessageClose, comma, semicolon or Name.
153
return Token{}, ErrUnexpectedEOF
155
switch ch := d.in[0]; ch {
158
return d.consumeToken(MessageClose, 1, 0), nil
159
case otherCloseChar[closeCh]:
160
return Token{}, d.newSyntaxError(mismatchedFmt, ch)
162
return d.consumeToken(comma, 1, 0), nil
164
return d.consumeToken(semicolon, 1, 0), nil
166
return d.parseFieldName()
170
// Next token can be ListClose or comma.
172
return Token{}, ErrUnexpectedEOF
174
switch ch := d.in[0]; ch {
177
return d.consumeToken(ListClose, 1, 0), nil
179
return d.consumeToken(comma, 1, 0), nil
181
return Token{}, d.newSyntaxError(unexpectedFmt, ch)
186
// Next token can be MessageClose or Name.
188
return Token{}, ErrUnexpectedEOF
190
_, closeCh := d.currentOpenKind()
191
switch ch := d.in[0]; ch {
194
return d.consumeToken(MessageClose, 1, 0), nil
195
case otherCloseChar[closeCh]:
196
return Token{}, d.newSyntaxError(mismatchedFmt, ch)
198
return d.parseFieldName()
202
openKind, closeCh := d.currentOpenKind()
205
// Top level message.
206
// Next token can be EOF, comma, semicolon or Name.
208
return d.consumeToken(EOF, 0, 0), nil
210
switch ch := d.in[0]; ch {
212
return d.consumeToken(comma, 1, 0), nil
214
return d.consumeToken(semicolon, 1, 0), nil
216
return d.parseFieldName()
220
// Next token can be MessageClose, comma, semicolon or Name.
222
return Token{}, ErrUnexpectedEOF
224
switch ch := d.in[0]; ch {
227
return d.consumeToken(MessageClose, 1, 0), nil
228
case otherCloseChar[closeCh]:
229
return Token{}, d.newSyntaxError(mismatchedFmt, ch)
231
return d.consumeToken(comma, 1, 0), nil
233
return d.consumeToken(semicolon, 1, 0), nil
235
return d.parseFieldName()
239
// Next token can be ListClose or comma
241
return Token{}, ErrUnexpectedEOF
243
switch ch := d.in[0]; ch {
246
return d.consumeToken(ListClose, 1, 0), nil
248
return d.consumeToken(comma, 1, 0), nil
250
return Token{}, d.newSyntaxError(unexpectedFmt, ch)
255
// Next token can be ListClose, MessageStart or Scalar.
257
return Token{}, ErrUnexpectedEOF
259
switch ch := d.in[0]; ch {
262
return d.consumeToken(ListClose, 1, 0), nil
265
return d.consumeToken(MessageOpen, 1, 0), nil
267
return d.parseScalar()
271
openKind, closeCh := d.currentOpenKind()
274
// Top level message.
275
// Next token can be EOF, comma, semicolon or Name.
277
return d.consumeToken(EOF, 0, 0), nil
279
switch ch := d.in[0]; ch {
281
return d.consumeToken(comma, 1, 0), nil
283
return d.consumeToken(semicolon, 1, 0), nil
285
return d.parseFieldName()
289
// Next token can be MessageClose, comma, semicolon or Name.
291
return Token{}, ErrUnexpectedEOF
293
switch ch := d.in[0]; ch {
296
return d.consumeToken(MessageClose, 1, 0), nil
297
case otherCloseChar[closeCh]:
298
return Token{}, d.newSyntaxError(mismatchedFmt, ch)
300
return d.consumeToken(comma, 1, 0), nil
302
return d.consumeToken(semicolon, 1, 0), nil
304
return d.parseFieldName()
308
// It is not possible to have this case. Let it panic below.
311
case comma, semicolon:
312
openKind, closeCh := d.currentOpenKind()
315
// Top level message. Next token can be EOF or Name.
317
return d.consumeToken(EOF, 0, 0), nil
319
return d.parseFieldName()
322
// Next token can be MessageClose or Name.
324
return Token{}, ErrUnexpectedEOF
326
switch ch := d.in[0]; ch {
329
return d.consumeToken(MessageClose, 1, 0), nil
330
case otherCloseChar[closeCh]:
331
return Token{}, d.newSyntaxError(mismatchedFmt, ch)
333
return d.parseFieldName()
337
if lastKind == semicolon {
338
// It is not be possible to have this case as logic here
339
// should not have produced a semicolon Token when inside a
340
// list. Let it panic below.
343
// Next token can be MessageOpen or Scalar.
345
return Token{}, ErrUnexpectedEOF
347
switch ch := d.in[0]; ch {
350
return d.consumeToken(MessageOpen, 1, 0), nil
352
return d.parseScalar()
357
line, column := d.Position(len(d.orig) - len(d.in))
358
panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
361
var otherCloseChar = map[byte]byte{
366
// currentOpenKind indicates whether current position is inside a message, list
367
// or top-level message by returning MessageOpen, ListOpen or bof respectively.
368
// If the returned kind is either a MessageOpen or ListOpen, it also returns the
369
// corresponding closing character.
370
func (d *Decoder) currentOpenKind() (Kind, byte) {
371
if len(d.openStack) == 0 {
374
openCh := d.openStack[len(d.openStack)-1]
377
return MessageOpen, '}'
379
return MessageOpen, '>'
383
panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
386
func (d *Decoder) pushOpenStack(ch byte) {
387
d.openStack = append(d.openStack, ch)
390
func (d *Decoder) popOpenStack() {
391
d.openStack = d.openStack[:len(d.openStack)-1]
394
// parseFieldName parses field name and separator.
395
func (d *Decoder) parseFieldName() (tok Token, err error) {
397
if err == nil && d.tryConsumeChar(':') {
398
tok.attrs |= hasSeparator
402
// Extension or Any type URL.
404
return d.parseTypeName()
408
if size := parseIdent(d.in, false); size > 0 {
409
return d.consumeToken(Name, size, uint8(IdentName)), nil
412
// Field number. Identify if input is a valid number that is not negative
413
// and is decimal integer within 32-bit range.
414
if num := parseNumber(d.in); num.size > 0 {
415
str := num.string(d.in)
416
if !num.neg && num.kind == numDec {
417
if _, err := strconv.ParseInt(str, 10, 32); err == nil {
418
return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
421
return Token{}, d.newSyntaxError("invalid field number: %s", str)
424
return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
427
// parseTypeName parses Any type URL or extension field name. The name is
428
// enclosed in [ and ] characters. The C++ parser does not handle many legal URL
429
// strings. This implementation is more liberal and allows for the pattern
430
// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
431
// in between [ ], '.', '/' and the sub names.
432
func (d *Decoder) parseTypeName() (Token, error) {
433
startPos := len(d.orig) - len(d.in)
434
// Use alias s to advance first in order to use d.in for error handling.
435
// Caller already checks for [ as first character.
436
s := consume(d.in[1:], 0)
438
return Token{}, ErrUnexpectedEOF
442
for len(s) > 0 && isTypeNameChar(s[0]) {
443
name = append(name, s[0])
449
for len(s) > 0 && !closed {
455
case s[0] == '/', s[0] == '.':
456
if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
457
return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
458
d.orig[startPos:len(d.orig)-len(s)+1])
460
name = append(name, s[0])
463
for len(s) > 0 && isTypeNameChar(s[0]) {
464
name = append(name, s[0])
470
return Token{}, d.newSyntaxError(
471
"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
476
return Token{}, ErrUnexpectedEOF
479
// First character cannot be '.'. Last character cannot be '.' or '/'.
481
if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
482
return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
483
d.orig[startPos:len(d.orig)-len(s)])
487
endPos := len(d.orig) - len(d.in)
492
attrs: uint8(TypeName),
494
raw: d.orig[startPos:endPos],
499
func isTypeNameChar(b byte) bool {
500
return (b == '-' || b == '_' ||
501
('0' <= b && b <= '9') ||
502
('a' <= b && b <= 'z') ||
503
('A' <= b && b <= 'Z'))
506
func isWhiteSpace(b byte) bool {
508
case ' ', '\n', '\r', '\t':
515
// parseIdent parses an unquoted proto identifier and returns size.
516
// If allowNeg is true, it allows '-' to be the first character in the
517
// identifier. This is used when parsing literal values like -infinity, etc.
518
// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
519
func parseIdent(input []byte, allowNeg bool) int {
527
if allowNeg && s[0] == '-' {
537
'a' <= s[0] && s[0] <= 'z',
538
'A' <= s[0] && s[0] <= 'Z':
545
for len(s) > 0 && (s[0] == '_' ||
546
'a' <= s[0] && s[0] <= 'z' ||
547
'A' <= s[0] && s[0] <= 'Z' ||
548
'0' <= s[0] && s[0] <= '9') {
553
if len(s) > 0 && !isDelim(s[0]) {
560
// parseScalar parses for a string, literal or number value.
561
func (d *Decoder) parseScalar() (Token, error) {
562
if d.in[0] == '"' || d.in[0] == '\'' {
563
return d.parseStringValue()
566
if tok, ok := d.parseLiteralValue(); ok {
570
if tok, ok := d.parseNumberValue(); ok {
574
return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
577
// parseLiteralValue parses a literal value. A literal value is used for
578
// bools, special floats and enums. This function simply identifies that the
579
// field value is a literal.
580
func (d *Decoder) parseLiteralValue() (Token, bool) {
581
size := parseIdent(d.in, true)
583
return Token{}, false
585
return d.consumeToken(Scalar, size, literalValue), true
588
// consumeToken constructs a Token for given Kind from d.in and consumes given
589
// size-length from it.
590
func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
591
// Important to compute raw and pos before consuming.
595
pos: len(d.orig) - len(d.in),
602
// newSyntaxError returns a syntax error with line and column information for
604
func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
605
e := errors.New(f, x...)
606
line, column := d.Position(len(d.orig) - len(d.in))
607
return errors.New("syntax error (line %d:%d): %v", line, column, e)
610
// Position returns line and column number of given index of the original input.
611
// It will panic if index is out of range.
612
func (d *Decoder) Position(idx int) (line int, column int) {
614
line = bytes.Count(b, []byte("\n")) + 1
615
if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
618
column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
622
func (d *Decoder) tryConsumeChar(c byte) bool {
623
if len(d.in) > 0 && d.in[0] == c {
630
// consume consumes n bytes of input and any subsequent whitespace or comments.
631
func (d *Decoder) consume(n int) {
632
d.in = consume(d.in, n)
636
// consume consumes n bytes of input and any subsequent whitespace or comments.
637
func consume(b []byte, n int) []byte {
641
case ' ', '\n', '\r', '\t':
644
if i := bytes.IndexByte(b, '\n'); i >= 0 {
656
// errId extracts a byte sequence that looks like an invalid ID
657
// (for the purposes of error reporting).
658
func errId(seq []byte) []byte {
660
for i := 0; i < len(seq); {
662
return append(seq[:i:i], "…"...)
664
r, size := utf8.DecodeRune(seq[i:])
665
if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
667
// Either the first byte is invalid UTF-8 or a
668
// delimiter, or the first rune is non-ASCII.
676
// No delimiter found.
680
// isDelim returns true if given byte is a delimiter character.
681
func isDelim(c byte) bool {
682
return !(c == '-' || c == '+' || c == '.' || c == '_' ||
683
('a' <= c && c <= 'z') ||
684
('A' <= c && c <= 'Z') ||
685
('0' <= c && c <= '9'))