v

scanner.v
309 строк · 8.1 Кб
Перенос по словам
1
// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2
// Use of this source code is governed by an MIT license
3
// that can be found in the LICENSE file.
4
module json2
5

6
import strconv
7

8
struct Scanner {
9
mut:
10
	text []u8
11
	pos  int // the position of the token in scanner text
12
	line int
13
	col  int
14
}
15

16
enum TokenKind {
17
	none_
18
	error
19
	str_
20
	float
21
	int_
22
	null
23
	bool_
24
	eof
25
	comma = 44  // ,
26
	colon = 58  // :
27
	lsbr  = 91  // [
28
	rsbr  = 93  // ]
29
	lcbr  = 123 // {
30
	rcbr  = 125 // }
31
}
32

33
pub struct Token {
34
	lit  []u8      // literal representation of the token
35
	kind TokenKind // the token number/enum; for quick comparisons
36
	line int       // the line in the source where the token occurred
37
	col  int       // the column in the source where the token occurred
38
}
39

40
// full_col returns the full column information which includes the length
41
pub fn (t Token) full_col() int {
42
	return t.col + t.lit.len
43
}
44

45
// list of characters commonly used in JSON.
46
const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
47
// list of newlines to check when moving to a new position.
48
const newlines = [`\r`, `\n`, `\t`]!
49
// list of escapable that needs to be escaped inside a JSON string.
50
// double quotes and forward slashes are excluded intentionally since
51
// they have their own separate checks for it in order to pass the
52
// JSON test suite (https://github.com/nst/JSONTestSuite/).
53
const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
54
// list of valid unicode escapes aside from \u{4-hex digits}
55
const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
56
// used for transforming escapes into valid unicode (eg. n => \n)
57
const unicode_transform_escapes = {
58
	98:  `\b`
59
	102: `\f`
60
	110: `\n`
61
	114: `\r`
62
	116: `\t`
63
	92:  `\\`
64
	34:  `"`
65
	47:  `/`
66
}
67
const exp_signs = [u8(`-`), `+`]!
68

69
// move_pos proceeds to the next position.
70
fn (mut s Scanner) move() {
71
	s.move_pos(true, true)
72
}
73

74
// move_pos_with_newlines is the same as move_pos but only enables newline checking.
75
fn (mut s Scanner) move_pos_with_newlines() {
76
	s.move_pos(false, true)
77
}
78

79
fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
80
	s.pos++
81
	if s.pos < s.text.len {
82
		if include_newlines && s.text[s.pos] in newlines {
83
			s.line++
84
			s.col = 0
85
			if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
86
				s.pos++
87
			}
88
			for s.pos < s.text.len && s.text[s.pos] in newlines {
89
				s.move()
90
			}
91
		} else if include_space && s.text[s.pos] == ` ` {
92
			s.pos++
93
			s.col++
94
			for s.pos < s.text.len && s.text[s.pos] == ` ` {
95
				s.move()
96
			}
97
		}
98
	} else {
99
		s.col++
100
	}
101
}
102

103
// error returns an error token.
104
fn (s Scanner) error(description string) Token {
105
	return s.tokenize(description.bytes(), .error)
106
}
107

108
// tokenize returns a token based on the given lit and kind.
109
fn (s Scanner) tokenize(lit []u8, kind TokenKind) Token {
110
	return Token{
111
		lit:  lit
112
		kind: kind
113
		col:  s.col
114
		line: s.line
115
	}
116
}
117

118
// text_scan scans and returns a string token.
119
@[manualfree]
120
fn (mut s Scanner) text_scan() Token {
121
	mut has_closed := false
122
	mut chrs := []u8{}
123
	for {
124
		s.pos++
125
		s.col++
126
		if s.pos >= s.text.len {
127
			break
128
		}
129
		ch := s.text[s.pos]
130
		if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` {
131
			has_closed = true
132
			break
133
		} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch in important_escapable_chars {
134
			return s.error('character must be escaped with a backslash')
135
		} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == u8(0) {
136
			return s.error('invalid backslash escape')
137
		} else if s.pos + 1 < s.text.len && ch == `\\` {
138
			peek := s.text[s.pos + 1]
139
			if peek in valid_unicode_escapes {
140
				chrs << unicode_transform_escapes[int(peek)]
141
				s.pos++
142
				s.col++
143
				continue
144
			} else if peek == `u` {
145
				if s.pos + 5 < s.text.len {
146
					s.pos++
147
					s.col++
148
					mut codepoint := []u8{}
149
					codepoint_start := s.pos
150
					for s.pos < s.text.len && s.pos < codepoint_start + 4 {
151
						s.pos++
152
						s.col++
153
						if s.text[s.pos] == `"` {
154
							break
155
						} else if !s.text[s.pos].is_hex_digit() {
156
							x := s.text[s.pos].ascii_str()
157
							return s.error('`${x}` is not a hex digit')
158
						}
159
						codepoint << s.text[s.pos]
160
					}
161
					if codepoint.len != 4 {
162
						return s.error('unicode escape must have 4 hex digits')
163
					}
164
					val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
165
					converted := utf32_to_str(val)
166
					converted_bytes := converted.bytes()
167
					chrs << converted_bytes
168
					unsafe {
169
						converted.free()
170
						converted_bytes.free()
171
						codepoint.free()
172
					}
173
					continue
174
				} else {
175
					return s.error('incomplete unicode escape')
176
				}
177
			} else if peek == `U` {
178
				return s.error('unicode endpoints must be in lowercase `u`')
179
			} else if peek == u8(229) {
180
				return s.error('unicode endpoint not allowed')
181
			} else {
182
				return s.error('invalid backslash escape')
183
			}
184
		}
185
		chrs << ch
186
	}
187
	tok := s.tokenize(chrs, .str_)
188
	s.move()
189
	if !has_closed {
190
		return s.error('missing double quotes in string closing')
191
	}
192
	return tok
193
}
194

195
// num_scan scans and returns an int/float token.
196
fn (mut s Scanner) num_scan() Token {
197
	// analyze json number structure
198
	// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
199
	mut is_fl := false
200
	mut dot_index := -1
201
	mut digits := []u8{}
202
	if s.text[s.pos] == `-` {
203
		digits << `-`
204
		if !s.text[s.pos + 1].is_digit() {
205
			return s.invalid_token()
206
		}
207
		s.move_pos_with_newlines()
208
	}
209
	if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
210
		return s.error('leading zeroes in a number are not allowed')
211
	}
212
	for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
213
		digits << s.text[s.pos]
214
		if s.text[s.pos] == `.` {
215
			is_fl = true
216
			dot_index = digits.len - 1
217
		}
218
		s.move_pos_with_newlines()
219
	}
220
	if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
221
		return s.error('invalid float')
222
	}
223
	if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
224
		digits << s.text[s.pos]
225
		s.move_pos_with_newlines()
226
		if s.pos < s.text.len && s.text[s.pos] in exp_signs {
227
			digits << s.text[s.pos]
228
			s.move_pos_with_newlines()
229
		}
230
		mut exp_digits_count := 0
231
		for s.pos < s.text.len && s.text[s.pos].is_digit() {
232
			digits << s.text[s.pos]
233
			exp_digits_count++
234
			s.move_pos_with_newlines()
235
		}
236
		if exp_digits_count == 0 {
237
			return s.error('invalid exponent')
238
		}
239
	}
240
	kind := if is_fl { TokenKind.float } else { TokenKind.int_ }
241
	return s.tokenize(digits, kind)
242
}
243

244
// invalid_token returns an error token with the invalid token message.
245
fn (s Scanner) invalid_token() Token {
246
	if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 {
247
		x := s.text[s.pos].ascii_str()
248
		return s.error('invalid token `${x}`')
249
	} else {
250
		x := s.text[s.pos].str_escaped()
251
		return s.error('invalid token `${x}`')
252
	}
253
}
254

255
// scan returns a token based on the scanner's current position.
256
// used to set the next token
257
@[manualfree]
258
fn (mut s Scanner) scan() Token {
259
	if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in newlines) {
260
		s.move()
261
	}
262
	if s.pos >= s.text.len {
263
		return s.tokenize([]u8{}, .eof)
264
	} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
265
		ident := s.text[s.pos..s.pos + 4].bytestr()
266
		if ident == 'true' || ident == 'null' {
267
			mut kind := TokenKind.null
268
			if ident == 'true' {
269
				kind = .bool_
270
			}
271
			unsafe { ident.free() }
272
			val := s.text[s.pos..s.pos + 4]
273
			tok := s.tokenize(val, kind)
274
			s.move() // n / t
275
			s.move() // u / r
276
			s.move() // l / u
277
			s.move() // l / e
278
			return tok
279
		}
280
		unsafe { ident.free() }
281
		return s.invalid_token()
282
	} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
283
		ident := s.text[s.pos..s.pos + 5].bytestr()
284
		if ident == 'false' {
285
			unsafe { ident.free() }
286
			val := s.text[s.pos..s.pos + 5]
287
			tok := s.tokenize(val, .bool_)
288
			s.move() // f
289
			s.move() // a
290
			s.move() // l
291
			s.move() // s
292
			s.move() // e
293
			return tok
294
		}
295
		unsafe { ident.free() }
296
		return s.invalid_token()
297
	} else if s.text[s.pos] in char_list {
298
		chr := s.text[s.pos]
299
		tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
300
		s.move()
301
		return tok
302
	} else if s.text[s.pos] == `"` {
303
		return s.text_scan()
304
	} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
305
		return s.num_scan()
306
	} else {
307
		return s.invalid_token()
308
	}
309
}
310
v

Использование cookies