v
Зеркало из https://github.com/vlang/v
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module json2
5
6import strconv
7
8struct Scanner {
9mut:
10text []u8
11pos int // the position of the token in scanner text
12line int
13col int
14}
15
16enum TokenKind {
17none_
18error
19str_
20float
21int_
22null
23bool_
24eof
25comma = 44 // ,
26colon = 58 // :
27lsbr = 91 // [
28rsbr = 93 // ]
29lcbr = 123 // {
30rcbr = 125 // }
31}
32
33pub struct Token {
34lit []u8 // literal representation of the token
35kind TokenKind // the token number/enum; for quick comparisons
36line int // the line in the source where the token occurred
37col int // the column in the source where the token occurred
38}
39
40// full_col returns the full column information which includes the length
41pub fn (t Token) full_col() int {
42return t.col + t.lit.len
43}
44
45// list of characters commonly used in JSON.
46const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
47// list of newlines to check when moving to a new position.
48const newlines = [`\r`, `\n`, `\t`]!
49// list of escapable that needs to be escaped inside a JSON string.
50// double quotes and forward slashes are excluded intentionally since
51// they have their own separate checks for it in order to pass the
52// JSON test suite (https://github.com/nst/JSONTestSuite/).
53const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
54// list of valid unicode escapes aside from \u{4-hex digits}
55const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
56// used for transforming escapes into valid unicode (eg. n => \n)
57const unicode_transform_escapes = {
5898: `\b`
59102: `\f`
60110: `\n`
61114: `\r`
62116: `\t`
6392: `\\`
6434: `"`
6547: `/`
66}
67const exp_signs = [u8(`-`), `+`]!
68
69// move_pos proceeds to the next position.
70fn (mut s Scanner) move() {
71s.move_pos(true, true)
72}
73
74// move_pos_with_newlines is the same as move_pos but only enables newline checking.
75fn (mut s Scanner) move_pos_with_newlines() {
76s.move_pos(false, true)
77}
78
79fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
80s.pos++
81if s.pos < s.text.len {
82if include_newlines && s.text[s.pos] in newlines {
83s.line++
84s.col = 0
85if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
86s.pos++
87}
88for s.pos < s.text.len && s.text[s.pos] in newlines {
89s.move()
90}
91} else if include_space && s.text[s.pos] == ` ` {
92s.pos++
93s.col++
94for s.pos < s.text.len && s.text[s.pos] == ` ` {
95s.move()
96}
97}
98} else {
99s.col++
100}
101}
102
103// error returns an error token.
104fn (s Scanner) error(description string) Token {
105return s.tokenize(description.bytes(), .error)
106}
107
108// tokenize returns a token based on the given lit and kind.
109fn (s Scanner) tokenize(lit []u8, kind TokenKind) Token {
110return Token{
111lit: lit
112kind: kind
113col: s.col
114line: s.line
115}
116}
117
118// text_scan scans and returns a string token.
119@[manualfree]
120fn (mut s Scanner) text_scan() Token {
121mut has_closed := false
122mut chrs := []u8{}
123for {
124s.pos++
125s.col++
126if s.pos >= s.text.len {
127break
128}
129ch := s.text[s.pos]
130if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` {
131has_closed = true
132break
133} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch in important_escapable_chars {
134return s.error('character must be escaped with a backslash')
135} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == u8(0) {
136return s.error('invalid backslash escape')
137} else if s.pos + 1 < s.text.len && ch == `\\` {
138peek := s.text[s.pos + 1]
139if peek in valid_unicode_escapes {
140chrs << unicode_transform_escapes[int(peek)]
141s.pos++
142s.col++
143continue
144} else if peek == `u` {
145if s.pos + 5 < s.text.len {
146s.pos++
147s.col++
148mut codepoint := []u8{}
149codepoint_start := s.pos
150for s.pos < s.text.len && s.pos < codepoint_start + 4 {
151s.pos++
152s.col++
153if s.text[s.pos] == `"` {
154break
155} else if !s.text[s.pos].is_hex_digit() {
156x := s.text[s.pos].ascii_str()
157return s.error('`${x}` is not a hex digit')
158}
159codepoint << s.text[s.pos]
160}
161if codepoint.len != 4 {
162return s.error('unicode escape must have 4 hex digits')
163}
164val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
165converted := utf32_to_str(val)
166converted_bytes := converted.bytes()
167chrs << converted_bytes
168unsafe {
169converted.free()
170converted_bytes.free()
171codepoint.free()
172}
173continue
174} else {
175return s.error('incomplete unicode escape')
176}
177} else if peek == `U` {
178return s.error('unicode endpoints must be in lowercase `u`')
179} else if peek == u8(229) {
180return s.error('unicode endpoint not allowed')
181} else {
182return s.error('invalid backslash escape')
183}
184}
185chrs << ch
186}
187tok := s.tokenize(chrs, .str_)
188s.move()
189if !has_closed {
190return s.error('missing double quotes in string closing')
191}
192return tok
193}
194
195// num_scan scans and returns an int/float token.
196fn (mut s Scanner) num_scan() Token {
197// analyze json number structure
198// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
199mut is_fl := false
200mut dot_index := -1
201mut digits := []u8{}
202if s.text[s.pos] == `-` {
203digits << `-`
204if !s.text[s.pos + 1].is_digit() {
205return s.invalid_token()
206}
207s.move_pos_with_newlines()
208}
209if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
210return s.error('leading zeroes in a number are not allowed')
211}
212for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
213digits << s.text[s.pos]
214if s.text[s.pos] == `.` {
215is_fl = true
216dot_index = digits.len - 1
217}
218s.move_pos_with_newlines()
219}
220if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
221return s.error('invalid float')
222}
223if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
224digits << s.text[s.pos]
225s.move_pos_with_newlines()
226if s.pos < s.text.len && s.text[s.pos] in exp_signs {
227digits << s.text[s.pos]
228s.move_pos_with_newlines()
229}
230mut exp_digits_count := 0
231for s.pos < s.text.len && s.text[s.pos].is_digit() {
232digits << s.text[s.pos]
233exp_digits_count++
234s.move_pos_with_newlines()
235}
236if exp_digits_count == 0 {
237return s.error('invalid exponent')
238}
239}
240kind := if is_fl { TokenKind.float } else { TokenKind.int_ }
241return s.tokenize(digits, kind)
242}
243
244// invalid_token returns an error token with the invalid token message.
245fn (s Scanner) invalid_token() Token {
246if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 {
247x := s.text[s.pos].ascii_str()
248return s.error('invalid token `${x}`')
249} else {
250x := s.text[s.pos].str_escaped()
251return s.error('invalid token `${x}`')
252}
253}
254
255// scan returns a token based on the scanner's current position.
256// used to set the next token
257@[manualfree]
258fn (mut s Scanner) scan() Token {
259if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in newlines) {
260s.move()
261}
262if s.pos >= s.text.len {
263return s.tokenize([]u8{}, .eof)
264} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
265ident := s.text[s.pos..s.pos + 4].bytestr()
266if ident == 'true' || ident == 'null' {
267mut kind := TokenKind.null
268if ident == 'true' {
269kind = .bool_
270}
271unsafe { ident.free() }
272val := s.text[s.pos..s.pos + 4]
273tok := s.tokenize(val, kind)
274s.move() // n / t
275s.move() // u / r
276s.move() // l / u
277s.move() // l / e
278return tok
279}
280unsafe { ident.free() }
281return s.invalid_token()
282} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
283ident := s.text[s.pos..s.pos + 5].bytestr()
284if ident == 'false' {
285unsafe { ident.free() }
286val := s.text[s.pos..s.pos + 5]
287tok := s.tokenize(val, .bool_)
288s.move() // f
289s.move() // a
290s.move() // l
291s.move() // s
292s.move() // e
293return tok
294}
295unsafe { ident.free() }
296return s.invalid_token()
297} else if s.text[s.pos] in char_list {
298chr := s.text[s.pos]
299tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
300s.move()
301return tok
302} else if s.text[s.pos] == `"` {
303return s.text_scan()
304} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
305return s.num_scan()
306} else {
307return s.invalid_token()
308}
309}
310