blitz_query_cpp
302 строки · 7.8 Кб
1#include <parser/tokenizer.hpp>
2
3using namespace blitz_query_cpp;
4
5inline bool is_whitespace(char ch)
6{
7return std::isspace(ch) || ch == ',';
8}
9
10inline bool is_name_char(char ch)
11{
12return std::isalnum(ch) || ch == '_';
13}
14
15inline bool is_name_first_char(char ch)
16{
17return std::isalpha(ch) || ch == '_';
18}
19
20inline bool is_newline_char(char ch)
21{
22return ch == '\n' || ch == '\r';
23}
24
25token_t tokenizer_t::next_token()
26{
27eat_whitespace();
28
29if (current_pos >= query.size())
30{
31return token_t(std::string_view(), query.size(), 0, token_type::End);
32}
33
34char ch = query[current_pos];
35switch (ch)
36{
37case '\0':
38current_pos = query.size();
39return token_t(token_type::End);
40case '{':
41return single_char_token(token_type::LBrace);
42case '}':
43return single_char_token(token_type::RBrace);
44case '(':
45return single_char_token(token_type::LParen);
46case ')':
47return single_char_token(token_type::RParen);
48case ':':
49return single_char_token(token_type::Colon);
50case '[':
51return single_char_token(token_type::LBracket);
52case ']':
53return single_char_token(token_type::RBracket);
54case '!':
55return single_char_token(token_type::NotNull);
56case '|':
57return single_char_token(token_type::Union);
58case '.':
59return handle_dot();
60case '=':
61return handle_eq();
62case '#':
63return read_comment();
64case '$':
65return read_name(1, token_type::ParameterLiteral);
66case '@':
67return read_name(1, token_type::Directive);
68case '"':
69return read_string();
70case '&':
71return single_char_token(token_type::And);
72default:
73if (std::isdigit(ch) || ch == '-' || ch == '+')
74{
75return read_number();
76}
77else
78{
79return read_name(0, token_type::Name);
80}
81}
82}
83
84void tokenizer_t::eat_whitespace()
85{
86if (current_pos >= query.size())
87return;
88do
89{
90char ch = query[current_pos];
91if (ch == '\n')
92{
93line_number++;
94line_start = current_pos;
95}
96if (!is_whitespace(ch))
97{
98break;
99}
100current_pos++;
101} while (current_pos < query.size());
102}
103
104token_t tokenizer_t::single_char_token(token_type type)
105{
106index_t current = current_pos;
107current_pos++;
108return token_t(query.substr(current, 1), current, 1, type);
109}
110
111token_t tokenizer_t::handle_dot()
112{
113if (query.size() - current_pos > 3)
114{
115// ...fragmentName
116if (query[current_pos + 1] == '.' && query[current_pos + 2] == '.')
117{
118index_t current = current_pos;
119current_pos += 3;
120return token_t(query.substr(current, 3), current, 3, token_type::FragmentSpread);
121}
122}
123return single_char_token(token_type::MemberAccess);
124}
125
126token_t tokenizer_t::read_name(index_t skipChars, token_type type)
127{
128index_t start_pos = current_pos;
129current_pos += skipChars;
130if (current_pos < query.size())
131{
132char ch = query[current_pos];
133if (!is_name_first_char(ch))
134{
135return token_t(query.substr(current_pos, 1), current_pos, 1, token_type::InvalidToken);
136}
137current_pos++;
138}
139
140while (current_pos < query.size())
141{
142char ch = query[current_pos];
143if (!is_name_char(ch))
144{
145break;
146}
147current_pos++;
148}
149index_t len = current_pos - start_pos;
150return token_t(query.substr(start_pos, len), start_pos, len, type);
151}
152
153token_t tokenizer_t::read_comment()
154{
155eat_whitespace();
156index_t start_pos = current_pos;
157while (current_pos < query.size())
158{
159char ch = query[current_pos];
160if (is_newline_char(ch))
161{
162line_number++;
163line_start = current_pos;
164break;
165}
166current_pos++;
167}
168index_t len = current_pos - start_pos;
169return token_t(query.substr(start_pos, len), start_pos, len, token_type::Comment);
170}
171
172token_t tokenizer_t::handle_eq()
173{
174index_t pos = current_pos++;
175return token_t(query.substr(pos, 1), pos, 1, token_type::Equal);
176}
177
178token_t tokenizer_t::read_string()
179{
180current_pos++;
181index_t start_pos = current_pos;
182bool block_string = false;
183if (chars_left() >= 5) // could not be a block string """ """
184{
185// check block string
186if (query[current_pos] == '"' && query[current_pos + 1] == '"')
187{
188block_string = true;
189current_pos += 2;
190start_pos += 2;
191}
192}
193
194while (current_pos < query.size())
195{
196char ch = query[current_pos];
197if (ch == '\\') // escape sequence will be decoded later
198{
199// TODO: unicode codepoints \u
200current_pos += 2; // skip backslash and escaped char
201if (current_pos >= query.size())
202{
203return invalid_token_before(1);
204}
205ch = query[current_pos];
206}
207current_pos++;
208if (ch == '"')
209{
210if (!block_string)
211{
212break;
213}
214if (chars_left() < 2)
215{
216return invalid_token_before(0);
217}
218if (query[current_pos] == '"' && query[current_pos + 1] == '"') // block string end
219{
220break;
221}
222}
223}
224index_t len = current_pos - start_pos - 1;
225if (block_string)
226{
227current_pos += 2;
228}
229auto value = unescape_string_value(query.substr(start_pos, len));
230return token_t(value, start_pos, len, block_string ? token_type::StringBlock : token_type::StringLiteral);
231}
232
233token_t tokenizer_t::invalid_token_before(index_t offset)
234{
235return token_t(query.substr(current_pos - offset, 1), current_pos - offset, 1, token_type::InvalidToken);
236}
237
238token_t tokenizer_t::read_number()
239{
240index_t start_pos = current_pos;
241token_type type = token_type::IntLiteral;
242bool has_dot = false;
243bool has_exponent = false;
244
245if (query[current_pos] == '-')
246{
247current_pos++;
248}
249while (current_pos < query.size())
250{
251char ch = query[current_pos];
252if (ch == '.' && !has_dot) // float
253{
254if (has_exponent)
255{
256return invalid_token_before(0);
257}
258current_pos++;
259if (current_pos >= query.size())
260{
261return invalid_token_before(1);
262}
263current_pos++;
264ch = query[current_pos];
265has_dot = true;
266}
267if ((ch == 'e' || ch == 'E') && !has_exponent) // float exp
268{
269current_pos++;
270if (current_pos >= query.size())
271{
272return invalid_token_before(1);
273}
274if (ch == '-' || ch == '+') // exp sign
275{
276current_pos++;
277if (current_pos >= query.size())
278{
279return invalid_token_before(1);
280}
281}
282ch = query[current_pos];
283has_exponent = true;
284}
285
286if (!std::isdigit(ch))
287{
288break;
289}
290current_pos++;
291}
292if (has_dot || has_exponent)
293type = token_type::FloatLiteral;
294index_t len = current_pos - start_pos;
295return token_t(query.substr(start_pos, len), start_pos, len, type);
296}
297
298std::string_view tokenizer_t::unescape_string_value(std::string_view value)
299{
300// TODO: implement
301return value;
302}