blitz_query_cpp

tokenizer.cpp
302 строки · 7.8 Кб
Перенос по словам
1
#include <parser/tokenizer.hpp>
2

3
using namespace blitz_query_cpp;
4

5
inline bool is_whitespace(char ch)
6
{
7
    return std::isspace(ch) || ch == ',';
8
}
9

10
inline bool is_name_char(char ch)
11
{
12
    return std::isalnum(ch) || ch == '_';
13
}
14

15
inline bool is_name_first_char(char ch)
16
{
17
    return std::isalpha(ch) || ch == '_';
18
}
19

20
inline bool is_newline_char(char ch)
21
{
22
    return ch == '\n' || ch == '\r';
23
}
24

25
token_t tokenizer_t::next_token()
26
{
27
    eat_whitespace();
28

29
    if (current_pos >= query.size())
30
    {
31
        return token_t(std::string_view(), query.size(), 0, token_type::End);
32
    }
33

34
    char ch = query[current_pos];
35
    switch (ch)
36
    {
37
    case '\0':
38
        current_pos = query.size();
39
        return token_t(token_type::End);
40
    case '{':
41
        return single_char_token(token_type::LBrace);
42
    case '}':
43
        return single_char_token(token_type::RBrace);
44
    case '(':
45
        return single_char_token(token_type::LParen);
46
    case ')':
47
        return single_char_token(token_type::RParen);
48
    case ':':
49
        return single_char_token(token_type::Colon);
50
    case '[':
51
        return single_char_token(token_type::LBracket);
52
    case ']':
53
        return single_char_token(token_type::RBracket);
54
    case '!':
55
        return single_char_token(token_type::NotNull);
56
    case '|':
57
        return single_char_token(token_type::Union);
58
    case '.':
59
        return handle_dot();
60
    case '=':
61
        return handle_eq();
62
    case '#':
63
        return read_comment();
64
    case '$':
65
        return read_name(1, token_type::ParameterLiteral);
66
    case '@':
67
        return read_name(1, token_type::Directive);
68
    case '"':
69
        return read_string();
70
    case '&':
71
        return single_char_token(token_type::And);
72
    default:
73
        if (std::isdigit(ch) || ch == '-' || ch == '+')
74
        {
75
            return read_number();
76
        }
77
        else
78
        {
79
            return read_name(0, token_type::Name);
80
        }
81
    }
82
}
83

84
void tokenizer_t::eat_whitespace()
85
{
86
    if (current_pos >= query.size())
87
        return;
88
    do
89
    {
90
        char ch = query[current_pos];
91
        if (ch == '\n')
92
        {
93
            line_number++;
94
            line_start = current_pos;
95
        }
96
        if (!is_whitespace(ch))
97
        {
98
            break;
99
        }
100
        current_pos++;
101
    } while (current_pos < query.size());
102
}
103

104
token_t tokenizer_t::single_char_token(token_type type)
105
{
106
    index_t current = current_pos;
107
    current_pos++;
108
    return token_t(query.substr(current, 1), current, 1, type);
109
}
110

111
token_t tokenizer_t::handle_dot()
112
{
113
    if (query.size() - current_pos > 3)
114
    {
115
        // ...fragmentName
116
        if (query[current_pos + 1] == '.' && query[current_pos + 2] == '.')
117
        {
118
            index_t current = current_pos;
119
            current_pos += 3;
120
            return token_t(query.substr(current, 3), current, 3, token_type::FragmentSpread);
121
        }
122
    }
123
    return single_char_token(token_type::MemberAccess);
124
}
125

126
token_t tokenizer_t::read_name(index_t skipChars, token_type type)
127
{
128
    index_t start_pos = current_pos;
129
    current_pos += skipChars;
130
    if (current_pos < query.size())
131
    {
132
        char ch = query[current_pos];
133
        if (!is_name_first_char(ch))
134
        {
135
            return token_t(query.substr(current_pos, 1), current_pos, 1, token_type::InvalidToken);
136
        }
137
        current_pos++;
138
    }
139

140
    while (current_pos < query.size())
141
    {
142
        char ch = query[current_pos];
143
        if (!is_name_char(ch))
144
        {
145
            break;
146
        }
147
        current_pos++;
148
    }
149
    index_t len = current_pos - start_pos;
150
    return token_t(query.substr(start_pos, len), start_pos, len, type);
151
}
152

153
token_t tokenizer_t::read_comment()
154
{
155
    eat_whitespace();
156
    index_t start_pos = current_pos;
157
    while (current_pos < query.size())
158
    {
159
        char ch = query[current_pos];
160
        if (is_newline_char(ch))
161
        {
162
            line_number++;
163
            line_start = current_pos;
164
            break;
165
        }
166
        current_pos++;
167
    }
168
    index_t len = current_pos - start_pos;
169
    return token_t(query.substr(start_pos, len), start_pos, len, token_type::Comment);
170
}
171

172
token_t tokenizer_t::handle_eq()
173
{
174
    index_t pos = current_pos++;
175
    return token_t(query.substr(pos, 1), pos, 1, token_type::Equal);
176
}
177

178
token_t tokenizer_t::read_string()
179
{
180
    current_pos++;
181
    index_t start_pos = current_pos;
182
    bool block_string = false;
183
    if (chars_left() >= 5) // could not be a block string """ """
184
    {
185
        // check block string
186
        if (query[current_pos] == '"' && query[current_pos + 1] == '"')
187
        {
188
            block_string = true;
189
            current_pos += 2;
190
            start_pos += 2;
191
        }
192
    }
193

194
    while (current_pos < query.size())
195
    {
196
        char ch = query[current_pos];
197
        if (ch == '\\') // escape sequence will be decoded later
198
        {
199
            // TODO: unicode codepoints \u
200
            current_pos += 2; // skip backslash and escaped char
201
            if (current_pos >= query.size())
202
            {
203
                return invalid_token_before(1);
204
            }
205
            ch = query[current_pos];
206
        }
207
        current_pos++;
208
        if (ch == '"')
209
        {
210
            if (!block_string)
211
            {
212
                break;
213
            }
214
            if (chars_left() < 2)
215
            {
216
                return invalid_token_before(0);
217
            }
218
            if (query[current_pos] == '"' && query[current_pos + 1] == '"') // block string end
219
            {
220
                break;
221
            }
222
        }
223
    }
224
    index_t len = current_pos - start_pos - 1;
225
    if (block_string)
226
    {
227
        current_pos += 2;
228
    }
229
    auto value = unescape_string_value(query.substr(start_pos, len));
230
    return token_t(value, start_pos, len, block_string ? token_type::StringBlock : token_type::StringLiteral);
231
}
232

233
token_t tokenizer_t::invalid_token_before(index_t offset)
234
{
235
    return token_t(query.substr(current_pos - offset, 1), current_pos - offset, 1, token_type::InvalidToken);
236
}
237

238
token_t tokenizer_t::read_number()
239
{
240
    index_t start_pos = current_pos;
241
    token_type type = token_type::IntLiteral;
242
    bool has_dot = false;
243
    bool has_exponent = false;
244

245
    if (query[current_pos] == '-')
246
    {
247
        current_pos++;
248
    }
249
    while (current_pos < query.size())
250
    {
251
        char ch = query[current_pos];
252
        if (ch == '.' && !has_dot) // float
253
        {
254
            if (has_exponent)
255
            {
256
                return invalid_token_before(0);
257
            }
258
            current_pos++;
259
            if (current_pos >= query.size())
260
            {
261
                return invalid_token_before(1);
262
            }
263
            current_pos++;
264
            ch = query[current_pos];
265
            has_dot = true;
266
        }
267
        if ((ch == 'e' || ch == 'E') && !has_exponent) // float exp
268
        {
269
            current_pos++;
270
            if (current_pos >= query.size())
271
            {
272
                return invalid_token_before(1);
273
            }
274
            if (ch == '-' || ch == '+') // exp sign
275
            {
276
                current_pos++;
277
                if (current_pos >= query.size())
278
                {
279
                    return invalid_token_before(1);
280
                }
281
            }
282
            ch = query[current_pos];
283
            has_exponent = true;
284
        }
285

286
        if (!std::isdigit(ch))
287
        {
288
            break;
289
        }
290
        current_pos++;
291
    }
292
    if (has_dot || has_exponent)
293
        type = token_type::FloatLiteral;
294
    index_t len = current_pos - start_pos;
295
    return token_t(query.substr(start_pos, len), start_pos, len, type);
296
}
297

298
std::string_view tokenizer_t::unescape_string_value(std::string_view value)
299
{
300
    // TODO: implement
301
    return value;
302
}
blitz_query_cpp

Использование cookies