4
Copyright (c) 2021 МГТУ им. Н.Э. Баумана, кафедра ИУ-6, Михаил Фетисов,
6
https://bmstu.codes/lsx/simodo
9
#include "simodo/inout/token/Tokenizer.h"
16
std::vector<simodo::inout::Tokenizer::_NumberMask> makeInnerMask(std::vector<simodo::inout::NumberMask> mask_set)
18
std::vector<simodo::inout::Tokenizer::_NumberMask> result;
20
for (const simodo::inout::NumberMask & mask : mask_set) {
21
if (mask.chars == simodo::inout::BUILDING_NUMBER)
23
int sh = result.size();
25
/* 00 */ result.push_back({u"N", mask.type, simodo::inout::TokenQualification::Integer, mask.system, {sh+1,sh+3,sh+4}, true, true});
26
/* 01 */ result.push_back({u".", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+2,sh+3,sh+4}, false, true});
27
/* 02 */ result.push_back({u"N", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+3,sh+4}, false, true});
28
/* 03 */ result.push_back({u"e", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+5,sh+6,sh+7}, false, false});
29
/* 04 */ result.push_back({u"E", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+5,sh+6,sh+7}, false, false});
30
/* 05 */ result.push_back({u"+", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+7}, false, false});
31
/* 06 */ result.push_back({u"-", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+7}, false, false});
32
/* 07 */ result.push_back({u"N", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {}, false, true});
35
result.push_back({mask.chars, mask.type, simodo::inout::TokenQualification::Integer, mask.system, {}, true, true});
42
namespace simodo::inout
45
Tokenizer::Tokenizer(uri_index_t uri_index,
46
InputStream_interface &input_stream,
47
const LexicalParameters & parameters,
48
context_index_t context_no)
49
: _scanner(uri_index,input_stream,context_no)
52
assert(!_param.digits.empty());
53
assert(!_param.latin_alphabet.empty());
54
assert(_param.latin_alphabet.size()%2 == 0);
55
assert(_param.national_alphabet.size()%2 == 0);
58
// parameters.masks.push_back({ u"0[1[2]][{3[4]|5[6]}[{{7}[8]}]9]", LexemeType::Number, 10 });
59
// parameters.masks.push_back({ u"123{4|5}", LexemeType::Number, 10 });
61
_numbers = makeInnerMask(parameters.masks);
64
Token Tokenizer::getToken()
66
Token t = getAnyToken();
68
while(t.type() == LexemeType::Comment)
74
Token Tokenizer::getAnyToken()
78
return { LexemeType::Empty, u"", _scanner.makeTokenLocation(), TokenQualification::None, _scanner.context() };
82
if (_scanner.context() != NO_TOKEN_CONTEXT_INDEX)
84
context_index_t context_index = _scanner.context();
86
_scanner.fixLocation(context_index);
87
return scanMarkup(context_index);
93
// Фиксируем координаты начала токена
94
_scanner.fixLocation();
97
return { LexemeType::Empty, u"", _scanner.makeTokenLocation() };
100
// Символ новой строки может быть заменён на спец символ
101
if (!_param.nl_substitution.empty() && _scanner.getChar() == '\n') {
103
return { LexemeType::NewLine, _param.nl_substitution, _scanner.makeTokenLocation() };
106
// Маркированный текст
107
for(size_t i=0; i < _param.markups.size(); ++i) {
108
const MarkupSymbol & mus = _param.markups[i];
110
if (_scanner.startsWith(mus.start))
111
return scanMarkup(static_cast<uint32_t>(i));
114
// Слово (переменная, идентификатор и пр.)
115
if (_scanner.startsWithAnyOf(_param.id_extra_symbols))
116
return scanWord(NationalCharAffiliation::Extra);
118
// Слово (переменная, идентификатор и пр.)
119
if (_scanner.startsWithAnyOf(_param.latin_alphabet))
120
return scanWord(NationalCharAffiliation::Latin);
122
// Слово (переменная, идентификатор и пр.)
123
if (_scanner.startsWithAnyOf(_param.national_alphabet))
124
return scanWord(NationalCharAffiliation::National);
129
TokenQualification qualification;
130
std::u16string lexeme_str;
132
if (scanNumber(type,qualification,lexeme_str))
133
return { type, lexeme_str, _scanner.makeTokenLocation(), qualification };
136
// Многосимвольная пунктуация (не ключевое слово)
137
for(const std::u16string & s : _param.punctuation_words)
138
if (_scanner.startsWith(s))
140
if (s == _param.eof_symbol)
143
_scanner.shift(s.size());
145
return { LexemeType::Punctuation, s, _scanner.makeTokenLocation() };
148
// Односимвольная пунктуация
149
if (_scanner.startsWithAnyOf(_param.punctuation_chars))
152
s.assign(1, _scanner.getFirstChar());
154
if (s == _param.eof_symbol)
159
return { LexemeType::Punctuation, s, _scanner.makeTokenLocation() };
162
// Что-то неизвестное, т.е. ошибка
163
std::u16string lexeme_str;
165
lexeme_str += _scanner.getFirstChar();
168
return { LexemeType::Error, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::UnknownCharacterSet };
171
void Tokenizer::passBlanks()
173
while(!_scanner.eof())
175
if (!_param.nl_substitution.empty() && _scanner.getChar() == '\n')
177
if (!isBlank(_scanner.getChar()))
184
Token Tokenizer::scanMarkup(context_index_t context)
186
assert(context < _param.markups.size());
188
std::u16string lexeme_str;
189
std::u16string token_str;
191
const MarkupSymbol & mus = _param.markups[context];
193
if (_scanner.context() != context)
195
_scanner.shift(mus.start.size());
196
token_str += mus.start;
199
while(!_scanner.eof())
201
if (_scanner.startsWith(mus.ignore_sign))
203
token_str += mus.ignore_sign;
204
_scanner.shift(mus.ignore_sign.size());
209
token_str += _scanner.getFirstChar();
210
lexeme_str += _scanner.getFirstChar();
217
if (_scanner.getFirstChar() == u'\n')
220
else if (_scanner.startsWith(mus.end))
222
token_str += mus.end;
223
_scanner.shift(mus.end.size());
224
context = NO_TOKEN_CONTEXT_INDEX;
228
token_str += _scanner.getFirstChar();
229
lexeme_str += _scanner.getFirstChar();
235
context = NO_TOKEN_CONTEXT_INDEX;
237
TokenLocation loc = _scanner.makeTokenLocation();
239
_scanner.fixLocation(context);
241
return { {lexeme_str, mus.type}, token_str, loc, TokenQualification::None, context };
244
Token Tokenizer::scanWord(Tokenizer::NationalCharAffiliation first_char)
246
bool has_latin = (first_char == NationalCharAffiliation::Latin);
247
bool has_national = (first_char == NationalCharAffiliation::National);
249
std::u16string lexeme_str;
251
lexeme_str += _scanner.getFirstChar();
255
while(!_scanner.eof())
257
if (_scanner.startsWithAnyOf(_param.id_extra_symbols))
259
else if (_scanner.startsWithAnyOf(_param.latin_alphabet))
261
else if (_scanner.startsWithAnyOf(_param.national_alphabet))
263
else if (!_scanner.startsWithAnyOf(_param.digits))
266
lexeme_str += _scanner.getFirstChar();
270
// Многосимвольная пунктуация
271
for(const std::u16string & s : _param.punctuation_words)
275
if (_param.is_case_sensitive)
276
is_find = (s == lexeme_str);
278
is_find = (s == convertToUpper(lexeme_str));
282
if (s == _param.eof_symbol)
285
if (_param.is_case_sensitive)
286
return { {lexeme_str, LexemeType::Punctuation}, s, _scanner.makeTokenLocation(), TokenQualification::Keyword };
288
return { LexemeType::Punctuation, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::Keyword };
292
// Односимвольная пунктуация
293
if (lexeme_str.size() == 1)
294
if (_param.punctuation_chars.find(*lexeme_str.c_str()) != std::u16string::npos)
295
return { LexemeType::Punctuation, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::Keyword};
299
if (!_param.may_national_letters_use)
300
return { LexemeType::Error, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::NationalCharacterUse };
303
return { _param.may_national_letters_mix ? LexemeType::Id : LexemeType::Error,
304
lexeme_str, _scanner.makeTokenLocation(), TokenQualification::NationalCharacterMix };
307
return { LexemeType::Id, lexeme_str, _scanner.makeTokenLocation() };
310
bool Tokenizer::scanNumber(LexemeType &type, TokenQualification &qualification, std::u16string &lexeme_str)
312
for(size_t i_starting=0; i_starting < _numbers.size(); ++i_starting)
314
if (_numbers[i_starting].is_starting)
318
size_t i_mask_index = i_starting;
319
size_t i_mask_char = 0;
321
int16_t N_count = -1;
325
const _NumberMask & mask = _numbers[i_mask_index];
326
char16_t ch = _scanner.getChar(i_input);
328
if (i_mask_char == mask.chars.size())
330
if (lexeme_str.empty())
333
if (mask.chars[i_mask_char-1] == u'n')
336
if (mask.system > 10 )
337
ch_upper = convertLatinToUpper(ch);
341
if (_param.digits.find(ch_upper) < mask.system)
345
if (!mask.refs.empty())
348
for(; i_ref < mask.refs.size(); ++i_ref)
350
uint8_t ref_no = mask.refs[i_ref];
352
assert(static_cast<size_t>(ref_no) < _numbers.size());
354
const _NumberMask & ref_mask = _numbers[static_cast<size_t>(ref_no)];
356
assert(!ref_mask.chars.empty());
360
if (mask.system > 10 )
361
ch_upper = convertLatinToUpper(ch);
365
if (ch == ref_mask.chars[0]
366
|| ((ref_mask.chars[0] == u'N' || ref_mask.chars[0] == u'n') && _param.digits.find(ch_upper) < mask.system))
370
if (i_ref < mask.refs.size())
372
i_mask_index = static_cast<size_t>(mask.refs[i_ref]);
382
qualification = mask.qualification;
383
_scanner.shift(lexeme_str.size());
387
if (mask.chars[i_mask_char] == u'N' || mask.chars[i_mask_char] == u'n')
392
if (mask.system > 10 )
393
ch_upper = convertLatinToUpper(ch);
397
if (_param.digits.find(ch_upper) < mask.system)
400
if (mask.chars[i_mask_char] == u'n')
403
else if (N_count == 0)
405
else if (mask.chars[i_mask_char] == u'n')
413
else if (mask.chars[i_mask_char] == ch)
430
if (!lexeme_str.empty())
432
_scanner.shift(lexeme_str.size());
433
type = LexemeType::Error;
434
qualification = TokenQualification::NotANumber;
441
std::u16string Tokenizer::convertToUpper(std::u16string s) const
446
res += convertToUpper(c);
451
char16_t Tokenizer::convertToUpper(char16_t ch) const
453
std::string::size_type pos_latin = _param.latin_alphabet.find(ch);
454
size_t latin_size = _param.latin_alphabet.size();
455
size_t national_size = _param.national_alphabet.size();
457
if (pos_latin != std::string::npos)
459
if (pos_latin < latin_size/2)
460
return _param.latin_alphabet.at(latin_size/2+pos_latin);
465
std::string::size_type pos_national = _param.national_alphabet.find(ch);
467
if (pos_national != std::string::npos)
468
if (pos_national < national_size/2)
469
return _param.national_alphabet.at(national_size/2+pos_national);
474
char16_t Tokenizer::convertLatinToUpper(char16_t ch) const
476
std::string::size_type pos_latin = _param.latin_alphabet.find(ch);
477
size_t latin_size = _param.latin_alphabet.size();
479
if (pos_latin != std::string::npos)
480
if (pos_latin < latin_size/2)
481
return _param.latin_alphabet.at(latin_size/2+pos_latin);
486
bool Tokenizer::isBlank(char16_t ch) const
488
return (std::u16string::npos != std::u16string(u" \t\r\n").find(ch));