4
Copyright (c) 2021 МГТУ им. Н.Э. Баумана, кафедра ИУ-6, Михаил Фетисов,
6
https://bmstu.codes/lsx/simodo
9
#include "simodo/parser/Grammar.h"
10
#include "simodo/ast/NodeFriend.h"
11
#include "simodo/inout/convert/functions.h"
16
#if __cplusplus >= __cpp_2017
18
namespace fs = std::filesystem;
20
#include <experimental/filesystem>
21
namespace fs = std::filesystem::experimental;
24
namespace simodo::parser
27
size_t Grammar::getColumnIndex(const inout::Lexeme &lexeme) const
29
/// \attention Этот метод используется до заполнения индексов terminal_symbol_index и compound_symbol_index!
31
assert(first_compound_index > 0 && first_compound_index < columns.size());
33
size_t i = (lexeme.type() != inout::LexemeType::Compound) ? 0 : first_compound_index;
34
size_t end = (lexeme.type() != inout::LexemeType::Compound) ? first_compound_index : columns.size();
37
if (lexeme == columns[i])
40
return columns.size();
43
size_t Grammar::getTerminalColumnIndex(const inout::Lexeme &lexeme) const
45
assert(first_compound_index > 0 && first_compound_index < columns.size());
47
for(size_t i=0; i < first_compound_index; ++i)
49
if (lexeme.type() == inout::LexemeType::Id && columns[i].type() == inout::LexemeType::Id)
51
if (lexeme.type() == inout::LexemeType::Number && columns[i].type() == inout::LexemeType::Number)
53
if (lexeme.type() == inout::LexemeType::Comment && columns[i].type() == inout::LexemeType::Comment)
55
if (lexeme.type() == inout::LexemeType::Annotation && columns[i].type() == inout::LexemeType::Annotation)
57
if (lexeme == columns[i])
61
return columns.size();
64
size_t Grammar::getCompoundColumnIndex(const std::u16string &str) const
66
assert(first_compound_index > 0 && first_compound_index < columns.size());
68
for(size_t i=first_compound_index; i < columns.size(); ++i)
69
if (str == columns[i].lexeme())
72
return columns.size();
75
std::u16string getFsmActionChar(FsmActionType action)
79
case FsmActionType::Error:
81
case FsmActionType::Shift:
83
case FsmActionType::Reduce:
85
case FsmActionType::Acceptance:
91
const char * getGrammarBuilderMethodName(TableBuildMethod method)
95
case TableBuildMethod::SLR:
97
case TableBuildMethod::LR1:
99
// case TableBuildMethod::LALR:
101
case TableBuildMethod::none:
108
bool fillLexemeParameters(parser::Grammar &g)
110
assert(!g.columns.empty());
111
assert(g.first_compound_index < g.columns.size());
113
std::u16string punct;
115
for(size_t i=0; i < g.first_compound_index; ++i)
117
if (g.columns[i].type() != inout::LexemeType::Punctuation)
120
const std::u16string & lexeme = g.columns[i].lexeme();
122
if (lexeme.size() == 1)
124
else if (!lexeme.empty())
125
g.lexical.punctuation_words.emplace_back(lexeme);
128
g.lexical.punctuation_chars = punct;
135
const std::u16string GRAMMAR_DUMP_SIGNATURE = u"SIMODO.loom.grammar.dump.v03";
136
const size_t REASONABLE_STRING_LENGTH = 2000;
137
const size_t MAX_STRING_LENGTH = REASONABLE_STRING_LENGTH * 2;
139
const std::string GRAMMAR_DUMP_DIRECTORY = "dump";
140
const std::string GRAMMAR_DUMP_EXTENTION = ".simodo-grammar-dump";
142
bool storeString(std::ostream &os, const std::u16string &s)
144
size_t len = s.size()*sizeof(s[0]);
146
assert(len < MAX_STRING_LENGTH);
148
os.write(reinterpret_cast<char *>(&len), sizeof(len));
151
os.write(reinterpret_cast<const char *>(s.c_str()), static_cast<std::streamsize>(len));
156
std::u16string loadString(std::istream &is)
159
is.read(reinterpret_cast<char *>(&len), sizeof(len));
161
assert(len <= MAX_STRING_LENGTH);
163
char16_t b[MAX_STRING_LENGTH+1];
166
is.read(reinterpret_cast<char *>(b), static_cast<std::streamsize>(len));
168
b[len/sizeof(b[0])] = 0;
170
return std::u16string(b);
173
bool storeLexeme(std::ostream & os, const inout::Lexeme & s)
175
storeString(os, s.lexeme());
177
inout::LexemeType type = s.type();
178
os.write(reinterpret_cast<char *>(&type), sizeof(type));
183
inout::Lexeme loadLexeme(std::istream &is)
185
std::u16string value(loadString(is));
186
inout::LexemeType type;
188
is.read(reinterpret_cast<char *>(&type), sizeof(type));
190
return {value.c_str(), type};
193
bool storeToken(std::ostream &os, const inout::Token &t)
196
storeString(os, t.token());
198
const inout::Range range = t.location().range();
199
os.write(reinterpret_cast<const char *>(&range), sizeof(range));
200
const inout::uri_index_t uri_index = t.location().uri_index();
201
os.write(reinterpret_cast<const char *>(&uri_index), sizeof(uri_index));
203
inout::TokenQualification q = t.qualification();
204
os.write(reinterpret_cast<const char *>(&q), sizeof(q));
209
inout::Token loadToken(std::istream & is)
211
inout::Lexeme lexeme = loadLexeme(is);
212
std::u16string token_string = loadString(is);
214
inout::uri_index_t uri_index;
216
is.read(reinterpret_cast<char *>(&range), sizeof(range));
217
is.read(reinterpret_cast<char *>(&uri_index), sizeof(uri_index));
219
inout::TokenLocation loc(uri_index,range);
221
inout::TokenQualification q;
222
is.read(reinterpret_cast<char *>(&q), sizeof(q));
224
return { lexeme, token_string, loc, q };
227
bool storeAstNode(std::ostream & os, const ast::Node & node)
229
storeString(os, node.host());
230
ast::OperationCode op = node.operation();
231
os.write(reinterpret_cast<const char *>(&op), sizeof(op));
233
storeToken(os, node.token());
234
storeToken(os, node.bound());
236
size_t branch_element_count = node.branches().size();
237
os.write(reinterpret_cast<const char *>(&branch_element_count), sizeof(branch_element_count));
238
for(const ast::Node & bn : node.branches())
239
storeAstNode(os, bn);
244
ast::Node loadAstNode(std::istream & is)
246
std::u16string host_name = loadString(is);
247
ast::OperationCode op;
248
is.read(reinterpret_cast<char *>(&op), sizeof(op));
250
inout::Token symbol = loadToken(is);
251
inout::Token bound = loadToken(is);
253
ast::Node node(host_name, op, symbol, bound);
255
size_t branch_element_count;
256
is.read(reinterpret_cast<char *>(&branch_element_count), sizeof(branch_element_count));
258
ast::NodeFriend::branches(node).reserve(branch_element_count);
259
for(size_t i=0; i < branch_element_count; ++i)
260
ast::NodeFriend::branches(node).emplace_back(loadAstNode(is));
265
bool storeMarkup(std::ostream & os, const inout::MarkupSymbol & ms)
267
storeString(os,ms.start);
268
storeString(os,ms.end);
269
storeString(os,ms.ignore_sign);
271
inout::LexemeType type = ms.type;
272
os.write(reinterpret_cast<char *>(&type), sizeof(type));
277
inout::MarkupSymbol loadMarkup(std::istream & is)
279
std::u16string start = loadString(is);
280
std::u16string end = loadString(is);
281
std::u16string ignore_sign = loadString(is);
283
inout::LexemeType type;
284
is.read(reinterpret_cast<char *>(&type), sizeof(type));
286
return {start, end, ignore_sign, type};
289
bool storeMask(std::ostream & os, const inout::NumberMask & mask)
291
storeString(os,mask.chars);
293
inout::LexemeType type = mask.type;
294
os.write(reinterpret_cast<const char *>(&type), sizeof(type));
296
inout::number_system_t system = mask.system;
297
os.write(reinterpret_cast<const char *>(&system), sizeof(system));
302
inout::NumberMask loadMask(std::istream &is)
304
std::u16string chars = loadString(is);
306
inout::LexemeType type;
307
is.read(reinterpret_cast<char *>(&type), sizeof(type));
309
inout::number_system_t system;
310
is.read(reinterpret_cast<char *>(&system), sizeof(system));
312
return {chars, type, system};
317
bool saveGrammarDump(const std::string & grammar_file, const Grammar & g)
319
fs::path grammar_dump_path { fs::path(grammar_file).parent_path() / GRAMMAR_DUMP_DIRECTORY };
321
if (!fs::exists(grammar_dump_path) && !fs::create_directory(grammar_dump_path))
324
grammar_dump_path /= fs::path(grammar_file).stem().string() + GRAMMAR_DUMP_EXTENTION;
326
std::ofstream os(grammar_dump_path, std::ios::binary);
331
// LSX_GRAMMAR_DUMP_SIGNATURE
332
storeString(os,GRAMMAR_DUMP_SIGNATURE);
334
os.write(reinterpret_cast<const char *>(&g.build_method), sizeof(g.build_method));
336
size_t markups_count = g.lexical.markups.size();
337
os.write(reinterpret_cast<char *>(&markups_count), sizeof(markups_count));
338
for(const inout::MarkupSymbol & ms : g.lexical.markups)
340
size_t masks_count = g.lexical.masks.size();
341
os.write(reinterpret_cast<char *>(&masks_count), sizeof(masks_count));
342
for(const inout::NumberMask & mask : g.lexical.masks)
344
storeString(os, g.lexical.national_alphabet);
345
storeString(os, g.lexical.id_extra_symbols);
346
uint16_t may_national_letters_use = g.lexical.may_national_letters_use;
347
os.write(reinterpret_cast<char *>(&may_national_letters_use), sizeof(may_national_letters_use));
348
uint16_t may_national_letters_mix = g.lexical.may_national_letters_mix;
349
os.write(reinterpret_cast<char *>(&may_national_letters_mix), sizeof(may_national_letters_mix));
350
uint16_t is_case_sensitive = g.lexical.is_case_sensitive;
351
os.write(reinterpret_cast<char *>(&is_case_sensitive), sizeof(is_case_sensitive));
352
storeString(os, g.lexical.nl_substitution);
354
size_t files_count = g.files.size();
355
os.write(reinterpret_cast<char *>(&files_count), sizeof(files_count));
356
for(const std::string & file_path : g.files)
357
storeString(os, inout::toU16(file_path));
359
size_t handlers_count = g.handlers.size();
360
os.write(reinterpret_cast<char *>(&handlers_count), sizeof(handlers_count));
361
for(const auto & [handler_name,handler_ast] : g.handlers)
363
storeString(os, handler_name);
364
storeAstNode(os, handler_ast);
367
size_t rulesCount = g.rules.size();
368
os.write(reinterpret_cast<char *>(&rulesCount), sizeof(rulesCount));
369
for(const GrammarRule & r : g.rules)
371
storeString(os, r.production);
372
size_t patternCount = r.pattern.size();
373
os.write(reinterpret_cast<char *>(&patternCount), sizeof(patternCount));
374
for(const inout::Lexeme & s:r.pattern)
377
storeAstNode(os, r.reduce_action);
379
RuleReduceDirection direction = r.reduce_direction;
380
os.write(reinterpret_cast<char *>(&direction), sizeof(direction));
383
size_t columnsCount = g.columns.size();
384
os.write(reinterpret_cast<char *>(&columnsCount), sizeof(columnsCount));
385
for(const inout::Lexeme & si : g.columns)
387
// first_compound_index
388
os.write(reinterpret_cast<const char *>(&g.first_compound_index), sizeof(g.first_compound_index));
390
size_t linesCount = g.states.size();
391
os.write(reinterpret_cast<char *>(&linesCount), sizeof(linesCount));
392
for(const FsmState_t & state : g.states)
394
size_t positionsCount = state.size();
395
os.write(reinterpret_cast<char *>(&positionsCount), sizeof(positionsCount));
396
for(const FsmStatePosition & p : state)
398
size_t main = p.is_main ? 1 : 0;
399
size_t rno = p.rule_no;
400
size_t pos = p.position;
402
int next = p.next_state_no;
404
os.write(reinterpret_cast<char *>(&rno), sizeof(rno));
405
os.write(reinterpret_cast<char *>(&pos), sizeof(pos));
406
os.write(reinterpret_cast<char *>(&next), sizeof(next));
407
os.write(reinterpret_cast<char *>(&main), sizeof(main));
411
size_t lookahead_count = p.lookahead.size();
413
os.write(reinterpret_cast<char *>(&lookahead_count), sizeof(lookahead_count));
414
for(const inout::Lexeme & lex : p.lookahead)
415
storeLexeme(os, lex);
419
size_t FSMtableCount = g.parse_table.size();
420
os.write(reinterpret_cast<char *>(&FSMtableCount), sizeof(FSMtableCount));
421
for (auto [key,value] : g.parse_table)
423
os.write(reinterpret_cast<const char *>(&key), sizeof(key));
424
os.write(reinterpret_cast<char *>(&value), sizeof(value));
427
os.write(reinterpret_cast<const char *>(&g.key_bound), sizeof(g.key_bound));
429
os.write(reinterpret_cast<const char *>(&g.value_bound), sizeof(g.value_bound));
431
storeString(os,u"end");
437
bool loadGrammarDump(const std::string & grammar_file, Grammar & g)
439
fs::path grammar_dump_path { fs::path(grammar_file).parent_path()
440
/ GRAMMAR_DUMP_DIRECTORY
441
/ (fs::path(grammar_file).stem().string() + GRAMMAR_DUMP_EXTENTION)
444
std::ifstream is(grammar_dump_path, std::ios::binary);
449
// GRAMMAR_DUMP_SIGNATURE
450
bool success = (loadString(is) == GRAMMAR_DUMP_SIGNATURE);
453
is.read(reinterpret_cast<char *>(&g.build_method), sizeof(g.build_method));
456
size_t markups_count;
457
is.read(reinterpret_cast<char *>(&markups_count), sizeof(markups_count));
458
g.lexical.markups.reserve(markups_count);
459
for(size_t i=0; i < markups_count; ++i)
461
inout::MarkupSymbol ms = loadMarkup(is);
463
g.lexical.markups.push_back(ms);
466
is.read(reinterpret_cast<char *>(&masks_count), sizeof(masks_count));
467
g.lexical.masks.reserve(masks_count);
468
for(size_t i=0; i < masks_count; ++i)
470
inout::NumberMask mask = loadMask(is);
472
g.lexical.masks.push_back(mask);
474
g.lexical.national_alphabet = loadString(is);
475
g.lexical.id_extra_symbols = loadString(is);
476
uint16_t may_national_letters_use;
477
is.read(reinterpret_cast<char *>(&may_national_letters_use), sizeof(may_national_letters_use));
478
g.lexical.may_national_letters_use = (may_national_letters_use != 0);
479
uint16_t may_national_letters_mix;
480
is.read(reinterpret_cast<char *>(&may_national_letters_mix), sizeof(may_national_letters_mix));
481
g.lexical.may_national_letters_mix = (may_national_letters_mix != 0);
482
uint16_t is_case_sensitive;
483
is.read(reinterpret_cast<char *>(&is_case_sensitive), sizeof(is_case_sensitive));
484
g.lexical.is_case_sensitive = (is_case_sensitive != 0);
485
g.lexical.nl_substitution = loadString(is);
489
is.read(reinterpret_cast<char *>(&files_count), sizeof(files_count));
490
g.files.reserve(files_count);
491
for(size_t i=0; i < files_count; ++i)
493
std::string file_path = inout::toU8(loadString(is));
495
g.files.push_back(file_path);
499
size_t handlers_count;
500
is.read(reinterpret_cast<char *>(&handlers_count), sizeof(handlers_count));
501
for(size_t i_handler=0; i_handler < handlers_count; ++i_handler)
503
std::u16string handler_name = loadString(is);
504
ast::Node handler_ast = loadAstNode(is);
506
g.handlers.insert({handler_name, handler_ast});
511
is.read(reinterpret_cast<char *>(&rulesCount), sizeof(rulesCount));
512
g.rules.reserve(rulesCount);
513
for(size_t iRule=0; iRule < rulesCount; ++iRule)
515
std::u16string production = loadString(is);
518
is.read(reinterpret_cast<char *>(&patternCount), sizeof(patternCount));
520
std::vector<inout::Lexeme> pattern;
521
pattern.reserve(patternCount);
522
for(size_t iPat=0; iPat < patternCount; ++iPat)
523
pattern.emplace_back(loadLexeme(is));
525
ast::Node node = loadAstNode(is);
527
RuleReduceDirection direction;
528
is.read(reinterpret_cast<char *>(&direction), sizeof(direction));
530
g.rules.emplace_back(production, pattern, node, direction);
535
is.read(reinterpret_cast<char *>(&columnsCount), sizeof(columnsCount));
536
g.columns.reserve(columnsCount);
537
for(size_t iCol=0; iCol < columnsCount; ++iCol)
538
g.columns.emplace_back(loadLexeme(is));
540
// first_compound_index
541
is.read(reinterpret_cast<char *>(&g.first_compound_index), sizeof(g.first_compound_index));
545
is.read(reinterpret_cast<char *>(&linesCount), sizeof(linesCount));
546
g.states.reserve(linesCount);
547
for(size_t iLine=0; iLine < linesCount; ++iLine)
551
size_t positionsCount;
552
is.read(reinterpret_cast<char *>(&positionsCount), sizeof(positionsCount));
553
line.reserve(positionsCount);
554
for(size_t iPos=0; iPos < positionsCount; ++iPos)
561
std::set<inout::Lexeme> lookahead;
563
is.read(reinterpret_cast<char *>(&rno), sizeof(rno));
564
is.read(reinterpret_cast<char *>(&pos), sizeof(pos));
565
is.read(reinterpret_cast<char *>(&next), sizeof(next));
566
is.read(reinterpret_cast<char *>(&main), sizeof(main));
568
size_t lookahead_count;
569
is.read(reinterpret_cast<char *>(&lookahead_count), sizeof(lookahead_count));
570
for(size_t i=0; i < lookahead_count; ++i)
572
inout::Lexeme lex = loadLexeme(is);
573
lookahead.insert(lex);
575
line.emplace_back(rno,pos,lookahead,main,next);
577
g.states.emplace_back(line);
581
size_t FSMtableCount;
582
is.read(reinterpret_cast<char *>(&FSMtableCount), sizeof(FSMtableCount));
583
for(size_t iCount=0; iCount < FSMtableCount; ++iCount)
588
is.read(reinterpret_cast<char *>(&key), sizeof(key));
589
is.read(reinterpret_cast<char *>(&value), sizeof(value));
591
g.parse_table.emplace(key,value);
595
is.read(reinterpret_cast<char *>(&g.key_bound), sizeof(g.key_bound));
598
is.read(reinterpret_cast<char *>(&g.value_bound), sizeof(g.value_bound));
601
std::u16string end = loadString(is);
607
fillLexemeParameters(g);