loom

Форк
0
/
Grammar.cpp 
612 строк · 21.3 Кб
1
/*
2
MIT License
3

4
Copyright (c) 2021 МГТУ им. Н.Э. Баумана, кафедра ИУ-6, Михаил Фетисов,
5

6
https://bmstu.codes/lsx/simodo
7
*/
8

9
#include "simodo/parser/Grammar.h"
10
#include "simodo/ast/NodeFriend.h"
11
#include "simodo/inout/convert/functions.h"
12

13
#include <fstream>
14
#include <cassert>
15

16
#if __cplusplus >= __cpp_2017
17
#include <filesystem>
18
namespace fs = std::filesystem;
19
#else
20
#include <experimental/filesystem>
21
namespace fs = std::filesystem::experimental;
22
#endif
23

24
namespace simodo::parser
25
{
26

27
    size_t Grammar::getColumnIndex(const inout::Lexeme &lexeme) const
28
    {
29
        /// \attention Этот метод используется до заполнения индексов terminal_symbol_index и compound_symbol_index!
30

31
        assert(first_compound_index > 0 && first_compound_index < columns.size());
32

33
        size_t i   = (lexeme.type() != inout::LexemeType::Compound) ? 0 : first_compound_index;
34
        size_t end = (lexeme.type() != inout::LexemeType::Compound) ? first_compound_index : columns.size();
35

36
        for(; i < end; ++i)
37
            if (lexeme == columns[i])
38
                return i;
39

40
        return columns.size();
41
    }
42

43
    size_t Grammar::getTerminalColumnIndex(const inout::Lexeme &lexeme) const
44
    {
45
        assert(first_compound_index > 0 && first_compound_index < columns.size());
46

47
        for(size_t i=0; i < first_compound_index; ++i)
48
        {
49
            if (lexeme.type() == inout::LexemeType::Id && columns[i].type() == inout::LexemeType::Id)
50
                return i;
51
            if (lexeme.type() == inout::LexemeType::Number && columns[i].type() == inout::LexemeType::Number)
52
                return i;
53
            if (lexeme.type() == inout::LexemeType::Comment && columns[i].type() == inout::LexemeType::Comment)
54
                return i;
55
            if (lexeme.type() == inout::LexemeType::Annotation && columns[i].type() == inout::LexemeType::Annotation)
56
                return i;
57
            if (lexeme == columns[i])
58
                return i;
59
        }
60

61
        return columns.size();
62
    }
63

64
    size_t Grammar::getCompoundColumnIndex(const std::u16string &str) const
65
    {
66
        assert(first_compound_index > 0 && first_compound_index < columns.size());
67

68
        for(size_t i=first_compound_index; i < columns.size(); ++i)
69
            if (str == columns[i].lexeme())
70
                return i;
71

72
        return columns.size();
73
    }
74

75
    std::u16string getFsmActionChar(FsmActionType action)
76
    {
77
        switch(action)
78
        {
79
        case FsmActionType::Error:
80
            return u"E";
81
        case FsmActionType::Shift:
82
            return u"S";
83
        case FsmActionType::Reduce:
84
            return u"R";
85
        case FsmActionType::Acceptance:
86
            return u"A";
87
        }
88
        return u"*";
89
    }
90

91
    const char * getGrammarBuilderMethodName(TableBuildMethod method)
92
    {
93
        switch(method)
94
        {
95
        case TableBuildMethod::SLR:
96
            return "SLR";
97
        case TableBuildMethod::LR1:
98
            return "LR1";
99
    //    case TableBuildMethod::LALR:
100
    //        return u"LALR";
101
        case TableBuildMethod::none:
102
            return "none";
103
        default:
104
            return "*****";
105
        }
106
    }
107

108
    bool fillLexemeParameters(parser::Grammar &g)
109
    {
110
        assert(!g.columns.empty());
111
        assert(g.first_compound_index < g.columns.size());
112

113
        std::u16string punct;
114

115
        for(size_t i=0; i < g.first_compound_index; ++i)
116
        {
117
            if (g.columns[i].type() != inout::LexemeType::Punctuation)
118
                continue;
119

120
            const std::u16string & lexeme = g.columns[i].lexeme();
121

122
            if (lexeme.size() == 1)
123
                punct += lexeme;
124
            else if (!lexeme.empty())
125
                g.lexical.punctuation_words.emplace_back(lexeme);
126
        }
127

128
        g.lexical.punctuation_chars = punct;
129

130
        return true;
131
    }
132

133
    namespace
134
    {
135
        const std::u16string GRAMMAR_DUMP_SIGNATURE   = u"SIMODO.loom.grammar.dump.v03";
136
        const size_t         REASONABLE_STRING_LENGTH = 2000;
137
        const size_t         MAX_STRING_LENGTH        = REASONABLE_STRING_LENGTH * 2;
138

139
        const std::string    GRAMMAR_DUMP_DIRECTORY   = "dump";
140
        const std::string    GRAMMAR_DUMP_EXTENTION   = ".simodo-grammar-dump";
141

142
        bool storeString(std::ostream &os, const std::u16string &s)
143
        {
144
            size_t  len = s.size()*sizeof(s[0]);
145

146
            assert(len < MAX_STRING_LENGTH);
147

148
            os.write(reinterpret_cast<char *>(&len), sizeof(len));
149

150
            if (len > 0)
151
                os.write(reinterpret_cast<const char *>(s.c_str()), static_cast<std::streamsize>(len));
152

153
            return true;
154
        }
155

156
        std::u16string loadString(std::istream &is)
157
        {
158
            size_t  len;
159
            is.read(reinterpret_cast<char *>(&len), sizeof(len));
160

161
            assert(len <= MAX_STRING_LENGTH);
162

163
            char16_t b[MAX_STRING_LENGTH+1];
164

165
            if (len > 0)
166
                is.read(reinterpret_cast<char *>(b), static_cast<std::streamsize>(len));
167

168
            b[len/sizeof(b[0])] = 0;
169

170
            return std::u16string(b);
171
        }
172

173
        bool storeLexeme(std::ostream & os, const inout::Lexeme & s)
174
        {
175
            storeString(os, s.lexeme());
176

177
            inout::LexemeType type = s.type();
178
            os.write(reinterpret_cast<char *>(&type), sizeof(type));
179

180
            return true;
181
        }
182

183
        inout::Lexeme loadLexeme(std::istream &is)
184
        {
185
            std::u16string    value(loadString(is));
186
            inout::LexemeType type;
187

188
            is.read(reinterpret_cast<char *>(&type), sizeof(type));
189

190
            return {value.c_str(), type};
191
        }
192

193
        bool storeToken(std::ostream &os, const inout::Token &t)
194
        {
195
            storeLexeme(os, t);
196
            storeString(os, t.token());
197

198
            const inout::Range range = t.location().range();
199
            os.write(reinterpret_cast<const char *>(&range), sizeof(range));
200
            const inout::uri_index_t uri_index = t.location().uri_index();
201
            os.write(reinterpret_cast<const char *>(&uri_index), sizeof(uri_index));
202

203
            inout::TokenQualification q = t.qualification();
204
            os.write(reinterpret_cast<const char *>(&q), sizeof(q));
205

206
            return true;
207
        }
208

209
        inout::Token loadToken(std::istream & is)
210
        {
211
            inout::Lexeme       lexeme       = loadLexeme(is);
212
            std::u16string      token_string = loadString(is);
213
            inout::Range        range;
214
            inout::uri_index_t  uri_index;
215

216
            is.read(reinterpret_cast<char *>(&range), sizeof(range));
217
            is.read(reinterpret_cast<char *>(&uri_index), sizeof(uri_index));
218

219
            inout::TokenLocation loc(uri_index,range);
220

221
            inout::TokenQualification q;
222
            is.read(reinterpret_cast<char *>(&q), sizeof(q));
223

224
            return { lexeme, token_string, loc, q };
225
        }
226

227
        bool storeAstNode(std::ostream & os, const ast::Node & node)
228
        {
229
            storeString(os, node.host());
230
            ast::OperationCode op = node.operation();
231
            os.write(reinterpret_cast<const char *>(&op), sizeof(op));
232

233
            storeToken(os, node.token());
234
            storeToken(os, node.bound());
235

236
            size_t branch_element_count = node.branches().size();
237
            os.write(reinterpret_cast<const char *>(&branch_element_count), sizeof(branch_element_count));
238
            for(const ast::Node & bn : node.branches())
239
                storeAstNode(os, bn);
240

241
            return true;
242
        }
243

244
        ast::Node loadAstNode(std::istream & is)
245
        {
246
            std::u16string host_name = loadString(is);
247
            ast::OperationCode op;
248
            is.read(reinterpret_cast<char *>(&op), sizeof(op));
249

250
            inout::Token symbol = loadToken(is);
251
            inout::Token bound = loadToken(is);
252

253
            ast::Node node(host_name, op, symbol, bound);
254

255
            size_t branch_element_count;
256
            is.read(reinterpret_cast<char *>(&branch_element_count), sizeof(branch_element_count));
257

258
            ast::NodeFriend::branches(node).reserve(branch_element_count);
259
            for(size_t i=0; i < branch_element_count; ++i)
260
                ast::NodeFriend::branches(node).emplace_back(loadAstNode(is));
261

262
            return node;
263
        }
264

265
        bool storeMarkup(std::ostream & os, const inout::MarkupSymbol & ms)
266
        {
267
            storeString(os,ms.start);
268
            storeString(os,ms.end);
269
            storeString(os,ms.ignore_sign);
270

271
            inout::LexemeType  type = ms.type;
272
            os.write(reinterpret_cast<char *>(&type), sizeof(type));
273

274
            return true;
275
        }
276

277
        inout::MarkupSymbol loadMarkup(std::istream & is)
278
        {
279
            std::u16string start        = loadString(is);
280
            std::u16string end          = loadString(is);
281
            std::u16string ignore_sign  = loadString(is);
282

283
            inout::LexemeType type;
284
            is.read(reinterpret_cast<char *>(&type), sizeof(type));
285

286
            return {start, end, ignore_sign, type};
287
        }
288

289
        bool storeMask(std::ostream & os, const inout::NumberMask & mask)
290
        {
291
            storeString(os,mask.chars);
292

293
            inout::LexemeType type = mask.type;
294
            os.write(reinterpret_cast<const char *>(&type), sizeof(type));
295

296
            inout::number_system_t system = mask.system;
297
            os.write(reinterpret_cast<const char *>(&system), sizeof(system));
298

299
            return true;
300
        }
301

302
        inout::NumberMask loadMask(std::istream &is)
303
        {
304
            std::u16string chars        = loadString(is);
305

306
            inout::LexemeType type;
307
            is.read(reinterpret_cast<char *>(&type), sizeof(type));
308

309
            inout::number_system_t system;
310
            is.read(reinterpret_cast<char *>(&system), sizeof(system));
311

312
            return {chars, type, system};
313
        }
314

315
    }
316

317
    bool saveGrammarDump(const std::string & grammar_file, const Grammar & g)
318
    {
319
        fs::path grammar_dump_path { fs::path(grammar_file).parent_path() / GRAMMAR_DUMP_DIRECTORY };
320

321
        if (!fs::exists(grammar_dump_path) && !fs::create_directory(grammar_dump_path)) 
322
            return false;
323

324
        grammar_dump_path /= fs::path(grammar_file).stem().string() + GRAMMAR_DUMP_EXTENTION;
325

326
        std::ofstream os(grammar_dump_path, std::ios::binary);
327

328
        if (!os)
329
            return false;
330

331
        // LSX_GRAMMAR_DUMP_SIGNATURE
332
        storeString(os,GRAMMAR_DUMP_SIGNATURE);
333
        // build_method
334
        os.write(reinterpret_cast<const char *>(&g.build_method), sizeof(g.build_method));
335
        // lexical
336
        size_t markups_count = g.lexical.markups.size();
337
        os.write(reinterpret_cast<char *>(&markups_count), sizeof(markups_count));
338
        for(const inout::MarkupSymbol & ms : g.lexical.markups)
339
            storeMarkup(os, ms);
340
        size_t masks_count = g.lexical.masks.size();
341
        os.write(reinterpret_cast<char *>(&masks_count), sizeof(masks_count));
342
        for(const inout::NumberMask & mask : g.lexical.masks)
343
            storeMask(os, mask);
344
        storeString(os, g.lexical.national_alphabet);
345
        storeString(os, g.lexical.id_extra_symbols);
346
        uint16_t may_national_letters_use = g.lexical.may_national_letters_use;
347
        os.write(reinterpret_cast<char *>(&may_national_letters_use), sizeof(may_national_letters_use));
348
        uint16_t may_national_letters_mix = g.lexical.may_national_letters_mix;
349
        os.write(reinterpret_cast<char *>(&may_national_letters_mix), sizeof(may_national_letters_mix));
350
        uint16_t is_case_sensitive = g.lexical.is_case_sensitive;
351
        os.write(reinterpret_cast<char *>(&is_case_sensitive), sizeof(is_case_sensitive));
352
        storeString(os, g.lexical.nl_substitution);
353
        // files
354
        size_t files_count = g.files.size();
355
        os.write(reinterpret_cast<char *>(&files_count), sizeof(files_count));
356
        for(const std::string & file_path : g.files)
357
            storeString(os, inout::toU16(file_path));
358
        // handlers
359
        size_t handlers_count = g.handlers.size();
360
        os.write(reinterpret_cast<char *>(&handlers_count), sizeof(handlers_count));
361
        for(const auto & [handler_name,handler_ast] : g.handlers)
362
        {
363
            storeString(os, handler_name);
364
            storeAstNode(os, handler_ast);
365
        }
366
        // rules
367
        size_t rulesCount = g.rules.size();
368
        os.write(reinterpret_cast<char *>(&rulesCount), sizeof(rulesCount));
369
        for(const GrammarRule & r : g.rules)
370
        {
371
            storeString(os, r.production);
372
            size_t patternCount = r.pattern.size();
373
            os.write(reinterpret_cast<char *>(&patternCount), sizeof(patternCount));
374
            for(const inout::Lexeme & s:r.pattern)
375
                storeLexeme(os, s);
376

377
            storeAstNode(os, r.reduce_action);
378

379
            RuleReduceDirection direction = r.reduce_direction;
380
            os.write(reinterpret_cast<char *>(&direction), sizeof(direction));
381
        }
382
        // columns
383
        size_t columnsCount = g.columns.size();
384
        os.write(reinterpret_cast<char *>(&columnsCount), sizeof(columnsCount));
385
        for(const inout::Lexeme & si : g.columns)
386
            storeLexeme(os, si);
387
        // first_compound_index
388
        os.write(reinterpret_cast<const char *>(&g.first_compound_index), sizeof(g.first_compound_index));
389
        // states
390
        size_t linesCount = g.states.size();
391
        os.write(reinterpret_cast<char *>(&linesCount), sizeof(linesCount));
392
        for(const FsmState_t & state : g.states)
393
        {
394
            size_t positionsCount = state.size();
395
            os.write(reinterpret_cast<char *>(&positionsCount), sizeof(positionsCount));
396
            for(const FsmStatePosition & p : state)
397
            {
398
                size_t main = p.is_main ? 1 : 0;
399
                size_t rno = p.rule_no;
400
                size_t pos = p.position;
401

402
                int next = p.next_state_no;
403

404
                os.write(reinterpret_cast<char *>(&rno), sizeof(rno));
405
                os.write(reinterpret_cast<char *>(&pos), sizeof(pos));
406
                os.write(reinterpret_cast<char *>(&next), sizeof(next));
407
                os.write(reinterpret_cast<char *>(&main), sizeof(main));
408

409
                // lookahead
410

411
                size_t lookahead_count = p.lookahead.size();
412

413
                os.write(reinterpret_cast<char *>(&lookahead_count), sizeof(lookahead_count));
414
                for(const inout::Lexeme & lex : p.lookahead)
415
                    storeLexeme(os, lex);
416
            }
417
        }
418
        // parse_table
419
        size_t FSMtableCount = g.parse_table.size();
420
        os.write(reinterpret_cast<char *>(&FSMtableCount), sizeof(FSMtableCount));
421
        for (auto [key,value] : g.parse_table)
422
        {
423
            os.write(reinterpret_cast<const char *>(&key), sizeof(key));
424
            os.write(reinterpret_cast<char *>(&value), sizeof(value));
425
        }
426
        // key_bound
427
        os.write(reinterpret_cast<const char *>(&g.key_bound), sizeof(g.key_bound));
428
        // value_bound
429
        os.write(reinterpret_cast<const char *>(&g.value_bound), sizeof(g.value_bound));
430
        // "end"
431
        storeString(os,u"end");
432

433
        os.close();
434
        return true;
435
    }
436

437
    bool loadGrammarDump(const std::string & grammar_file, Grammar & g)
438
    {
439
        fs::path grammar_dump_path  {   fs::path(grammar_file).parent_path() 
440
                                        / GRAMMAR_DUMP_DIRECTORY 
441
                                        / (fs::path(grammar_file).stem().string() + GRAMMAR_DUMP_EXTENTION)
442
                                    };
443

444
        std::ifstream is(grammar_dump_path, std::ios::binary);
445

446
        if (!is)
447
            return false;
448

449
        // GRAMMAR_DUMP_SIGNATURE
450
        bool success = (loadString(is) == GRAMMAR_DUMP_SIGNATURE);
451

452
        // build_method
453
        is.read(reinterpret_cast<char *>(&g.build_method), sizeof(g.build_method));
454

455
        // lexical
456
        size_t markups_count;
457
        is.read(reinterpret_cast<char *>(&markups_count), sizeof(markups_count));
458
        g.lexical.markups.reserve(markups_count);
459
        for(size_t i=0; i < markups_count; ++i)
460
        {
461
            inout::MarkupSymbol ms = loadMarkup(is);
462

463
            g.lexical.markups.push_back(ms);
464
        }
465
        size_t masks_count;
466
        is.read(reinterpret_cast<char *>(&masks_count), sizeof(masks_count));
467
        g.lexical.masks.reserve(masks_count);
468
        for(size_t i=0; i < masks_count; ++i)
469
        {
470
            inout::NumberMask mask = loadMask(is);
471

472
            g.lexical.masks.push_back(mask);
473
        }
474
        g.lexical.national_alphabet = loadString(is);
475
        g.lexical.id_extra_symbols = loadString(is);
476
        uint16_t may_national_letters_use;
477
        is.read(reinterpret_cast<char *>(&may_national_letters_use), sizeof(may_national_letters_use));
478
        g.lexical.may_national_letters_use = (may_national_letters_use != 0);
479
        uint16_t may_national_letters_mix;
480
        is.read(reinterpret_cast<char *>(&may_national_letters_mix), sizeof(may_national_letters_mix));
481
        g.lexical.may_national_letters_mix = (may_national_letters_mix != 0);
482
        uint16_t is_case_sensitive;
483
        is.read(reinterpret_cast<char *>(&is_case_sensitive), sizeof(is_case_sensitive));
484
        g.lexical.is_case_sensitive = (is_case_sensitive != 0);
485
        g.lexical.nl_substitution = loadString(is);
486

487
        // files
488
        size_t files_count;
489
        is.read(reinterpret_cast<char *>(&files_count), sizeof(files_count));
490
        g.files.reserve(files_count);
491
        for(size_t i=0; i < files_count; ++i)
492
        {
493
            std::string file_path = inout::toU8(loadString(is));
494

495
            g.files.push_back(file_path);
496
        }
497

498
        // handlers
499
        size_t handlers_count;
500
        is.read(reinterpret_cast<char *>(&handlers_count), sizeof(handlers_count));
501
        for(size_t i_handler=0; i_handler < handlers_count; ++i_handler)
502
        {
503
            std::u16string handler_name = loadString(is);
504
            ast::Node      handler_ast  = loadAstNode(is);
505

506
            g.handlers.insert({handler_name, handler_ast});
507
        }
508

509
        // rules
510
        size_t rulesCount;
511
        is.read(reinterpret_cast<char *>(&rulesCount), sizeof(rulesCount));
512
        g.rules.reserve(rulesCount);
513
        for(size_t iRule=0; iRule < rulesCount; ++iRule)
514
        {
515
            std::u16string production = loadString(is);
516

517
            size_t patternCount;
518
            is.read(reinterpret_cast<char *>(&patternCount), sizeof(patternCount));
519

520
            std::vector<inout::Lexeme> pattern;
521
            pattern.reserve(patternCount);
522
            for(size_t iPat=0; iPat < patternCount; ++iPat)
523
                pattern.emplace_back(loadLexeme(is));
524

525
            ast::Node node = loadAstNode(is);
526

527
            RuleReduceDirection direction;
528
            is.read(reinterpret_cast<char *>(&direction), sizeof(direction));
529

530
            g.rules.emplace_back(production, pattern, node, direction);
531
        }
532

533
        // columns
534
        size_t columnsCount;
535
        is.read(reinterpret_cast<char *>(&columnsCount), sizeof(columnsCount));
536
        g.columns.reserve(columnsCount);
537
        for(size_t iCol=0; iCol < columnsCount; ++iCol)
538
            g.columns.emplace_back(loadLexeme(is));
539

540
        // first_compound_index
541
        is.read(reinterpret_cast<char *>(&g.first_compound_index), sizeof(g.first_compound_index));
542

543
        // states
544
        size_t linesCount;
545
        is.read(reinterpret_cast<char *>(&linesCount), sizeof(linesCount));
546
        g.states.reserve(linesCount);
547
        for(size_t iLine=0; iLine < linesCount; ++iLine)
548
        {
549
            FsmState_t line;
550

551
            size_t positionsCount;
552
            is.read(reinterpret_cast<char *>(&positionsCount), sizeof(positionsCount));
553
            line.reserve(positionsCount);
554
            for(size_t iPos=0; iPos < positionsCount; ++iPos)
555
            {
556
                size_t rno;
557
                size_t pos;
558
                int    next;
559
                size_t main;
560

561
                std::set<inout::Lexeme> lookahead;
562

563
                is.read(reinterpret_cast<char *>(&rno), sizeof(rno));
564
                is.read(reinterpret_cast<char *>(&pos), sizeof(pos));
565
                is.read(reinterpret_cast<char *>(&next), sizeof(next));
566
                is.read(reinterpret_cast<char *>(&main), sizeof(main));
567
                // lookahead
568
                size_t lookahead_count;
569
                is.read(reinterpret_cast<char *>(&lookahead_count), sizeof(lookahead_count));
570
                for(size_t i=0; i < lookahead_count; ++i)
571
                {
572
                    inout::Lexeme lex = loadLexeme(is);
573
                    lookahead.insert(lex);
574
                }
575
                line.emplace_back(rno,pos,lookahead,main,next);
576
            }
577
            g.states.emplace_back(line);
578
        }
579

580
        // parse_table
581
        size_t FSMtableCount;
582
        is.read(reinterpret_cast<char *>(&FSMtableCount), sizeof(FSMtableCount));
583
        for(size_t iCount=0; iCount < FSMtableCount; ++iCount)
584
        {
585
            Fsm_key_t   key;
586
            Fsm_value_t value;
587

588
            is.read(reinterpret_cast<char *>(&key), sizeof(key));
589
            is.read(reinterpret_cast<char *>(&value), sizeof(value));
590

591
            g.parse_table.emplace(key,value);
592
        }
593

594
        // key_bound
595
        is.read(reinterpret_cast<char *>(&g.key_bound), sizeof(g.key_bound));
596

597
        // value_bound
598
        is.read(reinterpret_cast<char *>(&g.value_bound), sizeof(g.value_bound));
599

600
        // "end"
601
        std::u16string end = loadString(is);
602
        if (end == u"end")
603
            success = true;
604

605
        is.close();
606

607
        fillLexemeParameters(g);
608

609
        return success;
610
    }
611

612
}

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.