From a5e39d25125c503fa6f76571e4c62454876b9584 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 27 Dec 2013 17:31:08 -0800 Subject: [PATCH] Start work on lexing --- TreeSitter.xcodeproj/project.pbxproj | 32 +- include/runtime.h | 33 +- spec/compiler/generate_parsers.cpp | 6 +- spec/compiler/lr/parse_table_builder_spec.cpp | 43 --- spec/compiler/lr/table_builder_spec.cpp | 61 ++++ spec/compiler/spec_helper.cpp | 17 +- spec/compiler/spec_helper.h | 3 +- spec/fixtures/parsers/arithmetic.c | 313 ++++++++++-------- src/compiler/char_match.cpp | 4 +- src/compiler/char_match.h | 13 + src/compiler/code_gen/c_code.cpp | 140 ++++++-- src/compiler/code_gen/c_code.h | 3 +- src/compiler/lr/lex_table.cpp | 67 ++++ src/compiler/lr/lex_table.h | 72 ++++ src/compiler/lr/parse_table.cpp | 12 +- src/compiler/lr/parse_table.h | 33 +- src/compiler/lr/parse_table_builder.cpp | 78 ----- src/compiler/lr/table_builder.cpp | 115 +++++++ ...{parse_table_builder.h => table_builder.h} | 3 +- src/runtime/parser.c | 5 +- 20 files changed, 719 insertions(+), 334 deletions(-) delete mode 100644 spec/compiler/lr/parse_table_builder_spec.cpp create mode 100644 spec/compiler/lr/table_builder_spec.cpp create mode 100644 src/compiler/lr/lex_table.cpp create mode 100644 src/compiler/lr/lex_table.h delete mode 100644 src/compiler/lr/parse_table_builder.cpp create mode 100644 src/compiler/lr/table_builder.cpp rename src/compiler/lr/{parse_table_builder.h => table_builder.h} (66%) diff --git a/TreeSitter.xcodeproj/project.pbxproj b/TreeSitter.xcodeproj/project.pbxproj index da97714c..e0300fb3 100644 --- a/TreeSitter.xcodeproj/project.pbxproj +++ b/TreeSitter.xcodeproj/project.pbxproj @@ -19,14 +19,14 @@ 1251209B1830145300C9B56A /* rule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209A1830145300C9B56A /* rule.cpp */; }; 125120A018307DEC00C9B56A /* parse_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209E18307DEC00C9B56A /* parse_table.cpp */; }; 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; }; - 129D242C183EB1EB00FE9F71 /* parse_table_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 129D242A183EB1EB00FE9F71 /* parse_table_builder.cpp */; }; + 129D242C183EB1EB00FE9F71 /* table_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 129D242A183EB1EB00FE9F71 /* table_builder.cpp */; }; 12D136A4183678A2005F3369 /* repeat.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D136A2183678A2005F3369 /* repeat.cpp */; }; 12F9A64E182DD5FD00FAF50C /* spec_helper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */; }; 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64F182DD6BC00FAF50C /* grammar.cpp */; }; 12FD4061185E68470041A84E /* c_code.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD405F185E68470041A84E /* c_code.cpp */; }; 12FD4064185E75290041A84E /* generate_parsers.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD4063185E75290041A84E /* generate_parsers.cpp */; }; 12FD40B3185EEB5E0041A84E /* seq.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130612182C3A1700FCF928 /* seq.cpp */; }; - 12FD40B4185EEB5E0041A84E /* parse_table_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 129D242A183EB1EB00FE9F71 /* parse_table_builder.cpp */; }; + 12FD40B4185EEB5E0041A84E /* table_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 129D242A183EB1EB00FE9F71 /* table_builder.cpp */; }; 12FD40B6185EEB5E0041A84E /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; }; 12FD40B8185EEB5E0041A84E /* item.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130619182C84DF00FCF928 /* item.cpp */; }; 12FD40B9185EEB5E0041A84E /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130615182C3D2900FCF928 /* string.cpp */; }; @@ -44,7 +44,7 @@ 12FD40CB185EEB5E0041A84E /* pattern.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 27A340F3EEB184C040521323 /* pattern.cpp */; }; 12FD40D2185EEB970041A84E /* arithmetic.c in Sources */ = {isa = PBXBuildFile; fileRef = 12FD4065185E7C2F0041A84E /* arithmetic.c */; }; 12FD40D5185FEEDB0041A84E /* item_set_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1213061D182C857100FCF928 /* item_set_spec.cpp */; }; - 12FD40D6185FEEDB0041A84E /* parse_table_builder_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12512092182F307C00C9B56A /* parse_table_builder_spec.cpp */; }; + 12FD40D6185FEEDB0041A84E /* table_builder_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12512092182F307C00C9B56A /* table_builder_spec.cpp */; }; 12FD40D7185FEEDB0041A84E /* item_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D1369C18328C5A005F3369 /* item_spec.cpp */; }; 12FD40D8185FEEDF0041A84E /* rules_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 121492EA181E200B008E9BDA /* rules_spec.cpp */; }; 12FD40D9185FEEDF0041A84E /* pattern_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D136A0183570F5005F3369 /* pattern_spec.cpp */; }; @@ -59,6 +59,8 @@ 12FD40F01866415D0041A84E /* visitor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40E618639B910041A84E /* visitor.cpp */; }; 12FD40F3186641C00041A84E /* char_match.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F1186641C00041A84E /* char_match.cpp */; }; 12FD40F4186641C00041A84E /* char_match.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F1186641C00041A84E /* char_match.cpp */; }; + 12FD40F7186A16020041A84E /* lex_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F5186A16020041A84E /* lex_table.cpp */; }; + 12FD40F8186A16030041A84E /* lex_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F5186A16020041A84E /* lex_table.cpp */; }; 27A343CA69E17E0F9EBEDF1C /* pattern.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 27A340F3EEB184C040521323 /* pattern.cpp */; }; /* End PBXBuildFile section */ @@ -104,14 +106,14 @@ 12130621182C85D300FCF928 /* item_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = item_set.h; sourceTree = ""; }; 121492E9181E200B008E9BDA /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = spec/main.cpp; sourceTree = SOURCE_ROOT; }; 121492EA181E200B008E9BDA /* rules_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rules_spec.cpp; path = spec/compiler/rules/rules_spec.cpp; sourceTree = SOURCE_ROOT; }; - 12512092182F307C00C9B56A /* parse_table_builder_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = parse_table_builder_spec.cpp; path = spec/compiler/lr/parse_table_builder_spec.cpp; sourceTree = SOURCE_ROOT; }; + 12512092182F307C00C9B56A /* table_builder_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = table_builder_spec.cpp; path = spec/compiler/lr/table_builder_spec.cpp; sourceTree = SOURCE_ROOT; }; 1251209A1830145300C9B56A /* rule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rule.cpp; sourceTree = ""; }; 1251209E18307DEC00C9B56A /* parse_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table.cpp; sourceTree = ""; }; 1251209F18307DEC00C9B56A /* parse_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_table.h; sourceTree = ""; }; 125120A218307FFD00C9B56A /* arithmetic.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = arithmetic.h; path = spec/fixtures/grammars/arithmetic.h; sourceTree = SOURCE_ROOT; }; 125120A3183083BD00C9B56A /* arithmetic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arithmetic.cpp; path = spec/fixtures/grammars/arithmetic.cpp; sourceTree = SOURCE_ROOT; }; - 129D242A183EB1EB00FE9F71 /* parse_table_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table_builder.cpp; sourceTree = ""; }; - 129D242B183EB1EB00FE9F71 /* parse_table_builder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_table_builder.h; sourceTree = ""; }; + 129D242A183EB1EB00FE9F71 /* table_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = table_builder.cpp; sourceTree = ""; }; + 129D242B183EB1EB00FE9F71 /* table_builder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = table_builder.h; sourceTree = ""; }; 12C344421822F27700B07BE3 /* transition_map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transition_map.h; sourceTree = ""; }; 12D1369C18328C5A005F3369 /* item_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = item_spec.cpp; path = spec/compiler/lr/item_spec.cpp; sourceTree = SOURCE_ROOT; }; 12D1369E18342088005F3369 /* todo.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = todo.md; sourceTree = ""; }; @@ -140,6 +142,8 @@ 12FD40E818641FB70041A84E /* rules.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rules.cpp; sourceTree = ""; }; 12FD40F1186641C00041A84E /* char_match.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = char_match.cpp; sourceTree = ""; }; 12FD40F2186641C00041A84E /* char_match.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = char_match.h; sourceTree = ""; }; + 12FD40F5186A16020041A84E /* lex_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lex_table.cpp; sourceTree = ""; }; + 12FD40F6186A16020041A84E /* lex_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lex_table.h; sourceTree = ""; }; 27A340F3EEB184C040521323 /* pattern.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pattern.cpp; sourceTree = ""; }; 27A3438C4FA59A3882E8493B /* pattern.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pattern.h; sourceTree = ""; }; /* End PBXFileReference section */ @@ -200,10 +204,12 @@ 1213061A182C84DF00FCF928 /* item.h */, 12130620182C85D300FCF928 /* item_set.cpp */, 12130621182C85D300FCF928 /* item_set.h */, + 12FD40F5186A16020041A84E /* lex_table.cpp */, + 12FD40F6186A16020041A84E /* lex_table.h */, 1251209E18307DEC00C9B56A /* parse_table.cpp */, 1251209F18307DEC00C9B56A /* parse_table.h */, - 129D242A183EB1EB00FE9F71 /* parse_table_builder.cpp */, - 129D242B183EB1EB00FE9F71 /* parse_table_builder.h */, + 129D242A183EB1EB00FE9F71 /* table_builder.cpp */, + 129D242B183EB1EB00FE9F71 /* table_builder.h */, ); path = lr; sourceTree = ""; @@ -212,7 +218,7 @@ isa = PBXGroup; children = ( 1213061D182C857100FCF928 /* item_set_spec.cpp */, - 12512092182F307C00C9B56A /* parse_table_builder_spec.cpp */, + 12512092182F307C00C9B56A /* table_builder_spec.cpp */, 12D1369C18328C5A005F3369 /* item_spec.cpp */, ); name = lr; @@ -427,7 +433,7 @@ 12FD40D7185FEEDB0041A84E /* item_spec.cpp in Sources */, 12FD40D5185FEEDB0041A84E /* item_set_spec.cpp in Sources */, 12130614182C3A1700FCF928 /* seq.cpp in Sources */, - 129D242C183EB1EB00FE9F71 /* parse_table_builder.cpp in Sources */, + 129D242C183EB1EB00FE9F71 /* table_builder.cpp in Sources */, 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */, 1213061B182C84DF00FCF928 /* item.cpp in Sources */, 12FD40D9185FEEDF0041A84E /* pattern_spec.cpp in Sources */, @@ -436,6 +442,7 @@ 12130611182C3A1100FCF928 /* blank.cpp in Sources */, 1213060E182C398300FCF928 /* choice.cpp in Sources */, 12F9A64E182DD5FD00FAF50C /* spec_helper.cpp in Sources */, + 12FD40F7186A16020041A84E /* lex_table.cpp in Sources */, 12FD40E918641FB70041A84E /* rules.cpp in Sources */, 12FD4061185E68470041A84E /* c_code.cpp in Sources */, 12FD40D8185FEEDF0041A84E /* rules_spec.cpp in Sources */, @@ -443,7 +450,7 @@ 12FD4064185E75290041A84E /* generate_parsers.cpp in Sources */, 1214930E181E200B008E9BDA /* main.cpp in Sources */, 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */, - 12FD40D6185FEEDB0041A84E /* parse_table_builder_spec.cpp in Sources */, + 12FD40D6185FEEDB0041A84E /* table_builder_spec.cpp in Sources */, 12D136A4183678A2005F3369 /* repeat.cpp in Sources */, 12FD40F3186641C00041A84E /* char_match.cpp in Sources */, 12FD40E718639B910041A84E /* visitor.cpp in Sources */, @@ -460,7 +467,7 @@ buildActionMask = 2147483647; files = ( 12FD40B3185EEB5E0041A84E /* seq.cpp in Sources */, - 12FD40B4185EEB5E0041A84E /* parse_table_builder.cpp in Sources */, + 12FD40B4185EEB5E0041A84E /* table_builder.cpp in Sources */, 12FD40B6185EEB5E0041A84E /* arithmetic.cpp in Sources */, 12FD40DD185FF12C0041A84E /* parser.c in Sources */, 12FD40B8185EEB5E0041A84E /* item.cpp in Sources */, @@ -471,6 +478,7 @@ 12FD40BD185EEB5E0041A84E /* choice.cpp in Sources */, 12FD40DF1860064C0041A84E /* tree.c in Sources */, 12FD40BF185EEB5E0041A84E /* c_code.cpp in Sources */, + 12FD40F8186A16030041A84E /* lex_table.cpp in Sources */, 12FD40D2185EEB970041A84E /* arithmetic.c in Sources */, 12FD40DB185FEF0D0041A84E /* arithmetic_spec.cpp in Sources */, 12FD40C0185EEB5E0041A84E /* parse_table.cpp in Sources */, diff --git a/include/runtime.h b/include/runtime.h index 4d160564..eccf8b17 100644 --- a/include/runtime.h +++ b/include/runtime.h @@ -33,37 +33,60 @@ TSParser TSParserMake(const char *input); void TSParserShift(TSParser *parser, TSState state); void TSParserReduce(TSParser *parser, TSSymbol symbol, int child_count); void TSParserError(TSParser *parser); -TSState TSParserState(const TSParser *parser); +TSState TSParserParseState(const TSParser *parser); +TSState TSParserLexState(const TSParser *parser); TSSymbol TSParserLookahead(const TSParser *parser); #pragma mark - DSL #define START_PARSER() \ TSParser parser = TSParserMake(input); \ + start: \ + ts_lex(&parser); + +#define START_LEXER() \ start: -#define LOOKAHEAD() \ +#define LOOKAHEAD_SYM() \ TSParserLookahead(&parser) +#define LOOKAHEAD_CHAR() \ + 'a' + #define PARSE_STATE() \ - TSParserState(&parser) + TSParserParseState(&parser) + +#define LEX_STATE() \ + TSParserLexState(parser) #define SHIFT(number) \ { TSParserShift(&parser, number); break; } -#define ACCEPT() \ +#define ADVANCE(state_index) \ + { break; } + +#define ACCEPT_INPUT() \ + { goto done; } + +#define ACCEPT_TOKEN(symbol) \ { goto done; } #define REDUCE(symbol, child_count) \ { TSParserReduce(&parser, symbol, child_count); break; } -#define ERROR() \ +#define PARSE_ERROR() \ TSParserError(&parser) +#define LEX_ERROR() \ + TSParserError(parser) + #define FINISH_PARSER() \ done: \ return parser.tree; +#define FINISH_LEXER() \ + done: + #endif #ifdef __cplusplus diff --git a/spec/compiler/generate_parsers.cpp b/spec/compiler/generate_parsers.cpp index 2c5fb8ff..60057104 100644 --- a/spec/compiler/generate_parsers.cpp +++ b/spec/compiler/generate_parsers.cpp @@ -1,5 +1,5 @@ #include "spec_helper.h" -#include "parse_table_builder.h" +#include "table_builder.h" #include "parse_table.h" #include "c_code.h" #include @@ -13,8 +13,8 @@ Describe(code_generation) { It(works_for_the_arithmetic_grammar) { Grammar grammar = test_grammars::arithmetic(); - ParseTable table = build_tables(grammar); + pair tables = build_tables(grammar); std::ofstream parser_file(test_parser_dir + "/arithmetic.c"); - parser_file << code_gen::c_code(grammar, table); + parser_file << code_gen::c_code(grammar, tables.first, tables.second); } }; \ No newline at end of file diff --git a/spec/compiler/lr/parse_table_builder_spec.cpp b/spec/compiler/lr/parse_table_builder_spec.cpp deleted file mode 100644 index f1596437..00000000 --- a/spec/compiler/lr/parse_table_builder_spec.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "spec_helper.h" -#include "parse_table_builder.h" -#include "parse_table.h" - -using namespace tree_sitter::lr; - -typedef std::unordered_set actions; - -Describe(ParseTableBuilder_test) { - Grammar grammar = test_grammars::arithmetic(); - ParseTable table = build_tables(grammar); - - It(has_the_right_starting_state) { - AssertThat(table.states[0].actions, Equals(unordered_map({ - { "expression", actions({ ParseAction::Shift(1) }) }, - { "term", actions({ ParseAction::Shift(2) }) }, - { "factor", actions({ ParseAction::Shift(5) }) }, - { "variable", actions({ ParseAction::Shift(8) }) }, - { "number", actions({ ParseAction::Shift(8) }) }, - { "left_paren", actions({ ParseAction::Shift(9) }) } - }))); - } - - It(accepts_when_the_start_symbol_is_reduced) { - AssertThat(table.states[1].actions, Equals(unordered_map({ - { ParseTable::END_OF_INPUT, actions({ ParseAction::Accept() }) } - }))); - } - - It(has_the_right_next_states) { - AssertThat(table.states[2].actions, Equals(unordered_map({ - { "plus", actions({ ParseAction::Shift(3) }) }, - }))); - - AssertThat(table.states[3].actions, Equals(unordered_map({ - { "variable", actions({ ParseAction::Shift(8) }) }, - { "factor", actions({ ParseAction::Shift(5) }) }, - { "left_paren", actions({ ParseAction::Shift(9) }) }, - { "number", actions({ ParseAction::Shift(8) }) }, - { "term", actions({ ParseAction::Shift(4) }) }, - }))); - } -}; diff --git a/spec/compiler/lr/table_builder_spec.cpp b/spec/compiler/lr/table_builder_spec.cpp new file mode 100644 index 00000000..357201b3 --- /dev/null +++ b/spec/compiler/lr/table_builder_spec.cpp @@ -0,0 +1,61 @@ +#include "spec_helper.h" +#include "parse_table.h" +#include "lex_table.h" +#include "table_builder.h" + +using namespace tree_sitter::lr; + +typedef std::unordered_set parse_actions; +typedef std::unordered_set lex_actions; + +Describe(TableBuilderSpec) { + Grammar grammar = test_grammars::arithmetic(); + ParseTable table = build_tables(grammar).first; + LexTable lex_table = build_tables(grammar).second; + + ParseState parse_state(size_t index) { + return table.states[index]; + } + + LexState lex_state(size_t parse_state_index) { + size_t index = table.states[parse_state_index].lex_state_index; + return lex_table.states[index]; + } + + It(has_the_right_starting_state) { + AssertThat(parse_state(0).actions, Equals(unordered_map({ + { "expression", parse_actions({ ParseAction::Shift(1) }) }, + { "term", parse_actions({ ParseAction::Shift(2) }) }, + { "factor", parse_actions({ ParseAction::Shift(5) }) }, + { "variable", parse_actions({ ParseAction::Shift(8) }) }, + { "number", parse_actions({ ParseAction::Shift(8) }) }, + { "left_paren", parse_actions({ ParseAction::Shift(9) }) } + }))); + + AssertThat(lex_state(0).actions, Equals(unordered_map({ + { CharMatchClass(CharClassWord), lex_actions({ LexAction::Advance(1) }) }, + { CharMatchClass(CharClassDigit), lex_actions({ LexAction::Advance(4) }) }, + { CharMatchSpecific('('), lex_actions({ LexAction::Advance(11) }) } + }))); + } + + It(accepts_when_the_start_symbol_is_reduced) { + AssertThat(parse_state(1).actions, Equals(unordered_map({ + { ParseTable::END_OF_INPUT, parse_actions({ ParseAction::Accept() }) } + }))); + } + + It(has_the_right_next_states) { + AssertThat(parse_state(2).actions, Equals(unordered_map({ + { "plus", parse_actions({ ParseAction::Shift(3) }) }, + }))); + + AssertThat(parse_state(3).actions, Equals(unordered_map({ + { "variable", parse_actions({ ParseAction::Shift(8) }) }, + { "factor", parse_actions({ ParseAction::Shift(5) }) }, + { "left_paren", parse_actions({ ParseAction::Shift(9) }) }, + { "number", parse_actions({ ParseAction::Shift(8) }) }, + { "term", parse_actions({ ParseAction::Shift(4) }) }, + }))); + } +}; diff --git a/spec/compiler/spec_helper.cpp b/spec/compiler/spec_helper.cpp index d9d069aa..17d98c70 100644 --- a/spec/compiler/spec_helper.cpp +++ b/spec/compiler/spec_helper.cpp @@ -3,14 +3,16 @@ namespace tree_sitter { namespace lr { - std::ostream& operator<<(std::ostream &stream, const unordered_map> &map) { + template + std::ostream & stream_map_of_sets(std::ostream &stream, const unordered_map> &map) { stream << string("{"); bool started = false; for (auto pair : map) { if (started) stream << string(", "); - stream << string("{") << pair.first << string(", ["); + stream << pair.first; + stream << string(" => ["); bool started_set = false; - for (ParseAction action : pair.second) { + for (TValue action : pair.second) { if (started_set) stream << ", "; stream << action; started_set = true; @@ -18,9 +20,16 @@ namespace tree_sitter { stream << string("]}"); started = true; } - stream << string("}"); return stream; } + + std::ostream& operator<<(std::ostream &stream, const unordered_map> &map) { + return stream_map_of_sets(stream, map); + } + + std::ostream& operator<<(std::ostream &stream, const unordered_map> &map) { + return stream_map_of_sets(stream, map); + } } } diff --git a/spec/compiler/spec_helper.h b/spec/compiler/spec_helper.h index 0a054d1b..ddb09764 100644 --- a/spec/compiler/spec_helper.h +++ b/spec/compiler/spec_helper.h @@ -9,7 +9,7 @@ #include "item_set.h" #include "grammar.h" #include "parse_table.h" -#include "parse_table_builder.h" +#include "table_builder.h" #include "../fixtures/grammars/arithmetic.h" @@ -21,6 +21,7 @@ using namespace igloo; namespace tree_sitter { namespace lr { std::ostream& operator<<(std::ostream &stream, const unordered_map> &map); + std::ostream& operator<<(std::ostream &stream, const unordered_map> &map); } } diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 53603021..28cc68ad 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -1,208 +1,251 @@ #include "runtime.h" #include - +#include typedef enum { - ts_symbol_type_expression, - ts_symbol_type_term, - ts_symbol_type_right_paren, - ts_symbol_type_number, - ts_symbol_type_factor, - ts_symbol_type_variable, - ts_symbol_type_plus, - ts_symbol_type_times, - ts_symbol_type_left_paren, - ts_symbol_type___END__ -} ts_symbol_type; + ts_symbol_expression, + ts_symbol_term, + ts_symbol_right_paren, + ts_symbol_number, + ts_symbol_factor, + ts_symbol_variable, + ts_symbol_plus, + ts_symbol_times, + ts_symbol_left_paren, + ts_symbol___END__ +} ts_symbol; +static void ts_lex(TSParser *parser) { + START_LEXER(); + switch (LEX_STATE()) { + case 0: + if (LOOKAHEAD_CHAR() == '(') + ADVANCE(11); + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(4); + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(1); + LEX_ERROR(); + case 1: + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(2); + LEX_ERROR(); + case 2: + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(3); + LEX_ERROR(); + case 3: + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(1); + LEX_ERROR(); + case 4: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(5); + LEX_ERROR(); + case 5: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(6); + LEX_ERROR(); + case 6: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(7); + LEX_ERROR(); + case 7: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(8); + LEX_ERROR(); + case 8: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(9); + LEX_ERROR(); + case 9: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(10); + LEX_ERROR(); + case 10: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(6); + LEX_ERROR(); + case 11: + ACCEPT_TOKEN(ts_symbol_left_paren); + case 12: + ACCEPT_TOKEN(ts_symbol___START__); + case 13: + if (LOOKAHEAD_CHAR() == '+') + ADVANCE(14); + LEX_ERROR(); + case 14: + ACCEPT_TOKEN(ts_symbol_plus); + case 15: + if (LOOKAHEAD_CHAR() == '(') + ADVANCE(11); + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(4); + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(16); + LEX_ERROR(); + case 16: + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(2); + LEX_ERROR(); + case 17: + ACCEPT_TOKEN(ts_symbol_expression); + case 18: + if (LOOKAHEAD_CHAR() == '*') + ADVANCE(19); + LEX_ERROR(); + case 19: + ACCEPT_TOKEN(ts_symbol_times); + case 20: + if (LOOKAHEAD_CHAR() == '(') + ADVANCE(11); + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(4); + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(16); + LEX_ERROR(); + case 21: + ACCEPT_TOKEN(ts_symbol_term); + case 22: + ACCEPT_TOKEN(ts_symbol_factor); + case 23: + if (LOOKAHEAD_CHAR() == '(') + ADVANCE(11); + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(4); + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(16); + LEX_ERROR(); + case 24: + if (LOOKAHEAD_CHAR() == ')') + ADVANCE(25); + LEX_ERROR(); + case 25: + ACCEPT_TOKEN(ts_symbol_right_paren); + case 26: + ACCEPT_TOKEN(ts_symbol_factor); + default: + LEX_ERROR(); + } + FINISH_LEXER(); +} + TSTree ts_parse_arithmetic(const char *input) { START_PARSER(); switch (PARSE_STATE()) { case 0: - switch (LOOKAHEAD()) { - case ts_symbol_type_left_paren: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_left_paren: SHIFT(9); - case ts_symbol_type_variable: + case ts_symbol_variable: SHIFT(8); - case ts_symbol_type_factor: + case ts_symbol_factor: SHIFT(5); - case ts_symbol_type_number: + case ts_symbol_number: SHIFT(8); - case ts_symbol_type_term: + case ts_symbol_term: SHIFT(2); - case ts_symbol_type_expression: + case ts_symbol_expression: SHIFT(1); default: - ERROR(); + PARSE_ERROR(); } case 1: - switch (LOOKAHEAD()) { - case ts_symbol_type___END__: - ACCEPT(); + switch (LOOKAHEAD_SYM()) { + case ts_symbol___END__: + ACCEPT_INPUT(); default: - ERROR(); + PARSE_ERROR(); } case 2: - switch (LOOKAHEAD()) { - case ts_symbol_type_plus: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_plus: SHIFT(3); default: - ERROR(); + PARSE_ERROR(); } case 3: - switch (LOOKAHEAD()) { - case ts_symbol_type_variable: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_variable: SHIFT(8); - case ts_symbol_type_factor: + case ts_symbol_factor: SHIFT(5); - case ts_symbol_type_left_paren: + case ts_symbol_left_paren: SHIFT(9); - case ts_symbol_type_number: + case ts_symbol_number: SHIFT(8); - case ts_symbol_type_term: + case ts_symbol_term: SHIFT(4); default: - ERROR(); + PARSE_ERROR(); } case 4: - switch (LOOKAHEAD()) { - case ts_symbol_type_expression: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_term: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_right_paren: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_number: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_factor: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_variable: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_times: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_plus: - REDUCE(ts_symbol_type_expression, 3); - case ts_symbol_type_left_paren: - REDUCE(ts_symbol_type_expression, 3); + switch (LOOKAHEAD_SYM()) { default: - ERROR(); + REDUCE(ts_symbol_expression, 3); } case 5: - switch (LOOKAHEAD()) { - case ts_symbol_type_times: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_times: SHIFT(6); default: - ERROR(); + PARSE_ERROR(); } case 6: - switch (LOOKAHEAD()) { - case ts_symbol_type_left_paren: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_left_paren: SHIFT(9); - case ts_symbol_type_number: + case ts_symbol_number: SHIFT(8); - case ts_symbol_type_variable: + case ts_symbol_variable: SHIFT(8); - case ts_symbol_type_factor: + case ts_symbol_factor: SHIFT(7); default: - ERROR(); + PARSE_ERROR(); } case 7: - switch (LOOKAHEAD()) { - case ts_symbol_type_expression: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_term: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_right_paren: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_number: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_factor: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_variable: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_times: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_plus: - REDUCE(ts_symbol_type_term, 3); - case ts_symbol_type_left_paren: - REDUCE(ts_symbol_type_term, 3); + switch (LOOKAHEAD_SYM()) { default: - ERROR(); + REDUCE(ts_symbol_term, 3); } case 8: - switch (LOOKAHEAD()) { - case ts_symbol_type_expression: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_term: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_right_paren: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_number: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_factor: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_variable: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_times: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_plus: - REDUCE(ts_symbol_type_factor, 1); - case ts_symbol_type_left_paren: - REDUCE(ts_symbol_type_factor, 1); + switch (LOOKAHEAD_SYM()) { default: - ERROR(); + REDUCE(ts_symbol_factor, 1); } case 9: - switch (LOOKAHEAD()) { - case ts_symbol_type_left_paren: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_left_paren: SHIFT(9); - case ts_symbol_type_variable: + case ts_symbol_variable: SHIFT(8); - case ts_symbol_type_factor: + case ts_symbol_factor: SHIFT(5); - case ts_symbol_type_number: + case ts_symbol_number: SHIFT(8); - case ts_symbol_type_term: + case ts_symbol_term: SHIFT(2); - case ts_symbol_type_expression: + case ts_symbol_expression: SHIFT(10); default: - ERROR(); + PARSE_ERROR(); } case 10: - switch (LOOKAHEAD()) { - case ts_symbol_type_right_paren: + switch (LOOKAHEAD_SYM()) { + case ts_symbol_right_paren: SHIFT(11); default: - ERROR(); + PARSE_ERROR(); } case 11: - switch (LOOKAHEAD()) { - case ts_symbol_type_expression: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_term: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_right_paren: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_number: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_factor: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_variable: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_times: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_plus: - REDUCE(ts_symbol_type_factor, 3); - case ts_symbol_type_left_paren: - REDUCE(ts_symbol_type_factor, 3); + switch (LOOKAHEAD_SYM()) { default: - ERROR(); + REDUCE(ts_symbol_factor, 3); } default: - ERROR(); + PARSE_ERROR(); } FINISH_PARSER(); } diff --git a/src/compiler/char_match.cpp b/src/compiler/char_match.cpp index fac4af4a..02b8e2b6 100644 --- a/src/compiler/char_match.cpp +++ b/src/compiler/char_match.cpp @@ -10,13 +10,13 @@ namespace tree_sitter { } CharMatch CharMatchClass(CharClass value) { - CharMatch result = { .type = CharMatchTypeSpecific }; + CharMatch result = { .type = CharMatchTypeClass }; result.value.character = value; return result; } CharMatch CharMatchRange(char min, char max) { - CharMatch result = { .type = CharMatchTypeSpecific }; + CharMatch result = { .type = CharMatchTypeRange }; result.value.range.min_character = min; result.value.range.max_character = max; return result; diff --git a/src/compiler/char_match.h b/src/compiler/char_match.h index db03b125..4780d4c6 100644 --- a/src/compiler/char_match.h +++ b/src/compiler/char_match.h @@ -1,6 +1,7 @@ #ifndef __TreeSitter__char_match__ #define __TreeSitter__char_match__ +#include #include namespace tree_sitter { @@ -36,4 +37,16 @@ namespace tree_sitter { std::ostream& operator<<(std::ostream& stream, const CharMatch &rule); } +namespace std { + template<> + struct hash { + size_t operator()(const tree_sitter::CharMatch &match) const { + return ( + hash()(match.type) ^ + hash()(match.value.range.min_character) ^ + hash()(match.value.range.max_character)); + } + }; +} + #endif diff --git a/src/compiler/code_gen/c_code.cpp b/src/compiler/code_gen/c_code.cpp index 50aec939..6664eacc 100644 --- a/src/compiler/code_gen/c_code.cpp +++ b/src/compiler/code_gen/c_code.cpp @@ -4,6 +4,7 @@ #include using std::string; +using std::to_string; using std::unordered_map; using std::unordered_set; using std::vector; @@ -22,7 +23,7 @@ namespace tree_sitter { } } - string indent(std::string input) { + string indent(string input) { string tab = " "; str_replace(input, "\n", "\n" + tab); return tab + input; @@ -44,48 +45,108 @@ namespace tree_sitter { indent(body); } + string _if(string condition, string body) { + return string("if (") + condition + ")\n" + + indent(body) + "\n"; + } + class CCodeGenerator { const Grammar grammar; const ParseTable parse_table; + const LexTable lex_table; public: - CCodeGenerator(const Grammar &grammar, const ParseTable &parse_table) : + CCodeGenerator(const Grammar &grammar, const ParseTable &parse_table, const LexTable &lex_table) : grammar(grammar), - parse_table(parse_table) + parse_table(parse_table), + lex_table(lex_table) {} string symbol_id(string symbol_name) { - return "ts_symbol_type_" + symbol_name; + return "ts_symbol_" + symbol_name; } - string code_for_actions(const unordered_set &actions) { - auto action = *actions.begin(); - switch (action.type) { - case ParseActionTypeAccept: - return "ACCEPT();"; - case ParseActionTypeShift: - return "SHIFT(" + std::to_string(action.state_index) + ");"; - case ParseActionTypeReduce: - return "REDUCE(" + symbol_id(action.symbol_name) + ", " + std::to_string(action.child_symbol_count) + ");"; + string condition_for_char_match(const CharMatch &char_match) { + auto value = "LOOKAHEAD_CHAR()"; + switch (char_match.type) { + case CharMatchTypeClass: + switch (char_match.value.character_class) { + case CharClassDigit: + return string("isdigit(") + value + ")"; + case CharClassWord: + return string("isalnum(") + value + ")"; + } + case CharMatchTypeSpecific: + return string(value) + " == '" + char_match.value.character + "'"; default: return ""; } } - string switch_on_lookahead(const ParseState &parse_state) { + string code_for_parse_actions(const unordered_set &actions) { + auto action = actions.begin(); + if (action == actions.end()) { + return "PARSE_ERROR();"; + } else { + switch (action->type) { + case ParseActionTypeAccept: + return "ACCEPT_INPUT();"; + case ParseActionTypeShift: + return "SHIFT(" + to_string(action->state_index) + ");"; + case ParseActionTypeReduce: + return "REDUCE(" + symbol_id(action->symbol_name) + ", " + std::to_string(action->child_symbol_count) + ");"; + default: + return ""; + } + } + } + + string code_for_lex_actions(const unordered_set &actions) { + auto action = actions.begin(); + if (action == actions.end()) { + return "LEX_ERROR();"; + } else { + switch (action->type) { + case LexActionTypeAdvance: + return "ADVANCE(" + to_string(action->state_index) + ");"; + case LexActionTypeAccept: + return "ACCEPT_TOKEN(" + symbol_id(action->symbol_name) + ");"; + case LexActionTypeError: + return ""; + } + } + } + + string switch_on_lookahead_sym(const ParseState &parse_state) { string body = ""; for (auto pair : parse_state.actions) - body += _case(symbol_id(pair.first), code_for_actions(pair.second)); - body += _default("ERROR();"); - return _switch("LOOKAHEAD()", body); + body += _case(symbol_id(pair.first), code_for_parse_actions(pair.second)); + body += _default(code_for_parse_actions(parse_state.default_actions)); + return _switch("LOOKAHEAD_SYM()", body); } - - string switch_on_current_state(const ParseTable &parse_table) { + + string switch_on_lookahead_char(const LexState &parse_state) { + string result = ""; + for (auto pair : parse_state.actions) + result += _if(condition_for_char_match(pair.first), code_for_lex_actions(pair.second)); + result += code_for_lex_actions(parse_state.default_actions); + return result; + } + + string switch_on_parse_state() { string body = ""; for (int i = 0; i < parse_table.states.size(); i++) - body += _case(std::to_string(i), switch_on_lookahead(parse_table.states[i])); - body += _default("ERROR();"); + body += _case(std::to_string(i), switch_on_lookahead_sym(parse_table.states[i])); + body += _default("PARSE_ERROR();"); return _switch("PARSE_STATE()", body); } + + string switch_on_lex_state() { + string body = ""; + for (int i = 0; i < lex_table.states.size(); i++) + body += _case(std::to_string(i), switch_on_lookahead_char(lex_table.states[i])); + body += _default("LEX_ERROR();"); + return _switch("LEX_STATE()", body); + } string symbol_enum() { string result = "typedef enum {\n"; @@ -93,32 +154,45 @@ namespace tree_sitter { result += indent(symbol_id(rule_name)) + ",\n"; result += indent(symbol_id(ParseTable::END_OF_INPUT)); return result + "\n" - "} ts_symbol_type;\n"; + "} ts_symbol;\n"; + } + + string includes() { + return string( + "#include \"runtime.h\"\n" + "#include \n" + "#include "); } string parse_function() { return "TSTree ts_parse_arithmetic(const char *input) {\n" + indent("START_PARSER();") + "\n" + - indent(switch_on_current_state(parse_table)) + "\n" + + indent(switch_on_parse_state()) + "\n" + indent("FINISH_PARSER();") + "\n" "}"; } + + string lex_function() { + return + "static void ts_lex(TSParser *parser) {\n" + + indent("START_LEXER();") + "\n" + + indent(switch_on_lex_state()) + "\n" + + indent("FINISH_LEXER();") + "\n" + "}"; + } string code() { - return - "#include \"runtime.h\"\n" - "#include \n" - "\n\n" + - symbol_enum() + - "\n\n" + - parse_function() + - "\n"; + return + includes() + "\n\n" + + symbol_enum() + "\n\n" + + lex_function() + "\n\n" + + parse_function() + "\n"; } }; - string c_code(const Grammar &grammar, const ParseTable &parse_table) { - return CCodeGenerator(grammar, parse_table).code(); + string c_code(const Grammar &grammar, const ParseTable &parse_table, const LexTable &lex_table) { + return CCodeGenerator(grammar, parse_table, lex_table).code(); } } } \ No newline at end of file diff --git a/src/compiler/code_gen/c_code.h b/src/compiler/code_gen/c_code.h index c190f291..e4a44d51 100644 --- a/src/compiler/code_gen/c_code.h +++ b/src/compiler/code_gen/c_code.h @@ -3,10 +3,11 @@ #include "grammar.h" #include "parse_table.h" +#include "lex_table.h" namespace tree_sitter { namespace code_gen { - std::string c_code(const Grammar &grammar, const lr::ParseTable &parse_table); + std::string c_code(const Grammar &grammar, const lr::ParseTable &parse_table, const lr::LexTable &lex_table); } } diff --git a/src/compiler/lr/lex_table.cpp b/src/compiler/lr/lex_table.cpp new file mode 100644 index 00000000..f980de32 --- /dev/null +++ b/src/compiler/lr/lex_table.cpp @@ -0,0 +1,67 @@ +#include "lex_table.h" + +using std::string; +using std::to_string; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +namespace tree_sitter { + namespace lr { + + // Action + LexAction::LexAction(LexActionType type, size_t state_index, std::string symbol_name) : + type(type), + state_index(state_index), + symbol_name(symbol_name) {} + + LexAction LexAction::Error() { + return LexAction(LexActionTypeError, -1, ""); + } + + LexAction LexAction::Advance(size_t state_index) { + return LexAction(LexActionTypeAdvance, state_index, ""); + } + + LexAction LexAction::Accept(std::string symbol_name) { + return LexAction(LexActionTypeAccept, -1, symbol_name); + } + + bool LexAction::operator==(const LexAction &other) const { + return + (type == other.type) && + (state_index == other.state_index) && + (symbol_name == other.symbol_name); + } + + std::ostream& operator<<(std::ostream &stream, const LexAction &action) { + switch (action.type) { + case LexActionTypeError: + return stream << string("(error)"); + case LexActionTypeAccept: + return stream << string("(accept ") + action.symbol_name + ")"; + case LexActionTypeAdvance: + return stream << string("(advance ") + to_string(action.state_index) + ")"; + } + } + + // State + LexState::LexState() : actions(unordered_map>()) {} + + // Table + LexTable::LexTable(vector rule_names) : symbol_names(rule_names) {} + + size_t LexTable::add_state() { + states.push_back(LexState()); + return states.size() - 1; + } + + void LexTable::add_action(size_t state_index, CharMatch match, LexAction action) { + states[state_index].actions[match].insert(action); + } + + void LexTable::add_default_action(size_t state_index, LexAction action) { + states[state_index].default_actions.insert(action); + } + } +} \ No newline at end of file diff --git a/src/compiler/lr/lex_table.h b/src/compiler/lr/lex_table.h new file mode 100644 index 00000000..d5444d45 --- /dev/null +++ b/src/compiler/lr/lex_table.h @@ -0,0 +1,72 @@ +#ifndef __TreeSitter__lex_table__ +#define __TreeSitter__lex_table__ + +#include +#include +#include +#include +#include "char_match.h" + +namespace tree_sitter { + namespace lr { + typedef enum { + LexActionTypeAccept, + LexActionTypeError, + LexActionTypeAdvance + } LexActionType; + + class LexAction { + LexAction(LexActionType type, size_t state_index, std::string symbol_name); + public: + static LexAction Accept(std::string symbol_name); + static LexAction Error(); + static LexAction Advance(size_t state_index); + bool operator==(const LexAction &action) const; + + LexActionType type; + std::string symbol_name; + size_t state_index; + }; + } +} + +namespace std { + template<> + struct hash { + size_t operator()(const tree_sitter::lr::LexAction &action) const { + return ( + hash()(action.type) ^ + hash()(action.symbol_name) ^ + hash()(action.state_index)); + } + }; +} + +namespace tree_sitter { + namespace lr { + std::ostream& operator<<(std::ostream &stream, const LexAction &item); + + class LexState { + public: + LexState(); + std::unordered_map> actions; + std::unordered_set default_actions; + }; + + class LexTable { + public: + LexTable(std::vector rule_names); + + size_t add_state(); + void add_action(size_t state_index, CharMatch match, LexAction action); + void add_default_action(size_t state_index, LexAction action); + + static const std::string START; + static const std::string END_OF_INPUT; + std::vector states; + const std::vector symbol_names; + }; + } +} + +#endif diff --git a/src/compiler/lr/parse_table.cpp b/src/compiler/lr/parse_table.cpp index 558126e2..6a4f19c6 100644 --- a/src/compiler/lr/parse_table.cpp +++ b/src/compiler/lr/parse_table.cpp @@ -48,7 +48,11 @@ namespace tree_sitter { } // State - ParseState::ParseState() : actions(unordered_map>()) {} + ParseState::ParseState() : + actions(unordered_map>()), + default_actions(unordered_set()), + lex_state_index(-1) + {} // Table ParseTable::ParseTable(vector symbol_names) : @@ -63,8 +67,12 @@ namespace tree_sitter { void ParseTable::add_action(size_t state_index, string sym_name, ParseAction action) { states[state_index].actions[sym_name].insert(action); } + + void ParseTable::add_default_action(size_t state_index, ParseAction action) { + states[state_index].default_actions.insert(action); + } const string ParseTable::START = "__START__"; const string ParseTable::END_OF_INPUT = "__END__"; } -} \ No newline at end of file +} diff --git a/src/compiler/lr/parse_table.h b/src/compiler/lr/parse_table.h index 90bcbeba..805800e8 100644 --- a/src/compiler/lr/parse_table.h +++ b/src/compiler/lr/parse_table.h @@ -29,13 +29,32 @@ namespace tree_sitter { std::string symbol_name; size_t state_index; }; + } +} +namespace std { + template<> + struct hash { + size_t operator()(const tree_sitter::lr::ParseAction &action) const { + return ( + hash()(action.type) ^ + hash()(action.symbol_name) ^ + hash()(action.state_index) ^ + hash()(action.child_symbol_count)); + } + }; +} + +namespace tree_sitter { + namespace lr { std::ostream& operator<<(std::ostream &stream, const ParseAction &item); class ParseState { public: ParseState(); std::unordered_map> actions; + std::unordered_set default_actions; + size_t lex_state_index; }; class ParseTable { @@ -44,6 +63,7 @@ namespace tree_sitter { size_t add_state(); void add_action(size_t state_index, std::string symbol_name, ParseAction action); + void add_default_action(size_t state_index, ParseAction action); static const std::string START; static const std::string END_OF_INPUT; @@ -53,17 +73,4 @@ namespace tree_sitter { } } -namespace std { - template<> - struct hash { - size_t operator()(const tree_sitter::lr::ParseAction &action) const { - return ( - hash()(action.type) ^ - hash()(action.symbol_name) ^ - hash()(action.state_index) ^ - hash()(action.child_symbol_count)); - } - }; -} - #endif diff --git a/src/compiler/lr/parse_table_builder.cpp b/src/compiler/lr/parse_table_builder.cpp deleted file mode 100644 index e0bb055b..00000000 --- a/src/compiler/lr/parse_table_builder.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "parse_table_builder.h" -#include -#include "item_set.h" -#include "rules.h" -#include "item_set.h" -#include "grammar.h" - -using namespace std; - -namespace tree_sitter { - namespace lr { - static int NOT_FOUND = -1; - - class ParseTableBuilder { - const Grammar grammar; - std::unordered_map state_indices; - ParseTable table; - - long state_index_for_item_set(const ItemSet &item_set) const { - auto entry = state_indices.find(item_set); - return (entry == state_indices.end()) ? NOT_FOUND : entry->second; - } - - void add_shift_actions(const ItemSet &item_set, size_t state_index) { - for (auto transition : item_set.sym_transitions(grammar)) { - rules::Symbol symbol = *transition.first; - ItemSet item_set = *transition.second; - size_t new_state_index = add_item_set(item_set); - table.add_action(state_index, symbol.name, ParseAction::Shift(new_state_index)); - } - } - - void add_reduce_actions(const ItemSet &item_set, size_t state_index) { - for (Item item : item_set) { - if (item.is_done()) { - if (item.rule_name == ParseTable::START) { - table.add_action(state_index, ParseTable::END_OF_INPUT, ParseAction::Accept()); - } else { - for (string rule_name : table.symbol_names) - table.add_action(state_index, rule_name, ParseAction::Reduce(item.rule_name, item.consumed_sym_count)); - } - } - } - } - - size_t add_item_set(const ItemSet &item_set) { - auto state_index = state_index_for_item_set(item_set); - if (state_index == NOT_FOUND) { - state_index = table.add_state(); - state_indices[item_set] = state_index; - - add_shift_actions(item_set, state_index); - add_reduce_actions(item_set, state_index); - } - return state_index; - } - - public: - - ParseTableBuilder(const Grammar &grammar) : - grammar(grammar), - table(ParseTable(grammar.rule_names())), - state_indices(unordered_map()) - {}; - - ParseTable build() { - auto item = Item(ParseTable::START, rules::sym(grammar.start_rule_name), 0); - auto item_set = ItemSet(item, grammar); - add_item_set(item_set); - return table; - } - }; - - ParseTable build_tables(const tree_sitter::Grammar &grammar) { - return ParseTableBuilder(grammar).build(); - } - } -} \ No newline at end of file diff --git a/src/compiler/lr/table_builder.cpp b/src/compiler/lr/table_builder.cpp new file mode 100644 index 00000000..41485535 --- /dev/null +++ b/src/compiler/lr/table_builder.cpp @@ -0,0 +1,115 @@ +#include "table_builder.h" +#include +#include "item_set.h" +#include "rules.h" +#include "item_set.h" +#include "grammar.h" + +using namespace std; + +namespace tree_sitter { + namespace lr { + static int NOT_FOUND = -1; + + class ParseTableBuilder { + const Grammar grammar; + std::unordered_map parse_state_indices; + std::unordered_map lex_state_indices; + ParseTable parse_table; + LexTable lex_table; + + long parse_state_index_for_item_set(const ItemSet &item_set) const { + auto entry = parse_state_indices.find(item_set); + return (entry == parse_state_indices.end()) ? NOT_FOUND : entry->second; + } + + long lex_state_index_for_item_set(const ItemSet &item_set) const { + auto entry = lex_state_indices.find(item_set); + return (entry == lex_state_indices.end()) ? NOT_FOUND : entry->second; + } + + void add_shift_actions(const ItemSet &item_set, size_t state_index) { + for (auto transition : item_set.sym_transitions(grammar)) { + rules::Symbol symbol = *transition.first; + ItemSet item_set = *transition.second; + size_t new_state_index = add_parse_state(item_set); + parse_table.add_action(state_index, symbol.name, ParseAction::Shift(new_state_index)); + } + } + + void add_advance_actions(const ItemSet &item_set, size_t state_index) { + for (auto transition : item_set.char_transitions(grammar)) { + rules::Character rule = *transition.first; + ItemSet item_set = *transition.second; + size_t new_state_index = add_lex_state(item_set); + lex_table.add_action(state_index, rule.value, LexAction::Advance(new_state_index)); + } + } + + void add_accept_token_actions(const ItemSet &item_set, size_t state_index) { + for (Item item : item_set) { + if (item.is_done()) { + lex_table.add_default_action(state_index, LexAction::Accept(item.rule_name)); + } + } + } + + void add_reduce_actions(const ItemSet &item_set, size_t state_index) { + for (Item item : item_set) { + if (item.is_done()) { + if (item.rule_name == ParseTable::START) { + parse_table.add_action(state_index, ParseTable::END_OF_INPUT, ParseAction::Accept()); + } else { + parse_table.add_default_action(state_index, ParseAction::Reduce(item.rule_name, item.consumed_sym_count)); + } + } + } + } + + size_t add_lex_state(const ItemSet &item_set) { + auto state_index = lex_state_index_for_item_set(item_set); + if (state_index == NOT_FOUND) { + state_index = lex_table.add_state(); + lex_state_indices[item_set] = state_index; + add_advance_actions(item_set, state_index); + add_accept_token_actions(item_set, state_index); + } + return state_index; + } + + size_t add_parse_state(const ItemSet &item_set) { + auto state_index = parse_state_index_for_item_set(item_set); + if (state_index == NOT_FOUND) { + state_index = parse_table.add_state(); + parse_state_indices[item_set] = state_index; + + parse_table.states[state_index].lex_state_index = add_lex_state(item_set); + add_shift_actions(item_set, state_index); + add_reduce_actions(item_set, state_index); + } + return state_index; + } + + public: + + ParseTableBuilder(const Grammar &grammar) : + grammar(grammar), + parse_table(ParseTable(grammar.rule_names())), + lex_table(LexTable(grammar.rule_names())), + parse_state_indices(unordered_map()), + lex_state_indices(unordered_map()) + {}; + + std::pair build() { + auto item = Item(ParseTable::START, rules::sym(grammar.start_rule_name), 0); + auto item_set = ItemSet(item, grammar); + add_parse_state(item_set); + return std::pair(parse_table, lex_table); + } + }; + + std::pair build_tables(const tree_sitter::Grammar &grammar) { + return ParseTableBuilder(grammar).build(); + } + } +} \ No newline at end of file diff --git a/src/compiler/lr/parse_table_builder.h b/src/compiler/lr/table_builder.h similarity index 66% rename from src/compiler/lr/parse_table_builder.h rename to src/compiler/lr/table_builder.h index 9392838e..cdb934fb 100644 --- a/src/compiler/lr/parse_table_builder.h +++ b/src/compiler/lr/table_builder.h @@ -2,12 +2,13 @@ #define __TreeSitter__parse_table_builder__ #include "parse_table.h" +#include "lex_table.h" namespace tree_sitter { class Grammar; namespace lr { - ParseTable build_tables(const Grammar &grammar); + std::pair build_tables(const Grammar &grammar); } } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index eb161a94..42991bb2 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -36,7 +36,10 @@ TSSymbol TSParserLookahead(const TSParser *parser) { return 1; } -TSState TSParserState(const TSParser *parser) { +TSState TSParserParseState(const TSParser *parser) { return 5; } +TSState TSParserLexState(const TSParser *parser) { + return 5; +}