From c966af041235e42d207e4150cc1ba8cb2ec85c78 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 30 Nov 2016 09:34:47 -0800 Subject: [PATCH] Start work on external tokens --- include/tree_sitter/parser.h | 45 ++-- project.gyp | 1 + .../build_tables/distinctive_tokens_spec.cc | 2 +- .../build_tables/lex_conflict_manager_spec.cc | 8 +- spec/compiler/build_tables/lex_item_spec.cc | 92 ++++----- .../parse_item_set_builder_spec.cc | 62 +++--- .../prepare_grammar/extract_tokens_spec.cc | 8 +- .../prepare_grammar/flatten_grammar_spec.cc | 28 +-- spec/compiler/rules/repeat_spec.cc | 2 +- .../external_scanners/external_scan.c | 13 ++ spec/helpers/load_language.cc | 9 +- spec/helpers/load_language.h | 3 +- spec/helpers/rule_helpers.cc | 5 +- spec/helpers/stream_methods.cc | 11 +- spec/integration/compile_grammar_spec.cc | 65 ++++++ spec/integration/corpus_specs.cc | 6 +- src/compiler/build_tables/build_lex_table.cc | 66 +++--- .../build_tables/build_parse_table.cc | 80 ++++---- src/compiler/build_tables/lookahead_set.cc | 12 +- src/compiler/build_tables/lookahead_set.h | 8 +- src/compiler/build_tables/parse_item.cc | 34 +-- src/compiler/build_tables/parse_item.h | 26 --- .../build_tables/parse_item_set_builder.cc | 22 +- src/compiler/build_tables/recovery_tokens.cc | 6 +- src/compiler/build_tables/recovery_tokens.h | 2 +- src/compiler/generate_code/c_code.cc | 194 +++++++++++++----- src/compiler/grammar.h | 1 + src/compiler/parse_grammar.cc | 31 ++- src/compiler/parse_table.cc | 54 ++--- src/compiler/parse_table.h | 9 +- .../prepare_grammar/expand_repeats.cc | 3 +- .../prepare_grammar/extract_tokens.cc | 31 +-- .../prepare_grammar/flatten_grammar.cc | 1 + .../prepare_grammar/initial_syntax_grammar.h | 6 +- .../prepare_grammar/intern_symbols.cc | 31 ++- .../prepare_grammar/interned_grammar.h | 1 + src/compiler/rules.h | 1 + src/compiler/rules/built_in_symbols.cc | 6 +- src/compiler/rules/external_token.cc | 39 ++++ src/compiler/rules/external_token.h | 27 +++ src/compiler/rules/rules.cc | 5 + src/compiler/rules/symbol.cc | 34 ++- src/compiler/rules/symbol.h | 13 +- src/compiler/rules/visitor.h | 16 ++ src/compiler/syntax_grammar.cc | 11 - src/compiler/syntax_grammar.h | 3 +- src/runtime/parser.c | 7 +- 47 files changed, 723 insertions(+), 417 deletions(-) create mode 100644 spec/fixtures/external_scanners/external_scan.c create mode 100644 src/compiler/rules/external_token.cc create mode 100644 src/compiler/rules/external_token.h diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 3a5bab9a..a335dd6d 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -48,6 +48,11 @@ typedef struct { bool fragile : 1; } TSParseAction; +typedef struct { + uint16_t lex_state; + uint16_t external_tokens; +} TSLexMode; + typedef union { TSParseAction action; struct { @@ -64,8 +69,15 @@ typedef struct TSLanguage { const TSSymbolMetadata *symbol_metadata; const unsigned short *parse_table; const TSParseActionEntry *parse_actions; - const TSStateId *lex_states; + const TSLexMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); + const TSSymbol *external_token_symbol_map; + const bool *external_token_lists; + struct { + void * (*create)(); + bool (*scan)(TSLexer *, const bool *symbol_whitelist); + void (*destroy)(void *); + } external_scanner; } TSLanguage; /* @@ -146,21 +158,22 @@ typedef struct TSLanguage { { .type = TSParseActionTypeAccept } \ } -#define EXPORT_LANGUAGE(language_name) \ - static TSLanguage language = { \ - .symbol_count = SYMBOL_COUNT, \ - .token_count = TOKEN_COUNT, \ - .symbol_metadata = ts_symbol_metadata, \ - .parse_table = (const unsigned short *)ts_parse_table, \ - .parse_actions = ts_parse_actions, \ - .lex_states = ts_lex_states, \ - .symbol_names = ts_symbol_names, \ - .lex_fn = ts_lex, \ - }; \ - \ - const TSLanguage *language_name() { \ - return &language; \ - } + +#define GET_LANGUAGE(...) \ + static TSLanguage language = { \ + .symbol_count = SYMBOL_COUNT, \ + .token_count = TOKEN_COUNT, \ + .symbol_metadata = ts_symbol_metadata, \ + .parse_table = (const unsigned short *)ts_parse_table, \ + .parse_actions = ts_parse_actions, \ + .lex_modes = ts_lex_modes, \ + .symbol_names = ts_symbol_names, \ + .lex_fn = ts_lex, \ + .external_token_lists = (const bool *)ts_external_token_lists, \ + .external_token_symbol_map = ts_external_token_symbol_map, \ + .external_scanner = {__VA_ARGS__} \ + }; \ + return &language \ #ifdef __cplusplus } diff --git a/project.gyp b/project.gyp index 081a3a88..29b69787 100644 --- a/project.gyp +++ b/project.gyp @@ -47,6 +47,7 @@ 'src/compiler/rules/character_range.cc', 'src/compiler/rules/character_set.cc', 'src/compiler/rules/choice.cc', + 'src/compiler/rules/external_token.cc', 'src/compiler/rules/metadata.cc', 'src/compiler/rules/named_symbol.cc', 'src/compiler/rules/pattern.cc', diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc index 104cd721..f01d76cb 100644 --- a/spec/compiler/build_tables/distinctive_tokens_spec.cc +++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc @@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() { })), }; - AssertThat(recovery_tokens(grammar), Equals>({ 1 })); + AssertThat(recovery_tokens(grammar), Equals>({ Symbol(1, Symbol::Terminal) })); }); }); diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc index 7f43e175..3aa75a4c 100644 --- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc +++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc @@ -14,10 +14,10 @@ START_TEST describe("LexConflictManager::resolve(new_action, old_action)", []() { LexConflictManager conflict_manager; bool update; - Symbol sym1(0, true); - Symbol sym2(1, true); - Symbol sym3(2, true); - Symbol sym4(3, true); + Symbol sym1(0, Symbol::Terminal); + Symbol sym2(1, Symbol::Terminal); + Symbol sym3(2, Symbol::Terminal); + Symbol sym4(3, Symbol::Terminal); LexItemSet item_set({ LexItem(sym4, blank() )}); it("favors advance actions over empty accept token actions", [&]() { diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc index 94997956..7042922f 100644 --- a/spec/compiler/build_tables/lex_item_spec.cc +++ b/spec/compiler/build_tables/lex_item_spec.cc @@ -14,7 +14,7 @@ START_TEST describe("LexItem", []() { describe("completion_status()", [&]() { it("indicates whether the item is done, its precedence, and whether it is a string", [&]() { - LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' })); + LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' })); AssertThat(item1.completion_status().is_done, IsFalse()); AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item1.completion_status().is_string, IsFalse()); @@ -23,7 +23,7 @@ describe("LexItem", []() { params.precedence = 3; params.has_precedence = true; params.is_string = 1; - LexItem item2(Symbol(0, true), choice({ + LexItem item2(Symbol(0, Symbol::Terminal), choice({ metadata(blank(), params), character({ 'a', 'b', 'c' }) })); @@ -32,7 +32,7 @@ describe("LexItem", []() { AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); AssertThat(item2.completion_status().is_string, IsTrue()); - LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' }))); + LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' }))); AssertThat(item3.completion_status().is_done, IsTrue()); AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item3.completion_status().is_string, IsFalse()); @@ -43,7 +43,7 @@ describe("LexItem", []() { describe("LexItemSet::transitions()", [&]() { it("handles single characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), }); AssertThat( @@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() { params.is_main_token = true; LexItemSet item_set({ - LexItem(Symbol(1), metadata(character({ 'x' }), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)), }); AssertThat( @@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), metadata(blank(), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)), }), PrecedenceRange(), true @@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'w' }), character({ 'x' }), character({ 'y' }), @@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('w'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }), @@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences with nested precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ prec(3, seq({ character({ 'v' }), prec(4, seq({ @@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() { // The outer precedence is now 'active', because we are within its // contained rule. LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ prec(4, seq({ character({ 'w' }), @@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() { Transition{ // The inner precedence is now 'active' LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ active_prec(4, character({ 'x' })), character({ 'y' }) })), @@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, character({ 'y' })), character({ 'z' }), })), @@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(3), false @@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences where the left hand side can be blank", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ 'x' }), blank(), @@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'y' }), character({ 'z' }), })), @@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(), false @@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles blanks", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }); AssertThat(item_set.transitions(), IsEmpty()); @@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), repeat1(seq({ + LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({ character({ 'a' }), character({ 'b' }), }))), - LexItem(Symbol(2), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), }); AssertThat( @@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'b' }), repeat1(seq({ character({ 'a' }), character({ 'b' }), })) })), - LexItem(Symbol(1), character({ 'b' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })), }), PrecedenceRange(), false @@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c'), Transition{ LexItemSet({ - LexItem(Symbol(2), repeat1(character({ 'c' }))), - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats with precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))) + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))) }); AssertThat( @@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))), - LexItem(Symbol(1), active_prec(-1, blank())), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())), }), PrecedenceRange(-1), false @@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between overlapping character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ active_prec(2, seq({ character({ 'a', 'b', 'c', 'd' }), character({ 'x' }), @@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'b'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), }), PrecedenceRange(2), false @@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(2, 3), false @@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(3), false @@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between a subset and a superset of characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ seq({ character({ 'b', 'c', 'd' }), character({ 'x' }), @@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a').include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('b', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'x' })), - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between whitelisted and blacklisted character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ '/' }, false), seq({ @@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include_all().exclude('/').exclude('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), }), PrecedenceRange(), false @@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), - LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })), }), PrecedenceRange(), false @@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() { it("handles different items with overlapping character sets", [&]() { LexItemSet set1({ - LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), - LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' })) + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), + LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' })) }); AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({ @@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), - LexItem(Symbol(2), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('g', 'i'), Transition{ LexItemSet({ - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index a1dd2231..dad0976b 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -27,23 +27,23 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({ - {Symbol(2), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone}, }) }), SyntaxVariable("rule2", VariableTypeNamed, { Production({ - {Symbol(14, true), 0, AssociativityNone}, - {Symbol(15, true), 0, AssociativityNone}, + {Symbol(14, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(15, Symbol::Terminal), 0, AssociativityNone}, }) }), }, {}, {}}; @@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) + }, + { + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) - }, - { - ParseItem(Symbol(2), production(2, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); @@ -86,14 +86,14 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({}) }), @@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 9f871ec4..577dead1 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -133,13 +133,13 @@ describe("extract_tokens", []() { Variable("rule_A", VariableTypeNamed, str("ok")), Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), - }, { str(" ") }, { { Symbol(1), Symbol(2) } }}); + }, { str(" ") }, { { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } }}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); AssertThat(syntax_grammar.variables.size(), Equals(2)); AssertThat(syntax_grammar.expected_conflicts, Equals(set>({ - { Symbol(0), Symbol(1) }, + { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) }, }))); }); @@ -171,7 +171,7 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<1>(result).separators.size(), Equals(0)); - AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, true) }))); + AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, Symbol::Terminal) }))); }); it("updates extra symbols according to the new symbol numbers", [&]() { @@ -186,7 +186,7 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<0>(result).extra_tokens, Equals(set({ - { Symbol(3, true) }, + { Symbol(3, Symbol::Terminal) }, }))); AssertThat(get<1>(result).separators, IsEmpty()); diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc index 3efd4e03..823da8e6 100644 --- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -36,19 +36,19 @@ describe("flatten_grammar", []() { AssertThat(result.type, Equals(VariableTypeNamed)); AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(3), 102, AssociativityRight}, - {Symbol(4), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight}, + {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }), Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(5), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }) }))) }); @@ -65,8 +65,8 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, - {Symbol(2), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) @@ -80,7 +80,7 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) }); diff --git a/spec/compiler/rules/repeat_spec.cc b/spec/compiler/rules/repeat_spec.cc index 63680563..9c84c8e5 100644 --- a/spec/compiler/rules/repeat_spec.cc +++ b/spec/compiler/rules/repeat_spec.cc @@ -9,7 +9,7 @@ START_TEST describe("Repeat", []() { describe("constructing repeats", [&]() { it("doesn't create redundant repeats", [&]() { - auto sym = make_shared(1); + auto sym = make_shared(1, Symbol::NonTerminal); auto repeat = Repeat::build(sym); auto outer_repeat = Repeat::build(repeat); diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/external_scan.c new file mode 100644 index 00000000..7abab3ae --- /dev/null +++ b/spec/fixtures/external_scanners/external_scan.c @@ -0,0 +1,13 @@ +#include + +void *ts_language_external_scanner_example_external_scanner_create() { + puts("HELLO FROM EXTERNAL SCANNER"); + return 0; +} + +bool ts_language_external_scanner_example_external_scanner_scan() { + return true; +} + +void ts_language_external_scanner_example_external_scanner_destroy() { +} diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index a29aa240..2e85b762 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -67,7 +67,8 @@ static int get_modified_time(const string &path) { const TSLanguage *load_language(const string &source_filename, const string &lib_filename, - const string &language_name) { + const string &language_name, + string external_scanner_path = "") { string language_function_name = "ts_language_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); @@ -119,7 +120,9 @@ const TSLanguage *load_language(const string &source_filename, return language_fn(); } -const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) { +const TSLanguage *load_compile_result(const string &name, + const TSCompileResult &compile_result, + string external_scanner_path) { if (compile_result.error_type != TSCompileErrorTypeNone) { Assert::Failure(string("Compilation failed ") + compile_result.error_message); return nullptr; @@ -135,7 +138,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult source_file << compile_result.code; source_file.close(); - const TSLanguage *language = load_language(source_filename, lib_filename, name); + auto language = load_language(source_filename, lib_filename, name, external_scanner_path); free(compile_result.code); return language; } diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h index 41b1458e..41d8b739 100644 --- a/spec/helpers/load_language.h +++ b/spec/helpers/load_language.h @@ -5,7 +5,8 @@ #include "tree_sitter/runtime.h" #include -const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &); +const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &, + std::string external_scanner_path = ""); const TSLanguage *get_test_language(const std::string &language_name); #endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc index 8bf32360..0b010d2e 100644 --- a/spec/helpers/rule_helpers.cc +++ b/spec/helpers/rule_helpers.cc @@ -9,6 +9,7 @@ namespace tree_sitter { using std::ostream; using std::string; using std::to_string; + using rules::Symbol; rule_ptr character(const set &ranges) { return character(ranges, true); @@ -28,11 +29,11 @@ namespace tree_sitter { } rule_ptr i_sym(size_t index) { - return make_shared(index); + return make_shared(index, Symbol::NonTerminal); } rule_ptr i_token(size_t index) { - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) { diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index 4d411d66..b47363a0 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -10,16 +10,7 @@ namespace tree_sitter { ostream &operator<<(ostream &stream, const Grammar &grammar) { stream << string("# "); - stream << pair.second; - started = true; - } + stream << " rules: " << grammar.rules; return stream << string("}>"); } diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index d41d76e4..21307c89 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -507,6 +507,71 @@ describe("compile_grammar", []() { }); }); + describe("external scanners", [&]() { + it("can call out to arbitrary scanner functions during parsing", [&]() { + string grammar = R"JSON({ + "name": "external_scanner_example", + + "externals": [ + "percent_string", + "percent_string_start", + "percent_string_end" + ], + + "rules": { + "string": { + "type": "CHOICE", + "members": [ + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string" + }, + { + "type": "SEQ", + "members": [ + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string_start" + }, + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string_end" + } + ] + }, + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "external_scanner_example", + result, + "spec/fixtures/external_scanners/external_scan.c" + )); + + ts_document_set_input_string(document, "%|hi|"); + ts_document_parse(document); + assert_root_node("(string)"); + + ts_document_set_input_string(document, "%(1 #{two} three)"); + ts_document_parse(document); + assert_root_node("(string (identifier))"); + }); + }); + describe("when the grammar's start symbol is a token", [&]() { it("parses the token", [&]() { TSCompileResult result = ts_compile_grammar(R"JSON( diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index 9d716ed1..86a1dc47 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -80,10 +80,10 @@ START_TEST describe("The Corpus", []() { vector test_languages({ - "javascript", + // "javascript", "json", - "c", - "cpp", + // "c", + // "cpp", }); for (auto &language_name : test_languages) { diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index 151da7cf..29d8f4d0 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -64,7 +64,7 @@ class LexTableBuilder { private: void add_lex_state_for_parse_state(ParseState *parse_state) { parse_state->lex_state_id = - add_lex_state(item_set_for_tokens(parse_state->expected_inputs())); + add_lex_state(item_set_for_terminals(parse_state->terminal_entries)); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -112,24 +112,27 @@ class LexTableBuilder { void mark_fragile_tokens() { for (ParseState &state : parse_table->states) { for (auto &entry : state.terminal_entries) { - auto homonyms = conflict_manager.possible_homonyms.find(entry.first); - if (homonyms != conflict_manager.possible_homonyms.end()) - for (Symbol::Index homonym : homonyms->second) - if (state.terminal_entries.count(homonym)) { - entry.second.reusable = false; - break; - } + Symbol symbol = entry.first; + if (symbol.is_token()) { + auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); + if (homonyms != conflict_manager.possible_homonyms.end()) + for (Symbol::Index homonym : homonyms->second) + if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) { + entry.second.reusable = false; + break; + } - if (!entry.second.reusable) - continue; + if (!entry.second.reusable) + continue; - auto extensions = conflict_manager.possible_extensions.find(entry.first); - if (extensions != conflict_manager.possible_extensions.end()) - for (Symbol::Index extension : extensions->second) - if (state.terminal_entries.count(extension)) { - entry.second.depends_on_lookahead = true; - break; - } + auto extensions = conflict_manager.possible_extensions.find(symbol.index); + if (extensions != conflict_manager.possible_extensions.end()) + for (Symbol::Index extension : extensions->second) + if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) { + entry.second.depends_on_lookahead = true; + break; + } + } } } } @@ -150,24 +153,27 @@ class LexTableBuilder { } } - LexItemSet item_set_for_tokens(const set &symbols) { + LexItemSet item_set_for_terminals(const map &terminals) { LexItemSet result; - for (const Symbol &symbol : symbols) - for (const rule_ptr &rule : rules_for_symbol(symbol)) - for (const rule_ptr &separator_rule : separator_rules) - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Seq::build({ - separator_rule, - Metadata::main_token(rule) })))); + for (const auto &pair : terminals) { + Symbol symbol = pair.first; + if (symbol.is_token()) { + for (const rule_ptr &rule : rules_for_symbol(symbol)) { + for (const rule_ptr &separator_rule : separator_rules) { + result.entries.insert(LexItem( + symbol, + Metadata::separator( + Seq::build({ + separator_rule, + Metadata::main_token(rule) })))); + } + } + } + } return result; } vector rules_for_symbol(const rules::Symbol &symbol) { - if (!symbol.is_token) - return {}; - if (symbol == rules::END_OF_INPUT()) return { CharacterSet().include(0).copy() }; diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 91444310..819ce345 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -52,7 +52,10 @@ class ParseTableBuilder { allow_any_conflict(false) {} pair build() { - Symbol start_symbol = Symbol(0, grammar.variables.empty()); + Symbol start_symbol = grammar.variables.empty() ? + Symbol(0, Symbol::Terminal) : + Symbol(0, Symbol::NonTerminal); + Production start_production({ ProductionStep(start_symbol, 0, rules::AssociativityNone), }); @@ -63,7 +66,7 @@ class ParseTableBuilder { add_parse_state(ParseItemSet({ { ParseItem(rules::START(), start_production, 0), - LookaheadSet({ END_OF_INPUT().index }), + LookaheadSet({ END_OF_INPUT() }), }, })); @@ -107,21 +110,21 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (const Symbol::Index index : parse_table.mergeable_symbols) { - add_out_of_context_parse_state(&error_state, Symbol(index, true)); + for (const Symbol symbol : parse_table.mergeable_symbols) { + add_out_of_context_parse_state(&error_state, symbol); } for (const Symbol &symbol : grammar.extra_tokens) { - if (!error_state.terminal_entries.count(symbol.index)) { - error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra()); + if (!error_state.terminal_entries.count(symbol)) { + error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra()); } } for (size_t i = 0; i < grammar.variables.size(); i++) { - add_out_of_context_parse_state(&error_state, Symbol(i, false)); + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal)); } - error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0)); + error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); parse_table.states[0] = error_state; } @@ -130,10 +133,10 @@ class ParseTableBuilder { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state(item_set); - if (symbol.is_token) { - error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) }); - } else { + if (symbol.is_non_terminal()) { error_state->nonterminal_entries[symbol.index] = state; + } else { + error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) }); } } } @@ -152,9 +155,9 @@ class ParseTableBuilder { } string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { - map terminal_successors; + map terminal_successors; map nonterminal_successors; - set lookaheads_with_conflicts; + set lookaheads_with_conflicts; for (const auto &pair : item_set.entries) { const ParseItem &item = pair.first; @@ -168,7 +171,7 @@ class ParseTableBuilder { ParseAction::Reduce(item.lhs(), item.step_index, *item.production); int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { + for (Symbol lookahead : *lookahead_symbols.entries) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; // Only add the highest-precedence Reduce actions to the parse table. @@ -203,10 +206,10 @@ class ParseTableBuilder { Symbol symbol = item.production->at(item.step_index).symbol; ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - if (symbol.is_token) { - terminal_successors[symbol.index].entries[new_item] = lookahead_symbols; - } else { + if (symbol.is_non_terminal()) { nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols; + } else { + terminal_successors[symbol].entries[new_item] = lookahead_symbols; } } } @@ -214,7 +217,7 @@ class ParseTableBuilder { // Add a Shift action for each possible successor state. Shift actions for // terminal lookaheads can conflict with Reduce actions added previously. for (auto &pair : terminal_successors) { - Symbol::Index lookahead = pair.first; + Symbol lookahead = pair.first; ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(next_item_set); ParseState &state = parse_table.states[state_id]; @@ -223,7 +226,7 @@ class ParseTableBuilder { if (!allow_any_conflict) { if (had_existing_action) lookaheads_with_conflicts.insert(lookahead); - recovery_states[Symbol(lookahead, true)].add(next_item_set); + recovery_states[lookahead].add(next_item_set); } } @@ -234,10 +237,10 @@ class ParseTableBuilder { ParseStateId next_state = add_parse_state(next_item_set); parse_table.set_nonterminal_action(state_id, lookahead, next_state); if (!allow_any_conflict) - recovery_states[Symbol(lookahead, false)].add(next_item_set); + recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set); } - for (Symbol::Index lookahead : lookaheads_with_conflicts) { + for (Symbol lookahead : lookaheads_with_conflicts) { string conflict = handle_conflict(item_set, state_id, lookahead); if (!conflict.empty()) return conflict; } @@ -245,9 +248,9 @@ class ParseTableBuilder { ParseAction shift_extra = ParseAction::ShiftExtra(); ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { - if (!state.terminal_entries.count(extra_symbol.index) || + if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action() || allow_any_conflict) { - parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra); + parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } @@ -257,7 +260,6 @@ class ParseTableBuilder { void mark_fragile_actions() { for (ParseState &state : parse_table.states) { for (auto &entry : state.terminal_entries) { - const Symbol symbol(entry.first, true); auto &actions = entry.second.actions; for (ParseAction &action : actions) { @@ -359,7 +361,7 @@ class ParseTableBuilder { } string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, - Symbol::Index lookahead) { + Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; @@ -468,7 +470,7 @@ class ParseTableBuilder { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n\n"; description += "Possible interpretations:\n\n"; @@ -487,7 +489,7 @@ class ParseTableBuilder { description += " " + symbol_name(step.symbol); } description += ")"; - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n"; } } @@ -564,14 +566,22 @@ class ParseTableBuilder { return "END_OF_INPUT"; else return ""; - } else if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - if (variable.type == VariableTypeNamed) - return variable.name; - else - return "'" + variable.name + "'"; - } else { - return grammar.variables[symbol.index].name; + } + + switch (symbol.type) { + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + if (variable.type == VariableTypeNamed) + return variable.name; + else + return "'" + variable.name + "'"; + } + case Symbol::NonTerminal: { + return grammar.variables[symbol.index].name; + } + case Symbol::External: { + return grammar.external_tokens[symbol.index]; + } } } diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc index 1ecb0baf..239bc029 100644 --- a/src/compiler/build_tables/lookahead_set.cc +++ b/src/compiler/build_tables/lookahead_set.cc @@ -12,8 +12,8 @@ using rules::Symbol; LookaheadSet::LookaheadSet() : entries(nullptr) {} -LookaheadSet::LookaheadSet(const set &symbols) - : entries(make_shared>(symbols)) {} +LookaheadSet::LookaheadSet(const set &symbols) + : entries(make_shared>(symbols)) {} bool LookaheadSet::empty() const { return !entries.get() || entries->empty(); @@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const { return *entries == *other.entries; } -bool LookaheadSet::contains(const Symbol::Index &symbol) const { +bool LookaheadSet::contains(const Symbol &symbol) const { return entries->find(symbol) != entries->end(); } @@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) { if (!other.entries.get()) return false; if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); size_t previous_size = entries->size(); entries->insert(other.entries->begin(), other.entries->end()); return entries->size() > previous_size; } -bool LookaheadSet::insert(const Symbol::Index &symbol) { +bool LookaheadSet::insert(const Symbol &symbol) { if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); return entries->insert(symbol).second; } diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h index fe99b4d5..e62ee34d 100644 --- a/src/compiler/build_tables/lookahead_set.h +++ b/src/compiler/build_tables/lookahead_set.h @@ -11,15 +11,15 @@ namespace build_tables { class LookaheadSet { public: LookaheadSet(); - explicit LookaheadSet(const std::set &); + explicit LookaheadSet(const std::set &); bool empty() const; bool operator==(const LookaheadSet &) const; - bool contains(const rules::Symbol::Index &) const; + bool contains(const rules::Symbol &) const; bool insert_all(const LookaheadSet &); - bool insert(const rules::Symbol::Index &); + bool insert(const rules::Symbol &); - std::shared_ptr> entries; + std::shared_ptr> entries; }; } // namespace build_tables diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 39b131cb..b9c3831b 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const { } Symbol ParseItem::lhs() const { - return Symbol(variable_index); + return Symbol(variable_index, Symbol::NonTerminal); } bool ParseItem::is_done() const { @@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const { return result; } -ParseItemSet::ActionMap ParseItemSet::actions() const { - ParseItemSet::ActionMap result; - - for (const auto &pair : entries) { - const ParseItem &item = pair.first; - const LookaheadSet &lookahead_symbols = pair.second; - - if (item.step_index == item.production->size()) { - int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { - Action &action = result.terminal_actions[lookahead]; - if (precedence > action.completion_precedence) { - action.completions.assign({ &item }); - } else if (precedence == action.completion_precedence) { - action.completions.push_back({ &item }); - } - } - } else { - Symbol symbol = item.production->at(item.step_index).symbol; - ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - - if (symbol.is_token) { - result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols; - } else { - result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols; - } - } - } - - return result; -} - void ParseItemSet::add(const ParseItemSet &other) { for (const auto &pair : other.entries) entries[pair.first].insert_all(pair.second); diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index a091ac9d..a3785638 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -41,16 +41,6 @@ class ParseItemSet { ParseItemSet(); explicit ParseItemSet(const std::map &); - struct Completion; - struct Action; - - struct ActionMap { - std::map terminal_actions; - std::map nonterminal_continuations; - }; - - ActionMap actions() const; - bool operator==(const ParseItemSet &) const; void add(const ParseItemSet &); size_t unfinished_item_signature() const; @@ -58,22 +48,6 @@ class ParseItemSet { std::map entries; }; -struct ParseItemSet::Completion { - const ParseItem *item; - int precedence; - rules::Associativity associativity; - - bool operator<(const ParseItemSet::Completion &other) { - return precedence < other.precedence; - } -}; - -struct ParseItemSet::Action { - ParseItemSet continuation; - std::vector completions; - int completion_precedence; -}; - } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 34b347fe..7e29efdf 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -27,12 +27,12 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, set processed_non_terminals; for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - Symbol symbol(i, true); - first_sets.insert({symbol, LookaheadSet({ static_cast(i) })}); + Symbol symbol(i, Symbol::Terminal); + first_sets.insert({symbol, LookaheadSet({ symbol })}); } for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); LookaheadSet first_set; processed_non_terminals.clear(); @@ -42,10 +42,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, Symbol current_symbol = symbols_to_process.back(); symbols_to_process.pop_back(); - if (current_symbol.is_token) { - first_set.insert(current_symbol.index); + if (!current_symbol.is_non_terminal()) { + first_set.insert(current_symbol); } else if (processed_non_terminals.insert(current_symbol.index).second) { - for (const Production &production : grammar.productions(current_symbol)) { + for (const Production &production : grammar.variables[current_symbol.index].productions) { if (!production.empty()) { symbols_to_process.push_back(production[0].symbol); } @@ -59,11 +59,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, vector components_to_process; for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); map> cache_entry; components_to_process.clear(); - for (const Production &production : grammar.productions(symbol)) { + for (const Production &production : grammar.variables[i].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(symbol, production, 0), LookaheadSet(), @@ -87,7 +87,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, if (component_is_new) { Symbol next_symbol = item.next_symbol(); - if (next_symbol.is_built_in() || next_symbol.is_token) + if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue; LookaheadSet next_lookaheads; @@ -102,7 +102,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, propagates_lookaheads = false; } - for (const Production &production : grammar.productions(next_symbol)) { + for (const Production &production : grammar.variables[next_symbol.index].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(next_symbol, production, 0), next_lookaheads, @@ -130,7 +130,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { const LookaheadSet &lookaheads = pair.second; const Symbol &next_symbol = item.next_symbol(); - if (!next_symbol.is_token && !next_symbol.is_built_in()) { + if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) { LookaheadSet next_lookaheads; size_t next_step = item.step_index + 1; if (next_step == item.production->size()) { diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc index 479de6b8..84b175bc 100644 --- a/src/compiler/build_tables/recovery_tokens.cc +++ b/src/compiler/build_tables/recovery_tokens.cc @@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator {}; class LastCharacters : public CharacterAggregator {}; class AllCharacters : public CharacterAggregator {}; -set recovery_tokens(const LexicalGrammar &grammar) { - set result; +set recovery_tokens(const LexicalGrammar &grammar) { + set result; AllCharacters all_separator_characters; for (const rule_ptr &separator : grammar.separators) @@ -79,7 +79,7 @@ set recovery_tokens(const LexicalGrammar &grammar) { !all_characters.result.intersects(all_separator_characters.result); if ((has_distinct_start && has_distinct_end) || has_no_separators) - result.insert(i); + result.insert(Symbol(i, Symbol::Terminal)); } return result; diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h index 4873b5a9..c97a8cfd 100644 --- a/src/compiler/build_tables/recovery_tokens.h +++ b/src/compiler/build_tables/recovery_tokens.h @@ -11,7 +11,7 @@ struct LexicalGrammar; namespace build_tables { -std::set recovery_tokens(const LexicalGrammar &); +std::set recovery_tokens(const LexicalGrammar &); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b7058603..a5a9c17a 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -14,6 +14,7 @@ namespace tree_sitter { namespace generate_code { + using std::function; using std::map; using std::pair; @@ -22,6 +23,7 @@ using std::string; using std::to_string; using std::vector; using util::escape_char; +using rules::Symbol; static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr()); @@ -73,9 +75,8 @@ class CCodeGenerator { const LexicalGrammar lexical_grammar; map sanitized_names; vector> parse_table_entries; - vector>> in_progress_symbols; + vector> external_token_id_sets; size_t next_parse_action_list_index; - size_t next_in_progress_symbol_list_index; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -87,19 +88,25 @@ class CCodeGenerator { lex_table(lex_table), syntax_grammar(syntax_grammar), lexical_grammar(lexical_grammar), - next_parse_action_list_index(0), - next_in_progress_symbol_list_index(0) {} + next_parse_action_list_index(0) {} string code() { buffer = ""; add_includes(); - add_state_and_symbol_counts(); + add_warning_pragma(); + add_stats(); add_symbol_enum(); add_symbol_names_list(); - add_symbol_node_types_list(); + add_symbol_metadata_list(); add_lex_function(); - add_lex_states_list(); + add_lex_modes_list(); + + if (!syntax_grammar.external_tokens.empty()) + add_external_token_enum(); + + add_external_token_symbol_map(); + add_external_scan_modes_list(); add_parse_table(); add_parser_export(); @@ -112,10 +119,17 @@ class CCodeGenerator { line(); } - void add_state_and_symbol_counts() { + void add_warning_pragma() { + line("#pragma GCC diagnostic push"); + line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + line(); + } + + void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1)); + line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -124,7 +138,7 @@ class CCodeGenerator { indent([&]() { size_t i = 1; for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; if (!symbol.is_built_in()) { line(symbol_id(symbol) + " = " + to_string(i) + ","); i++; @@ -146,11 +160,11 @@ class CCodeGenerator { line(); } - void add_symbol_node_types_list() { + void add_symbol_metadata_list() { line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {"); indent([&]() { for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; line("[" + symbol_id(symbol) + "] = {"); indent([&]() { switch (symbol_type(symbol)) { @@ -198,13 +212,80 @@ class CCodeGenerator { line(); } - void add_lex_states_list() { - line("static TSStateId ts_lex_states[STATE_COUNT] = {"); + void add_lex_modes_list() { + add_external_tokens_id({}); + + line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { size_t state_id = 0; - for (const auto &state : parse_table.states) - line("[" + to_string(state_id++) + "] = " + - to_string(state.lex_state_id) + ","); + + for (const auto &state : parse_table.states) { + line("[" + to_string(state_id++) + "] = {.lex_state = "); + add(to_string(state.lex_state_id)); + + set external_token_indices; + for (const auto &pair : state.terminal_entries) { + Symbol symbol = pair.first; + if (symbol.is_external()) + external_token_indices.insert(symbol.index); + } + + if (!external_token_indices.empty()) + add(", .external_tokens = " + add_external_tokens_id(external_token_indices)); + add("},"); + } + }); + line("};"); + line(); + } + + string add_external_tokens_id(set external_token_ids) { + for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) + if (external_token_id_sets[i] == external_token_ids) + return to_string(i); + external_token_id_sets.push_back(external_token_ids); + return to_string(external_token_id_sets.size() - 1); + } + + void add_external_token_enum() { + line("enum {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) + line(external_token_id(i) + ","); + }); + line("};"); + line(); + } + + void add_external_token_symbol_map() { + line("TSSymbol ts_external_token_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { + line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ","); + } + }); + line("};"); + line(); + } + + void add_external_scan_modes_list() { + line("static bool ts_external_token_lists["); + add(to_string(external_token_id_sets.size())); + add("][EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + size_t i = 0; + for (const auto &external_token_ids : external_token_id_sets) { + if (!external_token_ids.empty()) { + line("[" + to_string(i) + "] = {"); + indent([&]() { + for (Symbol::Index id : external_token_ids) { + line("[" + external_token_id(id) + "] = true,"); + } + }); + line("},"); + } + i++; + } }); line("};"); line(); @@ -214,9 +295,6 @@ class CCodeGenerator { add_parse_action_list_id(ParseTableEntry{ {}, false, false }); size_t state_id = 0; - line("#pragma GCC diagnostic push"); - line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); - line(); line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {"); indent([&]() { @@ -224,12 +302,12 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {"); indent([&]() { for (const auto &entry : state.nonterminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE("); + line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE("); add(to_string(entry.second)); add("),"); } for (const auto &entry : state.terminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS("); + line("[" + symbol_id(entry.first) + "] = ACTIONS("); add(to_string(add_parse_action_list_id(entry.second))); add("),"); } @@ -242,12 +320,37 @@ class CCodeGenerator { line(); add_parse_action_list(); line(); - line("#pragma GCC diagnostic pop"); - line(); } void add_parser_export() { - line("EXPORT_LANGUAGE(ts_language_" + name + ");"); + if (!syntax_grammar.external_tokens.empty()) { + string external_scanner_name = "ts_language_" + name + "_external_scanner"; + + line("void *" + external_scanner_name + "_create();"); + line("bool " + external_scanner_name + "_scan();"); + line("void " + external_scanner_name + "_destroy();"); + line(); + + line("const TSLanguage *ts_language_" + name + "() {"); + indent([&]() { + if (!syntax_grammar.external_tokens.empty()) { + line("GET_LANGUAGE("); + indent([&]() { + line(external_scanner_name + "_create,"); + line(external_scanner_name + "_scan,"); + line(external_scanner_name + "_destroy,"); + }); + line(");"); + } + }); + line("}"); + } else { + line("const TSLanguage *ts_language_" + name + "() {"); + indent([&]() { + line("GET_LANGUAGE();"); + }); + line("}"); + } line(); } @@ -379,22 +482,13 @@ class CCodeGenerator { return result; } - size_t add_in_progress_symbol_list_id(const set &symbols) { - for (const auto &pair : in_progress_symbols) { - if (pair.second == symbols) { - return pair.first; - } - } - - size_t result = next_in_progress_symbol_list_index; - in_progress_symbols.push_back({ result, symbols }); - next_in_progress_symbol_list_index += 1 + symbols.size(); - return result; - } - // Helper functions - string symbol_id(const rules::Symbol &symbol) { + string external_token_id(Symbol::Index index) { + return "ts_external_token_" + syntax_grammar.external_tokens[index]; + } + + string symbol_id(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "ts_builtin_sym_end"; @@ -411,25 +505,31 @@ class CCodeGenerator { } } - string symbol_name(const rules::Symbol &symbol) { + string symbol_name(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "END"; return entry_for_symbol(symbol).first; } - VariableType symbol_type(const rules::Symbol &symbol) { + VariableType symbol_type(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return VariableTypeHidden; return entry_for_symbol(symbol).second; } - pair entry_for_symbol(const rules::Symbol &symbol) { - if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - return { variable.name, variable.type }; - } else { - const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; - return { variable.name, variable.type }; + pair entry_for_symbol(const Symbol &symbol) { + switch (symbol.type) { + case Symbol::NonTerminal: { + const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::External: { + return { syntax_grammar.external_tokens[symbol.index], VariableTypeAnonymous }; + } } } diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index a8955c02..0a07280c 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -12,6 +12,7 @@ struct Grammar { std::vector> rules; std::vector extra_tokens; std::vector> expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 185d919b..cc5cff55 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -119,6 +119,16 @@ ParseRuleResult parse_rule(json_value *rule_json) { } } + if (type == "EXTERNAL_TOKEN") { + json_value token_name_json = rule_json->operator[]("name"); + if (token_name_json.type != json_string) { + error_message = "External token name must be a string"; + goto error; + } + + return { external_token(token_name_json.u.string.ptr), "" }; + } + if (type == "PATTERN") { json_value value_json = rule_json->operator[]("value"); if (value_json.type == json_string) { @@ -210,7 +220,7 @@ ParseGrammarResult parse_grammar(const string &input) { string error_message; string name; Grammar grammar; - json_value name_json, rules_json, extras_json, conflicts_json; + json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json; json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; char parse_error[json_error_max]; @@ -302,6 +312,25 @@ ParseGrammarResult parse_grammar(const string &input) { } } + external_tokens_json = grammar_json->operator[]("externals"); + if (external_tokens_json.type != json_none) { + if (external_tokens_json.type != json_array) { + error_message = "External tokens must be an array"; + goto error; + } + + for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) { + json_value *token_name_json = external_tokens_json.u.array.values[i]; + if (token_name_json->type != json_string) { + error_message = "External token values must be strings"; + goto error; + } + + string token_name = token_name_json->u.string.ptr; + grammar.external_tokens.push_back(token_name); + } + } + json_value_free(grammar_json); return { name, grammar, "" }; diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index e6e4badd..a04eec8c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -1,6 +1,7 @@ #include "compiler/parse_table.h" #include #include "compiler/precedence_range.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -28,7 +29,7 @@ ParseAction::ParseAction() extra(false), fragile(false), state_index(-1), - symbol(Symbol(-1)), + symbol(rules::NONE()), consumed_symbol_count(0), production(nullptr) {} @@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() { } ParseAction ParseAction::Shift(ParseStateId state_index) { - return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr); + return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr); } ParseAction ParseAction::Recover(ParseStateId state_index) { - return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0, + return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0, nullptr); } @@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const { set ParseState::expected_inputs() const { set result; for (auto &entry : terminal_entries) - result.insert(Symbol(entry.first, true)); - for (auto &entry : nonterminal_entries) - result.insert(Symbol(entry.first, false)); + result.insert(entry.first); return result; } @@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() { return states.size() - 1; } -ParseAction &ParseTable::set_terminal_action(ParseStateId state_id, - Symbol::Index index, - ParseAction action) { - states[state_id].terminal_entries[index].actions.clear(); - return add_terminal_action(state_id, index, action); -} - ParseAction &ParseTable::add_terminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol lookahead, ParseAction action) { - Symbol symbol(index, true); if (action.type == ParseActionTypeShift && action.extra) - symbols[symbol].extra = true; + symbols[lookahead].extra = true; else - symbols[symbol].structural = true; + symbols[lookahead].structural = true; - ParseTableEntry &entry = states[state_id].terminal_entries[index]; + ParseTableEntry &entry = states[state_id].terminal_entries[lookahead]; entry.actions.push_back(action); return *entry.actions.rbegin(); } void ParseTable::set_nonterminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol::Index lookahead, ParseStateId next_state_id) { - Symbol symbol(index, false); - symbols[symbol].structural = true; - states[state_id].nonterminal_entries[index] = next_state_id; + symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true; + states[state_id].nonterminal_entries[lookahead] = next_state_id; } static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { @@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) { return false; for (auto &entry : state.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - const auto &other_entry = other.terminal_entries.find(index); + const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; @@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) { } } - set symbols_to_merge; + set symbols_to_merge; for (auto &entry : other.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - if (!state.terminal_entries.count(index)) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (!state.terminal_entries.count(lookahead)) { + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) return false; - symbols_to_merge.insert(index); + symbols_to_merge.insert(lookahead); } } - for (const Symbol::Index &index : symbols_to_merge) - state.terminal_entries[index] = other.terminal_entries.find(index)->second; + for (const Symbol &lookahead : symbols_to_merge) + state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; return true; } diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 59eee4a8..79eec4fc 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -76,7 +76,7 @@ class ParseState { void each_referenced_state(std::function); bool has_shift_action() const; - std::map terminal_entries; + std::map terminal_entries; std::map nonterminal_entries; LexStateId lex_state_id; size_t shift_actions_signature; @@ -91,15 +91,14 @@ class ParseTable { public: std::set all_symbols() const; ParseStateId add_state(); - ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction); - ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction); - void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId); + ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); + void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); bool merge_state(size_t i, size_t j); std::vector states; std::map symbols; - std::set mergeable_symbols; + std::set mergeable_symbols; }; } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 7963e94b..331c9cea 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn { rule_ptr inner_rule = apply(rule->content); size_t index = aux_rules.size(); string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count); - Symbol repeat_symbol(offset + index); + Symbol repeat_symbol(offset + index, Symbol::NonTerminal); existing_repeats.push_back({ rule->copy(), repeat_symbol }); aux_rules.push_back( Variable(helper_rule_name, VariableTypeAuxiliary, @@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { result.variables = grammar.variables; result.extra_tokens = grammar.extra_tokens; result.expected_conflicts = grammar.expected_conflicts; + result.external_tokens = grammar.external_tokens; ExpandRepeats expander(result.variables.size()); for (auto &variable : result.variables) diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index bf7ac514..dcf88e53 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -11,6 +11,7 @@ #include "compiler/rules/symbol.h" #include "compiler/rules/string.h" #include "compiler/rules/metadata.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/pattern.h" #include "compiler/prepare_grammar/token_description.h" #include "compiler/prepare_grammar/is_token.h" @@ -38,7 +39,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { map replacements; Symbol replace_symbol(const Symbol &symbol) { - if (symbol.is_built_in() || symbol.is_token) + if (!symbol.is_non_terminal()) return symbol; auto replacement_pair = replacements.find(symbol); @@ -49,7 +50,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { for (const auto &pair : replacements) if (pair.first.index < symbol.index) new_index--; - return Symbol(new_index); + return Symbol(new_index, Symbol::NonTerminal); } }; @@ -60,14 +61,14 @@ class TokenExtractor : public rules::IdentityRuleFn { for (size_t i = 0; i < tokens.size(); i++) if (tokens[i].rule->operator==(*input)) { token_usage_counts[i]++; - return make_shared(i, true); + return make_shared(i, Symbol::Terminal); } rule_ptr rule = input->copy(); size_t index = tokens.size(); tokens.push_back(Variable(token_description(rule), entry_type, rule)); token_usage_counts.push_back(1); - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr apply_to(const rules::String *rule) { @@ -78,6 +79,10 @@ class TokenExtractor : public rules::IdentityRuleFn { return apply_to_token(rule, VariableTypeAuxiliary); } + rule_ptr apply_to(const rules::ExternalToken *rule) { + return apply_to_token(rule, VariableTypeAuxiliary); + } + rule_ptr apply_to(const rules::Metadata *rule) { if (rule->params.is_token) return apply_to_token(rule->rule.get(), VariableTypeAuxiliary); @@ -90,7 +95,7 @@ class TokenExtractor : public rules::IdentityRuleFn { vector tokens; }; -static CompileError ubiq_token_err(const string &message) { +static CompileError extra_token_error(const string &message) { return CompileError(TSCompileErrorTypeInvalidUbiquitousToken, "Not a token: " + message); } @@ -122,11 +127,10 @@ tuple extract_tokens( size_t i = 0; for (const Variable &variable : processed_variables) { auto symbol = variable.rule->as(); - if (symbol && symbol->is_token && !symbol->is_built_in() && - extractor.token_usage_counts[symbol->index] == 1) { + if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) { lexical_grammar.variables[symbol->index].type = variable.type; lexical_grammar.variables[symbol->index].name = variable.name; - symbol_replacer.replacements.insert({ Symbol(i), *symbol }); + symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol }); } else { syntax_grammar.variables.push_back(variable); } @@ -158,7 +162,7 @@ tuple extract_tokens( bool used_elsewhere_in_grammar = false; for (const Variable &variable : lexical_grammar.variables) { if (variable.rule->operator==(*rule)) { - syntax_grammar.extra_tokens.insert(Symbol(i, true)); + syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal)); used_elsewhere_in_grammar = true; } i++; @@ -175,17 +179,20 @@ tuple extract_tokens( auto symbol = rule->as(); if (!symbol) return make_tuple(syntax_grammar, lexical_grammar, - ubiq_token_err(rule->to_string())); + extra_token_error(rule->to_string())); Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); - if (!new_symbol.is_token) + if (!new_symbol.is_token()) { return make_tuple( syntax_grammar, lexical_grammar, - ubiq_token_err(syntax_grammar.variables[new_symbol.index].name)); + extra_token_error(syntax_grammar.variables[new_symbol.index].name)); + } syntax_grammar.extra_tokens.insert(new_symbol); } + syntax_grammar.external_tokens = grammar.external_tokens; + return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index ddba9a5f..8ac0e33c 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -92,6 +92,7 @@ pair flatten_grammar(const InitialSyntaxGrammar &gr SyntaxGrammar result; result.expected_conflicts = grammar.expected_conflicts; result.extra_tokens = grammar.extra_tokens; + result.external_tokens = grammar.external_tokens; bool is_start = true; for (const Variable &variable : grammar.variables) { diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index fe1ff37d..d4b1c8d5 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -1,13 +1,12 @@ #ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ #define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ -#include -#include #include +#include #include "tree_sitter/compiler.h" #include "compiler/rules/symbol.h" -#include "compiler/variable.h" #include "compiler/syntax_grammar.h" +#include "compiler/variable.h" namespace tree_sitter { namespace prepare_grammar { @@ -16,6 +15,7 @@ struct InitialSyntaxGrammar { std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index cd01719c..f08edf5e 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -7,6 +7,7 @@ #include "compiler/rules/visitor.h" #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/symbol.h" namespace tree_sitter { @@ -17,6 +18,7 @@ using std::vector; using std::set; using std::pair; using std::make_shared; +using rules::Symbol; class InternSymbols : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; @@ -30,17 +32,34 @@ class InternSymbols : public rules::IdentityRuleFn { return result; } + rule_ptr apply_to(const rules::ExternalToken *rule) { + auto result = symbol_for_external_token(rule->name); + if (!result.get()) { + missing_external_token_name = rule->name; + return rules::Blank::build(); + } + return result; + } + public: std::shared_ptr symbol_for_rule_name(string rule_name) { for (size_t i = 0; i < grammar.rules.size(); i++) if (grammar.rules[i].first == rule_name) - return make_shared(i); + return make_shared(i, Symbol::NonTerminal); + return nullptr; + } + + std::shared_ptr symbol_for_external_token(string name) { + for (size_t i = 0; i < grammar.external_tokens.size(); i++) + if (grammar.external_tokens[i] == name) + return make_shared(i, Symbol::External); return nullptr; } explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; + string missing_external_token_name; }; CompileError missing_rule_error(string rule_name) { @@ -48,14 +67,22 @@ CompileError missing_rule_error(string rule_name) { "Undefined rule '" + rule_name + "'"); } +CompileError missing_external_token_error(string token_name) { + return CompileError(TSCompileErrorTypeUndefinedSymbol, + "Undefined external token '" + token_name + "'"); +} + pair intern_symbols(const Grammar &grammar) { InternedGrammar result; + result.external_tokens = grammar.external_tokens; InternSymbols interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; + if (!interner.missing_external_token_name.empty()) + return { result, missing_external_token_error(interner.missing_external_token_name) }; result.variables.push_back(Variable( pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, @@ -66,6 +93,8 @@ pair intern_symbols(const Grammar &grammar) { auto new_rule = interner.apply(rule); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; + if (!interner.missing_external_token_name.empty()) + return { result, missing_external_token_error(interner.missing_external_token_name) }; result.extra_tokens.push_back(new_rule); } diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index c08c07dd..7b425c3a 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,6 +15,7 @@ struct InternedGrammar { std::vector variables; std::vector extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/rules.h b/src/compiler/rules.h index d98a719a..8a3f4097 100644 --- a/src/compiler/rules.h +++ b/src/compiler/rules.h @@ -22,6 +22,7 @@ rule_ptr prec_left(int precedence, const rule_ptr &); rule_ptr prec_right(const rule_ptr &); rule_ptr prec_right(int precedence, const rule_ptr &); rule_ptr token(const rule_ptr &rule); +rule_ptr external_token(const std::string &); } // namespace std diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index a7a877ec..b3f7cd66 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -4,15 +4,15 @@ namespace tree_sitter { namespace rules { Symbol END_OF_INPUT() { - return Symbol(-1, true); + return Symbol(-1, Symbol::Terminal); } Symbol START() { - return Symbol(-2); + return Symbol(-2, Symbol::NonTerminal); } Symbol NONE() { - return Symbol(-3); + return Symbol(-3, Symbol::NonTerminal); } } // namespace rules diff --git a/src/compiler/rules/external_token.cc b/src/compiler/rules/external_token.cc new file mode 100644 index 00000000..d8487b0e --- /dev/null +++ b/src/compiler/rules/external_token.cc @@ -0,0 +1,39 @@ +#include "compiler/rules/external_token.h" +#include +#include "compiler/rules/visitor.h" + +namespace tree_sitter { +namespace rules { + +using std::string; +using std::hash; + +ExternalToken::ExternalToken(const string &name) : name(name) {} + +rule_ptr ExternalToken::build(const string &name) { + return std::make_shared(name); +} + +bool ExternalToken::operator==(const Rule &rule) const { + auto other = rule.as(); + return other && other->name == name; +} + +size_t ExternalToken::hash_code() const { + return hash()(name); +} + +rule_ptr ExternalToken::copy() const { + return std::make_shared(*this); +} + +string ExternalToken::to_string() const { + return string("(sym '") + name + "')"; +} + +void ExternalToken::accept(Visitor *visitor) const { + visitor->visit(this); +} + +} // namespace rules +} // namespace tree_sitter diff --git a/src/compiler/rules/external_token.h b/src/compiler/rules/external_token.h new file mode 100644 index 00000000..cec1a847 --- /dev/null +++ b/src/compiler/rules/external_token.h @@ -0,0 +1,27 @@ +#ifndef COMPILER_RULES_EXTERNAL_TOKEN_H_ +#define COMPILER_RULES_EXTERNAL_TOKEN_H_ + +#include +#include "compiler/rule.h" + +namespace tree_sitter { +namespace rules { + +class ExternalToken : public Rule { + public: + explicit ExternalToken(const std::string &); + static rule_ptr build(const std::string &); + + bool operator==(const Rule &other) const; + size_t hash_code() const; + rule_ptr copy() const; + std::string to_string() const; + void accept(Visitor *visitor) const; + + std::string name; +}; + +} // namespace rules +} // namespace tree_sitter + +#endif // COMPILER_RULES_EXTERNAL_TOKEN_H_ diff --git a/src/compiler/rules/rules.cc b/src/compiler/rules/rules.cc index fdb0ebdf..73c37284 100644 --- a/src/compiler/rules/rules.cc +++ b/src/compiler/rules/rules.cc @@ -13,6 +13,7 @@ #include "compiler/rules/pattern.h" #include "compiler/rules/character_set.h" #include "compiler/rules/repeat.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -105,4 +106,8 @@ rule_ptr token(const rule_ptr &rule) { return metadata(rule, params); } +rule_ptr external_token(const string &name) { + return rules::ExternalToken::build(name); +} + } // namespace tree_sitter diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index f85b09c7..478de7cf 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -11,12 +11,10 @@ using std::string; using std::to_string; using util::hash_combine; -Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {} - -Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {} +Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {} bool Symbol::operator==(const Symbol &other) const { - return (other.index == index) && (other.is_token == is_token); + return (other.index == index) && (other.type == type); } bool Symbol::operator==(const Rule &rule) const { @@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const { size_t Symbol::hash_code() const { size_t result = 0; hash_combine(&result, index); - hash_combine(&result, is_token); + hash_combine(&result, type); return result; } @@ -36,14 +34,20 @@ rule_ptr Symbol::copy() const { } string Symbol::to_string() const { - string name = is_token ? "token" : "sym"; - return "(" + name + " " + std::to_string(index) + ")"; + switch (type) { + case Symbol::Terminal: + return "(terminal " + std::to_string(index) + ")"; + case Symbol::NonTerminal: + return "(non-terminal " + std::to_string(index) + ")"; + case Symbol::External: + return "(external " + std::to_string(index) + ")"; + } } bool Symbol::operator<(const Symbol &other) const { - if (is_token && !other.is_token) + if (type < other.type) return true; - if (!is_token && other.is_token) + if (other.type < type) return false; return (index < other.index); } @@ -56,6 +60,18 @@ bool Symbol::is_built_in() const { return is_built_in(index); } +bool Symbol::is_token() const { + return type == Symbol::Terminal; +} + +bool Symbol::is_external() const { + return type == Symbol::External; +} + +bool Symbol::is_non_terminal() const { + return type == Symbol::NonTerminal; +} + void Symbol::accept(Visitor *visitor) const { visitor->visit(this); } diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 4ae9ece3..46272dc5 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -11,9 +11,13 @@ class Symbol : public Rule { public: typedef int Index; + typedef enum { + Terminal, + NonTerminal, + External, + } Type; - explicit Symbol(Index index); - Symbol(Index index, bool is_token); + Symbol(Index index, Type type); bool operator==(const Symbol &other) const; bool operator==(const Rule &other) const; @@ -26,9 +30,12 @@ class Symbol : public Rule { bool operator<(const Symbol &other) const; static bool is_built_in(Index); bool is_built_in() const; + bool is_token() const; + bool is_external() const; + bool is_non_terminal() const; Index index; - bool is_token; + Type type; }; } // namespace rules diff --git a/src/compiler/rules/visitor.h b/src/compiler/rules/visitor.h index b8301183..c75e31dc 100644 --- a/src/compiler/rules/visitor.h +++ b/src/compiler/rules/visitor.h @@ -16,6 +16,7 @@ class String; class Symbol; class Pattern; class Metadata; +class ExternalToken; class Visitor { public: @@ -29,6 +30,7 @@ class Visitor { virtual void visit(const String *rule) = 0; virtual void visit(const NamedSymbol *rule) = 0; virtual void visit(const Symbol *rule) = 0; + virtual void visit(const ExternalToken *rule) = 0; virtual ~Visitor(); }; @@ -86,6 +88,10 @@ class RuleFn : private Visitor { return default_apply((const Rule *)rule); } + virtual T apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } + void visit(const Blank *rule) { value_ = apply_to(rule); } @@ -126,6 +132,10 @@ class RuleFn : private Visitor { value_ = apply_to(rule); } + void visit(const ExternalToken *rule) { + value_ = apply_to(rule); + } + private: T value_; }; @@ -170,6 +180,9 @@ class RuleFn : private Visitor { virtual void apply_to(const Symbol *rule) { return default_apply((const Rule *)rule); } + virtual void apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } void visit(const Blank *rule) { apply_to(rule); @@ -201,6 +214,9 @@ class RuleFn : private Visitor { void visit(const Symbol *rule) { apply_to(rule); } + void visit(const ExternalToken *rule) { + apply_to(rule); + } }; class IdentityRuleFn : public RuleFn { diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 706ec828..535ddcda 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -13,8 +13,6 @@ using std::pair; using std::vector; using std::set; -static const vector NO_PRODUCTIONS; - SyntaxVariable::SyntaxVariable(const string &name, VariableType type, const vector &productions) : name(name), productions(productions), type(type) {} @@ -28,13 +26,4 @@ bool ProductionStep::operator==(const ProductionStep &other) const { associativity == other.associativity; } -const vector &SyntaxGrammar::productions( - const rules::Symbol &symbol) const { - if (symbol.is_built_in() || symbol.is_token) { - return NO_PRODUCTIONS; - } else { - return variables[symbol.index].productions; - } -} - } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 89745fa5..e34ddbbe 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -33,11 +33,10 @@ struct SyntaxVariable { typedef std::set ConflictSet; struct SyntaxGrammar { - const std::vector &productions(const rules::Symbol &) const; - std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 2f5879a4..c37b7871 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -161,7 +161,7 @@ static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) { static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { - if (tree->first_leaf.lex_state == self->language->lex_states[state]) + if (tree->first_leaf.lex_state == self->language->lex_modes[state].lex_state) return true; if (!table_entry->is_reusable) return false; @@ -209,7 +209,7 @@ static bool parser__condense_stack(Parser *self) { } static Tree *parser__lex(Parser *self, TSStateId parse_state) { - TSStateId start_state = self->language->lex_states[parse_state]; + TSStateId start_state = self->language->lex_modes[parse_state].lex_state; TSStateId current_state = start_state; Length start_position = self->lexer.current_position; LOG("lex state:%d", start_state); @@ -729,6 +729,9 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { LOG("new_parse"); } + if (self->language->external_scanner.create) + self->language->external_scanner.create(); + ts_lexer_set_input(&self->lexer, input); ts_stack_clear(self->stack); self->reusable_node = (ReusableNode){ previous_tree, 0 };