From 46854cc2747474bdecb64051edfa773bbf47f22e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 30 Nov 2016 09:33:31 -0800 Subject: [PATCH 01/50] Compile and link test grammars in one step --- spec/helpers/load_language.cc | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index 9409da42..a29aa240 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -75,7 +75,6 @@ const TSLanguage *load_language(const string &source_filename, int lib_mtime = get_modified_time(lib_filename); if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime) { - string obj_filename = lib_filename + ".o"; const char *compiler_name = getenv("CC"); if (!compiler_name) { compiler_name = "gcc"; @@ -83,32 +82,22 @@ const TSLanguage *load_language(const string &source_filename, const char *compile_argv[] = { compiler_name, + "-shared", "-x", "c", "-fPIC", "-g", "-I", header_dir.c_str(), - "-c", source_filename.c_str(), - "-o", obj_filename.c_str(), + "-o", lib_filename.c_str(), + source_filename.c_str(), + external_scanner_path.empty() ? NULL : external_scanner_path.c_str(), NULL }; - string compile_error = run_cmd("gcc", compile_argv); + + string compile_error = run_cmd(compiler_name, compile_argv); if (!compile_error.empty()) { AssertThat(string(compile_error), IsEmpty()); return nullptr; } - - const char *link_argv[] = { - compiler_name, - "-shared", - "-Wl", obj_filename.c_str(), - "-o", lib_filename.c_str(), - NULL - }; - string link_error = run_cmd("gcc", link_argv); - if (!link_error.empty()) { - AssertThat(link_error, IsEmpty()); - return nullptr; - } } void *parser_lib = dlopen(lib_filename.c_str(), RTLD_NOW); From c966af041235e42d207e4150cc1ba8cb2ec85c78 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 30 Nov 2016 09:34:47 -0800 Subject: [PATCH 02/50] Start work on external tokens --- include/tree_sitter/parser.h | 45 ++-- project.gyp | 1 + .../build_tables/distinctive_tokens_spec.cc | 2 +- .../build_tables/lex_conflict_manager_spec.cc | 8 +- spec/compiler/build_tables/lex_item_spec.cc | 92 ++++----- .../parse_item_set_builder_spec.cc | 62 +++--- .../prepare_grammar/extract_tokens_spec.cc | 8 +- .../prepare_grammar/flatten_grammar_spec.cc | 28 +-- spec/compiler/rules/repeat_spec.cc | 2 +- .../external_scanners/external_scan.c | 13 ++ spec/helpers/load_language.cc | 9 +- spec/helpers/load_language.h | 3 +- spec/helpers/rule_helpers.cc | 5 +- spec/helpers/stream_methods.cc | 11 +- spec/integration/compile_grammar_spec.cc | 65 ++++++ spec/integration/corpus_specs.cc | 6 +- src/compiler/build_tables/build_lex_table.cc | 66 +++--- .../build_tables/build_parse_table.cc | 80 ++++---- src/compiler/build_tables/lookahead_set.cc | 12 +- src/compiler/build_tables/lookahead_set.h | 8 +- src/compiler/build_tables/parse_item.cc | 34 +-- src/compiler/build_tables/parse_item.h | 26 --- .../build_tables/parse_item_set_builder.cc | 22 +- src/compiler/build_tables/recovery_tokens.cc | 6 +- src/compiler/build_tables/recovery_tokens.h | 2 +- src/compiler/generate_code/c_code.cc | 194 +++++++++++++----- src/compiler/grammar.h | 1 + src/compiler/parse_grammar.cc | 31 ++- src/compiler/parse_table.cc | 54 ++--- src/compiler/parse_table.h | 9 +- .../prepare_grammar/expand_repeats.cc | 3 +- .../prepare_grammar/extract_tokens.cc | 31 +-- .../prepare_grammar/flatten_grammar.cc | 1 + .../prepare_grammar/initial_syntax_grammar.h | 6 +- .../prepare_grammar/intern_symbols.cc | 31 ++- .../prepare_grammar/interned_grammar.h | 1 + src/compiler/rules.h | 1 + src/compiler/rules/built_in_symbols.cc | 6 +- src/compiler/rules/external_token.cc | 39 ++++ src/compiler/rules/external_token.h | 27 +++ src/compiler/rules/rules.cc | 5 + src/compiler/rules/symbol.cc | 34 ++- src/compiler/rules/symbol.h | 13 +- src/compiler/rules/visitor.h | 16 ++ src/compiler/syntax_grammar.cc | 11 - src/compiler/syntax_grammar.h | 3 +- src/runtime/parser.c | 7 +- 47 files changed, 723 insertions(+), 417 deletions(-) create mode 100644 spec/fixtures/external_scanners/external_scan.c create mode 100644 src/compiler/rules/external_token.cc create mode 100644 src/compiler/rules/external_token.h diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 3a5bab9a..a335dd6d 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -48,6 +48,11 @@ typedef struct { bool fragile : 1; } TSParseAction; +typedef struct { + uint16_t lex_state; + uint16_t external_tokens; +} TSLexMode; + typedef union { TSParseAction action; struct { @@ -64,8 +69,15 @@ typedef struct TSLanguage { const TSSymbolMetadata *symbol_metadata; const unsigned short *parse_table; const TSParseActionEntry *parse_actions; - const TSStateId *lex_states; + const TSLexMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); + const TSSymbol *external_token_symbol_map; + const bool *external_token_lists; + struct { + void * (*create)(); + bool (*scan)(TSLexer *, const bool *symbol_whitelist); + void (*destroy)(void *); + } external_scanner; } TSLanguage; /* @@ -146,21 +158,22 @@ typedef struct TSLanguage { { .type = TSParseActionTypeAccept } \ } -#define EXPORT_LANGUAGE(language_name) \ - static TSLanguage language = { \ - .symbol_count = SYMBOL_COUNT, \ - .token_count = TOKEN_COUNT, \ - .symbol_metadata = ts_symbol_metadata, \ - .parse_table = (const unsigned short *)ts_parse_table, \ - .parse_actions = ts_parse_actions, \ - .lex_states = ts_lex_states, \ - .symbol_names = ts_symbol_names, \ - .lex_fn = ts_lex, \ - }; \ - \ - const TSLanguage *language_name() { \ - return &language; \ - } + +#define GET_LANGUAGE(...) \ + static TSLanguage language = { \ + .symbol_count = SYMBOL_COUNT, \ + .token_count = TOKEN_COUNT, \ + .symbol_metadata = ts_symbol_metadata, \ + .parse_table = (const unsigned short *)ts_parse_table, \ + .parse_actions = ts_parse_actions, \ + .lex_modes = ts_lex_modes, \ + .symbol_names = ts_symbol_names, \ + .lex_fn = ts_lex, \ + .external_token_lists = (const bool *)ts_external_token_lists, \ + .external_token_symbol_map = ts_external_token_symbol_map, \ + .external_scanner = {__VA_ARGS__} \ + }; \ + return &language \ #ifdef __cplusplus } diff --git a/project.gyp b/project.gyp index 081a3a88..29b69787 100644 --- a/project.gyp +++ b/project.gyp @@ -47,6 +47,7 @@ 'src/compiler/rules/character_range.cc', 'src/compiler/rules/character_set.cc', 'src/compiler/rules/choice.cc', + 'src/compiler/rules/external_token.cc', 'src/compiler/rules/metadata.cc', 'src/compiler/rules/named_symbol.cc', 'src/compiler/rules/pattern.cc', diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc index 104cd721..f01d76cb 100644 --- a/spec/compiler/build_tables/distinctive_tokens_spec.cc +++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc @@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() { })), }; - AssertThat(recovery_tokens(grammar), Equals>({ 1 })); + AssertThat(recovery_tokens(grammar), Equals>({ Symbol(1, Symbol::Terminal) })); }); }); diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc index 7f43e175..3aa75a4c 100644 --- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc +++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc @@ -14,10 +14,10 @@ START_TEST describe("LexConflictManager::resolve(new_action, old_action)", []() { LexConflictManager conflict_manager; bool update; - Symbol sym1(0, true); - Symbol sym2(1, true); - Symbol sym3(2, true); - Symbol sym4(3, true); + Symbol sym1(0, Symbol::Terminal); + Symbol sym2(1, Symbol::Terminal); + Symbol sym3(2, Symbol::Terminal); + Symbol sym4(3, Symbol::Terminal); LexItemSet item_set({ LexItem(sym4, blank() )}); it("favors advance actions over empty accept token actions", [&]() { diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc index 94997956..7042922f 100644 --- a/spec/compiler/build_tables/lex_item_spec.cc +++ b/spec/compiler/build_tables/lex_item_spec.cc @@ -14,7 +14,7 @@ START_TEST describe("LexItem", []() { describe("completion_status()", [&]() { it("indicates whether the item is done, its precedence, and whether it is a string", [&]() { - LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' })); + LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' })); AssertThat(item1.completion_status().is_done, IsFalse()); AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item1.completion_status().is_string, IsFalse()); @@ -23,7 +23,7 @@ describe("LexItem", []() { params.precedence = 3; params.has_precedence = true; params.is_string = 1; - LexItem item2(Symbol(0, true), choice({ + LexItem item2(Symbol(0, Symbol::Terminal), choice({ metadata(blank(), params), character({ 'a', 'b', 'c' }) })); @@ -32,7 +32,7 @@ describe("LexItem", []() { AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); AssertThat(item2.completion_status().is_string, IsTrue()); - LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' }))); + LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' }))); AssertThat(item3.completion_status().is_done, IsTrue()); AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item3.completion_status().is_string, IsFalse()); @@ -43,7 +43,7 @@ describe("LexItem", []() { describe("LexItemSet::transitions()", [&]() { it("handles single characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), }); AssertThat( @@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() { params.is_main_token = true; LexItemSet item_set({ - LexItem(Symbol(1), metadata(character({ 'x' }), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)), }); AssertThat( @@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), metadata(blank(), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)), }), PrecedenceRange(), true @@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'w' }), character({ 'x' }), character({ 'y' }), @@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('w'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }), @@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences with nested precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ prec(3, seq({ character({ 'v' }), prec(4, seq({ @@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() { // The outer precedence is now 'active', because we are within its // contained rule. LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ prec(4, seq({ character({ 'w' }), @@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() { Transition{ // The inner precedence is now 'active' LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ active_prec(4, character({ 'x' })), character({ 'y' }) })), @@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, character({ 'y' })), character({ 'z' }), })), @@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(3), false @@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences where the left hand side can be blank", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ 'x' }), blank(), @@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'y' }), character({ 'z' }), })), @@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(), false @@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles blanks", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }); AssertThat(item_set.transitions(), IsEmpty()); @@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), repeat1(seq({ + LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({ character({ 'a' }), character({ 'b' }), }))), - LexItem(Symbol(2), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), }); AssertThat( @@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'b' }), repeat1(seq({ character({ 'a' }), character({ 'b' }), })) })), - LexItem(Symbol(1), character({ 'b' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })), }), PrecedenceRange(), false @@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c'), Transition{ LexItemSet({ - LexItem(Symbol(2), repeat1(character({ 'c' }))), - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats with precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))) + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))) }); AssertThat( @@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))), - LexItem(Symbol(1), active_prec(-1, blank())), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())), }), PrecedenceRange(-1), false @@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between overlapping character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ active_prec(2, seq({ character({ 'a', 'b', 'c', 'd' }), character({ 'x' }), @@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'b'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), }), PrecedenceRange(2), false @@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(2, 3), false @@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(3), false @@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between a subset and a superset of characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ seq({ character({ 'b', 'c', 'd' }), character({ 'x' }), @@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a').include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('b', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'x' })), - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between whitelisted and blacklisted character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ '/' }, false), seq({ @@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include_all().exclude('/').exclude('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), }), PrecedenceRange(), false @@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), - LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })), }), PrecedenceRange(), false @@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() { it("handles different items with overlapping character sets", [&]() { LexItemSet set1({ - LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), - LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' })) + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), + LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' })) }); AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({ @@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), - LexItem(Symbol(2), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('g', 'i'), Transition{ LexItemSet({ - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index a1dd2231..dad0976b 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -27,23 +27,23 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({ - {Symbol(2), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone}, }) }), SyntaxVariable("rule2", VariableTypeNamed, { Production({ - {Symbol(14, true), 0, AssociativityNone}, - {Symbol(15, true), 0, AssociativityNone}, + {Symbol(14, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(15, Symbol::Terminal), 0, AssociativityNone}, }) }), }, {}, {}}; @@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) + }, + { + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) - }, - { - ParseItem(Symbol(2), production(2, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); @@ -86,14 +86,14 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({}) }), @@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 9f871ec4..577dead1 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -133,13 +133,13 @@ describe("extract_tokens", []() { Variable("rule_A", VariableTypeNamed, str("ok")), Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), - }, { str(" ") }, { { Symbol(1), Symbol(2) } }}); + }, { str(" ") }, { { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } }}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); AssertThat(syntax_grammar.variables.size(), Equals(2)); AssertThat(syntax_grammar.expected_conflicts, Equals(set>({ - { Symbol(0), Symbol(1) }, + { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) }, }))); }); @@ -171,7 +171,7 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<1>(result).separators.size(), Equals(0)); - AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, true) }))); + AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, Symbol::Terminal) }))); }); it("updates extra symbols according to the new symbol numbers", [&]() { @@ -186,7 +186,7 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<0>(result).extra_tokens, Equals(set({ - { Symbol(3, true) }, + { Symbol(3, Symbol::Terminal) }, }))); AssertThat(get<1>(result).separators, IsEmpty()); diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc index 3efd4e03..823da8e6 100644 --- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -36,19 +36,19 @@ describe("flatten_grammar", []() { AssertThat(result.type, Equals(VariableTypeNamed)); AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(3), 102, AssociativityRight}, - {Symbol(4), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight}, + {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }), Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(5), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }) }))) }); @@ -65,8 +65,8 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, - {Symbol(2), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) @@ -80,7 +80,7 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) }); diff --git a/spec/compiler/rules/repeat_spec.cc b/spec/compiler/rules/repeat_spec.cc index 63680563..9c84c8e5 100644 --- a/spec/compiler/rules/repeat_spec.cc +++ b/spec/compiler/rules/repeat_spec.cc @@ -9,7 +9,7 @@ START_TEST describe("Repeat", []() { describe("constructing repeats", [&]() { it("doesn't create redundant repeats", [&]() { - auto sym = make_shared(1); + auto sym = make_shared(1, Symbol::NonTerminal); auto repeat = Repeat::build(sym); auto outer_repeat = Repeat::build(repeat); diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/external_scan.c new file mode 100644 index 00000000..7abab3ae --- /dev/null +++ b/spec/fixtures/external_scanners/external_scan.c @@ -0,0 +1,13 @@ +#include + +void *ts_language_external_scanner_example_external_scanner_create() { + puts("HELLO FROM EXTERNAL SCANNER"); + return 0; +} + +bool ts_language_external_scanner_example_external_scanner_scan() { + return true; +} + +void ts_language_external_scanner_example_external_scanner_destroy() { +} diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index a29aa240..2e85b762 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -67,7 +67,8 @@ static int get_modified_time(const string &path) { const TSLanguage *load_language(const string &source_filename, const string &lib_filename, - const string &language_name) { + const string &language_name, + string external_scanner_path = "") { string language_function_name = "ts_language_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); @@ -119,7 +120,9 @@ const TSLanguage *load_language(const string &source_filename, return language_fn(); } -const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) { +const TSLanguage *load_compile_result(const string &name, + const TSCompileResult &compile_result, + string external_scanner_path) { if (compile_result.error_type != TSCompileErrorTypeNone) { Assert::Failure(string("Compilation failed ") + compile_result.error_message); return nullptr; @@ -135,7 +138,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult source_file << compile_result.code; source_file.close(); - const TSLanguage *language = load_language(source_filename, lib_filename, name); + auto language = load_language(source_filename, lib_filename, name, external_scanner_path); free(compile_result.code); return language; } diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h index 41b1458e..41d8b739 100644 --- a/spec/helpers/load_language.h +++ b/spec/helpers/load_language.h @@ -5,7 +5,8 @@ #include "tree_sitter/runtime.h" #include -const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &); +const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &, + std::string external_scanner_path = ""); const TSLanguage *get_test_language(const std::string &language_name); #endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc index 8bf32360..0b010d2e 100644 --- a/spec/helpers/rule_helpers.cc +++ b/spec/helpers/rule_helpers.cc @@ -9,6 +9,7 @@ namespace tree_sitter { using std::ostream; using std::string; using std::to_string; + using rules::Symbol; rule_ptr character(const set &ranges) { return character(ranges, true); @@ -28,11 +29,11 @@ namespace tree_sitter { } rule_ptr i_sym(size_t index) { - return make_shared(index); + return make_shared(index, Symbol::NonTerminal); } rule_ptr i_token(size_t index) { - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) { diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index 4d411d66..b47363a0 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -10,16 +10,7 @@ namespace tree_sitter { ostream &operator<<(ostream &stream, const Grammar &grammar) { stream << string("# "); - stream << pair.second; - started = true; - } + stream << " rules: " << grammar.rules; return stream << string("}>"); } diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index d41d76e4..21307c89 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -507,6 +507,71 @@ describe("compile_grammar", []() { }); }); + describe("external scanners", [&]() { + it("can call out to arbitrary scanner functions during parsing", [&]() { + string grammar = R"JSON({ + "name": "external_scanner_example", + + "externals": [ + "percent_string", + "percent_string_start", + "percent_string_end" + ], + + "rules": { + "string": { + "type": "CHOICE", + "members": [ + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string" + }, + { + "type": "SEQ", + "members": [ + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string_start" + }, + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "EXTERNAL_TOKEN", + "name": "percent_string_end" + } + ] + }, + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "external_scanner_example", + result, + "spec/fixtures/external_scanners/external_scan.c" + )); + + ts_document_set_input_string(document, "%|hi|"); + ts_document_parse(document); + assert_root_node("(string)"); + + ts_document_set_input_string(document, "%(1 #{two} three)"); + ts_document_parse(document); + assert_root_node("(string (identifier))"); + }); + }); + describe("when the grammar's start symbol is a token", [&]() { it("parses the token", [&]() { TSCompileResult result = ts_compile_grammar(R"JSON( diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index 9d716ed1..86a1dc47 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -80,10 +80,10 @@ START_TEST describe("The Corpus", []() { vector test_languages({ - "javascript", + // "javascript", "json", - "c", - "cpp", + // "c", + // "cpp", }); for (auto &language_name : test_languages) { diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index 151da7cf..29d8f4d0 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -64,7 +64,7 @@ class LexTableBuilder { private: void add_lex_state_for_parse_state(ParseState *parse_state) { parse_state->lex_state_id = - add_lex_state(item_set_for_tokens(parse_state->expected_inputs())); + add_lex_state(item_set_for_terminals(parse_state->terminal_entries)); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -112,24 +112,27 @@ class LexTableBuilder { void mark_fragile_tokens() { for (ParseState &state : parse_table->states) { for (auto &entry : state.terminal_entries) { - auto homonyms = conflict_manager.possible_homonyms.find(entry.first); - if (homonyms != conflict_manager.possible_homonyms.end()) - for (Symbol::Index homonym : homonyms->second) - if (state.terminal_entries.count(homonym)) { - entry.second.reusable = false; - break; - } + Symbol symbol = entry.first; + if (symbol.is_token()) { + auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); + if (homonyms != conflict_manager.possible_homonyms.end()) + for (Symbol::Index homonym : homonyms->second) + if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) { + entry.second.reusable = false; + break; + } - if (!entry.second.reusable) - continue; + if (!entry.second.reusable) + continue; - auto extensions = conflict_manager.possible_extensions.find(entry.first); - if (extensions != conflict_manager.possible_extensions.end()) - for (Symbol::Index extension : extensions->second) - if (state.terminal_entries.count(extension)) { - entry.second.depends_on_lookahead = true; - break; - } + auto extensions = conflict_manager.possible_extensions.find(symbol.index); + if (extensions != conflict_manager.possible_extensions.end()) + for (Symbol::Index extension : extensions->second) + if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) { + entry.second.depends_on_lookahead = true; + break; + } + } } } } @@ -150,24 +153,27 @@ class LexTableBuilder { } } - LexItemSet item_set_for_tokens(const set &symbols) { + LexItemSet item_set_for_terminals(const map &terminals) { LexItemSet result; - for (const Symbol &symbol : symbols) - for (const rule_ptr &rule : rules_for_symbol(symbol)) - for (const rule_ptr &separator_rule : separator_rules) - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Seq::build({ - separator_rule, - Metadata::main_token(rule) })))); + for (const auto &pair : terminals) { + Symbol symbol = pair.first; + if (symbol.is_token()) { + for (const rule_ptr &rule : rules_for_symbol(symbol)) { + for (const rule_ptr &separator_rule : separator_rules) { + result.entries.insert(LexItem( + symbol, + Metadata::separator( + Seq::build({ + separator_rule, + Metadata::main_token(rule) })))); + } + } + } + } return result; } vector rules_for_symbol(const rules::Symbol &symbol) { - if (!symbol.is_token) - return {}; - if (symbol == rules::END_OF_INPUT()) return { CharacterSet().include(0).copy() }; diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 91444310..819ce345 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -52,7 +52,10 @@ class ParseTableBuilder { allow_any_conflict(false) {} pair build() { - Symbol start_symbol = Symbol(0, grammar.variables.empty()); + Symbol start_symbol = grammar.variables.empty() ? + Symbol(0, Symbol::Terminal) : + Symbol(0, Symbol::NonTerminal); + Production start_production({ ProductionStep(start_symbol, 0, rules::AssociativityNone), }); @@ -63,7 +66,7 @@ class ParseTableBuilder { add_parse_state(ParseItemSet({ { ParseItem(rules::START(), start_production, 0), - LookaheadSet({ END_OF_INPUT().index }), + LookaheadSet({ END_OF_INPUT() }), }, })); @@ -107,21 +110,21 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (const Symbol::Index index : parse_table.mergeable_symbols) { - add_out_of_context_parse_state(&error_state, Symbol(index, true)); + for (const Symbol symbol : parse_table.mergeable_symbols) { + add_out_of_context_parse_state(&error_state, symbol); } for (const Symbol &symbol : grammar.extra_tokens) { - if (!error_state.terminal_entries.count(symbol.index)) { - error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra()); + if (!error_state.terminal_entries.count(symbol)) { + error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra()); } } for (size_t i = 0; i < grammar.variables.size(); i++) { - add_out_of_context_parse_state(&error_state, Symbol(i, false)); + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal)); } - error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0)); + error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); parse_table.states[0] = error_state; } @@ -130,10 +133,10 @@ class ParseTableBuilder { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state(item_set); - if (symbol.is_token) { - error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) }); - } else { + if (symbol.is_non_terminal()) { error_state->nonterminal_entries[symbol.index] = state; + } else { + error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) }); } } } @@ -152,9 +155,9 @@ class ParseTableBuilder { } string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { - map terminal_successors; + map terminal_successors; map nonterminal_successors; - set lookaheads_with_conflicts; + set lookaheads_with_conflicts; for (const auto &pair : item_set.entries) { const ParseItem &item = pair.first; @@ -168,7 +171,7 @@ class ParseTableBuilder { ParseAction::Reduce(item.lhs(), item.step_index, *item.production); int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { + for (Symbol lookahead : *lookahead_symbols.entries) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; // Only add the highest-precedence Reduce actions to the parse table. @@ -203,10 +206,10 @@ class ParseTableBuilder { Symbol symbol = item.production->at(item.step_index).symbol; ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - if (symbol.is_token) { - terminal_successors[symbol.index].entries[new_item] = lookahead_symbols; - } else { + if (symbol.is_non_terminal()) { nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols; + } else { + terminal_successors[symbol].entries[new_item] = lookahead_symbols; } } } @@ -214,7 +217,7 @@ class ParseTableBuilder { // Add a Shift action for each possible successor state. Shift actions for // terminal lookaheads can conflict with Reduce actions added previously. for (auto &pair : terminal_successors) { - Symbol::Index lookahead = pair.first; + Symbol lookahead = pair.first; ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(next_item_set); ParseState &state = parse_table.states[state_id]; @@ -223,7 +226,7 @@ class ParseTableBuilder { if (!allow_any_conflict) { if (had_existing_action) lookaheads_with_conflicts.insert(lookahead); - recovery_states[Symbol(lookahead, true)].add(next_item_set); + recovery_states[lookahead].add(next_item_set); } } @@ -234,10 +237,10 @@ class ParseTableBuilder { ParseStateId next_state = add_parse_state(next_item_set); parse_table.set_nonterminal_action(state_id, lookahead, next_state); if (!allow_any_conflict) - recovery_states[Symbol(lookahead, false)].add(next_item_set); + recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set); } - for (Symbol::Index lookahead : lookaheads_with_conflicts) { + for (Symbol lookahead : lookaheads_with_conflicts) { string conflict = handle_conflict(item_set, state_id, lookahead); if (!conflict.empty()) return conflict; } @@ -245,9 +248,9 @@ class ParseTableBuilder { ParseAction shift_extra = ParseAction::ShiftExtra(); ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { - if (!state.terminal_entries.count(extra_symbol.index) || + if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action() || allow_any_conflict) { - parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra); + parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } @@ -257,7 +260,6 @@ class ParseTableBuilder { void mark_fragile_actions() { for (ParseState &state : parse_table.states) { for (auto &entry : state.terminal_entries) { - const Symbol symbol(entry.first, true); auto &actions = entry.second.actions; for (ParseAction &action : actions) { @@ -359,7 +361,7 @@ class ParseTableBuilder { } string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, - Symbol::Index lookahead) { + Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; @@ -468,7 +470,7 @@ class ParseTableBuilder { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n\n"; description += "Possible interpretations:\n\n"; @@ -487,7 +489,7 @@ class ParseTableBuilder { description += " " + symbol_name(step.symbol); } description += ")"; - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n"; } } @@ -564,14 +566,22 @@ class ParseTableBuilder { return "END_OF_INPUT"; else return ""; - } else if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - if (variable.type == VariableTypeNamed) - return variable.name; - else - return "'" + variable.name + "'"; - } else { - return grammar.variables[symbol.index].name; + } + + switch (symbol.type) { + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + if (variable.type == VariableTypeNamed) + return variable.name; + else + return "'" + variable.name + "'"; + } + case Symbol::NonTerminal: { + return grammar.variables[symbol.index].name; + } + case Symbol::External: { + return grammar.external_tokens[symbol.index]; + } } } diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc index 1ecb0baf..239bc029 100644 --- a/src/compiler/build_tables/lookahead_set.cc +++ b/src/compiler/build_tables/lookahead_set.cc @@ -12,8 +12,8 @@ using rules::Symbol; LookaheadSet::LookaheadSet() : entries(nullptr) {} -LookaheadSet::LookaheadSet(const set &symbols) - : entries(make_shared>(symbols)) {} +LookaheadSet::LookaheadSet(const set &symbols) + : entries(make_shared>(symbols)) {} bool LookaheadSet::empty() const { return !entries.get() || entries->empty(); @@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const { return *entries == *other.entries; } -bool LookaheadSet::contains(const Symbol::Index &symbol) const { +bool LookaheadSet::contains(const Symbol &symbol) const { return entries->find(symbol) != entries->end(); } @@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) { if (!other.entries.get()) return false; if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); size_t previous_size = entries->size(); entries->insert(other.entries->begin(), other.entries->end()); return entries->size() > previous_size; } -bool LookaheadSet::insert(const Symbol::Index &symbol) { +bool LookaheadSet::insert(const Symbol &symbol) { if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); return entries->insert(symbol).second; } diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h index fe99b4d5..e62ee34d 100644 --- a/src/compiler/build_tables/lookahead_set.h +++ b/src/compiler/build_tables/lookahead_set.h @@ -11,15 +11,15 @@ namespace build_tables { class LookaheadSet { public: LookaheadSet(); - explicit LookaheadSet(const std::set &); + explicit LookaheadSet(const std::set &); bool empty() const; bool operator==(const LookaheadSet &) const; - bool contains(const rules::Symbol::Index &) const; + bool contains(const rules::Symbol &) const; bool insert_all(const LookaheadSet &); - bool insert(const rules::Symbol::Index &); + bool insert(const rules::Symbol &); - std::shared_ptr> entries; + std::shared_ptr> entries; }; } // namespace build_tables diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 39b131cb..b9c3831b 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const { } Symbol ParseItem::lhs() const { - return Symbol(variable_index); + return Symbol(variable_index, Symbol::NonTerminal); } bool ParseItem::is_done() const { @@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const { return result; } -ParseItemSet::ActionMap ParseItemSet::actions() const { - ParseItemSet::ActionMap result; - - for (const auto &pair : entries) { - const ParseItem &item = pair.first; - const LookaheadSet &lookahead_symbols = pair.second; - - if (item.step_index == item.production->size()) { - int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { - Action &action = result.terminal_actions[lookahead]; - if (precedence > action.completion_precedence) { - action.completions.assign({ &item }); - } else if (precedence == action.completion_precedence) { - action.completions.push_back({ &item }); - } - } - } else { - Symbol symbol = item.production->at(item.step_index).symbol; - ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - - if (symbol.is_token) { - result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols; - } else { - result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols; - } - } - } - - return result; -} - void ParseItemSet::add(const ParseItemSet &other) { for (const auto &pair : other.entries) entries[pair.first].insert_all(pair.second); diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index a091ac9d..a3785638 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -41,16 +41,6 @@ class ParseItemSet { ParseItemSet(); explicit ParseItemSet(const std::map &); - struct Completion; - struct Action; - - struct ActionMap { - std::map terminal_actions; - std::map nonterminal_continuations; - }; - - ActionMap actions() const; - bool operator==(const ParseItemSet &) const; void add(const ParseItemSet &); size_t unfinished_item_signature() const; @@ -58,22 +48,6 @@ class ParseItemSet { std::map entries; }; -struct ParseItemSet::Completion { - const ParseItem *item; - int precedence; - rules::Associativity associativity; - - bool operator<(const ParseItemSet::Completion &other) { - return precedence < other.precedence; - } -}; - -struct ParseItemSet::Action { - ParseItemSet continuation; - std::vector completions; - int completion_precedence; -}; - } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 34b347fe..7e29efdf 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -27,12 +27,12 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, set processed_non_terminals; for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - Symbol symbol(i, true); - first_sets.insert({symbol, LookaheadSet({ static_cast(i) })}); + Symbol symbol(i, Symbol::Terminal); + first_sets.insert({symbol, LookaheadSet({ symbol })}); } for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); LookaheadSet first_set; processed_non_terminals.clear(); @@ -42,10 +42,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, Symbol current_symbol = symbols_to_process.back(); symbols_to_process.pop_back(); - if (current_symbol.is_token) { - first_set.insert(current_symbol.index); + if (!current_symbol.is_non_terminal()) { + first_set.insert(current_symbol); } else if (processed_non_terminals.insert(current_symbol.index).second) { - for (const Production &production : grammar.productions(current_symbol)) { + for (const Production &production : grammar.variables[current_symbol.index].productions) { if (!production.empty()) { symbols_to_process.push_back(production[0].symbol); } @@ -59,11 +59,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, vector components_to_process; for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); map> cache_entry; components_to_process.clear(); - for (const Production &production : grammar.productions(symbol)) { + for (const Production &production : grammar.variables[i].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(symbol, production, 0), LookaheadSet(), @@ -87,7 +87,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, if (component_is_new) { Symbol next_symbol = item.next_symbol(); - if (next_symbol.is_built_in() || next_symbol.is_token) + if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue; LookaheadSet next_lookaheads; @@ -102,7 +102,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, propagates_lookaheads = false; } - for (const Production &production : grammar.productions(next_symbol)) { + for (const Production &production : grammar.variables[next_symbol.index].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(next_symbol, production, 0), next_lookaheads, @@ -130,7 +130,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { const LookaheadSet &lookaheads = pair.second; const Symbol &next_symbol = item.next_symbol(); - if (!next_symbol.is_token && !next_symbol.is_built_in()) { + if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) { LookaheadSet next_lookaheads; size_t next_step = item.step_index + 1; if (next_step == item.production->size()) { diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc index 479de6b8..84b175bc 100644 --- a/src/compiler/build_tables/recovery_tokens.cc +++ b/src/compiler/build_tables/recovery_tokens.cc @@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator {}; class LastCharacters : public CharacterAggregator {}; class AllCharacters : public CharacterAggregator {}; -set recovery_tokens(const LexicalGrammar &grammar) { - set result; +set recovery_tokens(const LexicalGrammar &grammar) { + set result; AllCharacters all_separator_characters; for (const rule_ptr &separator : grammar.separators) @@ -79,7 +79,7 @@ set recovery_tokens(const LexicalGrammar &grammar) { !all_characters.result.intersects(all_separator_characters.result); if ((has_distinct_start && has_distinct_end) || has_no_separators) - result.insert(i); + result.insert(Symbol(i, Symbol::Terminal)); } return result; diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h index 4873b5a9..c97a8cfd 100644 --- a/src/compiler/build_tables/recovery_tokens.h +++ b/src/compiler/build_tables/recovery_tokens.h @@ -11,7 +11,7 @@ struct LexicalGrammar; namespace build_tables { -std::set recovery_tokens(const LexicalGrammar &); +std::set recovery_tokens(const LexicalGrammar &); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b7058603..a5a9c17a 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -14,6 +14,7 @@ namespace tree_sitter { namespace generate_code { + using std::function; using std::map; using std::pair; @@ -22,6 +23,7 @@ using std::string; using std::to_string; using std::vector; using util::escape_char; +using rules::Symbol; static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr()); @@ -73,9 +75,8 @@ class CCodeGenerator { const LexicalGrammar lexical_grammar; map sanitized_names; vector> parse_table_entries; - vector>> in_progress_symbols; + vector> external_token_id_sets; size_t next_parse_action_list_index; - size_t next_in_progress_symbol_list_index; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -87,19 +88,25 @@ class CCodeGenerator { lex_table(lex_table), syntax_grammar(syntax_grammar), lexical_grammar(lexical_grammar), - next_parse_action_list_index(0), - next_in_progress_symbol_list_index(0) {} + next_parse_action_list_index(0) {} string code() { buffer = ""; add_includes(); - add_state_and_symbol_counts(); + add_warning_pragma(); + add_stats(); add_symbol_enum(); add_symbol_names_list(); - add_symbol_node_types_list(); + add_symbol_metadata_list(); add_lex_function(); - add_lex_states_list(); + add_lex_modes_list(); + + if (!syntax_grammar.external_tokens.empty()) + add_external_token_enum(); + + add_external_token_symbol_map(); + add_external_scan_modes_list(); add_parse_table(); add_parser_export(); @@ -112,10 +119,17 @@ class CCodeGenerator { line(); } - void add_state_and_symbol_counts() { + void add_warning_pragma() { + line("#pragma GCC diagnostic push"); + line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + line(); + } + + void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1)); + line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -124,7 +138,7 @@ class CCodeGenerator { indent([&]() { size_t i = 1; for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; if (!symbol.is_built_in()) { line(symbol_id(symbol) + " = " + to_string(i) + ","); i++; @@ -146,11 +160,11 @@ class CCodeGenerator { line(); } - void add_symbol_node_types_list() { + void add_symbol_metadata_list() { line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {"); indent([&]() { for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; line("[" + symbol_id(symbol) + "] = {"); indent([&]() { switch (symbol_type(symbol)) { @@ -198,13 +212,80 @@ class CCodeGenerator { line(); } - void add_lex_states_list() { - line("static TSStateId ts_lex_states[STATE_COUNT] = {"); + void add_lex_modes_list() { + add_external_tokens_id({}); + + line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { size_t state_id = 0; - for (const auto &state : parse_table.states) - line("[" + to_string(state_id++) + "] = " + - to_string(state.lex_state_id) + ","); + + for (const auto &state : parse_table.states) { + line("[" + to_string(state_id++) + "] = {.lex_state = "); + add(to_string(state.lex_state_id)); + + set external_token_indices; + for (const auto &pair : state.terminal_entries) { + Symbol symbol = pair.first; + if (symbol.is_external()) + external_token_indices.insert(symbol.index); + } + + if (!external_token_indices.empty()) + add(", .external_tokens = " + add_external_tokens_id(external_token_indices)); + add("},"); + } + }); + line("};"); + line(); + } + + string add_external_tokens_id(set external_token_ids) { + for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) + if (external_token_id_sets[i] == external_token_ids) + return to_string(i); + external_token_id_sets.push_back(external_token_ids); + return to_string(external_token_id_sets.size() - 1); + } + + void add_external_token_enum() { + line("enum {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) + line(external_token_id(i) + ","); + }); + line("};"); + line(); + } + + void add_external_token_symbol_map() { + line("TSSymbol ts_external_token_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { + line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ","); + } + }); + line("};"); + line(); + } + + void add_external_scan_modes_list() { + line("static bool ts_external_token_lists["); + add(to_string(external_token_id_sets.size())); + add("][EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + size_t i = 0; + for (const auto &external_token_ids : external_token_id_sets) { + if (!external_token_ids.empty()) { + line("[" + to_string(i) + "] = {"); + indent([&]() { + for (Symbol::Index id : external_token_ids) { + line("[" + external_token_id(id) + "] = true,"); + } + }); + line("},"); + } + i++; + } }); line("};"); line(); @@ -214,9 +295,6 @@ class CCodeGenerator { add_parse_action_list_id(ParseTableEntry{ {}, false, false }); size_t state_id = 0; - line("#pragma GCC diagnostic push"); - line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); - line(); line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {"); indent([&]() { @@ -224,12 +302,12 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {"); indent([&]() { for (const auto &entry : state.nonterminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE("); + line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE("); add(to_string(entry.second)); add("),"); } for (const auto &entry : state.terminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS("); + line("[" + symbol_id(entry.first) + "] = ACTIONS("); add(to_string(add_parse_action_list_id(entry.second))); add("),"); } @@ -242,12 +320,37 @@ class CCodeGenerator { line(); add_parse_action_list(); line(); - line("#pragma GCC diagnostic pop"); - line(); } void add_parser_export() { - line("EXPORT_LANGUAGE(ts_language_" + name + ");"); + if (!syntax_grammar.external_tokens.empty()) { + string external_scanner_name = "ts_language_" + name + "_external_scanner"; + + line("void *" + external_scanner_name + "_create();"); + line("bool " + external_scanner_name + "_scan();"); + line("void " + external_scanner_name + "_destroy();"); + line(); + + line("const TSLanguage *ts_language_" + name + "() {"); + indent([&]() { + if (!syntax_grammar.external_tokens.empty()) { + line("GET_LANGUAGE("); + indent([&]() { + line(external_scanner_name + "_create,"); + line(external_scanner_name + "_scan,"); + line(external_scanner_name + "_destroy,"); + }); + line(");"); + } + }); + line("}"); + } else { + line("const TSLanguage *ts_language_" + name + "() {"); + indent([&]() { + line("GET_LANGUAGE();"); + }); + line("}"); + } line(); } @@ -379,22 +482,13 @@ class CCodeGenerator { return result; } - size_t add_in_progress_symbol_list_id(const set &symbols) { - for (const auto &pair : in_progress_symbols) { - if (pair.second == symbols) { - return pair.first; - } - } - - size_t result = next_in_progress_symbol_list_index; - in_progress_symbols.push_back({ result, symbols }); - next_in_progress_symbol_list_index += 1 + symbols.size(); - return result; - } - // Helper functions - string symbol_id(const rules::Symbol &symbol) { + string external_token_id(Symbol::Index index) { + return "ts_external_token_" + syntax_grammar.external_tokens[index]; + } + + string symbol_id(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "ts_builtin_sym_end"; @@ -411,25 +505,31 @@ class CCodeGenerator { } } - string symbol_name(const rules::Symbol &symbol) { + string symbol_name(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "END"; return entry_for_symbol(symbol).first; } - VariableType symbol_type(const rules::Symbol &symbol) { + VariableType symbol_type(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return VariableTypeHidden; return entry_for_symbol(symbol).second; } - pair entry_for_symbol(const rules::Symbol &symbol) { - if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - return { variable.name, variable.type }; - } else { - const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; - return { variable.name, variable.type }; + pair entry_for_symbol(const Symbol &symbol) { + switch (symbol.type) { + case Symbol::NonTerminal: { + const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::External: { + return { syntax_grammar.external_tokens[symbol.index], VariableTypeAnonymous }; + } } } diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index a8955c02..0a07280c 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -12,6 +12,7 @@ struct Grammar { std::vector> rules; std::vector extra_tokens; std::vector> expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 185d919b..cc5cff55 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -119,6 +119,16 @@ ParseRuleResult parse_rule(json_value *rule_json) { } } + if (type == "EXTERNAL_TOKEN") { + json_value token_name_json = rule_json->operator[]("name"); + if (token_name_json.type != json_string) { + error_message = "External token name must be a string"; + goto error; + } + + return { external_token(token_name_json.u.string.ptr), "" }; + } + if (type == "PATTERN") { json_value value_json = rule_json->operator[]("value"); if (value_json.type == json_string) { @@ -210,7 +220,7 @@ ParseGrammarResult parse_grammar(const string &input) { string error_message; string name; Grammar grammar; - json_value name_json, rules_json, extras_json, conflicts_json; + json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json; json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; char parse_error[json_error_max]; @@ -302,6 +312,25 @@ ParseGrammarResult parse_grammar(const string &input) { } } + external_tokens_json = grammar_json->operator[]("externals"); + if (external_tokens_json.type != json_none) { + if (external_tokens_json.type != json_array) { + error_message = "External tokens must be an array"; + goto error; + } + + for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) { + json_value *token_name_json = external_tokens_json.u.array.values[i]; + if (token_name_json->type != json_string) { + error_message = "External token values must be strings"; + goto error; + } + + string token_name = token_name_json->u.string.ptr; + grammar.external_tokens.push_back(token_name); + } + } + json_value_free(grammar_json); return { name, grammar, "" }; diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index e6e4badd..a04eec8c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -1,6 +1,7 @@ #include "compiler/parse_table.h" #include #include "compiler/precedence_range.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -28,7 +29,7 @@ ParseAction::ParseAction() extra(false), fragile(false), state_index(-1), - symbol(Symbol(-1)), + symbol(rules::NONE()), consumed_symbol_count(0), production(nullptr) {} @@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() { } ParseAction ParseAction::Shift(ParseStateId state_index) { - return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr); + return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr); } ParseAction ParseAction::Recover(ParseStateId state_index) { - return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0, + return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0, nullptr); } @@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const { set ParseState::expected_inputs() const { set result; for (auto &entry : terminal_entries) - result.insert(Symbol(entry.first, true)); - for (auto &entry : nonterminal_entries) - result.insert(Symbol(entry.first, false)); + result.insert(entry.first); return result; } @@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() { return states.size() - 1; } -ParseAction &ParseTable::set_terminal_action(ParseStateId state_id, - Symbol::Index index, - ParseAction action) { - states[state_id].terminal_entries[index].actions.clear(); - return add_terminal_action(state_id, index, action); -} - ParseAction &ParseTable::add_terminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol lookahead, ParseAction action) { - Symbol symbol(index, true); if (action.type == ParseActionTypeShift && action.extra) - symbols[symbol].extra = true; + symbols[lookahead].extra = true; else - symbols[symbol].structural = true; + symbols[lookahead].structural = true; - ParseTableEntry &entry = states[state_id].terminal_entries[index]; + ParseTableEntry &entry = states[state_id].terminal_entries[lookahead]; entry.actions.push_back(action); return *entry.actions.rbegin(); } void ParseTable::set_nonterminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol::Index lookahead, ParseStateId next_state_id) { - Symbol symbol(index, false); - symbols[symbol].structural = true; - states[state_id].nonterminal_entries[index] = next_state_id; + symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true; + states[state_id].nonterminal_entries[lookahead] = next_state_id; } static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { @@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) { return false; for (auto &entry : state.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - const auto &other_entry = other.terminal_entries.find(index); + const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; @@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) { } } - set symbols_to_merge; + set symbols_to_merge; for (auto &entry : other.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - if (!state.terminal_entries.count(index)) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (!state.terminal_entries.count(lookahead)) { + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) return false; - symbols_to_merge.insert(index); + symbols_to_merge.insert(lookahead); } } - for (const Symbol::Index &index : symbols_to_merge) - state.terminal_entries[index] = other.terminal_entries.find(index)->second; + for (const Symbol &lookahead : symbols_to_merge) + state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; return true; } diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 59eee4a8..79eec4fc 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -76,7 +76,7 @@ class ParseState { void each_referenced_state(std::function); bool has_shift_action() const; - std::map terminal_entries; + std::map terminal_entries; std::map nonterminal_entries; LexStateId lex_state_id; size_t shift_actions_signature; @@ -91,15 +91,14 @@ class ParseTable { public: std::set all_symbols() const; ParseStateId add_state(); - ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction); - ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction); - void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId); + ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); + void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); bool merge_state(size_t i, size_t j); std::vector states; std::map symbols; - std::set mergeable_symbols; + std::set mergeable_symbols; }; } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 7963e94b..331c9cea 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn { rule_ptr inner_rule = apply(rule->content); size_t index = aux_rules.size(); string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count); - Symbol repeat_symbol(offset + index); + Symbol repeat_symbol(offset + index, Symbol::NonTerminal); existing_repeats.push_back({ rule->copy(), repeat_symbol }); aux_rules.push_back( Variable(helper_rule_name, VariableTypeAuxiliary, @@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { result.variables = grammar.variables; result.extra_tokens = grammar.extra_tokens; result.expected_conflicts = grammar.expected_conflicts; + result.external_tokens = grammar.external_tokens; ExpandRepeats expander(result.variables.size()); for (auto &variable : result.variables) diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index bf7ac514..dcf88e53 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -11,6 +11,7 @@ #include "compiler/rules/symbol.h" #include "compiler/rules/string.h" #include "compiler/rules/metadata.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/pattern.h" #include "compiler/prepare_grammar/token_description.h" #include "compiler/prepare_grammar/is_token.h" @@ -38,7 +39,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { map replacements; Symbol replace_symbol(const Symbol &symbol) { - if (symbol.is_built_in() || symbol.is_token) + if (!symbol.is_non_terminal()) return symbol; auto replacement_pair = replacements.find(symbol); @@ -49,7 +50,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { for (const auto &pair : replacements) if (pair.first.index < symbol.index) new_index--; - return Symbol(new_index); + return Symbol(new_index, Symbol::NonTerminal); } }; @@ -60,14 +61,14 @@ class TokenExtractor : public rules::IdentityRuleFn { for (size_t i = 0; i < tokens.size(); i++) if (tokens[i].rule->operator==(*input)) { token_usage_counts[i]++; - return make_shared(i, true); + return make_shared(i, Symbol::Terminal); } rule_ptr rule = input->copy(); size_t index = tokens.size(); tokens.push_back(Variable(token_description(rule), entry_type, rule)); token_usage_counts.push_back(1); - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr apply_to(const rules::String *rule) { @@ -78,6 +79,10 @@ class TokenExtractor : public rules::IdentityRuleFn { return apply_to_token(rule, VariableTypeAuxiliary); } + rule_ptr apply_to(const rules::ExternalToken *rule) { + return apply_to_token(rule, VariableTypeAuxiliary); + } + rule_ptr apply_to(const rules::Metadata *rule) { if (rule->params.is_token) return apply_to_token(rule->rule.get(), VariableTypeAuxiliary); @@ -90,7 +95,7 @@ class TokenExtractor : public rules::IdentityRuleFn { vector tokens; }; -static CompileError ubiq_token_err(const string &message) { +static CompileError extra_token_error(const string &message) { return CompileError(TSCompileErrorTypeInvalidUbiquitousToken, "Not a token: " + message); } @@ -122,11 +127,10 @@ tuple extract_tokens( size_t i = 0; for (const Variable &variable : processed_variables) { auto symbol = variable.rule->as(); - if (symbol && symbol->is_token && !symbol->is_built_in() && - extractor.token_usage_counts[symbol->index] == 1) { + if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) { lexical_grammar.variables[symbol->index].type = variable.type; lexical_grammar.variables[symbol->index].name = variable.name; - symbol_replacer.replacements.insert({ Symbol(i), *symbol }); + symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol }); } else { syntax_grammar.variables.push_back(variable); } @@ -158,7 +162,7 @@ tuple extract_tokens( bool used_elsewhere_in_grammar = false; for (const Variable &variable : lexical_grammar.variables) { if (variable.rule->operator==(*rule)) { - syntax_grammar.extra_tokens.insert(Symbol(i, true)); + syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal)); used_elsewhere_in_grammar = true; } i++; @@ -175,17 +179,20 @@ tuple extract_tokens( auto symbol = rule->as(); if (!symbol) return make_tuple(syntax_grammar, lexical_grammar, - ubiq_token_err(rule->to_string())); + extra_token_error(rule->to_string())); Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); - if (!new_symbol.is_token) + if (!new_symbol.is_token()) { return make_tuple( syntax_grammar, lexical_grammar, - ubiq_token_err(syntax_grammar.variables[new_symbol.index].name)); + extra_token_error(syntax_grammar.variables[new_symbol.index].name)); + } syntax_grammar.extra_tokens.insert(new_symbol); } + syntax_grammar.external_tokens = grammar.external_tokens; + return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index ddba9a5f..8ac0e33c 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -92,6 +92,7 @@ pair flatten_grammar(const InitialSyntaxGrammar &gr SyntaxGrammar result; result.expected_conflicts = grammar.expected_conflicts; result.extra_tokens = grammar.extra_tokens; + result.external_tokens = grammar.external_tokens; bool is_start = true; for (const Variable &variable : grammar.variables) { diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index fe1ff37d..d4b1c8d5 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -1,13 +1,12 @@ #ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ #define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ -#include -#include #include +#include #include "tree_sitter/compiler.h" #include "compiler/rules/symbol.h" -#include "compiler/variable.h" #include "compiler/syntax_grammar.h" +#include "compiler/variable.h" namespace tree_sitter { namespace prepare_grammar { @@ -16,6 +15,7 @@ struct InitialSyntaxGrammar { std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index cd01719c..f08edf5e 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -7,6 +7,7 @@ #include "compiler/rules/visitor.h" #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/symbol.h" namespace tree_sitter { @@ -17,6 +18,7 @@ using std::vector; using std::set; using std::pair; using std::make_shared; +using rules::Symbol; class InternSymbols : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; @@ -30,17 +32,34 @@ class InternSymbols : public rules::IdentityRuleFn { return result; } + rule_ptr apply_to(const rules::ExternalToken *rule) { + auto result = symbol_for_external_token(rule->name); + if (!result.get()) { + missing_external_token_name = rule->name; + return rules::Blank::build(); + } + return result; + } + public: std::shared_ptr symbol_for_rule_name(string rule_name) { for (size_t i = 0; i < grammar.rules.size(); i++) if (grammar.rules[i].first == rule_name) - return make_shared(i); + return make_shared(i, Symbol::NonTerminal); + return nullptr; + } + + std::shared_ptr symbol_for_external_token(string name) { + for (size_t i = 0; i < grammar.external_tokens.size(); i++) + if (grammar.external_tokens[i] == name) + return make_shared(i, Symbol::External); return nullptr; } explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; + string missing_external_token_name; }; CompileError missing_rule_error(string rule_name) { @@ -48,14 +67,22 @@ CompileError missing_rule_error(string rule_name) { "Undefined rule '" + rule_name + "'"); } +CompileError missing_external_token_error(string token_name) { + return CompileError(TSCompileErrorTypeUndefinedSymbol, + "Undefined external token '" + token_name + "'"); +} + pair intern_symbols(const Grammar &grammar) { InternedGrammar result; + result.external_tokens = grammar.external_tokens; InternSymbols interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; + if (!interner.missing_external_token_name.empty()) + return { result, missing_external_token_error(interner.missing_external_token_name) }; result.variables.push_back(Variable( pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, @@ -66,6 +93,8 @@ pair intern_symbols(const Grammar &grammar) { auto new_rule = interner.apply(rule); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; + if (!interner.missing_external_token_name.empty()) + return { result, missing_external_token_error(interner.missing_external_token_name) }; result.extra_tokens.push_back(new_rule); } diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index c08c07dd..7b425c3a 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,6 +15,7 @@ struct InternedGrammar { std::vector variables; std::vector extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/rules.h b/src/compiler/rules.h index d98a719a..8a3f4097 100644 --- a/src/compiler/rules.h +++ b/src/compiler/rules.h @@ -22,6 +22,7 @@ rule_ptr prec_left(int precedence, const rule_ptr &); rule_ptr prec_right(const rule_ptr &); rule_ptr prec_right(int precedence, const rule_ptr &); rule_ptr token(const rule_ptr &rule); +rule_ptr external_token(const std::string &); } // namespace std diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index a7a877ec..b3f7cd66 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -4,15 +4,15 @@ namespace tree_sitter { namespace rules { Symbol END_OF_INPUT() { - return Symbol(-1, true); + return Symbol(-1, Symbol::Terminal); } Symbol START() { - return Symbol(-2); + return Symbol(-2, Symbol::NonTerminal); } Symbol NONE() { - return Symbol(-3); + return Symbol(-3, Symbol::NonTerminal); } } // namespace rules diff --git a/src/compiler/rules/external_token.cc b/src/compiler/rules/external_token.cc new file mode 100644 index 00000000..d8487b0e --- /dev/null +++ b/src/compiler/rules/external_token.cc @@ -0,0 +1,39 @@ +#include "compiler/rules/external_token.h" +#include +#include "compiler/rules/visitor.h" + +namespace tree_sitter { +namespace rules { + +using std::string; +using std::hash; + +ExternalToken::ExternalToken(const string &name) : name(name) {} + +rule_ptr ExternalToken::build(const string &name) { + return std::make_shared(name); +} + +bool ExternalToken::operator==(const Rule &rule) const { + auto other = rule.as(); + return other && other->name == name; +} + +size_t ExternalToken::hash_code() const { + return hash()(name); +} + +rule_ptr ExternalToken::copy() const { + return std::make_shared(*this); +} + +string ExternalToken::to_string() const { + return string("(sym '") + name + "')"; +} + +void ExternalToken::accept(Visitor *visitor) const { + visitor->visit(this); +} + +} // namespace rules +} // namespace tree_sitter diff --git a/src/compiler/rules/external_token.h b/src/compiler/rules/external_token.h new file mode 100644 index 00000000..cec1a847 --- /dev/null +++ b/src/compiler/rules/external_token.h @@ -0,0 +1,27 @@ +#ifndef COMPILER_RULES_EXTERNAL_TOKEN_H_ +#define COMPILER_RULES_EXTERNAL_TOKEN_H_ + +#include +#include "compiler/rule.h" + +namespace tree_sitter { +namespace rules { + +class ExternalToken : public Rule { + public: + explicit ExternalToken(const std::string &); + static rule_ptr build(const std::string &); + + bool operator==(const Rule &other) const; + size_t hash_code() const; + rule_ptr copy() const; + std::string to_string() const; + void accept(Visitor *visitor) const; + + std::string name; +}; + +} // namespace rules +} // namespace tree_sitter + +#endif // COMPILER_RULES_EXTERNAL_TOKEN_H_ diff --git a/src/compiler/rules/rules.cc b/src/compiler/rules/rules.cc index fdb0ebdf..73c37284 100644 --- a/src/compiler/rules/rules.cc +++ b/src/compiler/rules/rules.cc @@ -13,6 +13,7 @@ #include "compiler/rules/pattern.h" #include "compiler/rules/character_set.h" #include "compiler/rules/repeat.h" +#include "compiler/rules/external_token.h" #include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -105,4 +106,8 @@ rule_ptr token(const rule_ptr &rule) { return metadata(rule, params); } +rule_ptr external_token(const string &name) { + return rules::ExternalToken::build(name); +} + } // namespace tree_sitter diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index f85b09c7..478de7cf 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -11,12 +11,10 @@ using std::string; using std::to_string; using util::hash_combine; -Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {} - -Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {} +Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {} bool Symbol::operator==(const Symbol &other) const { - return (other.index == index) && (other.is_token == is_token); + return (other.index == index) && (other.type == type); } bool Symbol::operator==(const Rule &rule) const { @@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const { size_t Symbol::hash_code() const { size_t result = 0; hash_combine(&result, index); - hash_combine(&result, is_token); + hash_combine(&result, type); return result; } @@ -36,14 +34,20 @@ rule_ptr Symbol::copy() const { } string Symbol::to_string() const { - string name = is_token ? "token" : "sym"; - return "(" + name + " " + std::to_string(index) + ")"; + switch (type) { + case Symbol::Terminal: + return "(terminal " + std::to_string(index) + ")"; + case Symbol::NonTerminal: + return "(non-terminal " + std::to_string(index) + ")"; + case Symbol::External: + return "(external " + std::to_string(index) + ")"; + } } bool Symbol::operator<(const Symbol &other) const { - if (is_token && !other.is_token) + if (type < other.type) return true; - if (!is_token && other.is_token) + if (other.type < type) return false; return (index < other.index); } @@ -56,6 +60,18 @@ bool Symbol::is_built_in() const { return is_built_in(index); } +bool Symbol::is_token() const { + return type == Symbol::Terminal; +} + +bool Symbol::is_external() const { + return type == Symbol::External; +} + +bool Symbol::is_non_terminal() const { + return type == Symbol::NonTerminal; +} + void Symbol::accept(Visitor *visitor) const { visitor->visit(this); } diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 4ae9ece3..46272dc5 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -11,9 +11,13 @@ class Symbol : public Rule { public: typedef int Index; + typedef enum { + Terminal, + NonTerminal, + External, + } Type; - explicit Symbol(Index index); - Symbol(Index index, bool is_token); + Symbol(Index index, Type type); bool operator==(const Symbol &other) const; bool operator==(const Rule &other) const; @@ -26,9 +30,12 @@ class Symbol : public Rule { bool operator<(const Symbol &other) const; static bool is_built_in(Index); bool is_built_in() const; + bool is_token() const; + bool is_external() const; + bool is_non_terminal() const; Index index; - bool is_token; + Type type; }; } // namespace rules diff --git a/src/compiler/rules/visitor.h b/src/compiler/rules/visitor.h index b8301183..c75e31dc 100644 --- a/src/compiler/rules/visitor.h +++ b/src/compiler/rules/visitor.h @@ -16,6 +16,7 @@ class String; class Symbol; class Pattern; class Metadata; +class ExternalToken; class Visitor { public: @@ -29,6 +30,7 @@ class Visitor { virtual void visit(const String *rule) = 0; virtual void visit(const NamedSymbol *rule) = 0; virtual void visit(const Symbol *rule) = 0; + virtual void visit(const ExternalToken *rule) = 0; virtual ~Visitor(); }; @@ -86,6 +88,10 @@ class RuleFn : private Visitor { return default_apply((const Rule *)rule); } + virtual T apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } + void visit(const Blank *rule) { value_ = apply_to(rule); } @@ -126,6 +132,10 @@ class RuleFn : private Visitor { value_ = apply_to(rule); } + void visit(const ExternalToken *rule) { + value_ = apply_to(rule); + } + private: T value_; }; @@ -170,6 +180,9 @@ class RuleFn : private Visitor { virtual void apply_to(const Symbol *rule) { return default_apply((const Rule *)rule); } + virtual void apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } void visit(const Blank *rule) { apply_to(rule); @@ -201,6 +214,9 @@ class RuleFn : private Visitor { void visit(const Symbol *rule) { apply_to(rule); } + void visit(const ExternalToken *rule) { + apply_to(rule); + } }; class IdentityRuleFn : public RuleFn { diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 706ec828..535ddcda 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -13,8 +13,6 @@ using std::pair; using std::vector; using std::set; -static const vector NO_PRODUCTIONS; - SyntaxVariable::SyntaxVariable(const string &name, VariableType type, const vector &productions) : name(name), productions(productions), type(type) {} @@ -28,13 +26,4 @@ bool ProductionStep::operator==(const ProductionStep &other) const { associativity == other.associativity; } -const vector &SyntaxGrammar::productions( - const rules::Symbol &symbol) const { - if (symbol.is_built_in() || symbol.is_token) { - return NO_PRODUCTIONS; - } else { - return variables[symbol.index].productions; - } -} - } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 89745fa5..e34ddbbe 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -33,11 +33,10 @@ struct SyntaxVariable { typedef std::set ConflictSet; struct SyntaxGrammar { - const std::vector &productions(const rules::Symbol &) const; - std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 2f5879a4..c37b7871 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -161,7 +161,7 @@ static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) { static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { - if (tree->first_leaf.lex_state == self->language->lex_states[state]) + if (tree->first_leaf.lex_state == self->language->lex_modes[state].lex_state) return true; if (!table_entry->is_reusable) return false; @@ -209,7 +209,7 @@ static bool parser__condense_stack(Parser *self) { } static Tree *parser__lex(Parser *self, TSStateId parse_state) { - TSStateId start_state = self->language->lex_states[parse_state]; + TSStateId start_state = self->language->lex_modes[parse_state].lex_state; TSStateId current_state = start_state; Length start_position = self->lexer.current_position; LOG("lex state:%d", start_state); @@ -729,6 +729,9 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { LOG("new_parse"); } + if (self->language->external_scanner.create) + self->language->external_scanner.create(); + ts_lexer_set_input(&self->lexer, input); ts_stack_clear(self->stack); self->reusable_node = (ReusableNode){ previous_tree, 0 }; From 0f8e130687bad442b4c50c2cbeec15c4977ea68a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 2 Dec 2016 22:03:48 -0800 Subject: [PATCH 03/50] Call external scanner functions when lexing --- include/tree_sitter/parser.h | 5 +- .../prepare_grammar/extract_tokens_spec.cc | 8 +- .../external_scanners/external_scan.c | 105 +++++++++++++++++- spec/integration/compile_grammar_spec.cc | 6 +- src/compiler/generate_code/c_code.cc | 4 +- src/compiler/rules/symbol.h | 2 +- src/runtime/document.c | 2 +- src/runtime/language.h | 6 + src/runtime/lexer.c | 4 +- src/runtime/lexer.h | 2 +- src/runtime/parser.c | 52 ++++++--- src/runtime/parser.h | 2 + 12 files changed, 164 insertions(+), 34 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index a335dd6d..e099fd7f 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -65,6 +65,7 @@ typedef union { typedef struct TSLanguage { uint32_t symbol_count; uint32_t token_count; + uint32_t external_token_count; const char **symbol_names; const TSSymbolMetadata *symbol_metadata; const unsigned short *parse_table; @@ -75,7 +76,7 @@ typedef struct TSLanguage { const bool *external_token_lists; struct { void * (*create)(); - bool (*scan)(TSLexer *, const bool *symbol_whitelist); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); void (*destroy)(void *); } external_scanner; } TSLanguage; @@ -158,7 +159,6 @@ typedef struct TSLanguage { { .type = TSParseActionTypeAccept } \ } - #define GET_LANGUAGE(...) \ static TSLanguage language = { \ .symbol_count = SYMBOL_COUNT, \ @@ -169,6 +169,7 @@ typedef struct TSLanguage { .lex_modes = ts_lex_modes, \ .symbol_names = ts_symbol_names, \ .lex_fn = ts_lex, \ + .external_token_count = EXTERNAL_TOKEN_COUNT, \ .external_token_lists = (const bool *)ts_external_token_lists, \ .external_token_symbol_map = ts_external_token_symbol_map, \ .external_scanner = {__VA_ARGS__} \ diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 577dead1..613d31cc 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -5,6 +5,7 @@ #include "compiler/prepare_grammar/extract_tokens.h" #include "helpers/rule_helpers.h" #include "helpers/equals_pointer.h" +#include "helpers/stream_methods.h" START_TEST @@ -211,9 +212,10 @@ describe("extract_tokens", []() { }, { choice({ i_sym(1), blank() }) }, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); - AssertThat(get<2>(result), Equals( - CompileError(TSCompileErrorTypeInvalidUbiquitousToken, - "Not a token: (choice (sym 1) (blank))"))); + AssertThat(get<2>(result), Equals(CompileError( + TSCompileErrorTypeInvalidUbiquitousToken, + "Not a token: (choice (non-terminal 1) (blank))" + ))); }); }); }); diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/external_scan.c index 7abab3ae..41ef3706 100644 --- a/spec/fixtures/external_scanners/external_scan.c +++ b/spec/fixtures/external_scanners/external_scan.c @@ -1,13 +1,108 @@ #include +#include + +enum { + percent_string, + percent_string_start, + percent_string_end +}; + +typedef struct { + int32_t open_delimiter; + int32_t close_delimiter; + uint32_t depth; +} Scanner; void *ts_language_external_scanner_example_external_scanner_create() { - puts("HELLO FROM EXTERNAL SCANNER"); - return 0; + Scanner *scanner = malloc(sizeof(Scanner)); + *scanner = (Scanner){ + .open_delimiter = 0, + .close_delimiter = 0, + .depth = 0 + }; + return scanner; } -bool ts_language_external_scanner_example_external_scanner_scan() { - return true; +bool ts_language_external_scanner_example_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + Scanner *scanner = payload; + + if (whitelist[percent_string]) { + while (lexer->lookahead == ' ' || + lexer->lookahead == '\t' || + lexer->lookahead == '\n') { + lexer->advance(lexer, 0, true); + } + + if (lexer->lookahead != '%') return false; + lexer->advance(lexer, 0, false); + + switch (lexer->lookahead) { + case '(': + scanner->open_delimiter = '('; + scanner->close_delimiter = ')'; + scanner->depth = 1; + break; + case '[': + scanner->open_delimiter = '['; + scanner->close_delimiter = ']'; + scanner->depth = 1; + break; + case '{': + scanner->open_delimiter = '{'; + scanner->close_delimiter = '}'; + scanner->depth = 1; + break; + default: + return false; + } + + lexer->advance(lexer, 0, false); + + for (;;) { + if (scanner->depth == 0) { + lexer->result_symbol = percent_string; + return true; + } + + if (lexer->lookahead == scanner->open_delimiter) { + scanner->depth++; + } else if (lexer->lookahead == scanner->close_delimiter) { + scanner->depth--; + } else if (lexer->lookahead == '#') { + lexer->advance(lexer, 0, false); + if (lexer->lookahead == '{') { + lexer->advance(lexer, 0, false); + lexer->result_symbol = percent_string_start; + return true; + } + } + + lexer->advance(lexer, 0, false); + } + } else if (whitelist[percent_string_end]) { + if (lexer->lookahead != '}') return false; + lexer->advance(lexer, 0, false); + + for (;;) { + if (scanner->depth == 0) { + lexer->result_symbol = percent_string_end; + return true; + } + + if (lexer->lookahead == scanner->open_delimiter) { + scanner->depth++; + } else if (lexer->lookahead == scanner->close_delimiter) { + scanner->depth--; + } + + lexer->advance(lexer, 0, false); + } + } + + return false; } -void ts_language_external_scanner_example_external_scanner_destroy() { +void ts_language_external_scanner_example_external_scanner_destroy(void *payload) { + free(payload); } diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 21307c89..934b428c 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -562,7 +562,11 @@ describe("compile_grammar", []() { "spec/fixtures/external_scanners/external_scan.c" )); - ts_document_set_input_string(document, "%|hi|"); + ts_document_set_input_string(document, "%(sup (external) scanner?)"); + ts_document_parse(document); + assert_root_node("(string)"); + + ts_document_set_input_string(document, "%{sup {} external {} scanner?}"); ts_document_parse(document); assert_root_node("(string)"); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index a5a9c17a..36bd7cab 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -128,7 +128,7 @@ class CCodeGenerator { void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1)); + line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1 + syntax_grammar.external_tokens.size())); line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -327,7 +327,7 @@ class CCodeGenerator { string external_scanner_name = "ts_language_" + name + "_external_scanner"; line("void *" + external_scanner_name + "_create();"); - line("bool " + external_scanner_name + "_scan();"); + line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); line("void " + external_scanner_name + "_destroy();"); line(); diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 46272dc5..4aacf1b2 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -13,8 +13,8 @@ class Symbol : public Rule { typedef enum { Terminal, - NonTerminal, External, + NonTerminal, } Type; Symbol(Index index, Type type); diff --git a/src/runtime/document.c b/src/runtime/document.c index 65f9e435..c68d8c62 100644 --- a/src/runtime/document.c +++ b/src/runtime/document.c @@ -37,7 +37,7 @@ const TSLanguage *ts_document_language(TSDocument *self) { void ts_document_set_language(TSDocument *self, const TSLanguage *language) { ts_document_invalidate(self); - self->parser.language = language; + parser_set_language(&self->parser, language); if (self->tree) { ts_tree_release(self->tree); self->tree = NULL; diff --git a/src/runtime/language.h b/src/runtime/language.h index a4f44b11..5a2693db 100644 --- a/src/runtime/language.h +++ b/src/runtime/language.h @@ -49,6 +49,12 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, } } +static inline const bool * +ts_language_enabled_external_tokens(const TSLanguage *self, + unsigned external_scanner_state) { + return self->external_token_lists + self->external_token_count * external_scanner_state; +} + #ifdef __cplusplus } #endif diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 32910935..77d76ec6 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -123,9 +123,7 @@ void ts_lexer_reset(Lexer *self, Length position) { return; } -void ts_lexer_start(Lexer *self, TSStateId lex_state) { - LOG("start_lex state:%d, pos:%u", lex_state, self->current_position.chars); - +void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->data.result_symbol = 0; diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 1b047e5b..682c3f93 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -30,7 +30,7 @@ typedef struct { void ts_lexer_init(Lexer *); void ts_lexer_set_input(Lexer *, TSInput); void ts_lexer_reset(Lexer *, Length); -void ts_lexer_start(Lexer *, TSStateId); +void ts_lexer_start(Lexer *); #ifdef __cplusplus } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index c37b7871..997103c8 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -209,23 +209,43 @@ static bool parser__condense_stack(Parser *self) { } static Tree *parser__lex(Parser *self, TSStateId parse_state) { + Length start_position = self->lexer.current_position; + ts_lexer_start(&self->lexer); + + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + if (lex_mode.external_tokens) { + const bool *external_tokens = ts_language_enabled_external_tokens(self->language, lex_mode.external_tokens); + if (self->language->external_scanner.scan( + self->external_scanner_payload, + &self->lexer.data, + external_tokens + )) { + TSSymbol symbol = self->language->external_token_symbol_map[self->lexer.data.result_symbol]; + Length padding = length_sub(self->lexer.token_start_position, start_position); + Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); + TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); + Tree *result = ts_tree_make_leaf(symbol, padding, size, metadata); + result->parse_state = parse_state; + return result; + } else { + ts_lexer_reset(&self->lexer, start_position); + } + } + TSStateId start_state = self->language->lex_modes[parse_state].lex_state; TSStateId current_state = start_state; - Length start_position = self->lexer.current_position; LOG("lex state:%d", start_state); bool skipped_error = false; int32_t first_error_character = 0; Length error_start_position, error_end_position; - ts_lexer_start(&self->lexer, start_state); - while (!self->language->lex_fn(&self->lexer.data, current_state)) { if (current_state != ERROR_STATE) { LOG("retry_in_error_mode"); current_state = ERROR_STATE; ts_lexer_reset(&self->lexer, start_position); - ts_lexer_start(&self->lexer, current_state); + ts_lexer_start(&self->lexer); continue; } @@ -247,7 +267,6 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { } Tree *result; - if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); @@ -255,18 +274,12 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; - Length padding = - length_sub(self->lexer.token_start_position, start_position); - Length size = length_sub(self->lexer.current_position, - self->lexer.token_start_position); - result = - ts_tree_make_leaf(symbol, padding, size, - ts_language_symbol_metadata(self->language, symbol)); + Length padding = length_sub(self->lexer.token_start_position, start_position); + Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); + TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); + result = ts_tree_make_leaf(symbol, padding, size, metadata); } - if (!result) - return NULL; - result->parse_state = parse_state; result->first_leaf.lex_state = start_state; return result; @@ -1106,6 +1119,15 @@ bool parser_init(Parser *self) { return true; } +void parser_set_language(Parser *self, const TSLanguage *language) { + self->language = language; + if (language->external_scanner.create) { + self->external_scanner_payload = language->external_scanner.create(); + } else { + self->external_scanner_payload = NULL; + } +} + void parser_destroy(Parser *self) { if (self->stack) ts_stack_delete(self->stack); diff --git a/src/runtime/parser.h b/src/runtime/parser.h index 41512e12..54c041b3 100644 --- a/src/runtime/parser.h +++ b/src/runtime/parser.h @@ -29,11 +29,13 @@ typedef struct { ReusableNode reusable_node; TreePath tree_path1; TreePath tree_path2; + void *external_scanner_payload; } Parser; bool parser_init(Parser *); void parser_destroy(Parser *); Tree *parser_parse(Parser *, TSInput, Tree *); +void parser_set_language(Parser *, const TSLanguage *); #ifdef __cplusplus } From d72b49316b6fb5a74b7c444cbc3b3dbb25e06c0c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 4 Dec 2016 10:40:32 -0800 Subject: [PATCH 04/50] Handle external tokens in apply_transitive_closure --- spec/integration/compile_grammar_spec.cc | 58 +++++++++++-------- .../build_tables/parse_item_set_builder.cc | 5 ++ 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 934b428c..ea0696ec 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -518,29 +518,43 @@ describe("compile_grammar", []() { "percent_string_end" ], + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + "string": { "type": "CHOICE", "members": [ - { - "type": "EXTERNAL_TOKEN", - "name": "percent_string" - }, + {"type": "EXTERNAL_TOKEN", "name": "percent_string"}, { "type": "SEQ", "members": [ - { - "type": "EXTERNAL_TOKEN", - "name": "percent_string_start" - }, - { - "type": "SYMBOL", - "name": "identifier" - }, - { - "type": "EXTERNAL_TOKEN", - "name": "percent_string_end" - } + {"type": "EXTERNAL_TOKEN", "name": "percent_string_start"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "EXTERNAL_TOKEN", "name": "percent_string_end"} ] }, ] @@ -562,17 +576,13 @@ describe("compile_grammar", []() { "spec/fixtures/external_scanners/external_scan.c" )); - ts_document_set_input_string(document, "%(sup (external) scanner?)"); + ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); ts_document_parse(document); - assert_root_node("(string)"); + assert_root_node("(expression (sum (expression (identifier)) (expression (string))))"); - ts_document_set_input_string(document, "%{sup {} external {} scanner?}"); + ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}"); ts_document_parse(document); - assert_root_node("(string)"); - - ts_document_set_input_string(document, "%(1 #{two} three)"); - ts_document_parse(document); - assert_root_node("(string (identifier))"); + assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); }); }); diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 7e29efdf..0a2039d3 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -31,6 +31,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, first_sets.insert({symbol, LookaheadSet({ symbol })}); } + for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { + Symbol symbol(i, Symbol::External); + first_sets.insert({symbol, LookaheadSet({ symbol })}); + } + for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol(i, Symbol::NonTerminal); LookaheadSet first_set; From cf0d8abea1f4c536647adce002727fd09d90a642 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 4 Dec 2016 14:18:30 -0800 Subject: [PATCH 05/50] Destroy external scanner when destroying Parser --- src/runtime/parser.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 997103c8..6a358448 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -1120,12 +1120,16 @@ bool parser_init(Parser *self) { } void parser_set_language(Parser *self, const TSLanguage *language) { - self->language = language; - if (language->external_scanner.create) { + if (self->external_scanner_payload) + if (self->language->external_scanner.destroy) + self->language->external_scanner.destroy(self->external_scanner_payload); + + if (language->external_scanner.create) self->external_scanner_payload = language->external_scanner.create(); - } else { + else self->external_scanner_payload = NULL; - } + + self->language = language; } void parser_destroy(Parser *self) { @@ -1137,6 +1141,9 @@ void parser_destroy(Parser *self) { array_delete(&self->tree_path1); if (self->tree_path2.contents) array_delete(&self->tree_path2); + if (self->external_scanner_payload) + if (self->language->external_scanner.destroy) + self->language->external_scanner.destroy(self->external_scanner_payload); } Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { From 49d25bd0f8176819907fc243905870649e03d0c4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 4 Dec 2016 14:48:41 -0800 Subject: [PATCH 06/50] Remove EXTERNAL_TOKEN grammar rule type --- project.gyp | 1 - spec/integration/compile_grammar_spec.cc | 6 +-- src/compiler/parse_grammar.cc | 10 ----- .../prepare_grammar/extract_tokens.cc | 5 --- .../prepare_grammar/intern_symbols.cc | 26 +------------ src/compiler/rules.h | 1 - src/compiler/rules/external_token.cc | 39 ------------------- src/compiler/rules/external_token.h | 27 ------------- src/compiler/rules/rules.cc | 5 --- 9 files changed, 4 insertions(+), 116 deletions(-) delete mode 100644 src/compiler/rules/external_token.cc delete mode 100644 src/compiler/rules/external_token.h diff --git a/project.gyp b/project.gyp index 29b69787..081a3a88 100644 --- a/project.gyp +++ b/project.gyp @@ -47,7 +47,6 @@ 'src/compiler/rules/character_range.cc', 'src/compiler/rules/character_set.cc', 'src/compiler/rules/choice.cc', - 'src/compiler/rules/external_token.cc', 'src/compiler/rules/metadata.cc', 'src/compiler/rules/named_symbol.cc', 'src/compiler/rules/pattern.cc', diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index ea0696ec..14d026d0 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -548,13 +548,13 @@ describe("compile_grammar", []() { "string": { "type": "CHOICE", "members": [ - {"type": "EXTERNAL_TOKEN", "name": "percent_string"}, + {"type": "SYMBOL", "name": "percent_string"}, { "type": "SEQ", "members": [ - {"type": "EXTERNAL_TOKEN", "name": "percent_string_start"}, + {"type": "SYMBOL", "name": "percent_string_start"}, {"type": "SYMBOL", "name": "expression"}, - {"type": "EXTERNAL_TOKEN", "name": "percent_string_end"} + {"type": "SYMBOL", "name": "percent_string_end"} ] }, ] diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index cc5cff55..327c0f31 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -119,16 +119,6 @@ ParseRuleResult parse_rule(json_value *rule_json) { } } - if (type == "EXTERNAL_TOKEN") { - json_value token_name_json = rule_json->operator[]("name"); - if (token_name_json.type != json_string) { - error_message = "External token name must be a string"; - goto error; - } - - return { external_token(token_name_json.u.string.ptr), "" }; - } - if (type == "PATTERN") { json_value value_json = rule_json->operator[]("value"); if (value_json.type == json_string) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index dcf88e53..d2b32769 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -11,7 +11,6 @@ #include "compiler/rules/symbol.h" #include "compiler/rules/string.h" #include "compiler/rules/metadata.h" -#include "compiler/rules/external_token.h" #include "compiler/rules/pattern.h" #include "compiler/prepare_grammar/token_description.h" #include "compiler/prepare_grammar/is_token.h" @@ -79,10 +78,6 @@ class TokenExtractor : public rules::IdentityRuleFn { return apply_to_token(rule, VariableTypeAuxiliary); } - rule_ptr apply_to(const rules::ExternalToken *rule) { - return apply_to_token(rule, VariableTypeAuxiliary); - } - rule_ptr apply_to(const rules::Metadata *rule) { if (rule->params.is_token) return apply_to_token(rule->rule.get(), VariableTypeAuxiliary); diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index f08edf5e..efe4b37f 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -7,7 +7,6 @@ #include "compiler/rules/visitor.h" #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" -#include "compiler/rules/external_token.h" #include "compiler/rules/symbol.h" namespace tree_sitter { @@ -32,26 +31,13 @@ class InternSymbols : public rules::IdentityRuleFn { return result; } - rule_ptr apply_to(const rules::ExternalToken *rule) { - auto result = symbol_for_external_token(rule->name); - if (!result.get()) { - missing_external_token_name = rule->name; - return rules::Blank::build(); - } - return result; - } - public: std::shared_ptr symbol_for_rule_name(string rule_name) { for (size_t i = 0; i < grammar.rules.size(); i++) if (grammar.rules[i].first == rule_name) return make_shared(i, Symbol::NonTerminal); - return nullptr; - } - - std::shared_ptr symbol_for_external_token(string name) { for (size_t i = 0; i < grammar.external_tokens.size(); i++) - if (grammar.external_tokens[i] == name) + if (grammar.external_tokens[i] == rule_name) return make_shared(i, Symbol::External); return nullptr; } @@ -59,7 +45,6 @@ class InternSymbols : public rules::IdentityRuleFn { explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; - string missing_external_token_name; }; CompileError missing_rule_error(string rule_name) { @@ -67,11 +52,6 @@ CompileError missing_rule_error(string rule_name) { "Undefined rule '" + rule_name + "'"); } -CompileError missing_external_token_error(string token_name) { - return CompileError(TSCompileErrorTypeUndefinedSymbol, - "Undefined external token '" + token_name + "'"); -} - pair intern_symbols(const Grammar &grammar) { InternedGrammar result; result.external_tokens = grammar.external_tokens; @@ -81,8 +61,6 @@ pair intern_symbols(const Grammar &grammar) { auto new_rule = interner.apply(pair.second); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; - if (!interner.missing_external_token_name.empty()) - return { result, missing_external_token_error(interner.missing_external_token_name) }; result.variables.push_back(Variable( pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, @@ -93,8 +71,6 @@ pair intern_symbols(const Grammar &grammar) { auto new_rule = interner.apply(rule); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; - if (!interner.missing_external_token_name.empty()) - return { result, missing_external_token_error(interner.missing_external_token_name) }; result.extra_tokens.push_back(new_rule); } diff --git a/src/compiler/rules.h b/src/compiler/rules.h index 8a3f4097..d98a719a 100644 --- a/src/compiler/rules.h +++ b/src/compiler/rules.h @@ -22,7 +22,6 @@ rule_ptr prec_left(int precedence, const rule_ptr &); rule_ptr prec_right(const rule_ptr &); rule_ptr prec_right(int precedence, const rule_ptr &); rule_ptr token(const rule_ptr &rule); -rule_ptr external_token(const std::string &); } // namespace std diff --git a/src/compiler/rules/external_token.cc b/src/compiler/rules/external_token.cc deleted file mode 100644 index d8487b0e..00000000 --- a/src/compiler/rules/external_token.cc +++ /dev/null @@ -1,39 +0,0 @@ -#include "compiler/rules/external_token.h" -#include -#include "compiler/rules/visitor.h" - -namespace tree_sitter { -namespace rules { - -using std::string; -using std::hash; - -ExternalToken::ExternalToken(const string &name) : name(name) {} - -rule_ptr ExternalToken::build(const string &name) { - return std::make_shared(name); -} - -bool ExternalToken::operator==(const Rule &rule) const { - auto other = rule.as(); - return other && other->name == name; -} - -size_t ExternalToken::hash_code() const { - return hash()(name); -} - -rule_ptr ExternalToken::copy() const { - return std::make_shared(*this); -} - -string ExternalToken::to_string() const { - return string("(sym '") + name + "')"; -} - -void ExternalToken::accept(Visitor *visitor) const { - visitor->visit(this); -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/external_token.h b/src/compiler/rules/external_token.h deleted file mode 100644 index cec1a847..00000000 --- a/src/compiler/rules/external_token.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef COMPILER_RULES_EXTERNAL_TOKEN_H_ -#define COMPILER_RULES_EXTERNAL_TOKEN_H_ - -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace rules { - -class ExternalToken : public Rule { - public: - explicit ExternalToken(const std::string &); - static rule_ptr build(const std::string &); - - bool operator==(const Rule &other) const; - size_t hash_code() const; - rule_ptr copy() const; - std::string to_string() const; - void accept(Visitor *visitor) const; - - std::string name; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_EXTERNAL_TOKEN_H_ diff --git a/src/compiler/rules/rules.cc b/src/compiler/rules/rules.cc index 73c37284..fdb0ebdf 100644 --- a/src/compiler/rules/rules.cc +++ b/src/compiler/rules/rules.cc @@ -13,7 +13,6 @@ #include "compiler/rules/pattern.h" #include "compiler/rules/character_set.h" #include "compiler/rules/repeat.h" -#include "compiler/rules/external_token.h" #include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -106,8 +105,4 @@ rule_ptr token(const rule_ptr &rule) { return metadata(rule, params); } -rule_ptr external_token(const string &name) { - return rules::ExternalToken::build(name); -} - } // namespace tree_sitter From c16b6b2059eb1f66c8eb9ab73acb19022b5c77b9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 11:50:24 -0800 Subject: [PATCH 07/50] Run external scanners during error recovery --- .../build_tables/build_parse_table.cc | 4 ++ src/runtime/parser.c | 44 ++++++++++++------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 819ce345..4c3ba8c6 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -120,6 +120,10 @@ class ParseTableBuilder { } } + for (size_t i = 0; i < grammar.external_tokens.size(); i++) { + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::External)); + } + for (size_t i = 0; i < grammar.variables.size(); i++) { add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal)); } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 6a358448..a3f38730 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -208,44 +208,54 @@ static bool parser__condense_stack(Parser *self) { return result; } -static Tree *parser__lex(Parser *self, TSStateId parse_state) { +static bool parser__try_lex(Parser *self, TSLexMode lex_mode) { Length start_position = self->lexer.current_position; ts_lexer_start(&self->lexer); - TSLexMode lex_mode = self->language->lex_modes[parse_state]; if (lex_mode.external_tokens) { - const bool *external_tokens = ts_language_enabled_external_tokens(self->language, lex_mode.external_tokens); + const bool *external_tokens = ts_language_enabled_external_tokens( + self->language, + lex_mode.external_tokens + ); + + LOG("lex external:%d, pos:%u", + lex_mode.external_tokens, + self->lexer.current_position.chars + ); + if (self->language->external_scanner.scan( self->external_scanner_payload, &self->lexer.data, external_tokens )) { - TSSymbol symbol = self->language->external_token_symbol_map[self->lexer.data.result_symbol]; - Length padding = length_sub(self->lexer.token_start_position, start_position); - Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); - TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); - Tree *result = ts_tree_make_leaf(symbol, padding, size, metadata); - result->parse_state = parse_state; - return result; + self->lexer.data.result_symbol = self->language->external_token_symbol_map[self->lexer.data.result_symbol]; + return true; } else { ts_lexer_reset(&self->lexer, start_position); + ts_lexer_start(&self->lexer); } } - TSStateId start_state = self->language->lex_modes[parse_state].lex_state; - TSStateId current_state = start_state; - LOG("lex state:%d", start_state); + LOG("lex state:%d, pos:%u", lex_mode.lex_state, self->lexer.current_position.chars); + return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); +} +static Tree *parser__lex(Parser *self, TSStateId parse_state) { + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + TSStateId start_state = lex_mode.lex_state; + Length start_position = self->lexer.current_position; + + bool found_error = false; bool skipped_error = false; int32_t first_error_character = 0; Length error_start_position, error_end_position; - while (!self->language->lex_fn(&self->lexer.data, current_state)) { - if (current_state != ERROR_STATE) { + while (!parser__try_lex(self, lex_mode)) { + if (!found_error) { LOG("retry_in_error_mode"); - current_state = ERROR_STATE; + found_error = true; + lex_mode = self->language->lex_modes[ERROR_STATE]; ts_lexer_reset(&self->lexer, start_position); - ts_lexer_start(&self->lexer); continue; } From c4fe8ded95afddd0ff0728370c0831461eba6b7e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 16:36:34 -0800 Subject: [PATCH 08/50] Remove state argument to Lexer advance method --- include/tree_sitter/parser.h | 6 +++--- spec/fixtures/external_scanners/external_scan.c | 16 ++++++++-------- src/runtime/lexer.c | 6 +++--- src/runtime/parser.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index e099fd7f..27113e32 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -23,7 +23,7 @@ typedef struct { } TSSymbolMetadata; typedef struct { - void (*advance)(void *, TSStateId, bool); + void (*advance)(void *, bool); int32_t lookahead; TSSymbol result_symbol; } TSLexer; @@ -92,14 +92,14 @@ typedef struct TSLanguage { #define ADVANCE(state_value) \ { \ - lexer->advance(lexer, state_value, false); \ + lexer->advance(lexer, false); \ state = state_value; \ goto next_state; \ } #define SKIP(state_value) \ { \ - lexer->advance(lexer, state_value, true); \ + lexer->advance(lexer, true); \ state = state_value; \ goto next_state; \ } diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/external_scan.c index 41ef3706..9e7a7d12 100644 --- a/spec/fixtures/external_scanners/external_scan.c +++ b/spec/fixtures/external_scanners/external_scan.c @@ -31,11 +31,11 @@ bool ts_language_external_scanner_example_external_scanner_scan( while (lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\n') { - lexer->advance(lexer, 0, true); + lexer->advance(lexer, true); } if (lexer->lookahead != '%') return false; - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); switch (lexer->lookahead) { case '(': @@ -57,7 +57,7 @@ bool ts_language_external_scanner_example_external_scanner_scan( return false; } - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); for (;;) { if (scanner->depth == 0) { @@ -70,19 +70,19 @@ bool ts_language_external_scanner_example_external_scanner_scan( } else if (lexer->lookahead == scanner->close_delimiter) { scanner->depth--; } else if (lexer->lookahead == '#') { - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); if (lexer->lookahead == '{') { - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); lexer->result_symbol = percent_string_start; return true; } } - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); } } else if (whitelist[percent_string_end]) { if (lexer->lookahead != '}') return false; - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); for (;;) { if (scanner->depth == 0) { @@ -96,7 +96,7 @@ bool ts_language_external_scanner_example_external_scanner_scan( scanner->depth--; } - lexer->advance(lexer, 0, false); + lexer->advance(lexer, false); } } diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 77d76ec6..b39eb599 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -46,7 +46,7 @@ static void ts_lexer__get_lookahead(Lexer *self) { LOG_LOOKAHEAD(); } -static void ts_lexer__advance(void *payload, TSStateId state, bool skip) { +static void ts_lexer__advance(void *payload, bool skip) { Lexer *self = (Lexer *)payload; if (self->chunk == empty_chunk) return; @@ -63,10 +63,10 @@ static void ts_lexer__advance(void *payload, TSStateId state, bool skip) { } if (skip) { - LOG("skip_separator state:%d", state); + LOG("skip_separator"); self->token_start_position = self->current_position; } else { - LOG("advance state:%d", state); + LOG("advance"); } if (self->current_position.bytes >= self->chunk_start + self->chunk_size) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index a3f38730..3ec4338d 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -269,7 +269,7 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { self->lexer.data.result_symbol = ts_builtin_sym_error; break; } - self->lexer.data.advance(&self->lexer, ERROR_STATE, false); + self->lexer.data.advance(&self->lexer, false); } skipped_error = true; From 6073d9c0e82c302ae3980e4ac1b2a7336d55d72d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 16:36:50 -0800 Subject: [PATCH 09/50] Restore all languages in corpus specs --- spec/integration/corpus_specs.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index 86a1dc47..9d716ed1 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -80,10 +80,10 @@ START_TEST describe("The Corpus", []() { vector test_languages({ - // "javascript", + "javascript", "json", - // "c", - // "cpp", + "c", + "cpp", }); for (auto &language_name : test_languages) { From a09409900f0442610a0e4612a06d2b252eb373e9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 16:37:06 -0800 Subject: [PATCH 10/50] Silence missing intializer warnings in compiler unit tests --- .../build_tables/parse_item_set_builder_spec.cc | 4 ++-- .../prepare_grammar/expand_repeats_spec.cc | 12 ++++++------ .../prepare_grammar/extract_tokens_spec.cc | 16 ++++++++-------- .../prepare_grammar/intern_symbols_spec.cc | 6 +++--- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index dad0976b..6548f37a 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -46,7 +46,7 @@ describe("ParseItemSetBuilder", []() { {Symbol(15, Symbol::Terminal), 0, AssociativityNone}, }) }), - }, {}, {}}; + }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { return grammar.variables[variable_index].productions[production_index]; @@ -97,7 +97,7 @@ describe("ParseItemSetBuilder", []() { }), Production({}) }), - }, {}, {}}; + }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { return grammar.variables[variable_index].productions[production_index]; diff --git a/spec/compiler/prepare_grammar/expand_repeats_spec.cc b/spec/compiler/prepare_grammar/expand_repeats_spec.cc index c25ff47c..d8c93a41 100644 --- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc +++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc @@ -13,7 +13,7 @@ describe("expand_repeats", []() { it("replaces repeat rules with pairs of recursive rules", [&]() { InitialSyntaxGrammar grammar{{ Variable("rule0", VariableTypeNamed, repeat1(i_token(0))), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -32,7 +32,7 @@ describe("expand_repeats", []() { i_token(10), repeat1(i_token(11)), })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -54,7 +54,7 @@ describe("expand_repeats", []() { i_token(10), repeat1(i_token(11)) })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -80,7 +80,7 @@ describe("expand_repeats", []() { i_token(3), repeat1(i_token(4)) })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -106,7 +106,7 @@ describe("expand_repeats", []() { repeat1(i_token(10)), repeat1(i_token(11)), })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -130,7 +130,7 @@ describe("expand_repeats", []() { InitialSyntaxGrammar grammar{{ Variable("rule0", VariableTypeNamed, repeat1(i_token(10))), Variable("rule1", VariableTypeNamed, repeat1(i_token(11))), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 613d31cc..30a731c8 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -29,7 +29,7 @@ describe("extract_tokens", []() { Variable("rule_B", VariableTypeNamed, pattern("ij+")), Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })), Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3))) - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -92,7 +92,7 @@ describe("extract_tokens", []() { i_sym(0), str("ab"), })), - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -111,7 +111,7 @@ describe("extract_tokens", []() { Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })), Variable("rule_B", VariableTypeNamed, str("cd")), Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })), - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -151,7 +151,7 @@ describe("extract_tokens", []() { }, { str("y"), pattern("\\s+"), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); @@ -168,7 +168,7 @@ describe("extract_tokens", []() { Variable("rule_B", VariableTypeNamed, str("y")), }, { str("y"), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<1>(result).separators.size(), Equals(0)); @@ -182,7 +182,7 @@ describe("extract_tokens", []() { Variable("rule_C", VariableTypeNamed, str("z")), }, { i_sym(2), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); @@ -197,7 +197,7 @@ describe("extract_tokens", []() { auto result = extract_tokens(InternedGrammar{{ Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), - }, { i_sym(1) }, {}}); + }, { i_sym(1) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); AssertThat(get<2>(result), Equals( @@ -209,7 +209,7 @@ describe("extract_tokens", []() { auto result = extract_tokens(InternedGrammar{{ Variable("rule_A", VariableTypeNamed, str("x")), Variable("rule_B", VariableTypeNamed, str("y")), - }, { choice({ i_sym(1), blank() }) }, {}}); + }, { choice({ i_sym(1), blank() }) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); AssertThat(get<2>(result), Equals(CompileError( diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 4c417e57..4ec27149 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -17,7 +17,7 @@ describe("intern_symbols", []() { { "x", choice({ sym("y"), sym("_z") }) }, { "y", sym("_z") }, { "_z", str("stuff") } - }, {}, {}}; + }, {}, {}, {}}; auto result = intern_symbols(grammar); @@ -33,7 +33,7 @@ describe("intern_symbols", []() { it("returns an error", []() { Grammar grammar{{ { "x", sym("y") }, - }, {}, {}}; + }, {}, {}, {}}; auto result = intern_symbols(grammar); @@ -48,7 +48,7 @@ describe("intern_symbols", []() { { "z", str("stuff") } }, { sym("z") - }, {}}; + }, {}, {}}; auto result = intern_symbols(grammar); From 505fe6d382674f18c4133dc0f06b0b9507fdcedf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 17:09:09 -0800 Subject: [PATCH 11/50] Add externals to grammar JSON schema --- doc/grammar-schema.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/grammar-schema.json b/doc/grammar-schema.json index 5f43b279..f37cd983 100644 --- a/doc/grammar-schema.json +++ b/doc/grammar-schema.json @@ -40,6 +40,14 @@ "pattern": "^[a-zA-Z_]\\w*$" } } + }, + + "externals": { + "type": "array", + "items": { + "type": "string", + "pattern": "^[a-zA-Z_]\\w*$" + } } }, From 1251ff2e305f12f51c45ccb0533b3a3a69cee7eb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 17:09:22 -0800 Subject: [PATCH 12/50] Consider externals to be named, not anonymous --- src/compiler/generate_code/c_code.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 36bd7cab..db8756e6 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -528,7 +528,7 @@ class CCodeGenerator { return { variable.name, variable.type }; } case Symbol::External: { - return { syntax_grammar.external_tokens[symbol.index], VariableTypeAnonymous }; + return { syntax_grammar.external_tokens[symbol.index], VariableTypeNamed }; } } } From 83514293b50971e766edb8cae2351824303399a7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 5 Dec 2016 17:26:11 -0800 Subject: [PATCH 13/50] Allow external tokens to be either visible or hidden --- spec/integration/compile_grammar_spec.cc | 12 ++++++------ src/compiler/build_tables/build_parse_table.cc | 2 +- src/compiler/generate_code/c_code.cc | 5 +++-- .../prepare_grammar/initial_syntax_grammar.h | 2 +- src/compiler/prepare_grammar/intern_symbols.cc | 17 +++++++++++++---- src/compiler/prepare_grammar/interned_grammar.h | 2 +- src/compiler/syntax_grammar.h | 7 ++++++- 7 files changed, 31 insertions(+), 16 deletions(-) diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 14d026d0..743ea286 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -513,9 +513,9 @@ describe("compile_grammar", []() { "name": "external_scanner_example", "externals": [ - "percent_string", - "percent_string_start", - "percent_string_end" + "_percent_string", + "_percent_string_start", + "_percent_string_end" ], "extras": [ @@ -548,13 +548,13 @@ describe("compile_grammar", []() { "string": { "type": "CHOICE", "members": [ - {"type": "SYMBOL", "name": "percent_string"}, + {"type": "SYMBOL", "name": "_percent_string"}, { "type": "SEQ", "members": [ - {"type": "SYMBOL", "name": "percent_string_start"}, + {"type": "SYMBOL", "name": "_percent_string_start"}, {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "percent_string_end"} + {"type": "SYMBOL", "name": "_percent_string_end"} ] }, ] diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 4c3ba8c6..bdaac037 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -584,7 +584,7 @@ class ParseTableBuilder { return grammar.variables[symbol.index].name; } case Symbol::External: { - return grammar.external_tokens[symbol.index]; + return grammar.external_tokens[symbol.index].name; } } } diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index db8756e6..e5272595 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -485,7 +485,7 @@ class CCodeGenerator { // Helper functions string external_token_id(Symbol::Index index) { - return "ts_external_token_" + syntax_grammar.external_tokens[index]; + return "ts_external_token_" + syntax_grammar.external_tokens[index].name; } string symbol_id(const Symbol &symbol) { @@ -528,7 +528,8 @@ class CCodeGenerator { return { variable.name, variable.type }; } case Symbol::External: { - return { syntax_grammar.external_tokens[symbol.index], VariableTypeNamed }; + const ExternalToken &token = syntax_grammar.external_tokens[symbol.index]; + return { token.name, token.type }; } } } diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index d4b1c8d5..1ac319cb 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -15,7 +15,7 @@ struct InitialSyntaxGrammar { std::vector variables; std::set extra_tokens; std::set expected_conflicts; - std::vector external_tokens; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index efe4b37f..daad9d2e 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -54,7 +54,14 @@ CompileError missing_rule_error(string rule_name) { pair intern_symbols(const Grammar &grammar) { InternedGrammar result; - result.external_tokens = grammar.external_tokens; + + for (auto &external_token_name : grammar.external_tokens) { + result.external_tokens.push_back(ExternalToken{ + external_token_name, + external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed + }); + } + InternSymbols interner(grammar); for (auto &pair : grammar.rules) { @@ -62,9 +69,11 @@ pair intern_symbols(const Grammar &grammar) { if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; - result.variables.push_back(Variable( - pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, - new_rule)); + result.variables.push_back(Variable{ + pair.first, + pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, + new_rule + }); } for (auto &rule : grammar.extra_tokens) { diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index 7b425c3a..c8a14647 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,7 +15,7 @@ struct InternedGrammar { std::vector variables; std::vector extra_tokens; std::set expected_conflicts; - std::vector external_tokens; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index e34ddbbe..3d001b61 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -10,6 +10,11 @@ namespace tree_sitter { +struct ExternalToken { + std::string name; + VariableType type; +}; + struct ProductionStep { ProductionStep(const rules::Symbol &, int, rules::Associativity); bool operator==(const ProductionStep &) const; @@ -36,7 +41,7 @@ struct SyntaxGrammar { std::vector variables; std::set extra_tokens; std::set expected_conflicts; - std::vector external_tokens; + std::vector external_tokens; }; } // namespace tree_sitter From 7f6ec0131d3164b02dcb8fefb80bded5235f2dcd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 6 Dec 2016 10:12:49 -0800 Subject: [PATCH 14/50] Remove duplication between parser_destroy and parser_set_language --- src/runtime/parser.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 3ec4338d..be827250 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -1130,11 +1130,10 @@ bool parser_init(Parser *self) { } void parser_set_language(Parser *self, const TSLanguage *language) { - if (self->external_scanner_payload) - if (self->language->external_scanner.destroy) - self->language->external_scanner.destroy(self->external_scanner_payload); + if (self->external_scanner_payload && self->language->external_scanner.destroy) + self->language->external_scanner.destroy(self->external_scanner_payload); - if (language->external_scanner.create) + if (language && language->external_scanner.create) self->external_scanner_payload = language->external_scanner.create(); else self->external_scanner_payload = NULL; @@ -1151,9 +1150,7 @@ void parser_destroy(Parser *self) { array_delete(&self->tree_path1); if (self->tree_path2.contents) array_delete(&self->tree_path2); - if (self->external_scanner_payload) - if (self->language->external_scanner.destroy) - self->language->external_scanner.destroy(self->external_scanner_payload); + parser_set_language(self, NULL); } Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { From 10b51a05a1dae660888e7bee7e7ad26ecff4fbe9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 8 Dec 2016 22:35:16 -0800 Subject: [PATCH 15/50] Allow external scanners to refer to (and return) internally-defined tokens Tokens that are defined in the grammar's rules may now be included in the externals list also, so that external scanners can check if they are valid lookaheads or not, and if so, can return them to the parser if needed. --- spec/fixtures/external_scanners/line_breaks.c | 53 ++++++++++++++ .../{external_scan.c => percent_strings.c} | 0 spec/integration/compile_grammar_spec.cc | 70 ++++++++++++++++++- src/compiler/generate_code/c_code.cc | 42 +++++++++-- .../prepare_grammar/intern_symbols.cc | 6 +- 5 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 spec/fixtures/external_scanners/line_breaks.c rename spec/fixtures/external_scanners/{external_scan.c => percent_strings.c} (100%) diff --git a/spec/fixtures/external_scanners/line_breaks.c b/spec/fixtures/external_scanners/line_breaks.c new file mode 100644 index 00000000..eb63a37f --- /dev/null +++ b/spec/fixtures/external_scanners/line_breaks.c @@ -0,0 +1,53 @@ +#include +#include + +enum { + STRING, + LINE_BREAK +}; + +void *ts_language_shared_external_tokens_external_scanner_create() { + return NULL; +} + +void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +} + +bool ts_language_shared_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + // If a line-break is a valid lookahead token, only skip spaces. + if (whitelist[LINE_BREAK]) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + lexer->result_symbol = LINE_BREAK; + return true; + } + } + + // If a line-break is not a valid lookahead token, skip line breaks as well + // as spaces. + if (whitelist[STRING]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\'') { + lexer->advance(lexer, false); + + while (lexer->lookahead != '\'') { + lexer->advance(lexer, false); + } + + lexer->advance(lexer, false); + lexer->result_symbol = STRING; + return true; + } + } + + return false; +} diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/percent_strings.c similarity index 100% rename from spec/fixtures/external_scanners/external_scan.c rename to spec/fixtures/external_scanners/percent_strings.c diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 743ea286..f26bbcc8 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -1,6 +1,7 @@ #include "spec_helper.h" #include "runtime/alloc.h" #include "helpers/load_language.h" +#include "helpers/stderr_logger.h" #include "compiler/util/string_helpers.h" #include @@ -508,7 +509,7 @@ describe("compile_grammar", []() { }); describe("external scanners", [&]() { - it("can call out to arbitrary scanner functions during parsing", [&]() { + it("can tokenize using arbitrary user-defined scanner functions", [&]() { string grammar = R"JSON({ "name": "external_scanner_example", @@ -573,7 +574,7 @@ describe("compile_grammar", []() { ts_document_set_language(document, load_compile_result( "external_scanner_example", result, - "spec/fixtures/external_scanners/external_scan.c" + "spec/fixtures/external_scanners/percent_strings.c" )); ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); @@ -584,6 +585,71 @@ describe("compile_grammar", []() { ts_document_parse(document); assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); }); + + it("allows external scanners to refer to tokens that are defined internally", [&]() { + string grammar = R"JSON({ + "name": "shared_external_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "shared_external_tokens", + result, + "spec/fixtures/external_scanners/line_breaks.c" + )); + + ts_document_set_input_string(document, "a b\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + ts_document_set_input_string(document, "a \nb\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + + ts_document_set_input_string(document, "'hello' 'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + + ts_document_set_input_string(document, "'hello' \n'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + }); }); describe("when the grammar's start symbol is a token", [&]() { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index e5272595..5df39413 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,6 +77,7 @@ class CCodeGenerator { vector> parse_table_entries; vector> external_token_id_sets; size_t next_parse_action_list_index; + map shared_token_indices; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -93,6 +94,17 @@ class CCodeGenerator { string code() { buffer = ""; + for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { + const Variable &variable = lexical_grammar.variables[i]; + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.name == variable.name) { + shared_token_indices.insert({i, j}); + break; + } + } + } + add_includes(); add_warning_pragma(); add_stats(); @@ -128,8 +140,14 @@ class CCodeGenerator { void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1 + syntax_grammar.external_tokens.size())); - line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); + line("#define TOKEN_COUNT " + to_string( + 1 + + lexical_grammar.variables.size() + + syntax_grammar.external_tokens.size() - shared_token_indices.size() + )); + line("#define EXTERNAL_TOKEN_COUNT " + to_string( + syntax_grammar.external_tokens.size() + )); line(); } @@ -213,7 +231,7 @@ class CCodeGenerator { } void add_lex_modes_list() { - add_external_tokens_id({}); + add_external_scanner_state({}); line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { @@ -223,15 +241,25 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {.lex_state = "); add(to_string(state.lex_state_id)); + bool has_external_tokens = false; set external_token_indices; for (const auto &pair : state.terminal_entries) { Symbol symbol = pair.first; - if (symbol.is_external()) + if (symbol.is_external()) { + has_external_tokens = true; external_token_indices.insert(symbol.index); + } else if (symbol.is_token()) { + auto shared_token_entry = shared_token_indices.find(symbol.index); + if (shared_token_entry != shared_token_indices.end()) { + external_token_indices.insert(shared_token_entry->second); + } + } + } + + if (has_external_tokens) { + add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); } - if (!external_token_indices.empty()) - add(", .external_tokens = " + add_external_tokens_id(external_token_indices)); add("},"); } }); @@ -239,7 +267,7 @@ class CCodeGenerator { line(); } - string add_external_tokens_id(set external_token_ids) { + string add_external_scanner_state(set external_token_ids) { for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) if (external_token_id_sets[i] == external_token_ids) return to_string(i); diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index daad9d2e..06b4d430 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -19,7 +19,7 @@ using std::pair; using std::make_shared; using rules::Symbol; -class InternSymbols : public rules::IdentityRuleFn { +class SymbolInterner : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; rule_ptr apply_to(const rules::NamedSymbol *rule) { @@ -42,7 +42,7 @@ class InternSymbols : public rules::IdentityRuleFn { return nullptr; } - explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} + explicit SymbolInterner(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; }; @@ -62,7 +62,7 @@ pair intern_symbols(const Grammar &grammar) { }); } - InternSymbols interner(grammar); + SymbolInterner interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second); From 0e595346bef5a670ac53829bf10bacb4333b5d08 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 9 Dec 2016 13:33:37 -0800 Subject: [PATCH 16/50] Make lexer log output easier to read --- spec/runtime/document_spec.cc | 12 +++--------- src/runtime/lexer.c | 13 ++++--------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index 0fb7a640..a50fee1b 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -178,20 +178,14 @@ describe("Document", [&]() { delete logger; }); - it("calls the debugger with a message for each lex action", [&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_parse(doc); - - AssertThat(logger->messages, Contains("lookahead char:'1'")); - AssertThat(logger->messages, Contains("lookahead char:'['")); - }); - it("calls the debugger with a message for each parse action", [&]() { ts_document_set_logger(doc, logger->logger()); ts_document_parse(doc); AssertThat(logger->messages, Contains("new_parse")); - AssertThat(logger->messages, Contains("lookahead char:'['")); + AssertThat(logger->messages, Contains("skip character:' '")); + AssertThat(logger->messages, Contains("consume character:'['")); + AssertThat(logger->messages, Contains("consume character:'1'")); AssertThat(logger->messages, Contains("reduce sym:array, child_count:4")); AssertThat(logger->messages, Contains("accept")); }); diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index b39eb599..acf394bb 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -11,11 +11,8 @@ self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \ } -#define LOG_LOOKAHEAD() \ - LOG((0 < self->data.lookahead && self->data.lookahead < 256) \ - ? "lookahead char:'%c'" \ - : "lookahead char:%d", \ - self->data.lookahead); +#define LOG_CHARACTER(message, character) \ + LOG(character < 255 ? message " character:'%c'" : message " character:%d", character) static const char empty_chunk[2] = { 0, 0 }; @@ -42,8 +39,6 @@ static void ts_lexer__get_lookahead(Lexer *self) { utf8proc_iterate(chunk, size, &self->data.lookahead); else self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); - - LOG_LOOKAHEAD(); } static void ts_lexer__advance(void *payload, bool skip) { @@ -63,10 +58,10 @@ static void ts_lexer__advance(void *payload, bool skip) { } if (skip) { - LOG("skip_separator"); + LOG_CHARACTER("skip", self->data.lookahead); self->token_start_position = self->current_position; } else { - LOG("advance"); + LOG_CHARACTER("consume", self->data.lookahead); } if (self->current_position.bytes >= self->chunk_start + self->chunk_size) From a1770ce844da2165b97c1422c371822cf9d623fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 12 Dec 2016 22:06:01 -0800 Subject: [PATCH 17/50] Allow external tokens to be used as extras --- .../external_scanners/extra_external_tokens.c | 32 ++++++++++++++ ...line_breaks.c => shared_external_tokens.c} | 0 spec/integration/compile_grammar_spec.cc | 44 ++++++++++++++++++- .../prepare_grammar/extract_tokens.cc | 2 +- 4 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 spec/fixtures/external_scanners/extra_external_tokens.c rename spec/fixtures/external_scanners/{line_breaks.c => shared_external_tokens.c} (100%) diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c new file mode 100644 index 00000000..45803213 --- /dev/null +++ b/spec/fixtures/external_scanners/extra_external_tokens.c @@ -0,0 +1,32 @@ +#include + +enum { + COMMENT, +}; + +void *ts_language_extra_external_tokens_external_scanner_create() { + return NULL; +} + +void ts_language_extra_external_tokens_external_scanner_destroy(void *payload) { +} + +bool ts_language_extra_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '#') { + lexer->advance(lexer, false); + while (lexer->lookahead != '\n') { + lexer->advance(lexer, false); + } + + lexer->result_symbol = COMMENT; + return true; + } + + return false; +} diff --git a/spec/fixtures/external_scanners/line_breaks.c b/spec/fixtures/external_scanners/shared_external_tokens.c similarity index 100% rename from spec/fixtures/external_scanners/line_breaks.c rename to spec/fixtures/external_scanners/shared_external_tokens.c diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index f26bbcc8..2c7560c0 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -630,7 +630,7 @@ describe("compile_grammar", []() { ts_document_set_language(document, load_compile_result( "shared_external_tokens", result, - "spec/fixtures/external_scanners/line_breaks.c" + "spec/fixtures/external_scanners/shared_external_tokens.c" )); ts_document_set_input_string(document, "a b\n"); @@ -641,7 +641,6 @@ describe("compile_grammar", []() { ts_document_parse(document); assert_root_node("(statement (variable) (variable) (line_break))"); - ts_document_set_input_string(document, "'hello' 'world'\n"); ts_document_parse(document); assert_root_node("(statement (string) (string) (line_break))"); @@ -650,6 +649,47 @@ describe("compile_grammar", []() { ts_document_parse(document); assert_root_node("(statement (string) (string) (line_break))"); }); + + it("allows external tokens to be used as extras", [&]() { + string grammar = R"JSON({ + "name": "extra_external_tokens", + + "externals": [ + "comment" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + "assignment": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "variable"}, + {"type": "STRING", "value": "="}, + {"type": "SYMBOL", "name": "variable"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "extra_external_tokens", + result, + "spec/fixtures/external_scanners/extra_external_tokens.c" + )); + + ts_document_set_input_string(document, "x = # a comment\n y"); + ts_document_parse(document); + assert_root_node("(assignment (variable) (comment) (variable))"); + }); }); describe("when the grammar's start symbol is a token", [&]() { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index d2b32769..ace6294a 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -177,7 +177,7 @@ tuple extract_tokens( extra_token_error(rule->to_string())); Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); - if (!new_symbol.is_token()) { + if (new_symbol.is_non_terminal()) { return make_tuple( syntax_grammar, lexical_grammar, extra_token_error(syntax_grammar.variables[new_symbol.index].name)); From 5c72c1e28bb9617ee0f9ff60106bc79c09594580 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 13:06:16 -0800 Subject: [PATCH 18/50] Fetch python grammar as part of CI --- script/fetch-fixtures | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/script/fetch-fixtures b/script/fetch-fixtures index bb727298..7009d70f 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -7,6 +7,7 @@ GRAMMARS=( json c cpp + python ) for grammar in ${GRAMMARS[@]}; do @@ -21,7 +22,7 @@ for grammar in ${GRAMMARS[@]}; do ( cd $grammar_dir; - git reset --hard; - git pull origin master; + git fetch origin + git reset --hard origin/master; ) done From 80b7affb74d82bce15a6e56b97a7d3fe27e39ff4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 13:07:45 -0800 Subject: [PATCH 19/50] Compile & link fixture grammars' external scanners in test suite --- spec/helpers/load_language.cc | 60 +++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index 2e85b762..0fa2053c 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -28,10 +28,11 @@ const char *libcompiler_path = "out/Test/libcompiler.a"; #endif -static std::string run_cmd(const char *cmd, const char *args[]) { +static std::string run_command(const char *cmd, const char *args[]) { int child_pid = fork(); - if (child_pid < 0) + if (child_pid < 0) { return "fork failed"; + } if (child_pid == 0) { close(0); @@ -39,7 +40,6 @@ static std::string run_cmd(const char *cmd, const char *args[]) { dup2(2, 1); dup2(1, 2); execvp(cmd, (char * const * )args); - return ""; } int status; @@ -47,12 +47,16 @@ static std::string run_cmd(const char *cmd, const char *args[]) { waitpid(child_pid, &status, 0); } while (!WIFEXITED(status)); - if (WEXITSTATUS(status) == 0) + if (WEXITSTATUS(status) == 0) { return ""; - else + } else { return "command failed"; + } +} - return ""; +static bool file_exists(const string &path) { + struct stat file_stat; + return stat(path.c_str(), &file_stat) == 0; } static int get_modified_time(const string &path) { @@ -68,33 +72,39 @@ static int get_modified_time(const string &path) { const TSLanguage *load_language(const string &source_filename, const string &lib_filename, const string &language_name, - string external_scanner_path = "") { + string external_scanner_filename = "") { string language_function_name = "ts_language_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); int header_mtime = get_modified_time(header_dir + "/tree_sitter/parser.h"); int lib_mtime = get_modified_time(lib_filename); + int external_scanner_mtime = get_modified_time(external_scanner_filename); - if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime) { - const char *compiler_name = getenv("CC"); - if (!compiler_name) { - compiler_name = "gcc"; - } + if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime || + lib_mtime < external_scanner_mtime) { + const char *compiler_name = getenv("CXX"); + if (!compiler_name) compiler_name = "c++"; - const char *compile_argv[] = { + vector compile_args = { compiler_name, "-shared", - "-x", "c", "-fPIC", "-g", "-I", header_dir.c_str(), "-o", lib_filename.c_str(), - source_filename.c_str(), - external_scanner_path.empty() ? NULL : external_scanner_path.c_str(), - NULL + "-x", "c", + source_filename.c_str() }; - string compile_error = run_cmd(compiler_name, compile_argv); + if (!external_scanner_filename.empty()) { + string extension = external_scanner_filename.substr(external_scanner_filename.rfind(".")); + if (extension != ".c") compile_args.push_back("-xc++"); + compile_args.push_back(external_scanner_filename.c_str()); + } + + compile_args.push_back(nullptr); + + string compile_error = run_command(compiler_name, compile_args.data()); if (!compile_error.empty()) { AssertThat(string(compile_error), IsEmpty()); return nullptr; @@ -108,16 +118,14 @@ const TSLanguage *load_language(const string &source_filename, return nullptr; } - void *symbol_value = dlsym(parser_lib, language_function_name.c_str()); - if (!symbol_value) { + void *language_function = dlsym(parser_lib, language_function_name.c_str()); + if (!language_function) { std::string message(dlerror()); AssertThat(message, IsEmpty()); return nullptr; } - typedef TSLanguage * (* LanguageFunction)(); - LanguageFunction language_fn = reinterpret_cast(symbol_value); - return language_fn(); + return reinterpret_cast(language_function)(); } const TSLanguage *load_compile_result(const string &name, @@ -150,6 +158,10 @@ const TSLanguage *get_test_language(const string &language_name) { string language_dir = string("spec/fixtures/grammars/") + language_name; string grammar_filename = language_dir + "/src/grammar.json"; string parser_filename = language_dir + "/src/parser.c"; + string external_scanner_filename = language_dir + "/src/scanner.cc"; + if (!file_exists(external_scanner_filename)) { + external_scanner_filename = ""; + } int grammar_mtime = get_modified_time(grammar_filename); if (!grammar_mtime) @@ -184,7 +196,7 @@ const TSLanguage *get_test_language(const string &language_name) { mkdir("out/tmp", 0777); string lib_filename = "out/tmp/" + language_name + ".so"; - const TSLanguage *language = load_language(parser_filename, lib_filename, language_name); + const TSLanguage *language = load_language(parser_filename, lib_filename, language_name, external_scanner_filename); loaded_languages[language_name] = language; return language; }; From 727727623ab58fafe8829f2644005a9c51db38c5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 13:10:18 -0800 Subject: [PATCH 20/50] Start work on unit test that edits python code Signed-off-by: Nathan Sobo --- spec/helpers/dedent.h | 12 ++++++++++ spec/integration/compile_grammar_spec.cc | 11 +--------- spec/runtime/parser_spec.cc | 28 ++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 12 deletions(-) create mode 100644 spec/helpers/dedent.h diff --git a/spec/helpers/dedent.h b/spec/helpers/dedent.h new file mode 100644 index 00000000..1387acf9 --- /dev/null +++ b/spec/helpers/dedent.h @@ -0,0 +1,12 @@ +#include "compiler/util/string_helpers.h" +#include + +static std::string dedent(std::string input) { + size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n"); + std::string whitespace = "\n" + std::string(indent_level, ' '); + tree_sitter::util::str_replace(&input, whitespace, "\n"); + return input.substr( + input.find_first_not_of("\n "), + input.find_last_not_of("\n ") + 1 + ); +} diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 2c7560c0..ed2109c2 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -2,19 +2,10 @@ #include "runtime/alloc.h" #include "helpers/load_language.h" #include "helpers/stderr_logger.h" +#include "helpers/dedent.h" #include "compiler/util/string_helpers.h" #include -static string dedent(string input) { - size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n"); - string whitespace = "\n" + string(indent_level, ' '); - util::str_replace(&input, whitespace, "\n"); - return input.substr( - input.find_first_not_of("\n "), - input.find_last_not_of("\n ") + 1 - ); -} - static string fill_template(string input, map parameters) { string result = input; for (const auto &pair : parameters) { diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index a14fa68e..969ac078 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -4,6 +4,8 @@ #include "helpers/spy_input.h" #include "helpers/load_language.h" #include "helpers/record_alloc.h" +#include "helpers/stderr_logger.h" +#include "helpers/dedent.h" START_TEST @@ -33,13 +35,13 @@ describe("Parser", [&]() { AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); }); - auto set_text = [&](const char *text) { + auto set_text = [&](string text) { input = new SpyInput(text, chunk_size); ts_document_set_input(doc, input->input()); ts_document_parse(doc); root = ts_document_root_node(doc); - AssertThat(ts_node_end_byte(root), Equals(strlen(text))); + AssertThat(ts_node_end_byte(root), Equals(text.size())); input->clear(); }; @@ -404,6 +406,28 @@ describe("Parser", [&]() { }); }); + describe("with external tokens", [&]() { + before_each([&]() { + ts_document_set_language(doc, get_test_language("python")); + }); + + it("maintains the external scanner's state during incremental parsing", [&]() { + string text = dedent(R"PYTHON( + if a: + print b + + return c + )PYTHON"); + + set_text(text); + + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier))) " + "(return_statement (expression_list (identifier))))"); + }); + }); + it("updates the document's parse count", [&]() { ts_document_set_language(doc, get_test_language("javascript")); AssertThat(ts_document_parse_count(doc), Equals(0)); From 2b3da512a4739015e5501925b8cf4216ec72d4f9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 13:12:01 -0800 Subject: [PATCH 21/50] Add serialize, deserialize and reset callbacks to external scanners Signed-off-by: Nathan Sobo --- include/tree_sitter/parser.h | 7 +++++- .../external_scanners/extra_external_tokens.c | 12 +++++++++- .../external_scanners/percent_strings.c | 10 ++++++++ .../shared_external_tokens.c | 12 +++++++++- src/compiler/generate_code/c_code.cc | 6 +++++ src/runtime/node.c | 24 ++++++++++++++++--- src/runtime/parser.c | 5 ++-- src/runtime/tree.h | 9 ++++--- 8 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 27113e32..90247719 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -12,6 +12,8 @@ extern "C" { typedef unsigned short TSSymbol; typedef unsigned short TSStateId; +typedef uint8_t TSExternalTokenState[16]; + #define ts_builtin_sym_error ((TSSymbol)-1) #define ts_builtin_sym_end 0 @@ -75,8 +77,11 @@ typedef struct TSLanguage { const TSSymbol *external_token_symbol_map; const bool *external_token_lists; struct { - void * (*create)(); + void *(*create)(); bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + void (*reset)(void *); + bool (*serialize)(void *, TSExternalTokenState); + void (*deserialize)(void *, TSExternalTokenState); void (*destroy)(void *); } external_scanner; } TSLanguage; diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c index 45803213..ba3338af 100644 --- a/spec/fixtures/external_scanners/extra_external_tokens.c +++ b/spec/fixtures/external_scanners/extra_external_tokens.c @@ -8,7 +8,14 @@ void *ts_language_extra_external_tokens_external_scanner_create() { return NULL; } -void ts_language_extra_external_tokens_external_scanner_destroy(void *payload) { +void ts_language_extra_external_tokens_external_scanner_reset(void *payload) { +} + +bool ts_language_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void ts_language_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { } bool ts_language_extra_external_tokens_external_scanner_scan( @@ -30,3 +37,6 @@ bool ts_language_extra_external_tokens_external_scanner_scan( return false; } + +void ts_language_extra_external_tokens_external_scanner_destroy(void *payload) { +} diff --git a/spec/fixtures/external_scanners/percent_strings.c b/spec/fixtures/external_scanners/percent_strings.c index 9e7a7d12..56c12e81 100644 --- a/spec/fixtures/external_scanners/percent_strings.c +++ b/spec/fixtures/external_scanners/percent_strings.c @@ -103,6 +103,16 @@ bool ts_language_external_scanner_example_external_scanner_scan( return false; } +void ts_language_external_scanner_example_external_scanner_reset(void *payload) { +} + +bool ts_language_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void ts_language_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +} + void ts_language_external_scanner_example_external_scanner_destroy(void *payload) { free(payload); } diff --git a/spec/fixtures/external_scanners/shared_external_tokens.c b/spec/fixtures/external_scanners/shared_external_tokens.c index eb63a37f..3be1a848 100644 --- a/spec/fixtures/external_scanners/shared_external_tokens.c +++ b/spec/fixtures/external_scanners/shared_external_tokens.c @@ -10,7 +10,14 @@ void *ts_language_shared_external_tokens_external_scanner_create() { return NULL; } -void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +void ts_language_shared_external_tokens_external_scanner_reset(void *payload) { +} + +bool ts_language_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void ts_language_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { } bool ts_language_shared_external_tokens_external_scanner_scan( @@ -51,3 +58,6 @@ bool ts_language_shared_external_tokens_external_scanner_scan( return false; } + +void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +} diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 5df39413..ed034c1b 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -356,6 +356,9 @@ class CCodeGenerator { line("void *" + external_scanner_name + "_create();"); line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); + line("void " + external_scanner_name + "_reset(void *);"); + line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);"); + line("void " + external_scanner_name + "_deserialize(void *, TSExternalTokenState);"); line("void " + external_scanner_name + "_destroy();"); line(); @@ -366,6 +369,9 @@ class CCodeGenerator { indent([&]() { line(external_scanner_name + "_create,"); line(external_scanner_name + "_scan,"); + line(external_scanner_name + "_reset,"); + line(external_scanner_name + "_serialize,"); + line(external_scanner_name + "_deserialize,"); line(external_scanner_name + "_destroy,"); }); line(");"); diff --git a/src/runtime/node.c b/src/runtime/node.c index d5bcb1a0..15e2d5cf 100644 --- a/src/runtime/node.c +++ b/src/runtime/node.c @@ -39,7 +39,15 @@ static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { static inline uint32_t ts_node__relevant_child_count(TSNode self, bool include_anonymous) { const Tree *tree = ts_node__tree(self); - return include_anonymous ? tree->visible_child_count : tree->named_child_count; + if (tree->child_count > 0) { + if (include_anonymous) { + return tree->visible_child_count; + } else { + return tree->named_child_count; + } + } else { + return 0; + } } static inline TSNode ts_node__direct_parent(TSNode self, uint32_t *index) { @@ -324,11 +332,21 @@ TSNode ts_node_named_child(TSNode self, uint32_t child_index) { } uint32_t ts_node_child_count(TSNode self) { - return ts_node__tree(self)->visible_child_count; + const Tree *tree = ts_node__tree(self); + if (tree->child_count > 0) { + return tree->visible_child_count; + } else { + return 0; + } } uint32_t ts_node_named_child_count(TSNode self) { - return ts_node__tree(self)->named_child_count; + const Tree *tree = ts_node__tree(self); + if (tree->child_count > 0) { + return tree->named_child_count; + } else { + return 0; + } } TSNode ts_node_next_sibling(TSNode self) { diff --git a/src/runtime/parser.c b/src/runtime/parser.c index be827250..e81b73a6 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -752,8 +752,9 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { LOG("new_parse"); } - if (self->language->external_scanner.create) - self->language->external_scanner.create(); + if (self->language->external_scanner.reset) { + self->language->external_scanner.reset(self->external_scanner_payload); + } ts_lexer_set_input(&self->lexer, input); ts_stack_clear(self->stack); diff --git a/src/runtime/tree.h b/src/runtime/tree.h index c37d61ab..7aea708f 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -22,10 +22,13 @@ typedef struct Tree { } context; uint32_t child_count; - uint32_t visible_child_count; - uint32_t named_child_count; union { - struct Tree **children; + struct { + uint32_t visible_child_count; + uint32_t named_child_count; + struct Tree **children; + }; + TSExternalTokenState external_token_state; int32_t lookahead_char; }; From e6c82ead2cdc3a9c441cca6296524bcbe80d5472 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 17:06:20 -0800 Subject: [PATCH 22/50] Start work toward maintaining external scanner's state during incremental parses --- spec/helpers/load_language.cc | 1 - spec/runtime/document_spec.cc | 3 +- spec/runtime/parser_spec.cc | 12 ++- src/compiler/rules/symbol.h | 2 +- src/runtime/language.h | 10 ++- src/runtime/parser.c | 158 ++++++++++++++++++++++------------ src/runtime/parser.h | 1 + src/runtime/tree.c | 10 ++- src/runtime/tree.h | 4 +- 9 files changed, 131 insertions(+), 70 deletions(-) diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index 0fa2053c..f7b61dc3 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -89,7 +89,6 @@ const TSLanguage *load_language(const string &source_filename, compiler_name, "-shared", "-fPIC", - "-g", "-I", header_dir.c_str(), "-o", lib_filename.c_str(), "-x", "c", diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index a50fee1b..de5b5f36 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -5,6 +5,7 @@ #include "helpers/tree_helpers.h" #include "helpers/point_helpers.h" #include "helpers/spy_logger.h" +#include "helpers/stderr_logger.h" #include "helpers/spy_input.h" #include "helpers/load_language.h" @@ -112,7 +113,7 @@ describe("Document", [&]() { assert_node_string_equals( new_root, "(object (pair (string) (array (null) (number))))"); - AssertThat(spy_input->strings_read, Equals(vector({" [null, 2"}))); + AssertThat(spy_input->strings_read, Equals(vector({" [null, 2", ""}))); }); it("reads from the new input correctly when the old input was blank", [&]() { diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 969ac078..bb296e7d 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -253,7 +253,7 @@ describe("Parser", [&]() { "(identifier) " "(math_op (number) (member_access (identifier) (identifier))))))"); - AssertThat(input->strings_read, Equals(vector({ " + abc.d)" }))); + AssertThat(input->strings_read, Equals(vector({ " + abc.d)", "" }))); }); }); @@ -277,7 +277,7 @@ describe("Parser", [&]() { "(number) " "(math_op (number) (math_op (number) (identifier)))))))"); - AssertThat(input->strings_read, Equals(vector({ "123 || 5 +" }))); + AssertThat(input->strings_read, Equals(vector({ "123 || 5 +", "" }))); }); }); @@ -415,16 +415,20 @@ describe("Parser", [&]() { string text = dedent(R"PYTHON( if a: print b - return c )PYTHON"); set_text(text); - assert_root_node("(module " "(if_statement (identifier) " "(print_statement (identifier))) " "(return_statement (expression_list (identifier))))"); + + replace_text(text.find("return"), 0, " "); + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier)) " + "(return_statement (expression_list (identifier)))))"); }); }); diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 4aacf1b2..a963433c 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -12,8 +12,8 @@ class Symbol : public Rule { typedef int Index; typedef enum { - Terminal, External, + Terminal, NonTerminal, } Type; diff --git a/src/runtime/language.h b/src/runtime/language.h index 5a2693db..56e275bd 100644 --- a/src/runtime/language.h +++ b/src/runtime/language.h @@ -19,6 +19,10 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); +static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { + return 0 < symbol && symbol < self->external_token_count + 1; +} + static inline const TSParseAction *ts_language_actions(const TSLanguage *self, TSStateId state, TSSymbol symbol, @@ -52,7 +56,11 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, static inline const bool * ts_language_enabled_external_tokens(const TSLanguage *self, unsigned external_scanner_state) { - return self->external_token_lists + self->external_token_count * external_scanner_state; + if (external_scanner_state == 0) { + return NULL; + } else { + return self->external_token_lists + self->external_token_count * external_scanner_state; + } } #ifdef __cplusplus diff --git a/src/runtime/parser.c b/src/runtime/parser.c index e81b73a6..6787e1ac 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -145,7 +145,6 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, } if (result) { - LOG("lookahead sym:%s", SYM_NAME(reusable_node->tree->symbol)); ts_tree_release(*lookahead); ts_tree_retain(*lookahead = reusable_node->tree); } @@ -161,7 +160,11 @@ static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) { static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { - if (tree->first_leaf.lex_state == self->language->lex_modes[state].lex_state) + TSLexMode current_lex_mode = self->language->lex_modes[state]; + if (ts_language_is_symbol_external(self->language, tree->first_leaf.symbol)) return false; + if (tree->size.bytes == 0) return false; + if (tree->first_leaf.lex_mode.lex_state == current_lex_mode.lex_state && + tree->first_leaf.lex_mode.external_tokens == current_lex_mode.external_tokens) return true; if (!table_entry->is_reusable) return false; @@ -208,58 +211,92 @@ static bool parser__condense_stack(Parser *self) { return result; } -static bool parser__try_lex(Parser *self, TSLexMode lex_mode) { - Length start_position = self->lexer.current_position; - ts_lexer_start(&self->lexer); - - if (lex_mode.external_tokens) { - const bool *external_tokens = ts_language_enabled_external_tokens( - self->language, - lex_mode.external_tokens - ); - - LOG("lex external:%d, pos:%u", - lex_mode.external_tokens, - self->lexer.current_position.chars - ); - - if (self->language->external_scanner.scan( - self->external_scanner_payload, - &self->lexer.data, - external_tokens - )) { - self->lexer.data.result_symbol = self->language->external_token_symbol_map[self->lexer.data.result_symbol]; - return true; - } else { - ts_lexer_reset(&self->lexer, start_position); - ts_lexer_start(&self->lexer); +static StackIterateAction parser__restore_external_scanner_callback( + void *payload, TSStateId state, TreeArray *trees, uint32_t tree_count, + bool is_done, bool is_pending) { + Parser *self = payload; + if (tree_count > 0) { + Tree *tree = *array_back(trees); + if (tree->has_external_token_state && tree->child_count == 0) { + self->language->external_scanner.deserialize(self->external_scanner_payload, tree->external_token_state); + return StackIterateStop; } + } else if (is_done) { + self->language->external_scanner.reset(self->external_scanner_payload); + return StackIterateStop; } - LOG("lex state:%d, pos:%u", lex_mode.lex_state, self->lexer.current_position.chars); - return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); + return StackIterateNone; } -static Tree *parser__lex(Parser *self, TSStateId parse_state) { - TSLexMode lex_mode = self->language->lex_modes[parse_state]; - TSStateId start_state = lex_mode.lex_state; - Length start_position = self->lexer.current_position; +static void parser__restore_external_scanner(Parser *self, StackVersion version) { + StackPopResult pop = ts_stack_iterate(self->stack, version, parser__restore_external_scanner_callback, self); + if (pop.slices.size > 0) { + StackSlice slice = pop.slices.contents[0]; + for (size_t i = 1; i < slice.trees.size; i++) { + Tree *tree = slice.trees.contents[i]; + if (tree->has_external_tokens) { + printf("RE-SCANNING TREE: %s\n", ts_tree_string(tree, self->language, true)); + } + } + ts_tree_array_delete(&slice.trees); + } +} +static Tree *parser__lex(Parser *self, StackVersion version) { + TSStateId parse_state = ts_stack_top_state(self->stack, version); + Length start_position = ts_stack_top_position(self->stack, version); + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + const bool *external_tokens = ts_language_enabled_external_tokens( + self->language, + lex_mode.external_tokens + ); + + bool found_external_token = false; bool found_error = false; bool skipped_error = false; int32_t first_error_character = 0; Length error_start_position, error_end_position; + ts_lexer_reset(&self->lexer, start_position); + + for (;;) { + Length current_position = self->lexer.current_position; + + if (external_tokens) { + LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_tokens, + current_position.extent.row, current_position.extent.column); + parser__restore_external_scanner(self, version); + ts_lexer_start(&self->lexer); + if (self->language->external_scanner.scan(self->external_scanner_payload, + &self->lexer.data, external_tokens)) { + found_external_token = true; + break; + } + ts_lexer_reset(&self->lexer, current_position); + } + + LOG("lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, + current_position.extent.row, current_position.extent.column); + ts_lexer_start(&self->lexer); + if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) { + break; + } - while (!parser__try_lex(self, lex_mode)) { if (!found_error) { LOG("retry_in_error_mode"); found_error = true; lex_mode = self->language->lex_modes[ERROR_STATE]; + external_tokens = ts_language_enabled_external_tokens( + self->language, + lex_mode.external_tokens + ); ts_lexer_reset(&self->lexer, start_position); continue; } if (!skipped_error) { + LOG("skip_unrecognized_character"); + skipped_error = true; error_start_position = self->lexer.token_start_position; first_error_character = self->lexer.data.lookahead; } @@ -272,7 +309,6 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { self->lexer.data.advance(&self->lexer, false); } - skipped_error = true; error_end_position = self->lexer.current_position; } @@ -284,14 +320,26 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; + if (found_external_token) symbol = self->language->external_token_symbol_map[symbol]; + Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); result = ts_tree_make_leaf(symbol, padding, size, metadata); + + if (found_external_token) { + result->has_external_tokens = true; + if (self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state)) { + result->has_external_token_state = true; + self->last_external_token = result; + } + } } result->parse_state = parse_state; - result->first_leaf.lex_state = start_state; + result->first_leaf.lex_mode = lex_mode; + + LOG("lexed_lookahead sym:%s, size:%u", SYM_NAME(result->symbol), result->size.bytes); return result; } @@ -301,19 +349,18 @@ static void parser__clear_cached_token(Parser *self) { } static Tree *parser__get_lookahead(Parser *self, StackVersion version, - ReusableNode *reusable_node) { + ReusableNode *reusable_node, + bool *is_fresh) { Length position = ts_stack_top_position(self->stack, version); while (reusable_node->tree) { if (reusable_node->byte_index > position.bytes) { - LOG("before_reusable sym:%s, pos:%u", - SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index); + LOG("before_reusable_node sym:%s", SYM_NAME(reusable_node->tree->symbol)); break; } if (reusable_node->byte_index < position.bytes) { - LOG("past_reusable sym:%s, pos:%u", - SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index); + LOG("past_reusable sym:%s", SYM_NAME(reusable_node->tree->symbol)); parser__pop_reusable_node(reusable_node); continue; } @@ -350,9 +397,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, return self->cached_token; } - ts_lexer_reset(&self->lexer, position); - TSStateId parse_state = ts_stack_top_state(self->stack, version); - return parser__lex(self, parse_state); + *is_fresh = true; + return parser__lex(self, version); } static bool parser__select_tree(Parser *self, Tree *left, Tree *right) { @@ -977,30 +1023,29 @@ static void parser__recover(Parser *self, StackVersion version, TSStateId state, static void parser__advance(Parser *self, StackVersion version, ReusableNode *reusable_node) { bool validated_lookahead = false; - Tree *lookahead = parser__get_lookahead(self, version, reusable_node); + Tree *lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); for (;;) { TSStateId state = ts_stack_top_state(self->stack, version); TableEntry table_entry; - ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol, - &table_entry); + ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol, &table_entry); if (!validated_lookahead) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { - if (lookahead == reusable_node->tree) + if (lookahead == reusable_node->tree) { parser__pop_reusable_node_leaf(reusable_node); - else + } else { parser__clear_cached_token(self); + } ts_tree_release(lookahead); - lookahead = parser__get_lookahead(self, version, reusable_node); + lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); continue; } validated_lookahead = true; - LOG("lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol), - lookahead->size.bytes); + LOG("reused_lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol), lookahead->size.bytes); } bool reduction_stopped_at_error = false; @@ -1023,12 +1068,11 @@ static void parser__advance(Parser *self, StackVersion version, } if (lookahead->child_count > 0) { - if (parser__breakdown_lookahead(self, &lookahead, state, - reusable_node)) { + if (parser__breakdown_lookahead(self, &lookahead, state, reusable_node)) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { parser__pop_reusable_node(reusable_node); ts_tree_release(lookahead); - lookahead = parser__get_lookahead(self, version, reusable_node); + lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); } } @@ -1175,8 +1219,8 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), ts_stack_top_state(self->stack, version), - ts_stack_top_position(self->stack, version).extent.row + 1, - ts_stack_top_position(self->stack, version).extent.column + 1); + ts_stack_top_position(self->stack, version).extent.row, + ts_stack_top_position(self->stack, version).extent.column); parser__advance(self, version, &reusable_node); LOG_STACK(); diff --git a/src/runtime/parser.h b/src/runtime/parser.h index 54c041b3..2d9381f8 100644 --- a/src/runtime/parser.h +++ b/src/runtime/parser.h @@ -30,6 +30,7 @@ typedef struct { TreePath tree_path1; TreePath tree_path2; void *external_scanner_payload; + Tree *last_external_token; } Parser; bool parser_init(Parser *); diff --git a/src/runtime/tree.c b/src/runtime/tree.c index c94b1f9f..e788cb02 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -25,10 +25,7 @@ Tree *ts_tree_make_leaf(TSSymbol sym, Length padding, Length size, .visible = metadata.visible, .named = metadata.named, .has_changes = false, - .first_leaf = { - .symbol = sym, - .lex_state = 0 - } + .first_leaf.symbol = sym, }; return result; } @@ -111,6 +108,8 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->named_child_count = 0; self->visible_child_count = 0; self->error_cost = 0; + self->has_external_tokens = false; + self->has_external_token_state = false; for (uint32_t i = 0; i < child_count; i++) { Tree *child = children[i]; @@ -133,6 +132,9 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->named_child_count += child->named_child_count; } + if (child->has_external_tokens) self->has_external_tokens = true; + if (child->has_external_token_state) self->has_external_token_state = true; + if (child->symbol == ts_builtin_sym_error) { self->fragile_left = self->fragile_right = true; self->parse_state = TS_TREE_STATE_NONE; diff --git a/src/runtime/tree.h b/src/runtime/tree.h index 7aea708f..425fac51 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -41,7 +41,7 @@ typedef struct Tree { struct { TSSymbol symbol; - TSStateId lex_state; + TSLexMode lex_mode; } first_leaf; unsigned short ref_count; @@ -51,6 +51,8 @@ typedef struct Tree { bool fragile_left : 1; bool fragile_right : 1; bool has_changes : 1; + bool has_external_tokens : 1; + bool has_external_token_state : 1; } Tree; typedef struct { From 006e9fe4a6f12a639d9f9f80eaa6e268532d5945 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Dec 2016 17:28:28 -0800 Subject: [PATCH 23/50] Specify c language explicitly when compiling test parsers --- spec/helpers/load_language.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index f7b61dc3..a2b89027 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -97,7 +97,11 @@ const TSLanguage *load_language(const string &source_filename, if (!external_scanner_filename.empty()) { string extension = external_scanner_filename.substr(external_scanner_filename.rfind(".")); - if (extension != ".c") compile_args.push_back("-xc++"); + if (extension == ".c") { + compile_args.push_back("-xc"); + } else { + compile_args.push_back("-xc++"); + } compile_args.push_back(external_scanner_filename.c_str()); } From 42c41c158c72a4904ed953751bb48f54f0b11056 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 10:49:55 -0800 Subject: [PATCH 24/50] Refactor logic for handling shared internal/external tokens --- .../prepare_grammar/intern_symbols_spec.cc | 28 ++++++++++ spec/helpers/stream_methods.cc | 5 ++ spec/helpers/stream_methods.h | 2 + src/compiler/generate_code/c_code.cc | 53 ++++++++++--------- .../prepare_grammar/extract_tokens.cc | 8 ++- .../prepare_grammar/intern_symbols.cc | 12 ++++- src/compiler/syntax_grammar.cc | 5 ++ src/compiler/syntax_grammar.h | 3 ++ 8 files changed, 88 insertions(+), 28 deletions(-) diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 4ec27149..9142eab6 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -3,8 +3,10 @@ #include "compiler/grammar.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" #include "helpers/equals_pointer.h" #include "helpers/rule_helpers.h" +#include "helpers/stream_methods.h" START_TEST @@ -56,6 +58,32 @@ describe("intern_symbols", []() { AssertThat(result.first.extra_tokens.size(), Equals(1)); AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2))); }); + + it("records any rule names that match external token names", [&]() { + Grammar grammar{{ + { "x", choice({ sym("y"), sym("z") }) }, + { "y", sym("z") }, + { "z", str("stuff") } + }, {}, {}, { + "w", + "z" + }}; + + auto result = intern_symbols(grammar); + + AssertThat(result.first.external_tokens, Equals(vector({ + { + "w", + VariableTypeNamed, + rules::NONE() + }, + { + "z", + VariableTypeNamed, + Symbol(2, Symbol::NonTerminal) + } + }))) + }); }); END_TEST diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index b47363a0..a4b275ea 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -76,6 +76,11 @@ ostream &operator<<(ostream &stream, const ParseState &state) { return stream << string(">"); } +ostream &operator<<(ostream &stream, const ExternalToken &external_token) { + return stream << "{" << external_token.name << ", " << external_token.type << + "," << external_token.corresponding_internal_token << "}"; +} + ostream &operator<<(ostream &stream, const ProductionStep &step) { stream << "(symbol: " << step.symbol << ", precedence:" << to_string(step.precedence); stream << ", associativity: "; diff --git a/spec/helpers/stream_methods.h b/spec/helpers/stream_methods.h index 515060eb..28b201c3 100644 --- a/spec/helpers/stream_methods.h +++ b/spec/helpers/stream_methods.h @@ -97,6 +97,7 @@ struct AdvanceAction; struct AcceptTokenAction; class ParseAction; class ParseState; +struct ExternalToken; struct ProductionStep; struct PrecedenceRange; @@ -110,6 +111,7 @@ ostream &operator<<(ostream &, const AdvanceAction &); ostream &operator<<(ostream &, const AcceptTokenAction &); ostream &operator<<(ostream &, const ParseAction &); ostream &operator<<(ostream &, const ParseState &); +ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); ostream &operator<<(ostream &, const PrecedenceRange &); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index ed034c1b..7c3601a3 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,7 +77,6 @@ class CCodeGenerator { vector> parse_table_entries; vector> external_token_id_sets; size_t next_parse_action_list_index; - map shared_token_indices; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -94,17 +93,6 @@ class CCodeGenerator { string code() { buffer = ""; - for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { - const Variable &variable = lexical_grammar.variables[i]; - for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { - const ExternalToken &external_token = syntax_grammar.external_tokens[j]; - if (external_token.name == variable.name) { - shared_token_indices.insert({i, j}); - break; - } - } - } - add_includes(); add_warning_pragma(); add_stats(); @@ -138,16 +126,17 @@ class CCodeGenerator { } void add_stats() { + size_t token_count = 1 + lexical_grammar.variables.size(); + for (const ExternalToken &external_token : syntax_grammar.external_tokens) { + if (external_token.corresponding_internal_token == rules::NONE()) { + token_count++; + } + } + line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string( - 1 + - lexical_grammar.variables.size() + - syntax_grammar.external_tokens.size() - shared_token_indices.size() - )); - line("#define EXTERNAL_TOKEN_COUNT " + to_string( - syntax_grammar.external_tokens.size() - )); + line("#define TOKEN_COUNT " + to_string(token_count)); + line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -233,6 +222,17 @@ class CCodeGenerator { void add_lex_modes_list() { add_external_scanner_state({}); + map external_tokens_by_corresponding_internal_token; + for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.corresponding_internal_token.index == i) { + external_tokens_by_corresponding_internal_token.insert({i, j}); + break; + } + } + } + line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { size_t state_id = 0; @@ -241,22 +241,23 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {.lex_state = "); add(to_string(state.lex_state_id)); - bool has_external_tokens = false; + bool needs_external_scanner = false; set external_token_indices; for (const auto &pair : state.terminal_entries) { Symbol symbol = pair.first; if (symbol.is_external()) { - has_external_tokens = true; + needs_external_scanner = true; external_token_indices.insert(symbol.index); } else if (symbol.is_token()) { - auto shared_token_entry = shared_token_indices.find(symbol.index); - if (shared_token_entry != shared_token_indices.end()) { - external_token_indices.insert(shared_token_entry->second); + auto corresponding_external_token = + external_tokens_by_corresponding_internal_token.find(symbol.index); + if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) { + external_token_indices.insert(corresponding_external_token->second); } } } - if (has_external_tokens) { + if (needs_external_scanner) { add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); } diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index ace6294a..e84d028d 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -186,7 +186,13 @@ tuple extract_tokens( syntax_grammar.extra_tokens.insert(new_symbol); } - syntax_grammar.external_tokens = grammar.external_tokens; + for (const ExternalToken &external_token : grammar.external_tokens) { + syntax_grammar.external_tokens.push_back({ + external_token.name, + external_token.type, + symbol_replacer.replace_symbol(external_token.corresponding_internal_token) + }); + } return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index 06b4d430..0786982b 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -8,6 +8,7 @@ #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { namespace prepare_grammar { @@ -56,9 +57,18 @@ pair intern_symbols(const Grammar &grammar) { InternedGrammar result; for (auto &external_token_name : grammar.external_tokens) { + Symbol corresponding_internal_token = rules::NONE(); + for (size_t i = 0, n = grammar.rules.size(); i < n; i++) { + if (grammar.rules[i].first == external_token_name) { + corresponding_internal_token = Symbol(i, Symbol::NonTerminal); + break; + } + } + result.external_tokens.push_back(ExternalToken{ external_token_name, - external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed + external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, + corresponding_internal_token }); } diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 535ddcda..aa3074e8 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -21,6 +21,11 @@ ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence, rules::Associativity associativity) : symbol(symbol), precedence(precedence), associativity(associativity) {} +bool ExternalToken::operator==(const ExternalToken &other) const { + return name == other.name && type == other.type && + corresponding_internal_token == other.corresponding_internal_token; +} + bool ProductionStep::operator==(const ProductionStep &other) const { return symbol == other.symbol && precedence == other.precedence && associativity == other.associativity; diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 3d001b61..e3af8f28 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -13,6 +13,9 @@ namespace tree_sitter { struct ExternalToken { std::string name; VariableType type; + rules::Symbol corresponding_internal_token; + + bool operator==(const ExternalToken &) const; }; struct ProductionStep { From 34a65f588d5e352d656df727bf6adafa2eca6894 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 11:24:41 -0800 Subject: [PATCH 25/50] Tweak naming and organization of external-scanner related language fields --- include/tree_sitter/parser.h | 12 ++-- src/compiler/generate_code/c_code.cc | 87 ++++++++++++++-------------- src/runtime/language.h | 2 +- src/runtime/parser.c | 10 ++-- 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 90247719..eea5f76f 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -52,7 +52,7 @@ typedef struct { typedef struct { uint16_t lex_state; - uint16_t external_tokens; + uint16_t external_lex_state; } TSLexMode; typedef union { @@ -74,15 +74,15 @@ typedef struct TSLanguage { const TSParseActionEntry *parse_actions; const TSLexMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); - const TSSymbol *external_token_symbol_map; - const bool *external_token_lists; struct { + const bool *states; + const TSSymbol *symbol_map; void *(*create)(); - bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + void (*destroy)(void *); void (*reset)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); bool (*serialize)(void *, TSExternalTokenState); void (*deserialize)(void *, TSExternalTokenState); - void (*destroy)(void *); } external_scanner; } TSLanguage; @@ -175,8 +175,6 @@ typedef struct TSLanguage { .symbol_names = ts_symbol_names, \ .lex_fn = ts_lex, \ .external_token_count = EXTERNAL_TOKEN_COUNT, \ - .external_token_lists = (const bool *)ts_external_token_lists, \ - .external_token_symbol_map = ts_external_token_symbol_map, \ .external_scanner = {__VA_ARGS__} \ }; \ return &language \ diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 7c3601a3..d5eab8b0 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -75,7 +75,7 @@ class CCodeGenerator { const LexicalGrammar lexical_grammar; map sanitized_names; vector> parse_table_entries; - vector> external_token_id_sets; + vector> external_scanner_states; size_t next_parse_action_list_index; public: @@ -102,11 +102,12 @@ class CCodeGenerator { add_lex_function(); add_lex_modes_list(); - if (!syntax_grammar.external_tokens.empty()) + if (!syntax_grammar.external_tokens.empty()) { add_external_token_enum(); + add_external_scanner_symbol_map(); + add_external_scanner_states_list(); + } - add_external_token_symbol_map(); - add_external_scan_modes_list(); add_parse_table(); add_parser_export(); @@ -258,7 +259,7 @@ class CCodeGenerator { } if (needs_external_scanner) { - add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); + add(", .external_lex_state = " + add_external_scanner_state(external_token_indices)); } add("},"); @@ -269,11 +270,11 @@ class CCodeGenerator { } string add_external_scanner_state(set external_token_ids) { - for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) - if (external_token_id_sets[i] == external_token_ids) + for (size_t i = 0, n = external_scanner_states.size(); i < n; i++) + if (external_scanner_states[i] == external_token_ids) return to_string(i); - external_token_id_sets.push_back(external_token_ids); - return to_string(external_token_id_sets.size() - 1); + external_scanner_states.push_back(external_token_ids); + return to_string(external_scanner_states.size() - 1); } void add_external_token_enum() { @@ -286,8 +287,8 @@ class CCodeGenerator { line(); } - void add_external_token_symbol_map() { - line("TSSymbol ts_external_token_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); + void add_external_scanner_symbol_map() { + line("TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); indent([&]() { for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ","); @@ -297,17 +298,17 @@ class CCodeGenerator { line(); } - void add_external_scan_modes_list() { - line("static bool ts_external_token_lists["); - add(to_string(external_token_id_sets.size())); + void add_external_scanner_states_list() { + line("static bool ts_external_scanner_states["); + add(to_string(external_scanner_states.size())); add("][EXTERNAL_TOKEN_COUNT] = {"); indent([&]() { size_t i = 0; - for (const auto &external_token_ids : external_token_id_sets) { - if (!external_token_ids.empty()) { + for (const auto &valid_external_lookaheads : external_scanner_states) { + if (!valid_external_lookaheads.empty()) { line("[" + to_string(i) + "] = {"); indent([&]() { - for (Symbol::Index id : external_token_ids) { + for (Symbol::Index id : valid_external_lookaheads) { line("[" + external_token_id(id) + "] = true,"); } }); @@ -352,40 +353,38 @@ class CCodeGenerator { } void add_parser_export() { - if (!syntax_grammar.external_tokens.empty()) { - string external_scanner_name = "ts_language_" + name + "_external_scanner"; + string external_scanner_name = "ts_language_" + name + "_external_scanner"; + if (!syntax_grammar.external_tokens.empty()) { line("void *" + external_scanner_name + "_create();"); - line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); + line("void " + external_scanner_name + "_destroy();"); line("void " + external_scanner_name + "_reset(void *);"); + line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);"); line("void " + external_scanner_name + "_deserialize(void *, TSExternalTokenState);"); - line("void " + external_scanner_name + "_destroy();"); line(); - - line("const TSLanguage *ts_language_" + name + "() {"); - indent([&]() { - if (!syntax_grammar.external_tokens.empty()) { - line("GET_LANGUAGE("); - indent([&]() { - line(external_scanner_name + "_create,"); - line(external_scanner_name + "_scan,"); - line(external_scanner_name + "_reset,"); - line(external_scanner_name + "_serialize,"); - line(external_scanner_name + "_deserialize,"); - line(external_scanner_name + "_destroy,"); - }); - line(");"); - } - }); - line("}"); - } else { - line("const TSLanguage *ts_language_" + name + "() {"); - indent([&]() { - line("GET_LANGUAGE();"); - }); - line("}"); } + + line("const TSLanguage *ts_language_" + name + "() {"); + indent([&]() { + line("GET_LANGUAGE("); + if (syntax_grammar.external_tokens.empty()) { + add(");"); + } else { + indent([&]() { + line("(const bool *)ts_external_scanner_states,"); + line("ts_external_scanner_symbol_map,"); + line(external_scanner_name + "_create,"); + line(external_scanner_name + "_destroy,"); + line(external_scanner_name + "_reset,"); + line(external_scanner_name + "_scan,"); + line(external_scanner_name + "_serialize,"); + line(external_scanner_name + "_deserialize,"); + }); + line(");"); + } + }); + line("}"); line(); } diff --git a/src/runtime/language.h b/src/runtime/language.h index 56e275bd..20e6ec5d 100644 --- a/src/runtime/language.h +++ b/src/runtime/language.h @@ -59,7 +59,7 @@ ts_language_enabled_external_tokens(const TSLanguage *self, if (external_scanner_state == 0) { return NULL; } else { - return self->external_token_lists + self->external_token_count * external_scanner_state; + return self->external_scanner.states + self->external_token_count * external_scanner_state; } } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 6787e1ac..f5b08f82 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -164,7 +164,7 @@ static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, if (ts_language_is_symbol_external(self->language, tree->first_leaf.symbol)) return false; if (tree->size.bytes == 0) return false; if (tree->first_leaf.lex_mode.lex_state == current_lex_mode.lex_state && - tree->first_leaf.lex_mode.external_tokens == current_lex_mode.external_tokens) + tree->first_leaf.lex_mode.external_lex_state == current_lex_mode.external_lex_state) return true; if (!table_entry->is_reusable) return false; @@ -249,7 +249,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { TSLexMode lex_mode = self->language->lex_modes[parse_state]; const bool *external_tokens = ts_language_enabled_external_tokens( self->language, - lex_mode.external_tokens + lex_mode.external_lex_state ); bool found_external_token = false; @@ -263,7 +263,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { Length current_position = self->lexer.current_position; if (external_tokens) { - LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_tokens, + LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, current_position.extent.row, current_position.extent.column); parser__restore_external_scanner(self, version); ts_lexer_start(&self->lexer); @@ -288,7 +288,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { lex_mode = self->language->lex_modes[ERROR_STATE]; external_tokens = ts_language_enabled_external_tokens( self->language, - lex_mode.external_tokens + lex_mode.external_lex_state ); ts_lexer_reset(&self->lexer, start_position); continue; @@ -320,7 +320,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; - if (found_external_token) symbol = self->language->external_token_symbol_map[symbol]; + if (found_external_token) symbol = self->language->external_scanner.symbol_map[symbol]; Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); From 4fd7b0e55d9e7457d19eca56a2a7f3a187169b43 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 11:31:28 -0800 Subject: [PATCH 26/50] Remove nested before_each's in Parser spec --- spec/runtime/parser_spec.cc | 40 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index bb296e7d..a955c7d6 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -85,10 +85,6 @@ describe("Parser", [&]() { }; describe("handling errors", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("json")); - }); - auto get_node_text = [&](TSNode node) { size_t start = ts_node_start_byte(node); size_t end = ts_node_end_byte(node); @@ -97,6 +93,7 @@ describe("Parser", [&]() { describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(doc, get_test_language("json")); set_text(" [123, @@@@@, true]"); assert_root_node( @@ -121,6 +118,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string in the middle of a token", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(doc, get_test_language("json")); set_text(" [123, faaaaalse, true]"); assert_root_node( @@ -146,6 +144,7 @@ describe("Parser", [&]() { describe("when there is one unexpected token between two valid tokens", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(doc, get_test_language("json")); set_text(" [123, true false, true]"); assert_root_node( @@ -164,6 +163,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string at the end of a token", [&]() { it("computes the error's size and position correctly", [&]() { + ts_document_set_language(doc, get_test_language("json")); set_text(" [123, \"hi\n, true]"); assert_root_node( @@ -182,14 +182,9 @@ describe("Parser", [&]() { }); describe("handling extra tokens", [&]() { - // In the javascript example grammar, ASI works by using newlines as - // terminators in statements, but also as extra tokens. - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); - }); - describe("when the token appears as part of a grammar rule", [&]() { it("is incorporated into the tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("fn()\n"); assert_root_node( @@ -199,6 +194,7 @@ describe("Parser", [&]() { describe("when the token appears somewhere else", [&]() { it("is incorporated into the tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text( "fn()\n" " .otherFn();"); @@ -214,6 +210,7 @@ describe("Parser", [&]() { describe("when several extra tokens appear in a row", [&]() { it("is incorporated into the tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text( "fn()\n\n" "// This is a comment" @@ -232,13 +229,10 @@ describe("Parser", [&]() { }); describe("editing", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); - }); - describe("inserting text", [&]() { describe("creating new tokens near the end of the input", [&]() { it("updates the parse tree and re-reads only the changed portion of the text", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("x * (100 + abc);"); assert_root_node( @@ -261,6 +255,7 @@ describe("Parser", [&]() { it("updates the parse tree and re-reads only the changed portion of the input", [&]() { chunk_size = 2; + ts_document_set_language(doc, get_test_language("javascript")); set_text("123 + 456 * (10 + x);"); assert_root_node( @@ -284,7 +279,6 @@ describe("Parser", [&]() { describe("introducing an error", [&]() { it("gives the error the right size", [&]() { ts_document_set_language(doc, get_test_language("javascript")); - set_text("var x = y;"); assert_root_node( @@ -307,6 +301,7 @@ describe("Parser", [&]() { describe("into the middle of an existing token", [&]() { it("updates the parse tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -325,6 +320,7 @@ describe("Parser", [&]() { describe("at the end of an existing token", [&]() { it("updates the parse tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -343,6 +339,7 @@ describe("Parser", [&]() { describe("into a node containing a extra token", [&]() { it("updates the parse tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("123 *\n" "// a-comment\n" "abc;"); @@ -371,6 +368,7 @@ describe("Parser", [&]() { describe("deleting text", [&]() { describe("when a critical token is removed", [&]() { it("updates the parse tree, creating an error", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("123 * 456; 789 * 123;"); assert_root_node( @@ -391,7 +389,6 @@ describe("Parser", [&]() { describe("replacing text", [&]() { it("does not try to re-use nodes that are within the edited region", [&]() { ts_document_set_language(doc, get_test_language("javascript")); - set_text("{ x: (b.c) };"); assert_root_node( @@ -407,11 +404,8 @@ describe("Parser", [&]() { }); describe("with external tokens", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("python")); - }); - it("maintains the external scanner's state during incremental parsing", [&]() { + ts_document_set_language(doc, get_test_language("python")); string text = dedent(R"PYTHON( if a: print b @@ -445,12 +439,9 @@ describe("Parser", [&]() { }); describe("lexing", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); - }); - describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { it("terminates them at the end of the document", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); set_text("x; // this is a comment"); assert_root_node( @@ -465,6 +456,7 @@ describe("Parser", [&]() { it("recognizes UTF8 characters as single characters", [&]() { // 'ΩΩΩ — ΔΔ'; + ts_document_set_language(doc, get_test_language("javascript")); set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); assert_root_node( From 75bcfed24770531fdad254436169639b4cf846b3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 11:32:27 -0800 Subject: [PATCH 27/50] Rename doc variable -> document in Parser spec --- spec/runtime/parser_spec.cc | 102 ++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index a955c7d6..2e4c9d20 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -10,7 +10,7 @@ START_TEST describe("Parser", [&]() { - TSDocument *doc; + TSDocument *document; SpyInput *input; TSNode root; size_t chunk_size; @@ -21,12 +21,12 @@ describe("Parser", [&]() { chunk_size = 3; input = nullptr; - doc = ts_document_new(); + document = ts_document_new(); }); after_each([&]() { - if (doc) - ts_document_free(doc); + if (document) + ts_document_free(document); if (input) delete input; @@ -37,30 +37,30 @@ describe("Parser", [&]() { auto set_text = [&](string text) { input = new SpyInput(text, chunk_size); - ts_document_set_input(doc, input->input()); - ts_document_parse(doc); + ts_document_set_input(document, input->input()); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); AssertThat(ts_node_end_byte(root), Equals(text.size())); input->clear(); }; auto insert_text = [&](size_t position, string text) { size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, 0, text)); - ts_document_parse(doc); + ts_document_edit(document, input->replace(position, 0, text)); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); size_t new_size = ts_node_end_byte(root); AssertThat(new_size, Equals(prev_size + text.size())); }; auto delete_text = [&](size_t position, size_t length) { size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, length, "")); - ts_document_parse(doc); + ts_document_edit(document, input->replace(position, length, "")); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); size_t new_size = ts_node_end_byte(root); AssertThat(new_size, Equals(prev_size - length)); }; @@ -68,17 +68,17 @@ describe("Parser", [&]() { auto replace_text = [&](size_t position, size_t length, string new_text) { size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, length, new_text)); - ts_document_parse(doc); + ts_document_edit(document, input->replace(position, length, new_text)); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); size_t new_size = ts_node_end_byte(root); AssertThat(new_size, Equals(prev_size - length + new_text.size())); }; auto assert_root_node = [&](const string &expected) { - TSNode node = ts_document_root_node(doc); - char *str = ts_node_string(node, doc); + TSNode node = ts_document_root_node(document); + char *str = ts_node_string(node, document); string actual(str); ts_free(str); AssertThat(actual, Equals(expected)); @@ -93,14 +93,14 @@ describe("Parser", [&]() { describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(doc, get_test_language("json")); + ts_document_set_language(document, get_test_language("json")); set_text(" [123, @@@@@, true]"); assert_root_node( "(array (number) (ERROR (UNEXPECTED '@')) (true))"); TSNode error = ts_node_named_child(root, 1); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(get_node_text(error), Equals(", @@@@@")); AssertThat(ts_node_child_count(error), Equals(2)); @@ -111,59 +111,59 @@ describe("Parser", [&]() { AssertThat(get_node_text(garbage), Equals("@@@@@")); TSNode node_after_error = ts_node_named_child(root, 2); - AssertThat(ts_node_type(node_after_error, doc), Equals("true")); + AssertThat(ts_node_type(node_after_error, document), Equals("true")); AssertThat(get_node_text(node_after_error), Equals("true")); }); }); describe("when there is an unexpected string in the middle of a token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(doc, get_test_language("json")); + ts_document_set_language(document, get_test_language("json")); set_text(" [123, faaaaalse, true]"); assert_root_node( "(array (number) (ERROR (UNEXPECTED 'a')) (true))"); TSNode error = ts_node_named_child(root, 1); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(ts_node_child_count(error), Equals(2)); TSNode comma = ts_node_child(error, 0); - AssertThat(ts_node_type(comma, doc), Equals(",")); + AssertThat(ts_node_type(comma, document), Equals(",")); AssertThat(get_node_text(comma), Equals(",")); TSNode garbage = ts_node_child(error, 1); - AssertThat(ts_node_type(garbage, doc), Equals("ERROR")); + AssertThat(ts_node_type(garbage, document), Equals("ERROR")); AssertThat(get_node_text(garbage), Equals("faaaaalse")); TSNode last = ts_node_named_child(root, 2); - AssertThat(ts_node_type(last, doc), Equals("true")); + AssertThat(ts_node_type(last, document), Equals("true")); AssertThat(ts_node_start_byte(last), Equals(strlen(" [123, faaaaalse, "))); }); }); describe("when there is one unexpected token between two valid tokens", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(doc, get_test_language("json")); + ts_document_set_language(document, get_test_language("json")); set_text(" [123, true false, true]"); assert_root_node( "(array (number) (true) (ERROR (false)) (true))"); TSNode error = ts_node_named_child(root, 2); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(get_node_text(error), Equals("false")); AssertThat(ts_node_child_count(error), Equals(1)); TSNode last = ts_node_named_child(root, 1); - AssertThat(ts_node_type(last, doc), Equals("true")); + AssertThat(ts_node_type(last, document), Equals("true")); AssertThat(get_node_text(last), Equals("true")); }); }); describe("when there is an unexpected string at the end of a token", [&]() { it("computes the error's size and position correctly", [&]() { - ts_document_set_language(doc, get_test_language("json")); + ts_document_set_language(document, get_test_language("json")); set_text(" [123, \"hi\n, true]"); assert_root_node( @@ -173,7 +173,7 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("a; /* b"); assert_root_node( "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); @@ -184,7 +184,7 @@ describe("Parser", [&]() { describe("handling extra tokens", [&]() { describe("when the token appears as part of a grammar rule", [&]() { it("is incorporated into the tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("fn()\n"); assert_root_node( @@ -194,7 +194,7 @@ describe("Parser", [&]() { describe("when the token appears somewhere else", [&]() { it("is incorporated into the tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n" " .otherFn();"); @@ -210,7 +210,7 @@ describe("Parser", [&]() { describe("when several extra tokens appear in a row", [&]() { it("is incorporated into the tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n\n" "// This is a comment" @@ -232,7 +232,7 @@ describe("Parser", [&]() { describe("inserting text", [&]() { describe("creating new tokens near the end of the input", [&]() { it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("x * (100 + abc);"); assert_root_node( @@ -255,7 +255,7 @@ describe("Parser", [&]() { it("updates the parse tree and re-reads only the changed portion of the input", [&]() { chunk_size = 2; - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("123 + 456 * (10 + x);"); assert_root_node( @@ -278,7 +278,7 @@ describe("Parser", [&]() { describe("introducing an error", [&]() { it("gives the error the right size", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("var x = y;"); assert_root_node( @@ -301,7 +301,7 @@ describe("Parser", [&]() { describe("into the middle of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -313,14 +313,14 @@ describe("Parser", [&]() { "(program (expression_statement (math_op (identifier) (number))))"); TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, doc), Equals("identifier")); + AssertThat(ts_node_type(node, document), Equals("identifier")); AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); }); }); describe("at the end of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -332,14 +332,14 @@ describe("Parser", [&]() { "(program (expression_statement (math_op (identifier) (number))))"); TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, doc), Equals("identifier")); + AssertThat(ts_node_type(node, document), Equals("identifier")); AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); }); }); describe("into a node containing a extra token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("123 *\n" "// a-comment\n" "abc;"); @@ -368,7 +368,7 @@ describe("Parser", [&]() { describe("deleting text", [&]() { describe("when a critical token is removed", [&]() { it("updates the parse tree, creating an error", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("123 * 456; 789 * 123;"); assert_root_node( @@ -388,7 +388,7 @@ describe("Parser", [&]() { describe("replacing text", [&]() { it("does not try to re-use nodes that are within the edited region", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("{ x: (b.c) };"); assert_root_node( @@ -405,7 +405,7 @@ describe("Parser", [&]() { describe("with external tokens", [&]() { it("maintains the external scanner's state during incremental parsing", [&]() { - ts_document_set_language(doc, get_test_language("python")); + ts_document_set_language(document, get_test_language("python")); string text = dedent(R"PYTHON( if a: print b @@ -427,21 +427,21 @@ describe("Parser", [&]() { }); it("updates the document's parse count", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); - AssertThat(ts_document_parse_count(doc), Equals(0)); + ts_document_set_language(document, get_test_language("javascript")); + AssertThat(ts_document_parse_count(document), Equals(0)); set_text("{ x: (b.c) };"); - AssertThat(ts_document_parse_count(doc), Equals(1)); + AssertThat(ts_document_parse_count(document), Equals(1)); insert_text(strlen("{ x"), "yz"); - AssertThat(ts_document_parse_count(doc), Equals(2)); + AssertThat(ts_document_parse_count(document), Equals(2)); }); }); describe("lexing", [&]() { describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { it("terminates them at the end of the document", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("x; // this is a comment"); assert_root_node( @@ -456,7 +456,7 @@ describe("Parser", [&]() { it("recognizes UTF8 characters as single characters", [&]() { // 'ΩΩΩ — ΔΔ'; - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); assert_root_node( From 83c4c03a70d4a24840bc5d93d406ca6ddfaa99fd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 11:37:08 -0800 Subject: [PATCH 28/50] Remove unnecessary describes in Parser spec --- spec/runtime/parser_spec.cc | 384 +++++++++++++++++------------------- 1 file changed, 181 insertions(+), 203 deletions(-) diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 2e4c9d20..c06f5322 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -20,16 +20,12 @@ describe("Parser", [&]() { chunk_size = 3; input = nullptr; - document = ts_document_new(); }); after_each([&]() { - if (document) - ts_document_free(document); - - if (input) - delete input; + if (document) ts_document_free(document); + if (input) delete input; record_alloc::stop(); AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); @@ -45,26 +41,6 @@ describe("Parser", [&]() { input->clear(); }; - auto insert_text = [&](size_t position, string text) { - size_t prev_size = ts_node_end_byte(root); - ts_document_edit(document, input->replace(position, 0, text)); - ts_document_parse(document); - - root = ts_document_root_node(document); - size_t new_size = ts_node_end_byte(root); - AssertThat(new_size, Equals(prev_size + text.size())); - }; - - auto delete_text = [&](size_t position, size_t length) { - size_t prev_size = ts_node_end_byte(root); - ts_document_edit(document, input->replace(position, length, "")); - ts_document_parse(document); - - root = ts_document_root_node(document); - size_t new_size = ts_node_end_byte(root); - AssertThat(new_size, Equals(prev_size - length)); - }; - auto replace_text = [&](size_t position, size_t length, string new_text) { size_t prev_size = ts_node_end_byte(root); @@ -76,21 +52,29 @@ describe("Parser", [&]() { AssertThat(new_size, Equals(prev_size - length + new_text.size())); }; + auto insert_text = [&](size_t position, string text) { + replace_text(position, 0, text); + }; + + auto delete_text = [&](size_t position, size_t length) { + replace_text(position, length, ""); + }; + auto assert_root_node = [&](const string &expected) { TSNode node = ts_document_root_node(document); - char *str = ts_node_string(node, document); - string actual(str); - ts_free(str); + char *node_string = ts_node_string(node, document); + string actual(node_string); + ts_free(node_string); AssertThat(actual, Equals(expected)); }; - describe("handling errors", [&]() { - auto get_node_text = [&](TSNode node) { - size_t start = ts_node_start_byte(node); - size_t end = ts_node_end_byte(node); - return input->content.substr(start, end - start); - }; + auto get_node_text = [&](TSNode node) { + size_t start = ts_node_start_byte(node); + size_t end = ts_node_end_byte(node); + return input->content.substr(start, end - start); + }; + describe("handling errors", [&]() { describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { ts_document_set_language(document, get_test_language("json")); @@ -183,7 +167,7 @@ describe("Parser", [&]() { describe("handling extra tokens", [&]() { describe("when the token appears as part of a grammar rule", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates it into the tree", [&]() { ts_document_set_language(document, get_test_language("javascript")); set_text("fn()\n"); @@ -193,7 +177,7 @@ describe("Parser", [&]() { }); describe("when the token appears somewhere else", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates it into the tree", [&]() { ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n" @@ -209,7 +193,7 @@ describe("Parser", [&]() { }); describe("when several extra tokens appear in a row", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates them into the tree", [&]() { ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n\n" @@ -229,177 +213,156 @@ describe("Parser", [&]() { }); describe("editing", [&]() { - describe("inserting text", [&]() { - describe("creating new tokens near the end of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("x * (100 + abc);"); - - assert_root_node( - "(program (expression_statement (math_op " - "(identifier) " - "(math_op (number) (identifier)))))"); - - insert_text(strlen("x * (100 + abc"), ".d"); - - assert_root_node( - "(program (expression_statement (math_op " - "(identifier) " - "(math_op (number) (member_access (identifier) (identifier))))))"); - - AssertThat(input->strings_read, Equals(vector({ " + abc.d)", "" }))); - }); - }); - - describe("creating new tokens near the beginning of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the input", [&]() { - chunk_size = 2; - - ts_document_set_language(document, get_test_language("javascript")); - set_text("123 + 456 * (10 + x);"); - - assert_root_node( - "(program (expression_statement (math_op " - "(number) " - "(math_op (number) (math_op (number) (identifier))))))"); - - insert_text(strlen("123"), " || 5"); - - assert_root_node( - "(program (expression_statement (bool_op " - "(number) " - "(math_op " - "(number) " - "(math_op (number) (math_op (number) (identifier)))))))"); - - AssertThat(input->strings_read, Equals(vector({ "123 || 5 +", "" }))); - }); - }); - - describe("introducing an error", [&]() { - it("gives the error the right size", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("var x = y;"); - - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (identifier))))"); - - insert_text(strlen("var x = y"), " *"); - - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (identifier)) (ERROR)))"); - - insert_text(strlen("var x = y *"), " z"); - - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (math_op (identifier) (identifier)))))"); - }); - }); - - describe("into the middle of an existing token", [&]() { - it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("abc * 123;"); - - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); - - insert_text(strlen("ab"), "XYZ"); - - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); - - TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, document), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); - }); - }); - - describe("at the end of an existing token", [&]() { - it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("abc * 123;"); - - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); - - insert_text(strlen("abc"), "XYZ"); - - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); - - TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, document), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); - }); - }); - - describe("into a node containing a extra token", [&]() { - it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("123 *\n" - "// a-comment\n" - "abc;"); - - assert_root_node( - "(program (expression_statement (math_op " - "(number) " - "(comment) " - "(identifier))))"); - - insert_text( - strlen("123 *\n" - "// a-comment\n" - "abc"), - "XYZ"); - - assert_root_node( - "(program (expression_statement (math_op " - "(number) " - "(comment) " - "(identifier))))"); - }); - }); - }); - - describe("deleting text", [&]() { - describe("when a critical token is removed", [&]() { - it("updates the parse tree, creating an error", [&]() { - ts_document_set_language(document, get_test_language("javascript")); - set_text("123 * 456; 789 * 123;"); - - assert_root_node( - "(program " - "(expression_statement (math_op (number) (number))) " - "(expression_statement (math_op (number) (number))))"); - - delete_text(strlen("123 "), 2); - - assert_root_node( - "(program " - "(expression_statement (number) (ERROR (number))) " - "(expression_statement (math_op (number) (number))))"); - }); - }); - }); - - describe("replacing text", [&]() { - it("does not try to re-use nodes that are within the edited region", [&]() { + describe("creating new tokens near the end of the input", [&]() { + it("updates the parse tree and re-reads only the changed portion of the text", [&]() { ts_document_set_language(document, get_test_language("javascript")); - set_text("{ x: (b.c) };"); + set_text("x * (100 + abc);"); assert_root_node( - "(program (expression_statement (object (pair " - "(identifier) (member_access (identifier) (identifier))))))"); + "(program (expression_statement (math_op " + "(identifier) " + "(math_op (number) (identifier)))))"); - replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c"); + insert_text(strlen("x * (100 + abc"), ".d"); assert_root_node( - "(program (expression_statement (object (pair " - "(identifier) (member_access (identifier) (identifier))))))"); + "(program (expression_statement (math_op " + "(identifier) " + "(math_op (number) (member_access (identifier) (identifier))))))"); + + AssertThat(input->strings_read, Equals(vector({ " + abc.d)", "" }))); + }); + }); + + describe("creating new tokens near the beginning of the input", [&]() { + it("updates the parse tree and re-reads only the changed portion of the input", [&]() { + chunk_size = 2; + + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 + 456 * (10 + x);"); + + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(math_op (number) (math_op (number) (identifier))))))"); + + insert_text(strlen("123"), " || 5"); + + assert_root_node( + "(program (expression_statement (bool_op " + "(number) " + "(math_op " + "(number) " + "(math_op (number) (math_op (number) (identifier)))))))"); + + AssertThat(input->strings_read, Equals(vector({ "123 || 5 +", "" }))); + }); + }); + + describe("introducing an error", [&]() { + it("gives the error the right size", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("var x = y;"); + + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (identifier))))"); + + insert_text(strlen("var x = y"), " *"); + + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (identifier)) (ERROR)))"); + + insert_text(strlen("var x = y *"), " z"); + + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (math_op (identifier) (identifier)))))"); + }); + }); + + describe("into the middle of an existing token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("abc * 123;"); + + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); + + insert_text(strlen("ab"), "XYZ"); + + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); + + TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); + AssertThat(ts_node_type(node, document), Equals("identifier")); + AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); + }); + }); + + describe("at the end of an existing token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("abc * 123;"); + + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); + + insert_text(strlen("abc"), "XYZ"); + + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); + + TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); + AssertThat(ts_node_type(node, document), Equals("identifier")); + AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); + }); + }); + + describe("inserting text into a node containing a extra token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 *\n" + "// a-comment\n" + "abc;"); + + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(comment) " + "(identifier))))"); + + insert_text( + strlen("123 *\n" + "// a-comment\n" + "abc"), + "XYZ"); + + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(comment) " + "(identifier))))"); + }); + }); + + describe("when a critical token is removed", [&]() { + it("updates the parse tree, creating an error", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 * 456; 789 * 123;"); + + assert_root_node( + "(program " + "(expression_statement (math_op (number) (number))) " + "(expression_statement (math_op (number) (number))))"); + + delete_text(strlen("123 "), 2); + + assert_root_node( + "(program " + "(expression_statement (number) (ERROR (number))) " + "(expression_statement (math_op (number) (number))))"); }); }); @@ -426,6 +389,21 @@ describe("Parser", [&]() { }); }); + it("does not try to re-use nodes that are within the edited region", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("{ x: (b.c) };"); + + assert_root_node( + "(program (expression_statement (object (pair " + "(identifier) (member_access (identifier) (identifier))))))"); + + replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c"); + + assert_root_node( + "(program (expression_statement (object (pair " + "(identifier) (member_access (identifier) (identifier))))))"); + }); + it("updates the document's parse count", [&]() { ts_document_set_language(document, get_test_language("javascript")); AssertThat(ts_document_parse_count(document), Equals(0)); From b833942bb85c270071e7cf3b1ba4cb1430fa5990 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 11:42:32 -0800 Subject: [PATCH 29/50] Clean up Document spec --- spec/runtime/document_spec.cc | 147 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 75 deletions(-) diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index de5b5f36..cc2efa60 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -16,22 +16,22 @@ TSPoint point(size_t row, size_t column) { START_TEST describe("Document", [&]() { - TSDocument *doc; + TSDocument *document; TSNode root; before_each([&]() { record_alloc::start(); - doc = ts_document_new(); + document = ts_document_new(); }); after_each([&]() { - ts_document_free(doc); + ts_document_free(document); record_alloc::stop(); AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); }); auto assert_node_string_equals = [&](TSNode node, const string &expected) { - char *str = ts_node_string(node, doc); + char *str = ts_node_string(node, document); string actual(str); ts_free(str); AssertThat(actual, Equals(expected)); @@ -43,11 +43,11 @@ describe("Document", [&]() { before_each([&]() { spy_input = new SpyInput("{\"key\": [null, 2]}", 3); - ts_document_set_language(doc, get_test_language("json")); - ts_document_set_input_string(doc, "{\"key\": [1, 2]}"); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_set_input_string(document, "{\"key\": [1, 2]}"); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(object (pair (string) (array (number) (number))))"); @@ -62,11 +62,11 @@ describe("Document", [&]() { spy_input->content = string((const char *)content, sizeof(content)); spy_input->encoding = TSInputEncodingUTF16; - ts_document_set_input(doc, spy_input->input()); - ts_document_invalidate(doc); - ts_document_parse(doc); + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(array (true) (false))"); @@ -78,27 +78,27 @@ describe("Document", [&]() { spy_input->encoding = TSInputEncodingUTF16; // spy_input->measure_columns_in_bytes - ts_document_set_input(doc, spy_input->input()); - ts_document_invalidate(doc); - ts_document_parse(doc); + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); }); it("allows the input to be retrieved later", [&]() { - ts_document_set_input(doc, spy_input->input()); - AssertThat(ts_document_input(doc).payload, Equals(spy_input)); - AssertThat(ts_document_input(doc).read, Equals(spy_input->input().read)); - AssertThat(ts_document_input(doc).seek, Equals(spy_input->input().seek)); + ts_document_set_input(document, spy_input->input()); + AssertThat(ts_document_input(document).payload, Equals(spy_input)); + AssertThat(ts_document_input(document).read, Equals(spy_input->input().read)); + AssertThat(ts_document_input(document).seek, Equals(spy_input->input().seek)); }); it("does not assume that the document's text has changed", [&]() { - ts_document_set_input(doc, spy_input->input()); - AssertThat(ts_document_root_node(doc), Equals(root)); + ts_document_set_input(document, spy_input->input()); + AssertThat(ts_document_root_node(document), Equals(root)); AssertThat(ts_node_has_changes(root), IsFalse()); AssertThat(spy_input->strings_read, Equals(vector({ "" }))); }); it("reads text from the new input for future parses", [&]() { - ts_document_set_input(doc, spy_input->input()); + ts_document_set_input(document, spy_input->input()); // Insert 'null', delete '1'. TSInputEdit edit = {}; @@ -106,10 +106,10 @@ describe("Document", [&]() { edit.extent_added.column = edit.bytes_added = 4; edit.extent_removed.column = edit.bytes_removed = 1; - ts_document_edit(doc, edit); - ts_document_parse(doc); + ts_document_edit(document, edit); + ts_document_parse(document); - TSNode new_root = ts_document_root_node(doc); + TSNode new_root = ts_document_root_node(document); assert_node_string_equals( new_root, "(object (pair (string) (array (null) (number))))"); @@ -117,17 +117,17 @@ describe("Document", [&]() { }); it("reads from the new input correctly when the old input was blank", [&]() { - ts_document_set_input_string(doc, ""); - ts_document_parse(doc); - TSNode new_root = ts_document_root_node(doc); + ts_document_set_input_string(document, ""); + ts_document_parse(document); + TSNode new_root = ts_document_root_node(document); AssertThat(ts_node_end_char(new_root), Equals(0)); assert_node_string_equals( new_root, "(ERROR)"); - ts_document_set_input_string(doc, "1"); - ts_document_parse(doc); - new_root = ts_document_root_node(doc); + ts_document_set_input_string(document, "1"); + ts_document_parse(document); + new_root = ts_document_root_node(document); AssertThat(ts_node_end_char(new_root), Equals(1)); assert_node_string_equals( new_root, @@ -137,28 +137,28 @@ describe("Document", [&]() { describe("set_language(language)", [&]() { before_each([&]() { - ts_document_set_input_string(doc, "{\"key\": [1, 2]}\n"); + ts_document_set_input_string(document, "{\"key\": [1, 2]}\n"); }); it("uses the given language for future parses", [&]() { - ts_document_set_language(doc, get_test_language("json")); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(object (pair (string) (array (number) (number))))"); }); it("clears out any previous tree", [&]() { - ts_document_set_language(doc, get_test_language("json")); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_parse(document); - ts_document_set_language(doc, get_test_language("javascript")); - AssertThat(ts_document_root_node(doc).data, Equals(nullptr)); + ts_document_set_language(document, get_test_language("javascript")); + AssertThat(ts_document_root_node(document).data, Equals(nullptr)); - ts_document_parse(doc); - root = ts_document_root_node(doc); + ts_document_parse(document); + root = ts_document_root_node(document); assert_node_string_equals( root, "(program (expression_statement " @@ -171,8 +171,8 @@ describe("Document", [&]() { before_each([&]() { logger = new SpyLogger(); - ts_document_set_language(doc, get_test_language("json")); - ts_document_set_input_string(doc, "[1, 2]"); + ts_document_set_language(document, get_test_language("json")); + ts_document_set_input_string(document, "[1, 2]"); }); after_each([&]() { @@ -180,8 +180,8 @@ describe("Document", [&]() { }); it("calls the debugger with a message for each parse action", [&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_parse(doc); + ts_document_set_logger(document, logger->logger()); + ts_document_parse(document); AssertThat(logger->messages, Contains("new_parse")); AssertThat(logger->messages, Contains("skip character:' '")); @@ -192,18 +192,18 @@ describe("Document", [&]() { }); it("allows the debugger to be retrieved later", [&]() { - ts_document_set_logger(doc, logger->logger()); - AssertThat(ts_document_logger(doc).payload, Equals(logger)); + ts_document_set_logger(document, logger->logger()); + AssertThat(ts_document_logger(document).payload, Equals(logger)); }); describe("disabling debugging", [&]() { before_each([&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_set_logger(doc, {NULL, NULL}); + ts_document_set_logger(document, logger->logger()); + ts_document_set_logger(document, {NULL, NULL}); }); it("does not call the debugger any more", [&]() { - ts_document_parse(doc); + ts_document_parse(document); AssertThat(logger->messages, IsEmpty()); }); }); @@ -213,12 +213,12 @@ describe("Document", [&]() { SpyInput *input; before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); input = new SpyInput("{a: null};", 3); - ts_document_set_input(doc, input->input()); - ts_document_parse(doc); + ts_document_set_input(document, input->input()); + ts_document_parse(document); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object (pair (identifier) (null)))))"); }); @@ -226,26 +226,25 @@ describe("Document", [&]() { delete input; }); - auto get_ranges = [&](std::function callback) -> vector { + auto get_invalidated_ranges_for_edit = [&](std::function callback) -> vector { TSInputEdit edit = callback(); - ts_document_edit(doc, edit); + ts_document_edit(document, edit); TSRange *ranges; uint32_t range_count = 0; - - ts_document_parse_and_get_changed_ranges(doc, &ranges, &range_count); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); vector result; - for (size_t i = 0; i < range_count; i++) + for (size_t i = 0; i < range_count; i++) { result.push_back(ranges[i]); + } ts_free(ranges); - return result; }; it("reports changes when one token has been updated", [&]() { // Replace `null` with `nothing` - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("ull"), 1, "othing"); }); @@ -257,7 +256,7 @@ describe("Document", [&]() { }))); // Replace `nothing` with `null` again - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); @@ -271,7 +270,7 @@ describe("Document", [&]() { it("reports changes when tokens have been appended", [&]() { // Add a second key-value pair - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("}"), 0, ", b: false"); }); @@ -283,12 +282,12 @@ describe("Document", [&]() { }))); // Add a third key-value pair in between the first two - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find(", b"), 0, ", c: 1"); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)) " "(pair (identifier) (number)) " @@ -302,41 +301,39 @@ describe("Document", [&]() { }))); // Delete the middle pair. - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)) " "(pair (identifier) (false)))))"); - AssertThat(ranges, Equals(vector({ - }))); + AssertThat(ranges, IsEmpty()); // Delete the second pair. - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)))))"); - AssertThat(ranges, Equals(vector({ - }))); + AssertThat(ranges, IsEmpty()); }); it("reports changes when trees have been wrapped", [&]() { // Wrap the object in an assignment expression. - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("null"), 0, "b === "); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (rel_op (identifier) (null))))))"); From 1595a02692ff195d9c7fc0d48d9ee4c410047f30 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 12:23:24 -0800 Subject: [PATCH 30/50] Avoid referencing invalid union member in tree_set_children --- src/runtime/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/tree.c b/src/runtime/tree.c index e788cb02..dfc45c0c 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -127,7 +127,7 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->visible_child_count++; if (child->named) self->named_child_count++; - } else { + } else if (child->child_count > 0) { self->visible_child_count += child->visible_child_count; self->named_child_count += child->named_child_count; } From 4136dad5de28b540281a251cb7703d4a94864085 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 13:21:21 -0800 Subject: [PATCH 31/50] Avoid referencing invalid union member in tree_path_descend --- src/runtime/tree_path.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/tree_path.h b/src/runtime/tree_path.h index bba32718..6fd4ef97 100644 --- a/src/runtime/tree_path.h +++ b/src/runtime/tree_path.h @@ -37,7 +37,7 @@ static bool tree_path_descend(TreePath *path, TSPoint position) { if (child->visible) { array_push(path, child_entry); return true; - } else if (child->visible_child_count > 0) { + } else if (child->child_count > 0 && child->visible_child_count > 0) { array_push(path, child_entry); did_descend = true; break; From 3706678b8971892ff18e4bf7410940720781b04a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 13:58:18 -0800 Subject: [PATCH 32/50] Pass const TSExternalTokenState to external scanner deserialize hook --- include/tree_sitter/parser.h | 2 +- src/compiler/generate_code/c_code.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index eea5f76f..8e5658f4 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -82,7 +82,7 @@ typedef struct TSLanguage { void (*reset)(void *); bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); bool (*serialize)(void *, TSExternalTokenState); - void (*deserialize)(void *, TSExternalTokenState); + void (*deserialize)(void *, const TSExternalTokenState); } external_scanner; } TSLanguage; diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index d5eab8b0..a28648c8 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -361,7 +361,7 @@ class CCodeGenerator { line("void " + external_scanner_name + "_reset(void *);"); line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);"); - line("void " + external_scanner_name + "_deserialize(void *, TSExternalTokenState);"); + line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);"); line(); } From 2fa7b453c8ff6ed3de528f2e5c2a79b985922f75 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 21 Dec 2016 13:59:56 -0800 Subject: [PATCH 33/50] Restore external scanner's state only after repositioning lexer Also, properly identify the leaf node with the external token state --- src/runtime/lexer.c | 16 ++++++++++++++-- src/runtime/lexer.h | 2 ++ src/runtime/parser.c | 10 ++++++++-- src/runtime/tree.c | 13 +++++++++++++ src/runtime/tree.h | 1 + 5 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index acf394bb..f7ebf042 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -88,6 +88,8 @@ void ts_lexer_init(Lexer *self) { .payload = NULL, .log = NULL }, + .needs_to_restore_external_scanner = false, + .last_external_token_end_byte = 0, }; ts_lexer_reset(self, length_zero()); } @@ -110,12 +112,22 @@ static inline void ts_lexer__reset(Lexer *self, Length position) { void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__reset(self, length_zero()); + self->needs_to_restore_external_scanner = false; + self->last_external_token_end_byte = 0; } void ts_lexer_reset(Lexer *self, Length position) { - if (!length_eq(position, self->current_position)) + if (position.bytes > self->current_position.bytes) { + self->needs_to_restore_external_scanner = true; + self->last_external_token_end_byte = 0; ts_lexer__reset(self, position); - return; + } else if (position.bytes < self->current_position.bytes) { + if (position.bytes < self->last_external_token_end_byte) { + self->needs_to_restore_external_scanner = true; + self->last_external_token_end_byte = 0; + } + ts_lexer__reset(self, position); + } } void ts_lexer_start(Lexer *self) { diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 682c3f93..76d863c4 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -25,6 +25,8 @@ typedef struct { TSInput input; TSLogger logger; char debug_buffer[TS_DEBUG_BUFFER_SIZE]; + bool needs_to_restore_external_scanner; + uint32_t last_external_token_end_byte; } Lexer; void ts_lexer_init(Lexer *); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index f5b08f82..b503bea0 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -217,8 +217,11 @@ static StackIterateAction parser__restore_external_scanner_callback( Parser *self = payload; if (tree_count > 0) { Tree *tree = *array_back(trees); - if (tree->has_external_token_state && tree->child_count == 0) { - self->language->external_scanner.deserialize(self->external_scanner_payload, tree->external_token_state); + if (tree->has_external_token_state) { + self->language->external_scanner.deserialize( + self->external_scanner_payload, + *ts_tree_last_external_token_state(tree) + ); return StackIterateStop; } } else if (is_done) { @@ -230,6 +233,7 @@ static StackIterateAction parser__restore_external_scanner_callback( } static void parser__restore_external_scanner(Parser *self, StackVersion version) { + if (!self->lexer.needs_to_restore_external_scanner) return; StackPopResult pop = ts_stack_iterate(self->stack, version, parser__restore_external_scanner_callback, self); if (pop.slices.size > 0) { StackSlice slice = pop.slices.contents[0]; @@ -269,6 +273,8 @@ static Tree *parser__lex(Parser *self, StackVersion version) { ts_lexer_start(&self->lexer); if (self->language->external_scanner.scan(self->external_scanner_payload, &self->lexer.data, external_tokens)) { + self->lexer.last_external_token_end_byte = self->lexer.current_position.bytes; + self->lexer.needs_to_restore_external_scanner = false; found_external_token = true; break; } diff --git a/src/runtime/tree.c b/src/runtime/tree.c index dfc45c0c..ebcca441 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -379,6 +379,19 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) { } } +const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *tree) { + while (tree->child_count > 0) { + for (uint32_t i = tree->child_count - 1; i + 1 > 0; i--) { + Tree *child = tree->children[i]; + if (child->has_external_token_state) { + tree = child; + break; + } + } + } + return &tree->external_token_state; +} + static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) { if (c == 0) return snprintf(s, n, "EOF"); diff --git a/src/runtime/tree.h b/src/runtime/tree.h index 425fac51..d5916e31 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -86,6 +86,7 @@ void ts_tree_assign_parents(Tree *, TreePath *); void ts_tree_edit(Tree *, const TSInputEdit *edit); char *ts_tree_string(const Tree *, const TSLanguage *, bool include_all); void ts_tree_print_dot_graph(const Tree *, const TSLanguage *, FILE *); +const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *); static inline uint32_t ts_tree_total_bytes(const Tree *self) { return self->padding.bytes + self->size.bytes; From 0f039721dd316bd205d956ccc362a8107e596248 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 26 Dec 2016 11:08:35 -0800 Subject: [PATCH 34/50] Compile external scanners w/ debug symbols in test suite --- spec/helpers/load_language.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index a2b89027..84873d1a 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -96,6 +96,7 @@ const TSLanguage *load_language(const string &source_filename, }; if (!external_scanner_filename.empty()) { + compile_args.push_back("-g"); string extension = external_scanner_filename.substr(external_scanner_filename.rfind(".")); if (extension == ".c") { compile_args.push_back("-xc"); From adae1b16a61a305566af20443b1691bb8fd62f4b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 26 Dec 2016 13:34:05 -0800 Subject: [PATCH 35/50] Remove shared setup in tree spec --- spec/runtime/tree_spec.cc | 198 +++++++++++++++++++++----------------- 1 file changed, 110 insertions(+), 88 deletions(-) diff --git a/spec/runtime/tree_spec.cc b/spec/runtime/tree_spec.cc index 9f451829..b67cd0a8 100644 --- a/spec/runtime/tree_spec.cc +++ b/spec/runtime/tree_spec.cc @@ -22,47 +22,29 @@ void assert_consistent(const Tree *tree) { START_TEST -enum { - cat = 1, - dog, - eel, - fox, - goat, - hog, -}; - describe("Tree", []() { - Tree *tree1, *tree2, *parent1; + enum { + symbol1 = 1, + symbol2, + symbol3, + symbol4, + symbol5, + symbol6, + }; + TSSymbolMetadata visible = {true, true, false, true}; TSSymbolMetadata invisible = {false, false, false, true}; - before_each([&]() { - tree1 = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); - tree2 = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - - ts_tree_retain(tree1); - ts_tree_retain(tree2); - parent1 = ts_tree_make_node(dog, 2, tree_array({ - tree1, - tree2, - }), visible); - }); - - after_each([&]() { - ts_tree_release(tree1); - ts_tree_release(tree2); - ts_tree_release(parent1); - }); - - describe("make_leaf(sym, size, padding, is_hidden)", [&]() { - it("does not record that it is fragile", [&]() { - AssertThat(tree1->fragile_left, IsFalse()); - AssertThat(tree1->fragile_right, IsFalse()); + describe("make_leaf", [&]() { + it("does not mark the tree as fragile", [&]() { + Tree *tree = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + AssertThat(tree->fragile_left, IsFalse()); + AssertThat(tree->fragile_right, IsFalse()); }); }); - describe("make_error(size, padding, lookahead_char)", [&]() { - it("records that it is fragile", [&]() { + describe("make_error", [&]() { + it("marks the tree as fragile", [&]() { Tree *error_tree = ts_tree_make_error( length_zero(), length_zero(), @@ -75,15 +57,33 @@ describe("Tree", []() { }); }); - describe("make_node(symbol, child_count, children, is_hidden)", [&]() { - it("computes its size based on its child nodes", [&]() { - AssertThat(parent1->size.bytes, Equals( - tree1->size.bytes + + tree2->padding.bytes + tree2->size.bytes)); - AssertThat(parent1->size.chars, Equals( - tree1->size.chars + + tree2->padding.chars + tree2->size.chars)); + describe("make_node", [&]() { + Tree *tree1, *tree2, *parent1; + + before_each([&]() { + tree1 = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + tree2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); + + ts_tree_retain(tree1); + ts_tree_retain(tree2); + parent1 = ts_tree_make_node(symbol3, 2, tree_array({ + tree1, + tree2, + }), visible); }); - it("computes its padding based on its first child", [&]() { + after_each([&]() { + ts_tree_release(tree1); + ts_tree_release(tree2); + ts_tree_release(parent1); + }); + + it("computes its size and padding based on its child nodes", [&]() { + AssertThat(parent1->size.bytes, Equals( + tree1->size.bytes + tree2->padding.bytes + tree2->size.bytes)); + AssertThat(parent1->size.chars, Equals( + tree1->size.chars + tree2->padding.chars + tree2->size.chars)); + AssertThat(parent1->padding.bytes, Equals(tree1->padding.bytes)); AssertThat(parent1->padding.chars, Equals(tree1->padding.chars)); }); @@ -97,7 +97,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -121,7 +121,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -145,7 +145,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -162,14 +162,14 @@ describe("Tree", []() { }); }); - describe("edit(InputEdit)", [&]() { + describe("edit", [&]() { Tree *tree = nullptr; before_each([&]() { - tree = ts_tree_make_node(cat, 3, tree_array({ - ts_tree_make_leaf(dog, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), - ts_tree_make_leaf(eel, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), - ts_tree_make_leaf(fox, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + tree = ts_tree_make_node(symbol1, 3, tree_array({ + ts_tree_make_leaf(symbol2, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + ts_tree_make_leaf(symbol3, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + ts_tree_make_leaf(symbol4, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), }), visible); AssertThat(tree->padding, Equals({2, 2, {0, 2}})); @@ -180,7 +180,6 @@ describe("Tree", []() { ts_tree_release(tree); }); - describe("edits within a tree's padding", [&]() { it("resizes the padding of the tree and its leftmost descendants", [&]() { TSInputEdit edit; @@ -312,68 +311,91 @@ describe("Tree", []() { }); }); - describe("equality", [&]() { + describe("eq", [&]() { + Tree *leaf; + + before_each([&]() { + leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + }); + + after_each([&]() { + ts_tree_release(leaf); + }); + it("returns true for identical trees", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsTrue()); + Tree *leaf_copy = ts_tree_make_leaf(symbol1, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible); + AssertThat(ts_tree_eq(leaf, leaf_copy), IsTrue()); - Tree *tree2_copy = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - AssertThat(ts_tree_eq(tree2, tree2_copy), IsTrue()); - - Tree *parent2 = ts_tree_make_node(dog, 2, tree_array({ - tree1_copy, - tree2_copy, + Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf_copy, }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf_copy); - AssertThat(ts_tree_eq(parent1, parent2), IsTrue()); + Tree *parent_copy = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf_copy, + }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf_copy); - ts_tree_release(parent2); + AssertThat(ts_tree_eq(parent, parent_copy), IsTrue()); + + ts_tree_release(leaf_copy); + ts_tree_release(parent); + ts_tree_release(parent_copy); }); it("returns false for trees with different symbols", [&]() { - Tree *different_tree = ts_tree_make_leaf( - tree1->symbol + 1, - tree1->padding, - tree1->size, + Tree *different_leaf = ts_tree_make_leaf( + leaf->symbol + 1, + leaf->padding, + leaf->size, visible); - AssertThat(ts_tree_eq(tree1, different_tree), IsFalse()); - ts_tree_release(different_tree); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different options", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, tree1->padding, tree1->size, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + Tree *different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, leaf->size, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different sizes", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, tree1->size, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + Tree *different_leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, leaf->size, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); - tree1_copy = ts_tree_make_leaf(cat, tree1->padding, {5, 4, {1, 10}}, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, {5, 4, {1, 10}}, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different children", [&]() { - Tree *different_tree = ts_tree_make_leaf( - tree1->symbol + 1, - tree1->padding, - tree1->size, - visible); + Tree *leaf2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - ts_tree_retain(different_tree); - ts_tree_retain(tree2); - Tree *different_parent = ts_tree_make_node(dog, 2, tree_array({ - different_tree, tree2, + Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf2, }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf2); - AssertThat(ts_tree_eq(different_parent, parent1), IsFalse()); - AssertThat(ts_tree_eq(parent1, different_parent), IsFalse()); + Tree *different_parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf2, + leaf, + }), visible); + ts_tree_retain(leaf2); + ts_tree_retain(leaf); - ts_tree_release(different_tree); + AssertThat(ts_tree_eq(different_parent, parent), IsFalse()); + AssertThat(ts_tree_eq(parent, different_parent), IsFalse()); + + ts_tree_release(leaf2); + ts_tree_release(parent); ts_tree_release(different_parent); }); }); From d57043b66555b631236e54c7ac47aa1fdd20d627 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 4 Jan 2017 21:22:23 -0800 Subject: [PATCH 36/50] Add ability to store external token state per stack version --- spec/runtime/stack_spec.cc | 25 +++++++++++++++++++++++++ src/runtime/stack.c | 28 ++++++++++++++++++++++++---- src/runtime/stack.h | 6 +++++- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/spec/runtime/stack_spec.cc b/spec/runtime/stack_spec.cc index 4d4b01fd..20180843 100644 --- a/spec/runtime/stack_spec.cc +++ b/spec/runtime/stack_spec.cc @@ -521,6 +521,31 @@ describe("Stack", [&]() { free_slice_array(&pop.slices); }); }); + + describe("setting external token state", [&]() { + TSExternalTokenState external_token_state1, external_token_state2; + + it("allows the state to be retrieved", [&]() { + AssertThat(ts_stack_external_token_state(stack, 0), Equals(nullptr)); + + ts_stack_set_external_token_state(stack, 0, &external_token_state1); + AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1)); + + ts_stack_copy_version(stack, 0); + AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1)); + }); + + it("does not merge stack versions with different external token states", [&]() { + ts_stack_copy_version(stack, 0); + ts_stack_push(stack, 0, trees[0], false, 5); + ts_stack_push(stack, 1, trees[0], false, 5); + + ts_stack_set_external_token_state(stack, 0, &external_token_state1); + ts_stack_set_external_token_state(stack, 0, &external_token_state2); + + AssertThat(ts_stack_merge(stack, 0, 1), IsFalse()); + }); + }); }); END_TEST diff --git a/src/runtime/stack.c b/src/runtime/stack.c index bdc5945c..198cce4d 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -50,6 +50,7 @@ typedef struct { StackNode *node; bool is_halted; unsigned push_count; + const TSExternalTokenState *external_token_state; } StackHead; struct Stack { @@ -288,7 +289,12 @@ Stack *ts_stack_new() { self->base_node = stack_node_new(NULL, NULL, false, 1, length_zero(), &self->node_pool); stack_node_retain(self->base_node); - array_push(&self->heads, ((StackHead){ self->base_node, false, 0 })); + array_push(&self->heads, ((StackHead){ + self->base_node, + false, + 0, + NULL + })); return self; } @@ -327,11 +333,19 @@ unsigned ts_stack_push_count(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->push_count; } -void ts_stack_decrease_push_count(const Stack *self, StackVersion version, +void ts_stack_decrease_push_count(Stack *self, StackVersion version, unsigned decrement) { array_get(&self->heads, version)->push_count -= decrement; } +const TSExternalTokenState *ts_stack_external_token_state(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->external_token_state; +} + +void ts_stack_set_external_token_state(Stack *self, StackVersion version, const TSExternalTokenState *state) { + array_get(&self->heads, version)->external_token_state = state; +} + ErrorStatus ts_stack_error_status(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); return (ErrorStatus){ @@ -480,7 +494,8 @@ bool ts_stack_merge(Stack *self, StackVersion version, StackVersion new_version) if (new_node->state == node->state && new_node->position.chars == node->position.chars && new_node->error_count == node->error_count && - new_node->error_cost == node->error_cost) { + new_node->error_cost == node->error_cost && + new_head->external_token_state == head->external_token_state) { for (uint32_t j = 0; j < new_node->link_count; j++) stack_node_add_link(node, new_node->links[j]); if (new_head->push_count > head->push_count) @@ -505,7 +520,12 @@ void ts_stack_clear(Stack *self) { for (uint32_t i = 0; i < self->heads.size; i++) stack_node_release(self->heads.contents[i].node, &self->node_pool); array_clear(&self->heads); - array_push(&self->heads, ((StackHead){ self->base_node, false, 0 })); + array_push(&self->heads, ((StackHead){ + self->base_node, + false, + 0, + NULL + })); } bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 64d9842b..2e88d72a 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -65,7 +65,11 @@ TSStateId ts_stack_top_state(const Stack *, StackVersion); unsigned ts_stack_push_count(const Stack *, StackVersion); -void ts_stack_decrease_push_count(const Stack *, StackVersion, unsigned); +void ts_stack_decrease_push_count(Stack *, StackVersion, unsigned); + +const TSExternalTokenState *ts_stack_external_token_state(const Stack *, StackVersion); + +void ts_stack_set_external_token_state(Stack *, StackVersion, const TSExternalTokenState *); /* * Get the position at the top of the given version of the stack. If the stack From 12cd2132ff428e81c5c5d18b3ac76cfd39149f74 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 4 Jan 2017 21:23:04 -0800 Subject: [PATCH 37/50] Add test for retrieving last external token state in a Tree --- spec/runtime/tree_spec.cc | 35 +++++++++++++++++++++++++++++++++++ src/runtime/tree.c | 2 ++ 2 files changed, 37 insertions(+) diff --git a/spec/runtime/tree_spec.cc b/spec/runtime/tree_spec.cc index b67cd0a8..bdc8145f 100644 --- a/spec/runtime/tree_spec.cc +++ b/spec/runtime/tree_spec.cc @@ -30,6 +30,9 @@ describe("Tree", []() { symbol4, symbol5, symbol6, + symbol7, + symbol8, + symbol9, }; TSSymbolMetadata visible = {true, true, false, true}; @@ -399,6 +402,38 @@ describe("Tree", []() { ts_tree_release(different_parent); }); }); + + describe("last_external_token_state", [&]() { + Length padding = {1, 1, {0, 1}}; + Length size = {2, 2, {0, 2}}; + + auto make_external = [](Tree *tree) { + tree->has_external_tokens = true; + tree->has_external_token_state = true; + return tree; + }; + + it("returns the last serialized external token state in the given tree", [&]() { + Tree *tree1, *tree2, *tree3, *tree4, *tree5, *tree6, *tree7, *tree8, *tree9; + + tree1 = ts_tree_make_node(symbol1, 2, tree_array({ + (tree2 = ts_tree_make_node(symbol2, 3, tree_array({ + (tree3 = make_external(ts_tree_make_leaf(symbol3, padding, size, visible))), + (tree4 = ts_tree_make_leaf(symbol4, padding, size, visible)), + (tree5 = ts_tree_make_leaf(symbol5, padding, size, visible)), + }), visible)), + (tree6 = ts_tree_make_node(symbol6, 2, tree_array({ + (tree7 = ts_tree_make_node(symbol7, 1, tree_array({ + (tree8 = ts_tree_make_leaf(symbol8, padding, size, visible)), + }), visible)), + (tree9 = ts_tree_make_leaf(symbol9, padding, size, visible)), + }), visible)), + }), visible); + + auto state = ts_tree_last_external_token_state(tree1); + AssertThat(state, Equals(&tree3->external_token_state)); + }); + }); }); END_TEST diff --git a/src/runtime/tree.c b/src/runtime/tree.c index ebcca441..858ad90e 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -386,6 +386,8 @@ const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *tree) if (child->has_external_token_state) { tree = child; break; + } else if (child->has_external_tokens) { + return NULL; } } } From 3a4daace26ea22b57da5b964a16ab85c9c57619a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 5 Jan 2017 10:06:43 -0800 Subject: [PATCH 38/50] Move reusable node functions to their own file --- spec/runtime/document_spec.cc | 2 +- spec/runtime/parser_spec.cc | 4 +-- src/runtime/parser.c | 68 ++++++++++++----------------------- src/runtime/parser.h | 6 +--- src/runtime/reusable_node.h | 50 ++++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 54 deletions(-) create mode 100644 src/runtime/reusable_node.h diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index cc2efa60..2694acc6 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -113,7 +113,7 @@ describe("Document", [&]() { assert_node_string_equals( new_root, "(object (pair (string) (array (null) (number))))"); - AssertThat(spy_input->strings_read, Equals(vector({" [null, 2", ""}))); + AssertThat(spy_input->strings_read, Equals(vector({" [null, 2" }))); }); it("reads from the new input correctly when the old input was blank", [&]() { diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index c06f5322..6ac36991 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -230,7 +230,7 @@ describe("Parser", [&]() { "(identifier) " "(math_op (number) (member_access (identifier) (identifier))))))"); - AssertThat(input->strings_read, Equals(vector({ " + abc.d)", "" }))); + AssertThat(input->strings_read, Equals(vector({ " + abc.d)" }))); }); }); @@ -255,7 +255,7 @@ describe("Parser", [&]() { "(number) " "(math_op (number) (math_op (number) (identifier)))))))"); - AssertThat(input->strings_read, Equals(vector({ "123 || 5 +", "" }))); + AssertThat(input->strings_read, Equals(vector({ "123 || 5 +" }))); }); }); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index b503bea0..2184c2d5 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -109,28 +109,6 @@ static bool parser__breakdown_top_of_stack(Parser *self, StackVersion version) { return did_break_down; } -static void parser__pop_reusable_node(ReusableNode *reusable_node) { - reusable_node->byte_index += ts_tree_total_bytes(reusable_node->tree); - while (reusable_node->tree) { - Tree *parent = reusable_node->tree->context.parent; - uint32_t next_index = reusable_node->tree->context.index + 1; - if (parent && parent->child_count > next_index) { - reusable_node->tree = parent->children[next_index]; - return; - } - reusable_node->tree = parent; - } -} - -static bool parser__breakdown_reusable_node(ReusableNode *reusable_node) { - if (reusable_node->tree->child_count == 0) { - return false; - } else { - reusable_node->tree = reusable_node->tree->children[0]; - return true; - } -} - static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, TSStateId state, ReusableNode *reusable_node) { @@ -140,7 +118,7 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, reusable_node->tree->fragile_left || reusable_node->tree->fragile_right)) { LOG("state_mismatch sym:%s", SYM_NAME(reusable_node->tree->symbol)); - parser__breakdown_reusable_node(reusable_node); + reusable_node_breakdown(reusable_node); result = true; } @@ -152,20 +130,13 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, return result; } -static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) { - while (reusable_node->tree->child_count > 0) - reusable_node->tree = reusable_node->tree->children[0]; - parser__pop_reusable_node(reusable_node); -} - static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { TSLexMode current_lex_mode = self->language->lex_modes[state]; - if (ts_language_is_symbol_external(self->language, tree->first_leaf.symbol)) return false; - if (tree->size.bytes == 0) return false; if (tree->first_leaf.lex_mode.lex_state == current_lex_mode.lex_state && tree->first_leaf.lex_mode.external_lex_state == current_lex_mode.external_lex_state) return true; + if (tree->size.bytes == 0) return false; if (!table_entry->is_reusable) return false; if (!table_entry->depends_on_lookahead) @@ -218,13 +189,18 @@ static StackIterateAction parser__restore_external_scanner_callback( if (tree_count > 0) { Tree *tree = *array_back(trees); if (tree->has_external_token_state) { + const TSExternalTokenState *state = ts_tree_last_external_token_state(tree); self->language->external_scanner.deserialize( self->external_scanner_payload, - *ts_tree_last_external_token_state(tree) + *state ); + LOG("deserialized_external_scanner"); return StackIterateStop; } - } else if (is_done) { + } + + if (is_done) { + LOG("no_previous_external_token"); self->language->external_scanner.reset(self->external_scanner_payload); return StackIterateStop; } @@ -234,6 +210,7 @@ static StackIterateAction parser__restore_external_scanner_callback( static void parser__restore_external_scanner(Parser *self, StackVersion version) { if (!self->lexer.needs_to_restore_external_scanner) return; + LOG("restore_external_scanner"); StackPopResult pop = ts_stack_iterate(self->stack, version, parser__restore_external_scanner_callback, self); if (pop.slices.size > 0) { StackSlice slice = pop.slices.contents[0]; @@ -367,7 +344,7 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, if (reusable_node->byte_index < position.bytes) { LOG("past_reusable sym:%s", SYM_NAME(reusable_node->tree->symbol)); - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); continue; } @@ -375,8 +352,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, LOG("cant_reuse_changed tree:%s, size:%u", SYM_NAME(reusable_node->tree->symbol), reusable_node->tree->size.bytes); - if (!parser__breakdown_reusable_node(reusable_node)) { - parser__pop_reusable_node(reusable_node); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); parser__breakdown_top_of_stack(self, version); } continue; @@ -386,8 +363,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, LOG("cant_reuse_error tree:%s, size:%u", SYM_NAME(reusable_node->tree->symbol), reusable_node->tree->size.bytes); - if (!parser__breakdown_reusable_node(reusable_node)) { - parser__pop_reusable_node(reusable_node); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); parser__breakdown_top_of_stack(self, version); } continue; @@ -810,7 +787,7 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { ts_lexer_set_input(&self->lexer, input); ts_stack_clear(self->stack); - self->reusable_node = (ReusableNode){ previous_tree, 0 }; + self->reusable_node = reusable_node_new(previous_tree); self->cached_token = NULL; self->finished_tree = NULL; } @@ -1040,7 +1017,7 @@ static void parser__advance(Parser *self, StackVersion version, if (!validated_lookahead) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { if (lookahead == reusable_node->tree) { - parser__pop_reusable_node_leaf(reusable_node); + reusable_node_pop_leaf(reusable_node); } else { parser__clear_cached_token(self); } @@ -1076,7 +1053,7 @@ static void parser__advance(Parser *self, StackVersion version, if (lookahead->child_count > 0) { if (parser__breakdown_lookahead(self, &lookahead, state, reusable_node)) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); } @@ -1088,7 +1065,7 @@ static void parser__advance(Parser *self, StackVersion version, parser__shift(self, version, next_state, lookahead, extra); if (lookahead == reusable_node->tree) - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); return; @@ -1130,7 +1107,7 @@ static void parser__advance(Parser *self, StackVersion version, case TSParseActionTypeRecover: { while (lookahead->child_count > 0) { - parser__breakdown_reusable_node(reusable_node); + reusable_node_breakdown(reusable_node); ts_tree_release(lookahead); lookahead = reusable_node->tree; ts_tree_retain(lookahead); @@ -1138,7 +1115,7 @@ static void parser__advance(Parser *self, StackVersion version, parser__recover(self, version, action.params.to_state, lookahead); if (lookahead == reusable_node->tree) - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); return; } @@ -1218,8 +1195,7 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { while (!ts_stack_is_halted(self->stack, version)) { position = ts_stack_top_position(self->stack, version).chars; - if (position > last_position || - (version > 0 && position == last_position)) + if (position > last_position || (version > 0 && position == last_position)) break; LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", diff --git a/src/runtime/parser.h b/src/runtime/parser.h index 2d9381f8..a7b8dde3 100644 --- a/src/runtime/parser.h +++ b/src/runtime/parser.h @@ -8,13 +8,9 @@ extern "C" { #include "runtime/stack.h" #include "runtime/array.h" #include "runtime/lexer.h" +#include "runtime/reusable_node.h" #include "runtime/reduce_action.h" -typedef struct { - Tree *tree; - uint32_t byte_index; -} ReusableNode; - typedef struct { Lexer lexer; Stack *stack; diff --git a/src/runtime/reusable_node.h b/src/runtime/reusable_node.h new file mode 100644 index 00000000..b9777638 --- /dev/null +++ b/src/runtime/reusable_node.h @@ -0,0 +1,50 @@ +#include "runtime/tree.h" + +typedef struct { + Tree *tree; + uint32_t byte_index; + bool has_preceding_external_token; + const TSExternalTokenState *preceding_external_token_state; +} ReusableNode; + +static inline ReusableNode reusable_node_new(Tree *tree) { + return (ReusableNode){ + .tree = tree, + .byte_index = 0, + .has_preceding_external_token = false, + .preceding_external_token_state = NULL, + }; +} + +static inline void reusable_node_pop(ReusableNode *self) { + self->byte_index += ts_tree_total_bytes(self->tree); + if (self->tree->has_external_tokens) { + self->has_preceding_external_token = true; + self->preceding_external_token_state = ts_tree_last_external_token_state(self->tree); + } + + while (self->tree) { + Tree *parent = self->tree->context.parent; + uint32_t next_index = self->tree->context.index + 1; + if (parent && parent->child_count > next_index) { + self->tree = parent->children[next_index]; + return; + } + self->tree = parent; + } +} + +static inline void reusable_node_pop_leaf(ReusableNode *self) { + while (self->tree->child_count > 0) + self->tree = self->tree->children[0]; + reusable_node_pop(self); +} + +static inline bool reusable_node_breakdown(ReusableNode *self) { + if (self->tree->child_count == 0) { + return false; + } else { + self->tree = self->tree->children[0]; + return true; + } +} From 36608180d243163a15705cdfb0b837c6784ed1ac Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 7 Jan 2017 21:45:28 -0800 Subject: [PATCH 39/50] Store external token states in the parse stack --- spec/runtime/parser_spec.cc | 11 ++++ src/runtime/lexer.c | 16 ++---- src/runtime/lexer.h | 3 +- src/runtime/parser.c | 104 +++++++++++++++++++----------------- src/runtime/stack.c | 13 +++-- 5 files changed, 80 insertions(+), 67 deletions(-) diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 6ac36991..88633f1f 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -60,6 +60,11 @@ describe("Parser", [&]() { replace_text(position, length, ""); }; + auto undo = [&]() { + ts_document_edit(document, input->undo()); + ts_document_parse(document); + }; + auto assert_root_node = [&](const string &expected) { TSNode node = ts_document_root_node(document); char *node_string = ts_node_string(node, document); @@ -386,6 +391,12 @@ describe("Parser", [&]() { "(if_statement (identifier) " "(print_statement (identifier)) " "(return_statement (expression_list (identifier)))))"); + + undo(); + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier))) " + "(return_statement (expression_list (identifier))))"); }); }); diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index f7ebf042..902c2d3b 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -88,8 +88,7 @@ void ts_lexer_init(Lexer *self) { .payload = NULL, .log = NULL }, - .needs_to_restore_external_scanner = false, - .last_external_token_end_byte = 0, + .last_external_token_state = NULL, }; ts_lexer_reset(self, length_zero()); } @@ -112,20 +111,11 @@ static inline void ts_lexer__reset(Lexer *self, Length position) { void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__reset(self, length_zero()); - self->needs_to_restore_external_scanner = false; - self->last_external_token_end_byte = 0; + self->last_external_token_state = NULL; } void ts_lexer_reset(Lexer *self, Length position) { - if (position.bytes > self->current_position.bytes) { - self->needs_to_restore_external_scanner = true; - self->last_external_token_end_byte = 0; - ts_lexer__reset(self, position); - } else if (position.bytes < self->current_position.bytes) { - if (position.bytes < self->last_external_token_end_byte) { - self->needs_to_restore_external_scanner = true; - self->last_external_token_end_byte = 0; - } + if (position.bytes != self->current_position.bytes) { ts_lexer__reset(self, position); } } diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 76d863c4..67470f6f 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -25,8 +25,7 @@ typedef struct { TSInput input; TSLogger logger; char debug_buffer[TS_DEBUG_BUFFER_SIZE]; - bool needs_to_restore_external_scanner; - uint32_t last_external_token_end_byte; + const TSExternalTokenState *last_external_token_state; } Lexer; void ts_lexer_init(Lexer *); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 2184c2d5..a29810c4 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -130,13 +130,20 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, return result; } +static inline bool ts_lex_mode_eq(TSLexMode self, TSLexMode other) { + return self.lex_state == other.lex_state && + self.external_lex_state == other.external_lex_state; +} + static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { TSLexMode current_lex_mode = self->language->lex_modes[state]; - if (tree->first_leaf.lex_mode.lex_state == current_lex_mode.lex_state && - tree->first_leaf.lex_mode.external_lex_state == current_lex_mode.external_lex_state) + if (ts_lex_mode_eq(tree->first_leaf.lex_mode, current_lex_mode)) return true; - if (tree->size.bytes == 0) return false; + if (current_lex_mode.external_lex_state != 0) + return false; + if (tree->size.bytes == 0) + return false; if (!table_entry->is_reusable) return false; if (!table_entry->depends_on_lookahead) @@ -182,53 +189,26 @@ static bool parser__condense_stack(Parser *self) { return result; } -static StackIterateAction parser__restore_external_scanner_callback( - void *payload, TSStateId state, TreeArray *trees, uint32_t tree_count, - bool is_done, bool is_pending) { - Parser *self = payload; - if (tree_count > 0) { - Tree *tree = *array_back(trees); - if (tree->has_external_token_state) { - const TSExternalTokenState *state = ts_tree_last_external_token_state(tree); +static void parser__restore_external_scanner(Parser *self, StackVersion version) { + const TSExternalTokenState *state = ts_stack_external_token_state(self->stack, version); + if (self->lexer.last_external_token_state != state) { + LOG("restore_external_scanner"); + if (state) { self->language->external_scanner.deserialize( self->external_scanner_payload, *state ); - LOG("deserialized_external_scanner"); - return StackIterateStop; + } else { + self->language->external_scanner.reset(self->external_scanner_payload); } } - - if (is_done) { - LOG("no_previous_external_token"); - self->language->external_scanner.reset(self->external_scanner_payload); - return StackIterateStop; - } - - return StackIterateNone; -} - -static void parser__restore_external_scanner(Parser *self, StackVersion version) { - if (!self->lexer.needs_to_restore_external_scanner) return; - LOG("restore_external_scanner"); - StackPopResult pop = ts_stack_iterate(self->stack, version, parser__restore_external_scanner_callback, self); - if (pop.slices.size > 0) { - StackSlice slice = pop.slices.contents[0]; - for (size_t i = 1; i < slice.trees.size; i++) { - Tree *tree = slice.trees.contents[i]; - if (tree->has_external_tokens) { - printf("RE-SCANNING TREE: %s\n", ts_tree_string(tree, self->language, true)); - } - } - ts_tree_array_delete(&slice.trees); - } } static Tree *parser__lex(Parser *self, StackVersion version) { TSStateId parse_state = ts_stack_top_state(self->stack, version); Length start_position = ts_stack_top_position(self->stack, version); TSLexMode lex_mode = self->language->lex_modes[parse_state]; - const bool *external_tokens = ts_language_enabled_external_tokens( + const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state ); @@ -243,15 +223,13 @@ static Tree *parser__lex(Parser *self, StackVersion version) { for (;;) { Length current_position = self->lexer.current_position; - if (external_tokens) { + if (valid_external_tokens) { LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, current_position.extent.row, current_position.extent.column); parser__restore_external_scanner(self, version); ts_lexer_start(&self->lexer); if (self->language->external_scanner.scan(self->external_scanner_payload, - &self->lexer.data, external_tokens)) { - self->lexer.last_external_token_end_byte = self->lexer.current_position.bytes; - self->lexer.needs_to_restore_external_scanner = false; + &self->lexer.data, valid_external_tokens)) { found_external_token = true; break; } @@ -269,7 +247,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { LOG("retry_in_error_mode"); found_error = true; lex_mode = self->language->lex_modes[ERROR_STATE]; - external_tokens = ts_language_enabled_external_tokens( + valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state ); @@ -303,7 +281,9 @@ static Tree *parser__lex(Parser *self, StackVersion version) { result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; - if (found_external_token) symbol = self->language->external_scanner.symbol_map[symbol]; + if (found_external_token) { + symbol = self->language->external_scanner.symbol_map[symbol]; + } Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); @@ -312,10 +292,9 @@ static Tree *parser__lex(Parser *self, StackVersion version) { if (found_external_token) { result->has_external_tokens = true; - if (self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state)) { - result->has_external_token_state = true; - self->last_external_token = result; - } + result->has_external_token_state = true; + self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state); + self->lexer.last_external_token_state = &result->external_token_state; } } @@ -331,6 +310,17 @@ static void parser__clear_cached_token(Parser *self) { self->cached_token = NULL; } +static inline bool ts_external_token_state_eq(const TSExternalTokenState *self, + const TSExternalTokenState *other) { + if (self == other) { + return true; + } else if (!self || !other) { + return false; + } else { + return memcmp(self, other, sizeof(TSExternalTokenState)) == 0; + } +} + static Tree *parser__get_lookahead(Parser *self, StackVersion version, ReusableNode *reusable_node, bool *is_fresh) { @@ -370,6 +360,20 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, continue; } + if (reusable_node->tree->first_leaf.lex_mode.external_lex_state != 0 && + !ts_external_token_state_eq( + reusable_node->preceding_external_token_state, + ts_stack_external_token_state(self->stack, version))) { + LOG("cant_reuse_external_tokens tree:%s, size:%u", + SYM_NAME(reusable_node->tree->symbol), + reusable_node->tree->size.bytes); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); + parser__breakdown_top_of_stack(self, version); + } + continue; + } + Tree *result = reusable_node->tree; ts_tree_retain(result); return result; @@ -459,6 +463,10 @@ static void parser__shift(Parser *self, StackVersion version, TSStateId state, bool is_pending = lookahead->child_count > 0; ts_stack_push(self->stack, version, lookahead, is_pending, state); + if (lookahead->has_external_token_state) { + ts_stack_set_external_token_state( + self->stack, version, ts_tree_last_external_token_state(lookahead)); + } ts_tree_release(lookahead); } diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 198cce4d..934f70bb 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -169,11 +169,13 @@ static void stack_node_add_link(StackNode *self, StackLink link) { } static StackVersion ts_stack__add_version(Stack *self, StackNode *node, - unsigned push_count) { + unsigned push_count, + const TSExternalTokenState *external_token_state) { StackHead head = { .node = node, .is_halted = false, .push_count = push_count, + .external_token_state = external_token_state, }; array_push(&self->heads, head); stack_node_retain(node); @@ -181,7 +183,8 @@ static StackVersion ts_stack__add_version(Stack *self, StackNode *node, } static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees, - unsigned push_count) { + unsigned push_count, + const TSExternalTokenState *external_token_state) { for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { StackVersion version = self->slices.contents[i].version; if (self->heads.contents[version].node == node) { @@ -191,7 +194,7 @@ static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees, } } - StackVersion version = ts_stack__add_version(self, node, push_count); + StackVersion version = ts_stack__add_version(self, node, push_count, external_token_state); StackSlice slice = { *trees, version }; array_push(&self->slices, slice); } @@ -203,6 +206,7 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version, StackHead *head = array_get(&self->heads, version); unsigned push_count = head->push_count; + const TSExternalTokenState *external_token_state = head->external_token_state; Iterator iterator = { .node = head->node, .trees = array_new(), @@ -230,7 +234,8 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version, if (!should_stop) ts_tree_array_copy(trees, &trees); array_reverse(&trees); - ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count); + ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count, + external_token_state); } if (should_stop) { From 0a286d41f330ee47c0d8a5facbb0aff14956a798 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 8 Jan 2017 22:06:36 -0800 Subject: [PATCH 40/50] Add python error recovery tests --- spec/fixtures/error_corpus/python_errors.txt | 29 ++++++++++++++++++++ spec/integration/corpus_specs.cc | 3 ++ 2 files changed, 32 insertions(+) create mode 100644 spec/fixtures/error_corpus/python_errors.txt diff --git a/spec/fixtures/error_corpus/python_errors.txt b/spec/fixtures/error_corpus/python_errors.txt new file mode 100644 index 00000000..7ff9f240 --- /dev/null +++ b/spec/fixtures/error_corpus/python_errors.txt @@ -0,0 +1,29 @@ +========================================== +errors in if statements +========================================== + +if a is: + print b + print c + +--- + +(module + (if_statement (identifier) (ERROR) + (print_statement (identifier)) + (print_statement (identifier)))) + +========================================== +errors in function definitions +========================================== + +def a():: + b + c + +--- + +(module + (function_definition (identifier) (parameters) (ERROR) + (expression_statement (identifier)) + (expression_statement (identifier)))) diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index 9d716ed1..b70ec5fc 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -84,6 +84,7 @@ describe("The Corpus", []() { "json", "c", "cpp", + "python", }); for (auto &language_name : test_languages) { @@ -130,6 +131,8 @@ describe("The Corpus", []() { size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); string inserted_text = random_words(random() % 4 + 1); + if (language_name == "python") return; + if (insertions.insert({edit_position, inserted_text}).second) { string description = "\"" + inserted_text + "\" at " + to_string(edit_position); From 896254eea52cffef340c630d7c640ec3563a5ce8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 24 Jan 2017 12:48:47 -0800 Subject: [PATCH 41/50] Fix error in changed ranges calculation There was an error in the way that we calculate the reference scope sequences that are used as the basis for assertions about changed ranges in randomized tests. The error caused some characters' scopes to not be checked. This corrects the reference implementation and fixes a previously uncaught bug in the implementation of `tree_path_get_changed_ranges`. Previously, when iterating over the old and new trees, we would only perform comparisons of visible nodes. This resulted in a failure to do any comparison for portions of the text in which there were trailing invisible child nodes (e.g. trailing `_line_break` nodes inside `statement` nodes in the JavaScript grammar). Now, we additionally perform comparisons at invisible leaf nodes, based on their lowest visible ancestor. --- spec/helpers/point_helpers.cc | 4 +- spec/helpers/scope_sequence.cc | 25 +++---- src/runtime/length.h | 9 +-- src/runtime/tree_path.h | 122 +++++++++++++++++++-------------- 4 files changed, 86 insertions(+), 74 deletions(-) diff --git a/spec/helpers/point_helpers.cc b/spec/helpers/point_helpers.cc index e9c99259..60f4f9a7 100644 --- a/spec/helpers/point_helpers.cc +++ b/spec/helpers/point_helpers.cc @@ -15,7 +15,9 @@ bool operator==(const TSRange &left, const TSRange &right) { } bool operator==(const Length &left, const Length &right) { - return length_eq(left, right); + return left.bytes == right.bytes && + left.chars == right.chars && + left.extent == right.extent; } bool operator<(const TSPoint &left, const TSPoint &right) { diff --git a/spec/helpers/scope_sequence.cc b/spec/helpers/scope_sequence.cc index 87e059dc..d6e2e3b1 100644 --- a/spec/helpers/scope_sequence.cc +++ b/spec/helpers/scope_sequence.cc @@ -23,20 +23,21 @@ static void append_to_scope_sequence(ScopeSequence *sequence, ScopeStack *current_scopes, TSNode node, TSDocument *document, const std::string &text) { - append_text_to_scope_sequence(sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size()); + append_text_to_scope_sequence( + sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size() + ); - string scope = ts_node_type(node, document); - current_scopes->push_back(scope); - size_t child_count = ts_node_child_count(node); - if (child_count > 0) { - for (size_t i = 0; i < child_count; i++) { - TSNode child = ts_node_child(node, i); - append_to_scope_sequence(sequence, current_scopes, child, document, text); - } - } else { - size_t length = ts_node_end_byte(node) - ts_node_start_byte(node); - append_text_to_scope_sequence(sequence, current_scopes, text, length); + current_scopes->push_back(ts_node_type(node, document)); + + for (size_t i = 0, n = ts_node_child_count(node); i < n; i++) { + TSNode child = ts_node_child(node, i); + append_to_scope_sequence(sequence, current_scopes, child, document, text); } + + append_text_to_scope_sequence( + sequence, current_scopes, text, ts_node_end_byte(node) - sequence->size() + ); + current_scopes->pop_back(); } diff --git a/src/runtime/length.h b/src/runtime/length.h index 2477bbe1..352215d2 100644 --- a/src/runtime/length.h +++ b/src/runtime/length.h @@ -21,12 +21,11 @@ static inline void length_set_unknown_chars(Length *self) { } static inline Length length_min(Length len1, Length len2) { - return (len1.chars < len2.chars) ? len1 : len2; + return (len1.bytes < len2.bytes) ? len1 : len2; } static inline Length length_add(Length len1, Length len2) { Length result; - result.chars = len1.chars + len2.chars; result.bytes = len1.bytes + len2.bytes; result.extent = point_add(len1.extent, len2.extent); @@ -57,10 +56,4 @@ static inline Length length_zero() { return (Length){ 0, 0, {0, 0} }; } -static inline bool length_eq(Length self, Length other) { - return self.bytes == other.bytes && self.chars == other.chars && - self.extent.row == other.extent.row && - self.extent.column == other.extent.column; -} - #endif diff --git a/src/runtime/tree_path.h b/src/runtime/tree_path.h index 6fd4ef97..f64dd02f 100644 --- a/src/runtime/tree_path.h +++ b/src/runtime/tree_path.h @@ -21,61 +21,66 @@ static void range_array_add(RangeArray *results, TSPoint start, TSPoint end) { } } -static bool tree_path_descend(TreePath *path, TSPoint position) { +static bool tree_path_descend(TreePath *path, Length position) { uint32_t original_size = path->size; + bool did_descend; do { did_descend = false; TreePathEntry entry = *array_back(path); - Length child_position = entry.position; + Length child_left = entry.position; for (uint32_t i = 0; i < entry.tree->child_count; i++) { Tree *child = entry.tree->children[i]; - Length child_right_position = - length_add(child_position, ts_tree_total_size(child)); - if (point_lt(position, child_right_position.extent)) { - TreePathEntry child_entry = { child, child_position, i }; - if (child->visible) { + Length child_right = length_add(child_left, ts_tree_total_size(child)); + if (position.bytes < child_right.bytes) { + TreePathEntry child_entry = { child, child_left, i }; + if (child->visible || child->child_count == 0) { array_push(path, child_entry); return true; - } else if (child->child_count > 0 && child->visible_child_count > 0) { + } else if (child->visible_child_count > 0) { array_push(path, child_entry); did_descend = true; break; } } - child_position = child_right_position; + child_left = child_right; } } while (did_descend); + path->size = original_size; return false; } static uint32_t tree_path_advance(TreePath *path) { uint32_t ascend_count = 0; + while (path->size > 0) { TreePathEntry entry = array_pop(path); - if (path->size == 0) - break; + if (path->size == 0) break; TreePathEntry parent_entry = *array_back(path); if (parent_entry.tree->visible) ascend_count++; - Length position = - length_add(entry.position, ts_tree_total_size(entry.tree)); + + Length position = length_add(entry.position, ts_tree_total_size(entry.tree)); for (uint32_t i = entry.child_index + 1; i < parent_entry.tree->child_count; i++) { Tree *next_child = parent_entry.tree->children[i]; - if (next_child->visible || next_child->visible_child_count > 0) { + if (next_child->visible || + next_child->child_count == 0 || + next_child->visible_child_count > 0) { if (parent_entry.tree->visible) ascend_count--; array_push(path, ((TreePathEntry){ .tree = next_child, .child_index = i, .position = position, })); - if (!next_child->visible) - tree_path_descend(path, (TSPoint){ 0, 0 }); + if (!next_child->visible) { + tree_path_descend(path, length_zero()); + } return ascend_count; } position = length_add(position, ts_tree_total_size(next_child)); } } + return ascend_count; } @@ -94,8 +99,27 @@ static void tree_path_init(TreePath *path, Tree *tree) { .position = { 0, 0, { 0, 0 } }, .child_index = 0, })); - if (!tree->visible) - tree_path_descend(path, (TSPoint){ 0, 0 }); + if (!tree->visible) { + tree_path_descend(path, length_zero()); + } +} + +Tree *tree_path_visible_tree(TreePath *self) { + for (uint32_t i = self->size - 1; i + 1 > 0; i--) { + Tree *tree = self->contents[i].tree; + if (tree->visible) return tree; + } + return NULL; +} + +Length tree_path_start_position(TreePath *self) { + TreePathEntry entry = *array_back(self); + return length_add(entry.position, entry.tree->padding); +} + +Length tree_path_end_position(TreePath *self) { + TreePathEntry entry = *array_back(self); + return length_add(length_add(entry.position, entry.tree->padding), entry.tree->size); } static bool tree_must_eq(Tree *old_tree, Tree *new_tree) { @@ -112,67 +136,59 @@ static bool tree_must_eq(Tree *old_tree, Tree *new_tree) { static void tree_path_get_changes(TreePath *old_path, TreePath *new_path, TSRange **ranges, uint32_t *range_count) { - TSPoint position = { 0, 0 }; + Length position = length_zero(); RangeArray results = array_new(); while (old_path->size && new_path->size) { bool is_changed = false; - TSPoint next_position = position; + Length next_position = position; - TreePathEntry old_entry = *array_back(old_path); - TreePathEntry new_entry = *array_back(new_path); - Tree *old_tree = old_entry.tree; - Tree *new_tree = new_entry.tree; - uint32_t old_start_byte = old_entry.position.bytes + old_tree->padding.bytes; - uint32_t new_start_byte = new_entry.position.bytes + new_tree->padding.bytes; - TSPoint old_start_point = - point_add(old_entry.position.extent, old_tree->padding.extent); - TSPoint new_start_point = - point_add(new_entry.position.extent, new_tree->padding.extent); - TSPoint old_end_point = point_add(old_start_point, old_tree->size.extent); - TSPoint new_end_point = point_add(new_start_point, new_tree->size.extent); + Tree *old_tree = tree_path_visible_tree(old_path); + Tree *new_tree = tree_path_visible_tree(new_path); + Length old_start = tree_path_start_position(old_path); + Length new_start = tree_path_start_position(new_path); + Length old_end = tree_path_end_position(old_path); + Length new_end = tree_path_end_position(new_path); // #define NAME(t) (ts_language_symbol_name(language, ((Tree *)(t))->symbol)) - // printf("At [%-2lu, %-2lu] Compare (%-20s\t [%-2lu, %-2lu] - [%lu, %lu])\tvs\t(%-20s\t [%lu, %lu] - [%lu, %lu])\n", - // position.row, position.column, NAME(old_tree), old_start_point.row, - // old_start_point.column, old_end_point.row, old_end_point.column, - // NAME(new_tree), new_start_point.row, new_start_point.column, - // new_end_point.row, new_end_point.column); + // printf("At [%-2u, %-2u] Compare (%-20s\t [%-2u, %-2u] - [%u, %u])\tvs\t(%-20s\t [%u, %u] - [%u, %u])\n", + // position.extent.row, position.extent.column, + // NAME(old_tree), old_start.extent.row, old_start.extent.column, old_end.extent.row, old_end.extent.column, + // NAME(new_tree), new_start.extent.row, new_start.extent.column, new_end.extent.row, new_end.extent.column); - if (point_lt(position, old_start_point)) { - if (point_lt(position, new_start_point)) { - next_position = point_min(old_start_point, new_start_point); + if (position.bytes < old_start.bytes) { + if (position.bytes < new_start.bytes) { + next_position = length_min(old_start, new_start); } else { is_changed = true; - next_position = old_start_point; + next_position = old_start; } - } else if (point_lt(position, new_start_point)) { + } else if (position.bytes < new_start.bytes) { is_changed = true; - next_position = new_start_point; - } else if (old_start_byte == new_start_byte && - tree_must_eq(old_tree, new_tree)) { - next_position = old_end_point; + next_position = new_start; + } else if (old_start.bytes == new_start.bytes && tree_must_eq(old_tree, new_tree)) { + next_position = old_end; } else if (old_tree->symbol == new_tree->symbol) { if (tree_path_descend(old_path, position)) { if (!tree_path_descend(new_path, position)) { tree_path_ascend(old_path, 1); is_changed = true; - next_position = new_end_point; + next_position = new_end; } } else if (tree_path_descend(new_path, position)) { tree_path_ascend(new_path, 1); is_changed = true; - next_position = old_end_point; + next_position = old_end; } else { - next_position = point_min(old_end_point, new_end_point); + next_position = length_min(old_end, new_end); } } else { is_changed = true; - next_position = point_min(old_end_point, new_end_point); + next_position = length_min(old_end, new_end); } - bool at_old_end = point_lte(old_end_point, next_position); - bool at_new_end = point_lte(new_end_point, next_position); + bool at_old_end = old_end.bytes <= next_position.bytes; + bool at_new_end = new_end.bytes <= next_position.bytes; if (at_new_end && at_old_end) { uint32_t old_ascend_count = tree_path_advance(old_path); @@ -190,7 +206,7 @@ static void tree_path_get_changes(TreePath *old_path, TreePath *new_path, tree_path_ascend(new_path, ascend_count); } - if (is_changed) range_array_add(&results, position, next_position); + if (is_changed) range_array_add(&results, position.extent, next_position.extent); position = next_position; } From 5ee7cbda2c66d7b8a10616101a1759529ac2fcac Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 30 Jan 2017 21:58:02 -0800 Subject: [PATCH 42/50] Enable randomized incremental parsing tests for Python --- spec/integration/corpus_specs.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index b70ec5fc..c399e8f9 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -131,8 +131,6 @@ describe("The Corpus", []() { size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); string inserted_text = random_words(random() % 4 + 1); - if (language_name == "python") return; - if (insertions.insert({edit_position, inserted_text}).second) { string description = "\"" + inserted_text + "\" at " + to_string(edit_position); From dc6598e07e9bcb839933559bb8eb36cd84f6844d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 30 Jan 2017 21:58:27 -0800 Subject: [PATCH 43/50] Include external token states in stack debug graphs --- src/runtime/stack.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 934f70bb..fc875396 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -553,8 +553,20 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { fprintf( f, "node_head_%u -> node_%p [label=%u, fontcolor=blue, weight=10000, " - "labeltooltip=\"push_count: %u\"]\n", + "labeltooltip=\"push_count: %u", i, head->node, i, head->push_count); + + if (head->external_token_state) { + const TSExternalTokenState *s = head->external_token_state; + fprintf(f, + "\nexternal_token_state: " + "%2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X", + (*s)[0], (*s)[1], (*s)[2], (*s)[3], (*s)[4], (*s)[5], (*s)[6], (*s)[7], + (*s)[8], (*s)[9], (*s)[10], (*s)[11], (*s)[12], (*s)[13], (*s)[14], (*s)[15] + ); + } + + fprintf(f, "\"]\n"); array_push(&self->iterators, ((Iterator){.node = head->node })); } From 672d491775421eea143f89039fe266dedaa5dbb6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 30 Jan 2017 22:04:46 -0800 Subject: [PATCH 44/50] Fix errors in management of external scanner's most recent state --- src/runtime/parser.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index a29810c4..191354a3 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -193,6 +193,7 @@ static void parser__restore_external_scanner(Parser *self, StackVersion version) const TSExternalTokenState *state = ts_stack_external_token_state(self->stack, version); if (self->lexer.last_external_token_state != state) { LOG("restore_external_scanner"); + self->lexer.last_external_token_state = state; if (state) { self->language->external_scanner.deserialize( self->external_scanner_payload, @@ -293,6 +294,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { if (found_external_token) { result->has_external_tokens = true; result->has_external_token_state = true; + memset(result->external_token_state, 0, sizeof(TSExternalTokenState)); self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state); self->lexer.last_external_token_state = &result->external_token_state; } @@ -360,8 +362,7 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, continue; } - if (reusable_node->tree->first_leaf.lex_mode.external_lex_state != 0 && - !ts_external_token_state_eq( + if (!ts_external_token_state_eq( reusable_node->preceding_external_token_state, ts_stack_external_token_state(self->stack, version))) { LOG("cant_reuse_external_tokens tree:%s, size:%u", From d853b6504d342f2c55fe1737490beb236a13334a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 10:21:47 -0800 Subject: [PATCH 45/50] Add version number to TSLanguage structs --- include/tree_sitter/parser.h | 2 ++ include/tree_sitter/runtime.h | 3 +++ spec/runtime/document_spec.cc | 11 +++++++++++ src/compiler/generate_code/c_code.cc | 4 +++- src/runtime/document.c | 1 + src/runtime/language.c | 4 ++++ 6 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 8e5658f4..197015f4 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -65,6 +65,7 @@ typedef union { } TSParseActionEntry; typedef struct TSLanguage { + uint32_t version; uint32_t symbol_count; uint32_t token_count; uint32_t external_token_count; @@ -166,6 +167,7 @@ typedef struct TSLanguage { #define GET_LANGUAGE(...) \ static TSLanguage language = { \ + .version = LANGUAGE_VERSION, \ .symbol_count = SYMBOL_COUNT, \ .token_count = TOKEN_COUNT, \ .symbol_metadata = ts_symbol_metadata, \ diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 68e804f1..00d8e7c4 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,6 +9,8 @@ extern "C" { #include #include +#define TREE_SITTER_LANGUAGE_VERSION 1 + typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; typedef struct TSDocument TSDocument; @@ -114,6 +116,7 @@ uint32_t ts_document_parse_count(const TSDocument *); uint32_t ts_language_symbol_count(const TSLanguage *); const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); +uint32_t ts_language_version(const TSLanguage *); #ifdef __cplusplus } diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index 2694acc6..52e65ffb 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -164,6 +164,17 @@ describe("Document", [&]() { "(program (expression_statement " "(object (pair (string) (array (number) (number))))))"); }); + + it("does not allow setting a language with a different version number", [&]() { + TSLanguage language = *get_test_language("json"); + AssertThat(ts_language_version(&language), Equals(TREE_SITTER_LANGUAGE_VERSION)); + + language.version++; + AssertThat(ts_language_version(&language), !Equals(TREE_SITTER_LANGUAGE_VERSION)); + + ts_document_set_language(document, &language); + AssertThat(ts_document_language(document), IsNull()); + }); }); describe("set_logger(TSLogger)", [&]() { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index a28648c8..755a7402 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -11,6 +11,7 @@ #include "compiler/lexical_grammar.h" #include "compiler/rules/built_in_symbols.h" #include "compiler/util/string_helpers.h" +#include "tree_sitter/runtime.h" namespace tree_sitter { namespace generate_code { @@ -134,6 +135,7 @@ class CCodeGenerator { } } + line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION)); line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); line("#define TOKEN_COUNT " + to_string(token_count)); @@ -227,7 +229,7 @@ class CCodeGenerator { for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { const ExternalToken &external_token = syntax_grammar.external_tokens[j]; - if (external_token.corresponding_internal_token.index == i) { + if (external_token.corresponding_internal_token.index == Symbol::Index(i)) { external_tokens_by_corresponding_internal_token.insert({i, j}); break; } diff --git a/src/runtime/document.c b/src/runtime/document.c index c68d8c62..8c1eb779 100644 --- a/src/runtime/document.c +++ b/src/runtime/document.c @@ -36,6 +36,7 @@ const TSLanguage *ts_document_language(TSDocument *self) { } void ts_document_set_language(TSDocument *self, const TSLanguage *language) { + if (language->version != TREE_SITTER_LANGUAGE_VERSION) return; ts_document_invalidate(self); parser_set_language(&self->parser, language); if (self->tree) { diff --git a/src/runtime/language.c b/src/runtime/language.c index af08bb38..7f1bdefa 100644 --- a/src/runtime/language.c +++ b/src/runtime/language.c @@ -34,6 +34,10 @@ uint32_t ts_language_symbol_count(const TSLanguage *language) { return language->symbol_count; } +uint32_t ts_language_version(const TSLanguage *language) { + return language->version; +} + TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) { if (symbol == ts_builtin_sym_error) From 60f6998485e30640f496fad5a7b47e51b418f07a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 10:29:25 -0800 Subject: [PATCH 46/50] Rename generated language functions to e.g. `tree_sitter_python` They used to be called e.g. `ts_language_python`. Now that there are APIs that deal with the `TSLanguage` objects themselves, such as `ts_language_symbol_count`, the old names were a little confusing. --- .../external_scanners/extra_external_tokens.c | 12 ++++++------ spec/fixtures/external_scanners/percent_strings.c | 12 ++++++------ .../external_scanners/shared_external_tokens.c | 12 ++++++------ spec/helpers/load_language.cc | 2 +- src/compiler/generate_code/c_code.cc | 5 +++-- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c index ba3338af..5c409639 100644 --- a/spec/fixtures/external_scanners/extra_external_tokens.c +++ b/spec/fixtures/external_scanners/extra_external_tokens.c @@ -4,21 +4,21 @@ enum { COMMENT, }; -void *ts_language_extra_external_tokens_external_scanner_create() { +void *tree_sitter_extra_external_tokens_external_scanner_create() { return NULL; } -void ts_language_extra_external_tokens_external_scanner_reset(void *payload) { +void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) { } -bool ts_language_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { +bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } -void ts_language_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { } -bool ts_language_extra_external_tokens_external_scanner_scan( +bool tree_sitter_extra_external_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { while (lexer->lookahead == ' ') { @@ -38,5 +38,5 @@ bool ts_language_extra_external_tokens_external_scanner_scan( return false; } -void ts_language_extra_external_tokens_external_scanner_destroy(void *payload) { +void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) { } diff --git a/spec/fixtures/external_scanners/percent_strings.c b/spec/fixtures/external_scanners/percent_strings.c index 56c12e81..9f68696e 100644 --- a/spec/fixtures/external_scanners/percent_strings.c +++ b/spec/fixtures/external_scanners/percent_strings.c @@ -13,7 +13,7 @@ typedef struct { uint32_t depth; } Scanner; -void *ts_language_external_scanner_example_external_scanner_create() { +void *tree_sitter_external_scanner_example_external_scanner_create() { Scanner *scanner = malloc(sizeof(Scanner)); *scanner = (Scanner){ .open_delimiter = 0, @@ -23,7 +23,7 @@ void *ts_language_external_scanner_example_external_scanner_create() { return scanner; } -bool ts_language_external_scanner_example_external_scanner_scan( +bool tree_sitter_external_scanner_example_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { Scanner *scanner = payload; @@ -103,16 +103,16 @@ bool ts_language_external_scanner_example_external_scanner_scan( return false; } -void ts_language_external_scanner_example_external_scanner_reset(void *payload) { +void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) { } -bool ts_language_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { +bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } -void ts_language_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { } -void ts_language_external_scanner_example_external_scanner_destroy(void *payload) { +void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) { free(payload); } diff --git a/spec/fixtures/external_scanners/shared_external_tokens.c b/spec/fixtures/external_scanners/shared_external_tokens.c index 3be1a848..0bee00d8 100644 --- a/spec/fixtures/external_scanners/shared_external_tokens.c +++ b/spec/fixtures/external_scanners/shared_external_tokens.c @@ -6,21 +6,21 @@ enum { LINE_BREAK }; -void *ts_language_shared_external_tokens_external_scanner_create() { +void *tree_sitter_shared_external_tokens_external_scanner_create() { return NULL; } -void ts_language_shared_external_tokens_external_scanner_reset(void *payload) { +void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) { } -bool ts_language_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { +bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } -void ts_language_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { } -bool ts_language_shared_external_tokens_external_scanner_scan( +bool tree_sitter_shared_external_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { // If a line-break is a valid lookahead token, only skip spaces. @@ -59,5 +59,5 @@ bool ts_language_shared_external_tokens_external_scanner_scan( return false; } -void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) { } diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index 84873d1a..c59eca95 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -73,7 +73,7 @@ const TSLanguage *load_language(const string &source_filename, const string &lib_filename, const string &language_name, string external_scanner_filename = "") { - string language_function_name = "ts_language_" + language_name; + string language_function_name = "tree_sitter_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); int header_mtime = get_modified_time(header_dir + "/tree_sitter/parser.h"); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 755a7402..2127078e 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -355,7 +355,8 @@ class CCodeGenerator { } void add_parser_export() { - string external_scanner_name = "ts_language_" + name + "_external_scanner"; + string language_function_name = "tree_sitter_" + name; + string external_scanner_name = language_function_name + "_external_scanner"; if (!syntax_grammar.external_tokens.empty()) { line("void *" + external_scanner_name + "_create();"); @@ -367,7 +368,7 @@ class CCodeGenerator { line(); } - line("const TSLanguage *ts_language_" + name + "() {"); + line("const TSLanguage *" + language_function_name + "() {"); indent([&]() { line("GET_LANGUAGE("); if (syntax_grammar.external_tokens.empty()) { From 4131e1c16e9d70d9190663fc7645acf82ac97649 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 11:36:51 -0800 Subject: [PATCH 47/50] Return an error when external token name matches non-terminal rule --- include/tree_sitter/compiler.h | 3 +- .../prepare_grammar/extract_tokens_spec.cc | 42 +++++++++++++++---- .../prepare_grammar/extract_tokens.cc | 18 ++++++-- src/compiler/rules/built_in_symbols.cc | 2 +- 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index b362e535..1c287fd5 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -10,7 +10,8 @@ typedef enum { TSCompileErrorTypeInvalidGrammar, TSCompileErrorTypeInvalidRegex, TSCompileErrorTypeUndefinedSymbol, - TSCompileErrorTypeInvalidUbiquitousToken, + TSCompileErrorTypeInvalidExtraToken, + TSCompileErrorTypeInvalidExternalToken, TSCompileErrorTypeLexConflict, TSCompileErrorTypeParseConflict, TSCompileErrorTypeEpsilonRule, diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 30a731c8..3aa576df 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -130,11 +130,20 @@ describe("extract_tokens", []() { }); it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, str("ok")), - Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), - Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), - }, { str(" ") }, { { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } }}); + auto result = extract_tokens(InternedGrammar{ + { + Variable("rule_A", VariableTypeNamed, str("ok")), + Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), + Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), + }, + { + str(" ") + }, + { + { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } + }, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); @@ -201,7 +210,7 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), !Equals(CompileError::none())); AssertThat(get<2>(result), Equals( - CompileError(TSCompileErrorTypeInvalidUbiquitousToken, + CompileError(TSCompileErrorTypeInvalidExtraToken, "Not a token: rule_B"))); }); @@ -213,11 +222,30 @@ describe("extract_tokens", []() { AssertThat(get<2>(result), !Equals(CompileError::none())); AssertThat(get<2>(result), Equals(CompileError( - TSCompileErrorTypeInvalidUbiquitousToken, + TSCompileErrorTypeInvalidExtraToken, "Not a token: (choice (non-terminal 1) (blank))" ))); }); }); + + it("returns an error if an external token has the same name as a non-terminal rule", [&]() { + auto result = extract_tokens(InternedGrammar{ + { + Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), + Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), + }, + {}, + {}, + { + ExternalToken {"rule_A", VariableTypeNamed, Symbol(0, Symbol::NonTerminal)} + } + }); + + AssertThat(get<2>(result), Equals(CompileError( + TSCompileErrorTypeInvalidExternalToken, + "Name 'rule_A' cannot be used for both an external token and a non-terminal rule" + ))); + }); }); END_TEST diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index e84d028d..9d161ca8 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -91,8 +91,7 @@ class TokenExtractor : public rules::IdentityRuleFn { }; static CompileError extra_token_error(const string &message) { - return CompileError(TSCompileErrorTypeInvalidUbiquitousToken, - "Not a token: " + message); + return CompileError(TSCompileErrorTypeInvalidExtraToken, "Not a token: " + message); } tuple extract_tokens( @@ -187,10 +186,23 @@ tuple extract_tokens( } for (const ExternalToken &external_token : grammar.external_tokens) { + Symbol internal_token = symbol_replacer.replace_symbol(external_token.corresponding_internal_token); + + if (internal_token.is_non_terminal()) { + return make_tuple( + syntax_grammar, + lexical_grammar, + CompileError( + TSCompileErrorTypeInvalidExternalToken, + "Name '" + external_token.name + "' cannot be used for both an external token and a non-terminal rule" + ) + ); + } + syntax_grammar.external_tokens.push_back({ external_token.name, external_token.type, - symbol_replacer.replace_symbol(external_token.corresponding_internal_token) + internal_token }); } diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index b3f7cd66..0fe45f68 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -12,7 +12,7 @@ Symbol START() { } Symbol NONE() { - return Symbol(-3, Symbol::NonTerminal); + return Symbol(-3, Symbol::Type(-1)); } } // namespace rules From 005308f4c9baedd4433fec792de68fdb4cd3812c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 11:39:30 -0800 Subject: [PATCH 48/50] :fire: todo.md --- todo.md | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 todo.md diff --git a/todo.md b/todo.md deleted file mode 100644 index 0fd7f7b0..00000000 --- a/todo.md +++ /dev/null @@ -1,32 +0,0 @@ -TODO -==== - -### Handling ambiguity (GLR) -* Add a simple way to specify syntactic ambiguity resolutions in the Grammar (e.g. 'prefer declarations to statements' in C), similar to bison's `dprec` -construct. - -### Runtime System -* Refactoring: make separate symbol for unexpected characters than for interior error nodes. - -### Testing / Quality -* Start running the clang-analyzer on the codebase on Travis-CI. -* Use the Valgrind leak checker to fix the memory leaks in the runtime library. -* Randomize the editing in the language tests, using a seed that can be specified in order to reproduce failures. - -### Ubiquitous token handling -* Fix the unintuitive tree that results when ubiquitous tokens are last child of their parent node. - -### Error handling -* Use information about nesting depth of tokens like '(' and ')' to make error recovery more accurate. - -### Grammar Features -* Regexp assertions - - [ ] '^' - - [ ] '$' - - [ ] '\b' -* Composing languages - - [ ] Rule for referencing named grammar - - [ ] Grammar registry object in runtime - - [ ] Parsing returns control to parent language -* Indentation tokens - From d73534e97d28747aa689a081834d8a366d040b65 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 11:41:46 -0800 Subject: [PATCH 49/50] Update language function name in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 472827a8..15def40e 100644 --- a/README.md +++ b/README.md @@ -176,11 +176,11 @@ tokens, like `(` and `+`. This is useful when analyzing the meaning of a documen #include "tree_sitter/runtime.h" // Declare the language function that was generated from your grammar. -TSLanguage *ts_language_arithmetic(); +TSLanguage *tree_sitter_arithmetic(); int main() { TSDocument *document = ts_document_new(); - ts_document_set_language(document, ts_language_arithmetic()); + ts_document_set_language(document, tree_sitter_arithmetic()); ts_document_set_input_string(document, "a + b * 5"); ts_document_parse(document); From 0a6e5f9ee6e1a581a7254a6ceb93dde320261e6e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jan 2017 11:46:28 -0800 Subject: [PATCH 50/50] Fix some build warnings on gcc --- src/compiler/build_tables/build_parse_table.cc | 3 ++- src/compiler/generate_code/c_code.cc | 3 ++- src/compiler/rules/symbol.cc | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index bdaac037..9fb6859f 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -583,7 +583,8 @@ class ParseTableBuilder { case Symbol::NonTerminal: { return grammar.variables[symbol.index].name; } - case Symbol::External: { + case Symbol::External: + default: { return grammar.external_tokens[symbol.index].name; } } diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 2127078e..bc84e557 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -564,7 +564,8 @@ class CCodeGenerator { const Variable &variable = lexical_grammar.variables[symbol.index]; return { variable.name, variable.type }; } - case Symbol::External: { + case Symbol::External: + default: { const ExternalToken &token = syntax_grammar.external_tokens[symbol.index]; return { token.name, token.type }; } diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index 478de7cf..e826cb0f 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -41,6 +41,8 @@ string Symbol::to_string() const { return "(non-terminal " + std::to_string(index) + ")"; case Symbol::External: return "(external " + std::to_string(index) + ")"; + default: + return "(none)"; } }