diff --git a/README.md b/README.md index 472827a8..15def40e 100644 --- a/README.md +++ b/README.md @@ -176,11 +176,11 @@ tokens, like `(` and `+`. This is useful when analyzing the meaning of a documen #include "tree_sitter/runtime.h" // Declare the language function that was generated from your grammar. -TSLanguage *ts_language_arithmetic(); +TSLanguage *tree_sitter_arithmetic(); int main() { TSDocument *document = ts_document_new(); - ts_document_set_language(document, ts_language_arithmetic()); + ts_document_set_language(document, tree_sitter_arithmetic()); ts_document_set_input_string(document, "a + b * 5"); ts_document_parse(document); diff --git a/doc/grammar-schema.json b/doc/grammar-schema.json index 5f43b279..f37cd983 100644 --- a/doc/grammar-schema.json +++ b/doc/grammar-schema.json @@ -40,6 +40,14 @@ "pattern": "^[a-zA-Z_]\\w*$" } } + }, + + "externals": { + "type": "array", + "items": { + "type": "string", + "pattern": "^[a-zA-Z_]\\w*$" + } } }, diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index b362e535..1c287fd5 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -10,7 +10,8 @@ typedef enum { TSCompileErrorTypeInvalidGrammar, TSCompileErrorTypeInvalidRegex, TSCompileErrorTypeUndefinedSymbol, - TSCompileErrorTypeInvalidUbiquitousToken, + TSCompileErrorTypeInvalidExtraToken, + TSCompileErrorTypeInvalidExternalToken, TSCompileErrorTypeLexConflict, TSCompileErrorTypeParseConflict, TSCompileErrorTypeEpsilonRule, diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 3a5bab9a..197015f4 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -12,6 +12,8 @@ extern "C" { typedef unsigned short TSSymbol; typedef unsigned short TSStateId; +typedef uint8_t TSExternalTokenState[16]; + #define ts_builtin_sym_error ((TSSymbol)-1) #define ts_builtin_sym_end 0 @@ -23,7 +25,7 @@ typedef struct { } TSSymbolMetadata; typedef struct { - void (*advance)(void *, TSStateId, bool); + void (*advance)(void *, bool); int32_t lookahead; TSSymbol result_symbol; } TSLexer; @@ -48,6 +50,11 @@ typedef struct { bool fragile : 1; } TSParseAction; +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + typedef union { TSParseAction action; struct { @@ -58,14 +65,26 @@ typedef union { } TSParseActionEntry; typedef struct TSLanguage { + uint32_t version; uint32_t symbol_count; uint32_t token_count; + uint32_t external_token_count; const char **symbol_names; const TSSymbolMetadata *symbol_metadata; const unsigned short *parse_table; const TSParseActionEntry *parse_actions; - const TSStateId *lex_states; + const TSLexMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(); + void (*destroy)(void *); + void (*reset)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + bool (*serialize)(void *, TSExternalTokenState); + void (*deserialize)(void *, const TSExternalTokenState); + } external_scanner; } TSLanguage; /* @@ -79,14 +98,14 @@ typedef struct TSLanguage { #define ADVANCE(state_value) \ { \ - lexer->advance(lexer, state_value, false); \ + lexer->advance(lexer, false); \ state = state_value; \ goto next_state; \ } #define SKIP(state_value) \ { \ - lexer->advance(lexer, state_value, true); \ + lexer->advance(lexer, true); \ state = state_value; \ goto next_state; \ } @@ -146,21 +165,21 @@ typedef struct TSLanguage { { .type = TSParseActionTypeAccept } \ } -#define EXPORT_LANGUAGE(language_name) \ - static TSLanguage language = { \ - .symbol_count = SYMBOL_COUNT, \ - .token_count = TOKEN_COUNT, \ - .symbol_metadata = ts_symbol_metadata, \ - .parse_table = (const unsigned short *)ts_parse_table, \ - .parse_actions = ts_parse_actions, \ - .lex_states = ts_lex_states, \ - .symbol_names = ts_symbol_names, \ - .lex_fn = ts_lex, \ - }; \ - \ - const TSLanguage *language_name() { \ - return &language; \ - } +#define GET_LANGUAGE(...) \ + static TSLanguage language = { \ + .version = LANGUAGE_VERSION, \ + .symbol_count = SYMBOL_COUNT, \ + .token_count = TOKEN_COUNT, \ + .symbol_metadata = ts_symbol_metadata, \ + .parse_table = (const unsigned short *)ts_parse_table, \ + .parse_actions = ts_parse_actions, \ + .lex_modes = ts_lex_modes, \ + .symbol_names = ts_symbol_names, \ + .lex_fn = ts_lex, \ + .external_token_count = EXTERNAL_TOKEN_COUNT, \ + .external_scanner = {__VA_ARGS__} \ + }; \ + return &language \ #ifdef __cplusplus } diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 68e804f1..00d8e7c4 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,6 +9,8 @@ extern "C" { #include #include +#define TREE_SITTER_LANGUAGE_VERSION 1 + typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; typedef struct TSDocument TSDocument; @@ -114,6 +116,7 @@ uint32_t ts_document_parse_count(const TSDocument *); uint32_t ts_language_symbol_count(const TSLanguage *); const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); +uint32_t ts_language_version(const TSLanguage *); #ifdef __cplusplus } diff --git a/script/fetch-fixtures b/script/fetch-fixtures index bb727298..7009d70f 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -7,6 +7,7 @@ GRAMMARS=( json c cpp + python ) for grammar in ${GRAMMARS[@]}; do @@ -21,7 +22,7 @@ for grammar in ${GRAMMARS[@]}; do ( cd $grammar_dir; - git reset --hard; - git pull origin master; + git fetch origin + git reset --hard origin/master; ) done diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc index 104cd721..f01d76cb 100644 --- a/spec/compiler/build_tables/distinctive_tokens_spec.cc +++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc @@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() { })), }; - AssertThat(recovery_tokens(grammar), Equals>({ 1 })); + AssertThat(recovery_tokens(grammar), Equals>({ Symbol(1, Symbol::Terminal) })); }); }); diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc index 7f43e175..3aa75a4c 100644 --- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc +++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc @@ -14,10 +14,10 @@ START_TEST describe("LexConflictManager::resolve(new_action, old_action)", []() { LexConflictManager conflict_manager; bool update; - Symbol sym1(0, true); - Symbol sym2(1, true); - Symbol sym3(2, true); - Symbol sym4(3, true); + Symbol sym1(0, Symbol::Terminal); + Symbol sym2(1, Symbol::Terminal); + Symbol sym3(2, Symbol::Terminal); + Symbol sym4(3, Symbol::Terminal); LexItemSet item_set({ LexItem(sym4, blank() )}); it("favors advance actions over empty accept token actions", [&]() { diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc index 94997956..7042922f 100644 --- a/spec/compiler/build_tables/lex_item_spec.cc +++ b/spec/compiler/build_tables/lex_item_spec.cc @@ -14,7 +14,7 @@ START_TEST describe("LexItem", []() { describe("completion_status()", [&]() { it("indicates whether the item is done, its precedence, and whether it is a string", [&]() { - LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' })); + LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' })); AssertThat(item1.completion_status().is_done, IsFalse()); AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item1.completion_status().is_string, IsFalse()); @@ -23,7 +23,7 @@ describe("LexItem", []() { params.precedence = 3; params.has_precedence = true; params.is_string = 1; - LexItem item2(Symbol(0, true), choice({ + LexItem item2(Symbol(0, Symbol::Terminal), choice({ metadata(blank(), params), character({ 'a', 'b', 'c' }) })); @@ -32,7 +32,7 @@ describe("LexItem", []() { AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); AssertThat(item2.completion_status().is_string, IsTrue()); - LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' }))); + LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' }))); AssertThat(item3.completion_status().is_done, IsTrue()); AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); AssertThat(item3.completion_status().is_string, IsFalse()); @@ -43,7 +43,7 @@ describe("LexItem", []() { describe("LexItemSet::transitions()", [&]() { it("handles single characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), }); AssertThat( @@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() { params.is_main_token = true; LexItemSet item_set({ - LexItem(Symbol(1), metadata(character({ 'x' }), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)), }); AssertThat( @@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), metadata(blank(), params)), + LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)), }), PrecedenceRange(), true @@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'w' }), character({ 'x' }), character({ 'y' }), @@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('w'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }), @@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences with nested precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ prec(3, seq({ character({ 'v' }), prec(4, seq({ @@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() { // The outer precedence is now 'active', because we are within its // contained rule. LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ prec(4, seq({ character({ 'w' }), @@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() { Transition{ // The inner precedence is now 'active' LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, seq({ active_prec(4, character({ 'x' })), character({ 'y' }) })), @@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ active_prec(3, character({ 'y' })), character({ 'z' }), })), @@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(3), false @@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles sequences where the left hand side can be blank", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ 'x' }), blank(), @@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('x'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'y' }), character({ 'z' }), })), @@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('y'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'z' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })), }), PrecedenceRange(), false @@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles blanks", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }); AssertThat(item_set.transitions(), IsEmpty()); @@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), repeat1(seq({ + LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({ character({ 'a' }), character({ 'b' }), }))), - LexItem(Symbol(2), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), }); AssertThat( @@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ 'b' }), repeat1(seq({ character({ 'a' }), character({ 'b' }), })) })), - LexItem(Symbol(1), character({ 'b' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })), }), PrecedenceRange(), false @@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c'), Transition{ LexItemSet({ - LexItem(Symbol(2), repeat1(character({ 'c' }))), - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles repeats with precedence", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))) + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))) }); AssertThat( @@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))), - LexItem(Symbol(1), active_prec(-1, blank())), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())), }), PrecedenceRange(-1), false @@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between overlapping character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ active_prec(2, seq({ character({ 'a', 'b', 'c', 'd' }), character({ 'x' }), @@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'b'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), }), PrecedenceRange(2), false @@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('c', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(2, character({ 'x' }))), - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(2, 3), false @@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), active_prec(3, character({ 'y' }))), + LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))), }), PrecedenceRange(3), false @@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between a subset and a superset of characters", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), choice({ + LexItem(Symbol(1, Symbol::NonTerminal), choice({ seq({ character({ 'b', 'c', 'd' }), character({ 'x' }), @@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a').include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('b', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ 'x' })), - LexItem(Symbol(1), character({ 'y' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })), }), PrecedenceRange(), false @@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() { it("handles choices between whitelisted and blacklisted character sets", [&]() { LexItemSet item_set({ - LexItem(Symbol(1), seq({ + LexItem(Symbol(1, Symbol::NonTerminal), seq({ choice({ character({ '/' }, false), seq({ @@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include_all().exclude('/').exclude('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), }), PrecedenceRange(), false @@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('\\'), Transition{ LexItemSet({ - LexItem(Symbol(1), character({ '/' })), - LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })), + LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })), + LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })), }), PrecedenceRange(), false @@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() { it("handles different items with overlapping character sets", [&]() { LexItemSet set1({ - LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), - LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' })) + LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })), + LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' })) }); AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({ @@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('a', 'd'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('e', 'f'), Transition{ LexItemSet({ - LexItem(Symbol(1), blank()), - LexItem(Symbol(2), blank()), + LexItem(Symbol(1, Symbol::NonTerminal), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false @@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() { CharacterSet().include('g', 'i'), Transition{ LexItemSet({ - LexItem(Symbol(2), blank()), + LexItem(Symbol(2, Symbol::NonTerminal), blank()), }), PrecedenceRange(), false diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index a1dd2231..6548f37a 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -27,26 +27,26 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({ - {Symbol(2), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone}, }) }), SyntaxVariable("rule2", VariableTypeNamed, { Production({ - {Symbol(14, true), 0, AssociativityNone}, - {Symbol(15, true), 0, AssociativityNone}, + {Symbol(14, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(15, Symbol::Terminal), 0, AssociativityNone}, }) }), - }, {}, {}}; + }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { return grammar.variables[variable_index].productions[production_index]; @@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) + }, + { + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) - }, - { - ParseItem(Symbol(2), production(2, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); @@ -86,18 +86,18 @@ describe("ParseItemSetBuilder", []() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(11, true), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), }), SyntaxVariable("rule1", VariableTypeNamed, { Production({ - {Symbol(12, true), 0, AssociativityNone}, - {Symbol(13, true), 0, AssociativityNone}, + {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, + {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({}) }), - }, {}, {}}; + }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { return grammar.variables[variable_index].productions[production_index]; @@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() { ParseItemSet item_set({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }), + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }), } }); @@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() { AssertThat(item_set, Equals(ParseItemSet({ { - ParseItem(Symbol(0), production(0, 0), 0), - LookaheadSet({ 10 }) + ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0), + LookaheadSet({ Symbol(10, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 0), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, { - ParseItem(Symbol(1), production(1, 1), 0), - LookaheadSet({ 11 }) + ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0), + LookaheadSet({ Symbol(11, Symbol::Terminal) }) }, }))); }); diff --git a/spec/compiler/prepare_grammar/expand_repeats_spec.cc b/spec/compiler/prepare_grammar/expand_repeats_spec.cc index c25ff47c..d8c93a41 100644 --- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc +++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc @@ -13,7 +13,7 @@ describe("expand_repeats", []() { it("replaces repeat rules with pairs of recursive rules", [&]() { InitialSyntaxGrammar grammar{{ Variable("rule0", VariableTypeNamed, repeat1(i_token(0))), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -32,7 +32,7 @@ describe("expand_repeats", []() { i_token(10), repeat1(i_token(11)), })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -54,7 +54,7 @@ describe("expand_repeats", []() { i_token(10), repeat1(i_token(11)) })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -80,7 +80,7 @@ describe("expand_repeats", []() { i_token(3), repeat1(i_token(4)) })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -106,7 +106,7 @@ describe("expand_repeats", []() { repeat1(i_token(10)), repeat1(i_token(11)), })), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); @@ -130,7 +130,7 @@ describe("expand_repeats", []() { InitialSyntaxGrammar grammar{{ Variable("rule0", VariableTypeNamed, repeat1(i_token(10))), Variable("rule1", VariableTypeNamed, repeat1(i_token(11))), - }, {}, {}}; + }, {}, {}, {}}; auto result = expand_repeats(grammar); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 9f871ec4..3aa576df 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -5,6 +5,7 @@ #include "compiler/prepare_grammar/extract_tokens.h" #include "helpers/rule_helpers.h" #include "helpers/equals_pointer.h" +#include "helpers/stream_methods.h" START_TEST @@ -28,7 +29,7 @@ describe("extract_tokens", []() { Variable("rule_B", VariableTypeNamed, pattern("ij+")), Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })), Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3))) - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -91,7 +92,7 @@ describe("extract_tokens", []() { i_sym(0), str("ab"), })), - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -110,7 +111,7 @@ describe("extract_tokens", []() { Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })), Variable("rule_B", VariableTypeNamed, str("cd")), Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })), - }, {}, {}}); + }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -129,17 +130,26 @@ describe("extract_tokens", []() { }); it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, str("ok")), - Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), - Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), - }, { str(" ") }, { { Symbol(1), Symbol(2) } }}); + auto result = extract_tokens(InternedGrammar{ + { + Variable("rule_A", VariableTypeNamed, str("ok")), + Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), + Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), + }, + { + str(" ") + }, + { + { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } + }, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); AssertThat(syntax_grammar.variables.size(), Equals(2)); AssertThat(syntax_grammar.expected_conflicts, Equals(set>({ - { Symbol(0), Symbol(1) }, + { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) }, }))); }); @@ -150,7 +160,7 @@ describe("extract_tokens", []() { }, { str("y"), pattern("\\s+"), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); @@ -167,11 +177,11 @@ describe("extract_tokens", []() { Variable("rule_B", VariableTypeNamed, str("y")), }, { str("y"), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<1>(result).separators.size(), Equals(0)); - AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, true) }))); + AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol(1, Symbol::Terminal) }))); }); it("updates extra symbols according to the new symbol numbers", [&]() { @@ -181,12 +191,12 @@ describe("extract_tokens", []() { Variable("rule_C", VariableTypeNamed, str("z")), }, { i_sym(2), - }, {}}); + }, {}, {}}); AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<0>(result).extra_tokens, Equals(set({ - { Symbol(3, true) }, + { Symbol(3, Symbol::Terminal) }, }))); AssertThat(get<1>(result).separators, IsEmpty()); @@ -196,11 +206,11 @@ describe("extract_tokens", []() { auto result = extract_tokens(InternedGrammar{{ Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), - }, { i_sym(1) }, {}}); + }, { i_sym(1) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); AssertThat(get<2>(result), Equals( - CompileError(TSCompileErrorTypeInvalidUbiquitousToken, + CompileError(TSCompileErrorTypeInvalidExtraToken, "Not a token: rule_B"))); }); @@ -208,14 +218,34 @@ describe("extract_tokens", []() { auto result = extract_tokens(InternedGrammar{{ Variable("rule_A", VariableTypeNamed, str("x")), Variable("rule_B", VariableTypeNamed, str("y")), - }, { choice({ i_sym(1), blank() }) }, {}}); + }, { choice({ i_sym(1), blank() }) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); - AssertThat(get<2>(result), Equals( - CompileError(TSCompileErrorTypeInvalidUbiquitousToken, - "Not a token: (choice (sym 1) (blank))"))); + AssertThat(get<2>(result), Equals(CompileError( + TSCompileErrorTypeInvalidExtraToken, + "Not a token: (choice (non-terminal 1) (blank))" + ))); }); }); + + it("returns an error if an external token has the same name as a non-terminal rule", [&]() { + auto result = extract_tokens(InternedGrammar{ + { + Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), + Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), + }, + {}, + {}, + { + ExternalToken {"rule_A", VariableTypeNamed, Symbol(0, Symbol::NonTerminal)} + } + }); + + AssertThat(get<2>(result), Equals(CompileError( + TSCompileErrorTypeInvalidExternalToken, + "Name 'rule_A' cannot be used for both an external token and a non-terminal rule" + ))); + }); }); END_TEST diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc index 3efd4e03..823da8e6 100644 --- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -36,19 +36,19 @@ describe("flatten_grammar", []() { AssertThat(result.type, Equals(VariableTypeNamed)); AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(3), 102, AssociativityRight}, - {Symbol(4), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight}, + {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }), Production({ - {Symbol(1), 0, AssociativityNone}, - {Symbol(2), 101, AssociativityLeft}, - {Symbol(5), 101, AssociativityLeft}, - {Symbol(6), 0, AssociativityNone}, - {Symbol(7), 0, AssociativityNone}, + {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone}, + {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone}, }) }))) }); @@ -65,8 +65,8 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, - {Symbol(2), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, + {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) @@ -80,7 +80,7 @@ describe("flatten_grammar", []() { AssertThat(result.productions, Equals(vector({ Production({ - {Symbol(1), 101, AssociativityLeft}, + {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft}, }) }))) }); diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 4c417e57..9142eab6 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -3,8 +3,10 @@ #include "compiler/grammar.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" #include "helpers/equals_pointer.h" #include "helpers/rule_helpers.h" +#include "helpers/stream_methods.h" START_TEST @@ -17,7 +19,7 @@ describe("intern_symbols", []() { { "x", choice({ sym("y"), sym("_z") }) }, { "y", sym("_z") }, { "_z", str("stuff") } - }, {}, {}}; + }, {}, {}, {}}; auto result = intern_symbols(grammar); @@ -33,7 +35,7 @@ describe("intern_symbols", []() { it("returns an error", []() { Grammar grammar{{ { "x", sym("y") }, - }, {}, {}}; + }, {}, {}, {}}; auto result = intern_symbols(grammar); @@ -48,7 +50,7 @@ describe("intern_symbols", []() { { "z", str("stuff") } }, { sym("z") - }, {}}; + }, {}, {}}; auto result = intern_symbols(grammar); @@ -56,6 +58,32 @@ describe("intern_symbols", []() { AssertThat(result.first.extra_tokens.size(), Equals(1)); AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2))); }); + + it("records any rule names that match external token names", [&]() { + Grammar grammar{{ + { "x", choice({ sym("y"), sym("z") }) }, + { "y", sym("z") }, + { "z", str("stuff") } + }, {}, {}, { + "w", + "z" + }}; + + auto result = intern_symbols(grammar); + + AssertThat(result.first.external_tokens, Equals(vector({ + { + "w", + VariableTypeNamed, + rules::NONE() + }, + { + "z", + VariableTypeNamed, + Symbol(2, Symbol::NonTerminal) + } + }))) + }); }); END_TEST diff --git a/spec/compiler/rules/repeat_spec.cc b/spec/compiler/rules/repeat_spec.cc index 63680563..9c84c8e5 100644 --- a/spec/compiler/rules/repeat_spec.cc +++ b/spec/compiler/rules/repeat_spec.cc @@ -9,7 +9,7 @@ START_TEST describe("Repeat", []() { describe("constructing repeats", [&]() { it("doesn't create redundant repeats", [&]() { - auto sym = make_shared(1); + auto sym = make_shared(1, Symbol::NonTerminal); auto repeat = Repeat::build(sym); auto outer_repeat = Repeat::build(repeat); diff --git a/spec/fixtures/error_corpus/python_errors.txt b/spec/fixtures/error_corpus/python_errors.txt new file mode 100644 index 00000000..7ff9f240 --- /dev/null +++ b/spec/fixtures/error_corpus/python_errors.txt @@ -0,0 +1,29 @@ +========================================== +errors in if statements +========================================== + +if a is: + print b + print c + +--- + +(module + (if_statement (identifier) (ERROR) + (print_statement (identifier)) + (print_statement (identifier)))) + +========================================== +errors in function definitions +========================================== + +def a():: + b + c + +--- + +(module + (function_definition (identifier) (parameters) (ERROR) + (expression_statement (identifier)) + (expression_statement (identifier)))) diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c new file mode 100644 index 00000000..5c409639 --- /dev/null +++ b/spec/fixtures/external_scanners/extra_external_tokens.c @@ -0,0 +1,42 @@ +#include + +enum { + COMMENT, +}; + +void *tree_sitter_extra_external_tokens_external_scanner_create() { + return NULL; +} + +void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) { +} + +bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +} + +bool tree_sitter_extra_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '#') { + lexer->advance(lexer, false); + while (lexer->lookahead != '\n') { + lexer->advance(lexer, false); + } + + lexer->result_symbol = COMMENT; + return true; + } + + return false; +} + +void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) { +} diff --git a/spec/fixtures/external_scanners/percent_strings.c b/spec/fixtures/external_scanners/percent_strings.c new file mode 100644 index 00000000..9f68696e --- /dev/null +++ b/spec/fixtures/external_scanners/percent_strings.c @@ -0,0 +1,118 @@ +#include +#include + +enum { + percent_string, + percent_string_start, + percent_string_end +}; + +typedef struct { + int32_t open_delimiter; + int32_t close_delimiter; + uint32_t depth; +} Scanner; + +void *tree_sitter_external_scanner_example_external_scanner_create() { + Scanner *scanner = malloc(sizeof(Scanner)); + *scanner = (Scanner){ + .open_delimiter = 0, + .close_delimiter = 0, + .depth = 0 + }; + return scanner; +} + +bool tree_sitter_external_scanner_example_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + Scanner *scanner = payload; + + if (whitelist[percent_string]) { + while (lexer->lookahead == ' ' || + lexer->lookahead == '\t' || + lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead != '%') return false; + lexer->advance(lexer, false); + + switch (lexer->lookahead) { + case '(': + scanner->open_delimiter = '('; + scanner->close_delimiter = ')'; + scanner->depth = 1; + break; + case '[': + scanner->open_delimiter = '['; + scanner->close_delimiter = ']'; + scanner->depth = 1; + break; + case '{': + scanner->open_delimiter = '{'; + scanner->close_delimiter = '}'; + scanner->depth = 1; + break; + default: + return false; + } + + lexer->advance(lexer, false); + + for (;;) { + if (scanner->depth == 0) { + lexer->result_symbol = percent_string; + return true; + } + + if (lexer->lookahead == scanner->open_delimiter) { + scanner->depth++; + } else if (lexer->lookahead == scanner->close_delimiter) { + scanner->depth--; + } else if (lexer->lookahead == '#') { + lexer->advance(lexer, false); + if (lexer->lookahead == '{') { + lexer->advance(lexer, false); + lexer->result_symbol = percent_string_start; + return true; + } + } + + lexer->advance(lexer, false); + } + } else if (whitelist[percent_string_end]) { + if (lexer->lookahead != '}') return false; + lexer->advance(lexer, false); + + for (;;) { + if (scanner->depth == 0) { + lexer->result_symbol = percent_string_end; + return true; + } + + if (lexer->lookahead == scanner->open_delimiter) { + scanner->depth++; + } else if (lexer->lookahead == scanner->close_delimiter) { + scanner->depth--; + } + + lexer->advance(lexer, false); + } + } + + return false; +} + +void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) { +} + +bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +} + +void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) { + free(payload); +} diff --git a/spec/fixtures/external_scanners/shared_external_tokens.c b/spec/fixtures/external_scanners/shared_external_tokens.c new file mode 100644 index 00000000..0bee00d8 --- /dev/null +++ b/spec/fixtures/external_scanners/shared_external_tokens.c @@ -0,0 +1,63 @@ +#include +#include + +enum { + STRING, + LINE_BREAK +}; + +void *tree_sitter_shared_external_tokens_external_scanner_create() { + return NULL; +} + +void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) { +} + +bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { + return true; +} + +void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { +} + +bool tree_sitter_shared_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + // If a line-break is a valid lookahead token, only skip spaces. + if (whitelist[LINE_BREAK]) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + lexer->result_symbol = LINE_BREAK; + return true; + } + } + + // If a line-break is not a valid lookahead token, skip line breaks as well + // as spaces. + if (whitelist[STRING]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\'') { + lexer->advance(lexer, false); + + while (lexer->lookahead != '\'') { + lexer->advance(lexer, false); + } + + lexer->advance(lexer, false); + lexer->result_symbol = STRING; + return true; + } + } + + return false; +} + +void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) { +} diff --git a/spec/helpers/dedent.h b/spec/helpers/dedent.h new file mode 100644 index 00000000..1387acf9 --- /dev/null +++ b/spec/helpers/dedent.h @@ -0,0 +1,12 @@ +#include "compiler/util/string_helpers.h" +#include + +static std::string dedent(std::string input) { + size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n"); + std::string whitespace = "\n" + std::string(indent_level, ' '); + tree_sitter::util::str_replace(&input, whitespace, "\n"); + return input.substr( + input.find_first_not_of("\n "), + input.find_last_not_of("\n ") + 1 + ); +} diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index 9409da42..c59eca95 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -28,10 +28,11 @@ const char *libcompiler_path = "out/Test/libcompiler.a"; #endif -static std::string run_cmd(const char *cmd, const char *args[]) { +static std::string run_command(const char *cmd, const char *args[]) { int child_pid = fork(); - if (child_pid < 0) + if (child_pid < 0) { return "fork failed"; + } if (child_pid == 0) { close(0); @@ -39,7 +40,6 @@ static std::string run_cmd(const char *cmd, const char *args[]) { dup2(2, 1); dup2(1, 2); execvp(cmd, (char * const * )args); - return ""; } int status; @@ -47,12 +47,16 @@ static std::string run_cmd(const char *cmd, const char *args[]) { waitpid(child_pid, &status, 0); } while (!WIFEXITED(status)); - if (WEXITSTATUS(status) == 0) + if (WEXITSTATUS(status) == 0) { return ""; - else + } else { return "command failed"; + } +} - return ""; +static bool file_exists(const string &path) { + struct stat file_stat; + return stat(path.c_str(), &file_stat) == 0; } static int get_modified_time(const string &path) { @@ -67,46 +71,46 @@ static int get_modified_time(const string &path) { const TSLanguage *load_language(const string &source_filename, const string &lib_filename, - const string &language_name) { - string language_function_name = "ts_language_" + language_name; + const string &language_name, + string external_scanner_filename = "") { + string language_function_name = "tree_sitter_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); int header_mtime = get_modified_time(header_dir + "/tree_sitter/parser.h"); int lib_mtime = get_modified_time(lib_filename); + int external_scanner_mtime = get_modified_time(external_scanner_filename); - if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime) { - string obj_filename = lib_filename + ".o"; - const char *compiler_name = getenv("CC"); - if (!compiler_name) { - compiler_name = "gcc"; - } + if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime || + lib_mtime < external_scanner_mtime) { + const char *compiler_name = getenv("CXX"); + if (!compiler_name) compiler_name = "c++"; - const char *compile_argv[] = { - compiler_name, - "-x", "c", - "-fPIC", - "-g", - "-I", header_dir.c_str(), - "-c", source_filename.c_str(), - "-o", obj_filename.c_str(), - NULL - }; - string compile_error = run_cmd("gcc", compile_argv); - if (!compile_error.empty()) { - AssertThat(string(compile_error), IsEmpty()); - return nullptr; - } - - const char *link_argv[] = { + vector compile_args = { compiler_name, "-shared", - "-Wl", obj_filename.c_str(), + "-fPIC", + "-I", header_dir.c_str(), "-o", lib_filename.c_str(), - NULL + "-x", "c", + source_filename.c_str() }; - string link_error = run_cmd("gcc", link_argv); - if (!link_error.empty()) { - AssertThat(link_error, IsEmpty()); + + if (!external_scanner_filename.empty()) { + compile_args.push_back("-g"); + string extension = external_scanner_filename.substr(external_scanner_filename.rfind(".")); + if (extension == ".c") { + compile_args.push_back("-xc"); + } else { + compile_args.push_back("-xc++"); + } + compile_args.push_back(external_scanner_filename.c_str()); + } + + compile_args.push_back(nullptr); + + string compile_error = run_command(compiler_name, compile_args.data()); + if (!compile_error.empty()) { + AssertThat(string(compile_error), IsEmpty()); return nullptr; } } @@ -118,19 +122,19 @@ const TSLanguage *load_language(const string &source_filename, return nullptr; } - void *symbol_value = dlsym(parser_lib, language_function_name.c_str()); - if (!symbol_value) { + void *language_function = dlsym(parser_lib, language_function_name.c_str()); + if (!language_function) { std::string message(dlerror()); AssertThat(message, IsEmpty()); return nullptr; } - typedef TSLanguage * (* LanguageFunction)(); - LanguageFunction language_fn = reinterpret_cast(symbol_value); - return language_fn(); + return reinterpret_cast(language_function)(); } -const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) { +const TSLanguage *load_compile_result(const string &name, + const TSCompileResult &compile_result, + string external_scanner_path) { if (compile_result.error_type != TSCompileErrorTypeNone) { Assert::Failure(string("Compilation failed ") + compile_result.error_message); return nullptr; @@ -146,7 +150,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult source_file << compile_result.code; source_file.close(); - const TSLanguage *language = load_language(source_filename, lib_filename, name); + auto language = load_language(source_filename, lib_filename, name, external_scanner_path); free(compile_result.code); return language; } @@ -158,6 +162,10 @@ const TSLanguage *get_test_language(const string &language_name) { string language_dir = string("spec/fixtures/grammars/") + language_name; string grammar_filename = language_dir + "/src/grammar.json"; string parser_filename = language_dir + "/src/parser.c"; + string external_scanner_filename = language_dir + "/src/scanner.cc"; + if (!file_exists(external_scanner_filename)) { + external_scanner_filename = ""; + } int grammar_mtime = get_modified_time(grammar_filename); if (!grammar_mtime) @@ -192,7 +200,7 @@ const TSLanguage *get_test_language(const string &language_name) { mkdir("out/tmp", 0777); string lib_filename = "out/tmp/" + language_name + ".so"; - const TSLanguage *language = load_language(parser_filename, lib_filename, language_name); + const TSLanguage *language = load_language(parser_filename, lib_filename, language_name, external_scanner_filename); loaded_languages[language_name] = language; return language; }; diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h index 41b1458e..41d8b739 100644 --- a/spec/helpers/load_language.h +++ b/spec/helpers/load_language.h @@ -5,7 +5,8 @@ #include "tree_sitter/runtime.h" #include -const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &); +const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &, + std::string external_scanner_path = ""); const TSLanguage *get_test_language(const std::string &language_name); #endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/spec/helpers/point_helpers.cc b/spec/helpers/point_helpers.cc index e9c99259..60f4f9a7 100644 --- a/spec/helpers/point_helpers.cc +++ b/spec/helpers/point_helpers.cc @@ -15,7 +15,9 @@ bool operator==(const TSRange &left, const TSRange &right) { } bool operator==(const Length &left, const Length &right) { - return length_eq(left, right); + return left.bytes == right.bytes && + left.chars == right.chars && + left.extent == right.extent; } bool operator<(const TSPoint &left, const TSPoint &right) { diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc index 8bf32360..0b010d2e 100644 --- a/spec/helpers/rule_helpers.cc +++ b/spec/helpers/rule_helpers.cc @@ -9,6 +9,7 @@ namespace tree_sitter { using std::ostream; using std::string; using std::to_string; + using rules::Symbol; rule_ptr character(const set &ranges) { return character(ranges, true); @@ -28,11 +29,11 @@ namespace tree_sitter { } rule_ptr i_sym(size_t index) { - return make_shared(index); + return make_shared(index, Symbol::NonTerminal); } rule_ptr i_token(size_t index) { - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) { diff --git a/spec/helpers/scope_sequence.cc b/spec/helpers/scope_sequence.cc index 87e059dc..d6e2e3b1 100644 --- a/spec/helpers/scope_sequence.cc +++ b/spec/helpers/scope_sequence.cc @@ -23,20 +23,21 @@ static void append_to_scope_sequence(ScopeSequence *sequence, ScopeStack *current_scopes, TSNode node, TSDocument *document, const std::string &text) { - append_text_to_scope_sequence(sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size()); + append_text_to_scope_sequence( + sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size() + ); - string scope = ts_node_type(node, document); - current_scopes->push_back(scope); - size_t child_count = ts_node_child_count(node); - if (child_count > 0) { - for (size_t i = 0; i < child_count; i++) { - TSNode child = ts_node_child(node, i); - append_to_scope_sequence(sequence, current_scopes, child, document, text); - } - } else { - size_t length = ts_node_end_byte(node) - ts_node_start_byte(node); - append_text_to_scope_sequence(sequence, current_scopes, text, length); + current_scopes->push_back(ts_node_type(node, document)); + + for (size_t i = 0, n = ts_node_child_count(node); i < n; i++) { + TSNode child = ts_node_child(node, i); + append_to_scope_sequence(sequence, current_scopes, child, document, text); } + + append_text_to_scope_sequence( + sequence, current_scopes, text, ts_node_end_byte(node) - sequence->size() + ); + current_scopes->pop_back(); } diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index 4d411d66..a4b275ea 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -10,16 +10,7 @@ namespace tree_sitter { ostream &operator<<(ostream &stream, const Grammar &grammar) { stream << string("# "); - stream << pair.second; - started = true; - } + stream << " rules: " << grammar.rules; return stream << string("}>"); } @@ -85,6 +76,11 @@ ostream &operator<<(ostream &stream, const ParseState &state) { return stream << string(">"); } +ostream &operator<<(ostream &stream, const ExternalToken &external_token) { + return stream << "{" << external_token.name << ", " << external_token.type << + "," << external_token.corresponding_internal_token << "}"; +} + ostream &operator<<(ostream &stream, const ProductionStep &step) { stream << "(symbol: " << step.symbol << ", precedence:" << to_string(step.precedence); stream << ", associativity: "; diff --git a/spec/helpers/stream_methods.h b/spec/helpers/stream_methods.h index 515060eb..28b201c3 100644 --- a/spec/helpers/stream_methods.h +++ b/spec/helpers/stream_methods.h @@ -97,6 +97,7 @@ struct AdvanceAction; struct AcceptTokenAction; class ParseAction; class ParseState; +struct ExternalToken; struct ProductionStep; struct PrecedenceRange; @@ -110,6 +111,7 @@ ostream &operator<<(ostream &, const AdvanceAction &); ostream &operator<<(ostream &, const AcceptTokenAction &); ostream &operator<<(ostream &, const ParseAction &); ostream &operator<<(ostream &, const ParseState &); +ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); ostream &operator<<(ostream &, const PrecedenceRange &); diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index d41d76e4..ed2109c2 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -1,19 +1,11 @@ #include "spec_helper.h" #include "runtime/alloc.h" #include "helpers/load_language.h" +#include "helpers/stderr_logger.h" +#include "helpers/dedent.h" #include "compiler/util/string_helpers.h" #include -static string dedent(string input) { - size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n"); - string whitespace = "\n" + string(indent_level, ' '); - util::str_replace(&input, whitespace, "\n"); - return input.substr( - input.find_first_not_of("\n "), - input.find_last_not_of("\n ") + 1 - ); -} - static string fill_template(string input, map parameters) { string result = input; for (const auto &pair : parameters) { @@ -507,6 +499,190 @@ describe("compile_grammar", []() { }); }); + describe("external scanners", [&]() { + it("can tokenize using arbitrary user-defined scanner functions", [&]() { + string grammar = R"JSON({ + "name": "external_scanner_example", + + "externals": [ + "_percent_string", + "_percent_string_start", + "_percent_string_end" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "string": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "_percent_string"}, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_percent_string_start"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "_percent_string_end"} + ] + }, + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "external_scanner_example", + result, + "spec/fixtures/external_scanners/percent_strings.c" + )); + + ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); + ts_document_parse(document); + assert_root_node("(expression (sum (expression (identifier)) (expression (string))))"); + + ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}"); + ts_document_parse(document); + assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); + }); + + it("allows external scanners to refer to tokens that are defined internally", [&]() { + string grammar = R"JSON({ + "name": "shared_external_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "shared_external_tokens", + result, + "spec/fixtures/external_scanners/shared_external_tokens.c" + )); + + ts_document_set_input_string(document, "a b\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + ts_document_set_input_string(document, "a \nb\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + ts_document_set_input_string(document, "'hello' 'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + + ts_document_set_input_string(document, "'hello' \n'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + }); + + it("allows external tokens to be used as extras", [&]() { + string grammar = R"JSON({ + "name": "extra_external_tokens", + + "externals": [ + "comment" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + "assignment": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "variable"}, + {"type": "STRING", "value": "="}, + {"type": "SYMBOL", "name": "variable"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "extra_external_tokens", + result, + "spec/fixtures/external_scanners/extra_external_tokens.c" + )); + + ts_document_set_input_string(document, "x = # a comment\n y"); + ts_document_parse(document); + assert_root_node("(assignment (variable) (comment) (variable))"); + }); + }); + describe("when the grammar's start symbol is a token", [&]() { it("parses the token", [&]() { TSCompileResult result = ts_compile_grammar(R"JSON( diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc index 9d716ed1..c399e8f9 100644 --- a/spec/integration/corpus_specs.cc +++ b/spec/integration/corpus_specs.cc @@ -84,6 +84,7 @@ describe("The Corpus", []() { "json", "c", "cpp", + "python", }); for (auto &language_name : test_languages) { diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index 0fb7a640..52e65ffb 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -5,6 +5,7 @@ #include "helpers/tree_helpers.h" #include "helpers/point_helpers.h" #include "helpers/spy_logger.h" +#include "helpers/stderr_logger.h" #include "helpers/spy_input.h" #include "helpers/load_language.h" @@ -15,22 +16,22 @@ TSPoint point(size_t row, size_t column) { START_TEST describe("Document", [&]() { - TSDocument *doc; + TSDocument *document; TSNode root; before_each([&]() { record_alloc::start(); - doc = ts_document_new(); + document = ts_document_new(); }); after_each([&]() { - ts_document_free(doc); + ts_document_free(document); record_alloc::stop(); AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); }); auto assert_node_string_equals = [&](TSNode node, const string &expected) { - char *str = ts_node_string(node, doc); + char *str = ts_node_string(node, document); string actual(str); ts_free(str); AssertThat(actual, Equals(expected)); @@ -42,11 +43,11 @@ describe("Document", [&]() { before_each([&]() { spy_input = new SpyInput("{\"key\": [null, 2]}", 3); - ts_document_set_language(doc, get_test_language("json")); - ts_document_set_input_string(doc, "{\"key\": [1, 2]}"); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_set_input_string(document, "{\"key\": [1, 2]}"); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(object (pair (string) (array (number) (number))))"); @@ -61,11 +62,11 @@ describe("Document", [&]() { spy_input->content = string((const char *)content, sizeof(content)); spy_input->encoding = TSInputEncodingUTF16; - ts_document_set_input(doc, spy_input->input()); - ts_document_invalidate(doc); - ts_document_parse(doc); + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(array (true) (false))"); @@ -77,27 +78,27 @@ describe("Document", [&]() { spy_input->encoding = TSInputEncodingUTF16; // spy_input->measure_columns_in_bytes - ts_document_set_input(doc, spy_input->input()); - ts_document_invalidate(doc); - ts_document_parse(doc); + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); }); it("allows the input to be retrieved later", [&]() { - ts_document_set_input(doc, spy_input->input()); - AssertThat(ts_document_input(doc).payload, Equals(spy_input)); - AssertThat(ts_document_input(doc).read, Equals(spy_input->input().read)); - AssertThat(ts_document_input(doc).seek, Equals(spy_input->input().seek)); + ts_document_set_input(document, spy_input->input()); + AssertThat(ts_document_input(document).payload, Equals(spy_input)); + AssertThat(ts_document_input(document).read, Equals(spy_input->input().read)); + AssertThat(ts_document_input(document).seek, Equals(spy_input->input().seek)); }); it("does not assume that the document's text has changed", [&]() { - ts_document_set_input(doc, spy_input->input()); - AssertThat(ts_document_root_node(doc), Equals(root)); + ts_document_set_input(document, spy_input->input()); + AssertThat(ts_document_root_node(document), Equals(root)); AssertThat(ts_node_has_changes(root), IsFalse()); AssertThat(spy_input->strings_read, Equals(vector({ "" }))); }); it("reads text from the new input for future parses", [&]() { - ts_document_set_input(doc, spy_input->input()); + ts_document_set_input(document, spy_input->input()); // Insert 'null', delete '1'. TSInputEdit edit = {}; @@ -105,28 +106,28 @@ describe("Document", [&]() { edit.extent_added.column = edit.bytes_added = 4; edit.extent_removed.column = edit.bytes_removed = 1; - ts_document_edit(doc, edit); - ts_document_parse(doc); + ts_document_edit(document, edit); + ts_document_parse(document); - TSNode new_root = ts_document_root_node(doc); + TSNode new_root = ts_document_root_node(document); assert_node_string_equals( new_root, "(object (pair (string) (array (null) (number))))"); - AssertThat(spy_input->strings_read, Equals(vector({" [null, 2"}))); + AssertThat(spy_input->strings_read, Equals(vector({" [null, 2" }))); }); it("reads from the new input correctly when the old input was blank", [&]() { - ts_document_set_input_string(doc, ""); - ts_document_parse(doc); - TSNode new_root = ts_document_root_node(doc); + ts_document_set_input_string(document, ""); + ts_document_parse(document); + TSNode new_root = ts_document_root_node(document); AssertThat(ts_node_end_char(new_root), Equals(0)); assert_node_string_equals( new_root, "(ERROR)"); - ts_document_set_input_string(doc, "1"); - ts_document_parse(doc); - new_root = ts_document_root_node(doc); + ts_document_set_input_string(document, "1"); + ts_document_parse(document); + new_root = ts_document_root_node(document); AssertThat(ts_node_end_char(new_root), Equals(1)); assert_node_string_equals( new_root, @@ -136,33 +137,44 @@ describe("Document", [&]() { describe("set_language(language)", [&]() { before_each([&]() { - ts_document_set_input_string(doc, "{\"key\": [1, 2]}\n"); + ts_document_set_input_string(document, "{\"key\": [1, 2]}\n"); }); it("uses the given language for future parses", [&]() { - ts_document_set_language(doc, get_test_language("json")); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); assert_node_string_equals( root, "(object (pair (string) (array (number) (number))))"); }); it("clears out any previous tree", [&]() { - ts_document_set_language(doc, get_test_language("json")); - ts_document_parse(doc); + ts_document_set_language(document, get_test_language("json")); + ts_document_parse(document); - ts_document_set_language(doc, get_test_language("javascript")); - AssertThat(ts_document_root_node(doc).data, Equals(nullptr)); + ts_document_set_language(document, get_test_language("javascript")); + AssertThat(ts_document_root_node(document).data, Equals(nullptr)); - ts_document_parse(doc); - root = ts_document_root_node(doc); + ts_document_parse(document); + root = ts_document_root_node(document); assert_node_string_equals( root, "(program (expression_statement " "(object (pair (string) (array (number) (number))))))"); }); + + it("does not allow setting a language with a different version number", [&]() { + TSLanguage language = *get_test_language("json"); + AssertThat(ts_language_version(&language), Equals(TREE_SITTER_LANGUAGE_VERSION)); + + language.version++; + AssertThat(ts_language_version(&language), !Equals(TREE_SITTER_LANGUAGE_VERSION)); + + ts_document_set_language(document, &language); + AssertThat(ts_document_language(document), IsNull()); + }); }); describe("set_logger(TSLogger)", [&]() { @@ -170,45 +182,39 @@ describe("Document", [&]() { before_each([&]() { logger = new SpyLogger(); - ts_document_set_language(doc, get_test_language("json")); - ts_document_set_input_string(doc, "[1, 2]"); + ts_document_set_language(document, get_test_language("json")); + ts_document_set_input_string(document, "[1, 2]"); }); after_each([&]() { delete logger; }); - it("calls the debugger with a message for each lex action", [&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_parse(doc); - - AssertThat(logger->messages, Contains("lookahead char:'1'")); - AssertThat(logger->messages, Contains("lookahead char:'['")); - }); - it("calls the debugger with a message for each parse action", [&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_parse(doc); + ts_document_set_logger(document, logger->logger()); + ts_document_parse(document); AssertThat(logger->messages, Contains("new_parse")); - AssertThat(logger->messages, Contains("lookahead char:'['")); + AssertThat(logger->messages, Contains("skip character:' '")); + AssertThat(logger->messages, Contains("consume character:'['")); + AssertThat(logger->messages, Contains("consume character:'1'")); AssertThat(logger->messages, Contains("reduce sym:array, child_count:4")); AssertThat(logger->messages, Contains("accept")); }); it("allows the debugger to be retrieved later", [&]() { - ts_document_set_logger(doc, logger->logger()); - AssertThat(ts_document_logger(doc).payload, Equals(logger)); + ts_document_set_logger(document, logger->logger()); + AssertThat(ts_document_logger(document).payload, Equals(logger)); }); describe("disabling debugging", [&]() { before_each([&]() { - ts_document_set_logger(doc, logger->logger()); - ts_document_set_logger(doc, {NULL, NULL}); + ts_document_set_logger(document, logger->logger()); + ts_document_set_logger(document, {NULL, NULL}); }); it("does not call the debugger any more", [&]() { - ts_document_parse(doc); + ts_document_parse(document); AssertThat(logger->messages, IsEmpty()); }); }); @@ -218,12 +224,12 @@ describe("Document", [&]() { SpyInput *input; before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); input = new SpyInput("{a: null};", 3); - ts_document_set_input(doc, input->input()); - ts_document_parse(doc); + ts_document_set_input(document, input->input()); + ts_document_parse(document); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object (pair (identifier) (null)))))"); }); @@ -231,26 +237,25 @@ describe("Document", [&]() { delete input; }); - auto get_ranges = [&](std::function callback) -> vector { + auto get_invalidated_ranges_for_edit = [&](std::function callback) -> vector { TSInputEdit edit = callback(); - ts_document_edit(doc, edit); + ts_document_edit(document, edit); TSRange *ranges; uint32_t range_count = 0; - - ts_document_parse_and_get_changed_ranges(doc, &ranges, &range_count); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); vector result; - for (size_t i = 0; i < range_count; i++) + for (size_t i = 0; i < range_count; i++) { result.push_back(ranges[i]); + } ts_free(ranges); - return result; }; it("reports changes when one token has been updated", [&]() { // Replace `null` with `nothing` - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("ull"), 1, "othing"); }); @@ -262,7 +267,7 @@ describe("Document", [&]() { }))); // Replace `nothing` with `null` again - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); @@ -276,7 +281,7 @@ describe("Document", [&]() { it("reports changes when tokens have been appended", [&]() { // Add a second key-value pair - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("}"), 0, ", b: false"); }); @@ -288,12 +293,12 @@ describe("Document", [&]() { }))); // Add a third key-value pair in between the first two - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find(", b"), 0, ", c: 1"); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)) " "(pair (identifier) (number)) " @@ -307,41 +312,39 @@ describe("Document", [&]() { }))); // Delete the middle pair. - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)) " "(pair (identifier) (false)))))"); - AssertThat(ranges, Equals(vector({ - }))); + AssertThat(ranges, IsEmpty()); // Delete the second pair. - ranges = get_ranges([&]() { + ranges = get_invalidated_ranges_for_edit([&]() { return input->undo(); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (null)))))"); - AssertThat(ranges, Equals(vector({ - }))); + AssertThat(ranges, IsEmpty()); }); it("reports changes when trees have been wrapped", [&]() { // Wrap the object in an assignment expression. - auto ranges = get_ranges([&]() { + auto ranges = get_invalidated_ranges_for_edit([&]() { return input->replace(input->content.find("null"), 0, "b === "); }); assert_node_string_equals( - ts_document_root_node(doc), + ts_document_root_node(document), "(program (expression_statement (object " "(pair (identifier) (rel_op (identifier) (null))))))"); diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index a14fa68e..88633f1f 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -4,11 +4,13 @@ #include "helpers/spy_input.h" #include "helpers/load_language.h" #include "helpers/record_alloc.h" +#include "helpers/stderr_logger.h" +#include "helpers/dedent.h" START_TEST describe("Parser", [&]() { - TSDocument *doc; + TSDocument *document; SpyInput *input; TSNode root; size_t chunk_size; @@ -18,90 +20,76 @@ describe("Parser", [&]() { chunk_size = 3; input = nullptr; - - doc = ts_document_new(); + document = ts_document_new(); }); after_each([&]() { - if (doc) - ts_document_free(doc); - - if (input) - delete input; + if (document) ts_document_free(document); + if (input) delete input; record_alloc::stop(); AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); }); - auto set_text = [&](const char *text) { + auto set_text = [&](string text) { input = new SpyInput(text, chunk_size); - ts_document_set_input(doc, input->input()); - ts_document_parse(doc); + ts_document_set_input(document, input->input()); + ts_document_parse(document); - root = ts_document_root_node(doc); - AssertThat(ts_node_end_byte(root), Equals(strlen(text))); + root = ts_document_root_node(document); + AssertThat(ts_node_end_byte(root), Equals(text.size())); input->clear(); }; - auto insert_text = [&](size_t position, string text) { - size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, 0, text)); - ts_document_parse(doc); - - root = ts_document_root_node(doc); - size_t new_size = ts_node_end_byte(root); - AssertThat(new_size, Equals(prev_size + text.size())); - }; - - auto delete_text = [&](size_t position, size_t length) { - size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, length, "")); - ts_document_parse(doc); - - root = ts_document_root_node(doc); - size_t new_size = ts_node_end_byte(root); - AssertThat(new_size, Equals(prev_size - length)); - }; - auto replace_text = [&](size_t position, size_t length, string new_text) { size_t prev_size = ts_node_end_byte(root); - ts_document_edit(doc, input->replace(position, length, new_text)); - ts_document_parse(doc); + ts_document_edit(document, input->replace(position, length, new_text)); + ts_document_parse(document); - root = ts_document_root_node(doc); + root = ts_document_root_node(document); size_t new_size = ts_node_end_byte(root); AssertThat(new_size, Equals(prev_size - length + new_text.size())); }; + auto insert_text = [&](size_t position, string text) { + replace_text(position, 0, text); + }; + + auto delete_text = [&](size_t position, size_t length) { + replace_text(position, length, ""); + }; + + auto undo = [&]() { + ts_document_edit(document, input->undo()); + ts_document_parse(document); + }; + auto assert_root_node = [&](const string &expected) { - TSNode node = ts_document_root_node(doc); - char *str = ts_node_string(node, doc); - string actual(str); - ts_free(str); + TSNode node = ts_document_root_node(document); + char *node_string = ts_node_string(node, document); + string actual(node_string); + ts_free(node_string); AssertThat(actual, Equals(expected)); }; + auto get_node_text = [&](TSNode node) { + size_t start = ts_node_start_byte(node); + size_t end = ts_node_end_byte(node); + return input->content.substr(start, end - start); + }; + describe("handling errors", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("json")); - }); - - auto get_node_text = [&](TSNode node) { - size_t start = ts_node_start_byte(node); - size_t end = ts_node_end_byte(node); - return input->content.substr(start, end - start); - }; - describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(document, get_test_language("json")); set_text(" [123, @@@@@, true]"); assert_root_node( "(array (number) (ERROR (UNEXPECTED '@')) (true))"); TSNode error = ts_node_named_child(root, 1); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(get_node_text(error), Equals(", @@@@@")); AssertThat(ts_node_child_count(error), Equals(2)); @@ -112,56 +100,59 @@ describe("Parser", [&]() { AssertThat(get_node_text(garbage), Equals("@@@@@")); TSNode node_after_error = ts_node_named_child(root, 2); - AssertThat(ts_node_type(node_after_error, doc), Equals("true")); + AssertThat(ts_node_type(node_after_error, document), Equals("true")); AssertThat(get_node_text(node_after_error), Equals("true")); }); }); describe("when there is an unexpected string in the middle of a token", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(document, get_test_language("json")); set_text(" [123, faaaaalse, true]"); assert_root_node( "(array (number) (ERROR (UNEXPECTED 'a')) (true))"); TSNode error = ts_node_named_child(root, 1); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(ts_node_child_count(error), Equals(2)); TSNode comma = ts_node_child(error, 0); - AssertThat(ts_node_type(comma, doc), Equals(",")); + AssertThat(ts_node_type(comma, document), Equals(",")); AssertThat(get_node_text(comma), Equals(",")); TSNode garbage = ts_node_child(error, 1); - AssertThat(ts_node_type(garbage, doc), Equals("ERROR")); + AssertThat(ts_node_type(garbage, document), Equals("ERROR")); AssertThat(get_node_text(garbage), Equals("faaaaalse")); TSNode last = ts_node_named_child(root, 2); - AssertThat(ts_node_type(last, doc), Equals("true")); + AssertThat(ts_node_type(last, document), Equals("true")); AssertThat(ts_node_start_byte(last), Equals(strlen(" [123, faaaaalse, "))); }); }); describe("when there is one unexpected token between two valid tokens", [&]() { it("computes the error node's size and position correctly", [&]() { + ts_document_set_language(document, get_test_language("json")); set_text(" [123, true false, true]"); assert_root_node( "(array (number) (true) (ERROR (false)) (true))"); TSNode error = ts_node_named_child(root, 2); - AssertThat(ts_node_type(error, doc), Equals("ERROR")); + AssertThat(ts_node_type(error, document), Equals("ERROR")); AssertThat(get_node_text(error), Equals("false")); AssertThat(ts_node_child_count(error), Equals(1)); TSNode last = ts_node_named_child(root, 1); - AssertThat(ts_node_type(last, doc), Equals("true")); + AssertThat(ts_node_type(last, document), Equals("true")); AssertThat(get_node_text(last), Equals("true")); }); }); describe("when there is an unexpected string at the end of a token", [&]() { it("computes the error's size and position correctly", [&]() { + ts_document_set_language(document, get_test_language("json")); set_text(" [123, \"hi\n, true]"); assert_root_node( @@ -171,7 +162,7 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + ts_document_set_language(document, get_test_language("javascript")); set_text("a; /* b"); assert_root_node( "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); @@ -180,14 +171,9 @@ describe("Parser", [&]() { }); describe("handling extra tokens", [&]() { - // In the javascript example grammar, ASI works by using newlines as - // terminators in statements, but also as extra tokens. - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); - }); - describe("when the token appears as part of a grammar rule", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates it into the tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); set_text("fn()\n"); assert_root_node( @@ -196,7 +182,8 @@ describe("Parser", [&]() { }); describe("when the token appears somewhere else", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates it into the tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n" " .otherFn();"); @@ -211,7 +198,8 @@ describe("Parser", [&]() { }); describe("when several extra tokens appear in a row", [&]() { - it("is incorporated into the tree", [&]() { + it("incorporates them into the tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); set_text( "fn()\n\n" "// This is a comment" @@ -230,199 +218,219 @@ describe("Parser", [&]() { }); describe("editing", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); + describe("creating new tokens near the end of the input", [&]() { + it("updates the parse tree and re-reads only the changed portion of the text", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("x * (100 + abc);"); + + assert_root_node( + "(program (expression_statement (math_op " + "(identifier) " + "(math_op (number) (identifier)))))"); + + insert_text(strlen("x * (100 + abc"), ".d"); + + assert_root_node( + "(program (expression_statement (math_op " + "(identifier) " + "(math_op (number) (member_access (identifier) (identifier))))))"); + + AssertThat(input->strings_read, Equals(vector({ " + abc.d)" }))); + }); }); - describe("inserting text", [&]() { - describe("creating new tokens near the end of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - set_text("x * (100 + abc);"); + describe("creating new tokens near the beginning of the input", [&]() { + it("updates the parse tree and re-reads only the changed portion of the input", [&]() { + chunk_size = 2; - assert_root_node( - "(program (expression_statement (math_op " - "(identifier) " - "(math_op (number) (identifier)))))"); + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 + 456 * (10 + x);"); - insert_text(strlen("x * (100 + abc"), ".d"); + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(math_op (number) (math_op (number) (identifier))))))"); - assert_root_node( - "(program (expression_statement (math_op " - "(identifier) " - "(math_op (number) (member_access (identifier) (identifier))))))"); + insert_text(strlen("123"), " || 5"); - AssertThat(input->strings_read, Equals(vector({ " + abc.d)" }))); - }); - }); - - describe("creating new tokens near the beginning of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the input", [&]() { - chunk_size = 2; - - set_text("123 + 456 * (10 + x);"); - - assert_root_node( - "(program (expression_statement (math_op " + assert_root_node( + "(program (expression_statement (bool_op " + "(number) " + "(math_op " "(number) " - "(math_op (number) (math_op (number) (identifier))))))"); + "(math_op (number) (math_op (number) (identifier)))))))"); - insert_text(strlen("123"), " || 5"); - - assert_root_node( - "(program (expression_statement (bool_op " - "(number) " - "(math_op " - "(number) " - "(math_op (number) (math_op (number) (identifier)))))))"); - - AssertThat(input->strings_read, Equals(vector({ "123 || 5 +" }))); - }); + AssertThat(input->strings_read, Equals(vector({ "123 || 5 +" }))); }); + }); - describe("introducing an error", [&]() { - it("gives the error the right size", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + describe("introducing an error", [&]() { + it("gives the error the right size", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("var x = y;"); - set_text("var x = y;"); + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (identifier))))"); - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (identifier))))"); + insert_text(strlen("var x = y"), " *"); - insert_text(strlen("var x = y"), " *"); + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (identifier)) (ERROR)))"); - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (identifier)) (ERROR)))"); + insert_text(strlen("var x = y *"), " z"); - insert_text(strlen("var x = y *"), " z"); - - assert_root_node( - "(program (var_declaration (var_assignment " - "(identifier) (math_op (identifier) (identifier)))))"); - }); + assert_root_node( + "(program (var_declaration (var_assignment " + "(identifier) (math_op (identifier) (identifier)))))"); }); + }); - describe("into the middle of an existing token", [&]() { - it("updates the parse tree", [&]() { - set_text("abc * 123;"); + describe("into the middle of an existing token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("abc * 123;"); - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); - insert_text(strlen("ab"), "XYZ"); + insert_text(strlen("ab"), "XYZ"); - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); - TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, doc), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); - }); + TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); + AssertThat(ts_node_type(node, document), Equals("identifier")); + AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); }); + }); - describe("at the end of an existing token", [&]() { - it("updates the parse tree", [&]() { - set_text("abc * 123;"); + describe("at the end of an existing token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("abc * 123;"); - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); - insert_text(strlen("abc"), "XYZ"); + insert_text(strlen("abc"), "XYZ"); - assert_root_node( - "(program (expression_statement (math_op (identifier) (number))))"); + assert_root_node( + "(program (expression_statement (math_op (identifier) (number))))"); - TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); - AssertThat(ts_node_type(node, doc), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); - }); + TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1); + AssertThat(ts_node_type(node, document), Equals("identifier")); + AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); }); + }); - describe("into a node containing a extra token", [&]() { - it("updates the parse tree", [&]() { - set_text("123 *\n" + describe("inserting text into a node containing a extra token", [&]() { + it("updates the parse tree", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 *\n" + "// a-comment\n" + "abc;"); + + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(comment) " + "(identifier))))"); + + insert_text( + strlen("123 *\n" "// a-comment\n" - "abc;"); + "abc"), + "XYZ"); - assert_root_node( - "(program (expression_statement (math_op " - "(number) " - "(comment) " - "(identifier))))"); - - insert_text( - strlen("123 *\n" - "// a-comment\n" - "abc"), - "XYZ"); - - assert_root_node( - "(program (expression_statement (math_op " - "(number) " - "(comment) " - "(identifier))))"); - }); + assert_root_node( + "(program (expression_statement (math_op " + "(number) " + "(comment) " + "(identifier))))"); }); }); - describe("deleting text", [&]() { - describe("when a critical token is removed", [&]() { - it("updates the parse tree, creating an error", [&]() { - set_text("123 * 456; 789 * 123;"); + describe("when a critical token is removed", [&]() { + it("updates the parse tree, creating an error", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("123 * 456; 789 * 123;"); - assert_root_node( - "(program " - "(expression_statement (math_op (number) (number))) " - "(expression_statement (math_op (number) (number))))"); + assert_root_node( + "(program " + "(expression_statement (math_op (number) (number))) " + "(expression_statement (math_op (number) (number))))"); - delete_text(strlen("123 "), 2); + delete_text(strlen("123 "), 2); - assert_root_node( - "(program " - "(expression_statement (number) (ERROR (number))) " - "(expression_statement (math_op (number) (number))))"); - }); + assert_root_node( + "(program " + "(expression_statement (number) (ERROR (number))) " + "(expression_statement (math_op (number) (number))))"); }); }); - describe("replacing text", [&]() { - it("does not try to re-use nodes that are within the edited region", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); + describe("with external tokens", [&]() { + it("maintains the external scanner's state during incremental parsing", [&]() { + ts_document_set_language(document, get_test_language("python")); + string text = dedent(R"PYTHON( + if a: + print b + return c + )PYTHON"); - set_text("{ x: (b.c) };"); + set_text(text); + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier))) " + "(return_statement (expression_list (identifier))))"); - assert_root_node( - "(program (expression_statement (object (pair " - "(identifier) (member_access (identifier) (identifier))))))"); + replace_text(text.find("return"), 0, " "); + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier)) " + "(return_statement (expression_list (identifier)))))"); - replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c"); - - assert_root_node( - "(program (expression_statement (object (pair " - "(identifier) (member_access (identifier) (identifier))))))"); + undo(); + assert_root_node("(module " + "(if_statement (identifier) " + "(print_statement (identifier))) " + "(return_statement (expression_list (identifier))))"); }); }); + it("does not try to re-use nodes that are within the edited region", [&]() { + ts_document_set_language(document, get_test_language("javascript")); + set_text("{ x: (b.c) };"); + + assert_root_node( + "(program (expression_statement (object (pair " + "(identifier) (member_access (identifier) (identifier))))))"); + + replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c"); + + assert_root_node( + "(program (expression_statement (object (pair " + "(identifier) (member_access (identifier) (identifier))))))"); + }); + it("updates the document's parse count", [&]() { - ts_document_set_language(doc, get_test_language("javascript")); - AssertThat(ts_document_parse_count(doc), Equals(0)); + ts_document_set_language(document, get_test_language("javascript")); + AssertThat(ts_document_parse_count(document), Equals(0)); set_text("{ x: (b.c) };"); - AssertThat(ts_document_parse_count(doc), Equals(1)); + AssertThat(ts_document_parse_count(document), Equals(1)); insert_text(strlen("{ x"), "yz"); - AssertThat(ts_document_parse_count(doc), Equals(2)); + AssertThat(ts_document_parse_count(document), Equals(2)); }); }); describe("lexing", [&]() { - before_each([&]() { - ts_document_set_language(doc, get_test_language("javascript")); - }); - describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { it("terminates them at the end of the document", [&]() { + ts_document_set_language(document, get_test_language("javascript")); set_text("x; // this is a comment"); assert_root_node( @@ -437,6 +445,7 @@ describe("Parser", [&]() { it("recognizes UTF8 characters as single characters", [&]() { // 'ΩΩΩ — ΔΔ'; + ts_document_set_language(document, get_test_language("javascript")); set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); assert_root_node( diff --git a/spec/runtime/stack_spec.cc b/spec/runtime/stack_spec.cc index 4d4b01fd..20180843 100644 --- a/spec/runtime/stack_spec.cc +++ b/spec/runtime/stack_spec.cc @@ -521,6 +521,31 @@ describe("Stack", [&]() { free_slice_array(&pop.slices); }); }); + + describe("setting external token state", [&]() { + TSExternalTokenState external_token_state1, external_token_state2; + + it("allows the state to be retrieved", [&]() { + AssertThat(ts_stack_external_token_state(stack, 0), Equals(nullptr)); + + ts_stack_set_external_token_state(stack, 0, &external_token_state1); + AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1)); + + ts_stack_copy_version(stack, 0); + AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1)); + }); + + it("does not merge stack versions with different external token states", [&]() { + ts_stack_copy_version(stack, 0); + ts_stack_push(stack, 0, trees[0], false, 5); + ts_stack_push(stack, 1, trees[0], false, 5); + + ts_stack_set_external_token_state(stack, 0, &external_token_state1); + ts_stack_set_external_token_state(stack, 0, &external_token_state2); + + AssertThat(ts_stack_merge(stack, 0, 1), IsFalse()); + }); + }); }); END_TEST diff --git a/spec/runtime/tree_spec.cc b/spec/runtime/tree_spec.cc index 9f451829..bdc8145f 100644 --- a/spec/runtime/tree_spec.cc +++ b/spec/runtime/tree_spec.cc @@ -22,47 +22,32 @@ void assert_consistent(const Tree *tree) { START_TEST -enum { - cat = 1, - dog, - eel, - fox, - goat, - hog, -}; - describe("Tree", []() { - Tree *tree1, *tree2, *parent1; + enum { + symbol1 = 1, + symbol2, + symbol3, + symbol4, + symbol5, + symbol6, + symbol7, + symbol8, + symbol9, + }; + TSSymbolMetadata visible = {true, true, false, true}; TSSymbolMetadata invisible = {false, false, false, true}; - before_each([&]() { - tree1 = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); - tree2 = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - - ts_tree_retain(tree1); - ts_tree_retain(tree2); - parent1 = ts_tree_make_node(dog, 2, tree_array({ - tree1, - tree2, - }), visible); - }); - - after_each([&]() { - ts_tree_release(tree1); - ts_tree_release(tree2); - ts_tree_release(parent1); - }); - - describe("make_leaf(sym, size, padding, is_hidden)", [&]() { - it("does not record that it is fragile", [&]() { - AssertThat(tree1->fragile_left, IsFalse()); - AssertThat(tree1->fragile_right, IsFalse()); + describe("make_leaf", [&]() { + it("does not mark the tree as fragile", [&]() { + Tree *tree = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + AssertThat(tree->fragile_left, IsFalse()); + AssertThat(tree->fragile_right, IsFalse()); }); }); - describe("make_error(size, padding, lookahead_char)", [&]() { - it("records that it is fragile", [&]() { + describe("make_error", [&]() { + it("marks the tree as fragile", [&]() { Tree *error_tree = ts_tree_make_error( length_zero(), length_zero(), @@ -75,15 +60,33 @@ describe("Tree", []() { }); }); - describe("make_node(symbol, child_count, children, is_hidden)", [&]() { - it("computes its size based on its child nodes", [&]() { - AssertThat(parent1->size.bytes, Equals( - tree1->size.bytes + + tree2->padding.bytes + tree2->size.bytes)); - AssertThat(parent1->size.chars, Equals( - tree1->size.chars + + tree2->padding.chars + tree2->size.chars)); + describe("make_node", [&]() { + Tree *tree1, *tree2, *parent1; + + before_each([&]() { + tree1 = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + tree2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); + + ts_tree_retain(tree1); + ts_tree_retain(tree2); + parent1 = ts_tree_make_node(symbol3, 2, tree_array({ + tree1, + tree2, + }), visible); }); - it("computes its padding based on its first child", [&]() { + after_each([&]() { + ts_tree_release(tree1); + ts_tree_release(tree2); + ts_tree_release(parent1); + }); + + it("computes its size and padding based on its child nodes", [&]() { + AssertThat(parent1->size.bytes, Equals( + tree1->size.bytes + tree2->padding.bytes + tree2->size.bytes)); + AssertThat(parent1->size.chars, Equals( + tree1->size.chars + tree2->padding.chars + tree2->size.chars)); + AssertThat(parent1->padding.bytes, Equals(tree1->padding.bytes)); AssertThat(parent1->padding.chars, Equals(tree1->padding.chars)); }); @@ -97,7 +100,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -121,7 +124,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -145,7 +148,7 @@ describe("Tree", []() { ts_tree_retain(tree1); ts_tree_retain(tree2); - parent = ts_tree_make_node(eel, 2, tree_array({ + parent = ts_tree_make_node(symbol3, 2, tree_array({ tree1, tree2, }), visible); @@ -162,14 +165,14 @@ describe("Tree", []() { }); }); - describe("edit(InputEdit)", [&]() { + describe("edit", [&]() { Tree *tree = nullptr; before_each([&]() { - tree = ts_tree_make_node(cat, 3, tree_array({ - ts_tree_make_leaf(dog, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), - ts_tree_make_leaf(eel, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), - ts_tree_make_leaf(fox, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + tree = ts_tree_make_node(symbol1, 3, tree_array({ + ts_tree_make_leaf(symbol2, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + ts_tree_make_leaf(symbol3, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), + ts_tree_make_leaf(symbol4, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible), }), visible); AssertThat(tree->padding, Equals({2, 2, {0, 2}})); @@ -180,7 +183,6 @@ describe("Tree", []() { ts_tree_release(tree); }); - describe("edits within a tree's padding", [&]() { it("resizes the padding of the tree and its leftmost descendants", [&]() { TSInputEdit edit; @@ -312,69 +314,124 @@ describe("Tree", []() { }); }); - describe("equality", [&]() { + describe("eq", [&]() { + Tree *leaf; + + before_each([&]() { + leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible); + }); + + after_each([&]() { + ts_tree_release(leaf); + }); + it("returns true for identical trees", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsTrue()); + Tree *leaf_copy = ts_tree_make_leaf(symbol1, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible); + AssertThat(ts_tree_eq(leaf, leaf_copy), IsTrue()); - Tree *tree2_copy = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - AssertThat(ts_tree_eq(tree2, tree2_copy), IsTrue()); - - Tree *parent2 = ts_tree_make_node(dog, 2, tree_array({ - tree1_copy, - tree2_copy, + Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf_copy, }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf_copy); - AssertThat(ts_tree_eq(parent1, parent2), IsTrue()); + Tree *parent_copy = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf_copy, + }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf_copy); - ts_tree_release(parent2); + AssertThat(ts_tree_eq(parent, parent_copy), IsTrue()); + + ts_tree_release(leaf_copy); + ts_tree_release(parent); + ts_tree_release(parent_copy); }); it("returns false for trees with different symbols", [&]() { - Tree *different_tree = ts_tree_make_leaf( - tree1->symbol + 1, - tree1->padding, - tree1->size, + Tree *different_leaf = ts_tree_make_leaf( + leaf->symbol + 1, + leaf->padding, + leaf->size, visible); - AssertThat(ts_tree_eq(tree1, different_tree), IsFalse()); - ts_tree_release(different_tree); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different options", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, tree1->padding, tree1->size, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + Tree *different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, leaf->size, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different sizes", [&]() { - Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, tree1->size, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + Tree *different_leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, leaf->size, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); - tree1_copy = ts_tree_make_leaf(cat, tree1->padding, {5, 4, {1, 10}}, invisible); - AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse()); - ts_tree_release(tree1_copy); + different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, {5, 4, {1, 10}}, invisible); + AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse()); + ts_tree_release(different_leaf); }); it("returns false for trees with different children", [&]() { - Tree *different_tree = ts_tree_make_leaf( - tree1->symbol + 1, - tree1->padding, - tree1->size, - visible); + Tree *leaf2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible); - ts_tree_retain(different_tree); - ts_tree_retain(tree2); - Tree *different_parent = ts_tree_make_node(dog, 2, tree_array({ - different_tree, tree2, + Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf, + leaf2, + }), visible); + ts_tree_retain(leaf); + ts_tree_retain(leaf2); + + Tree *different_parent = ts_tree_make_node(symbol2, 2, tree_array({ + leaf2, + leaf, + }), visible); + ts_tree_retain(leaf2); + ts_tree_retain(leaf); + + AssertThat(ts_tree_eq(different_parent, parent), IsFalse()); + AssertThat(ts_tree_eq(parent, different_parent), IsFalse()); + + ts_tree_release(leaf2); + ts_tree_release(parent); + ts_tree_release(different_parent); + }); + }); + + describe("last_external_token_state", [&]() { + Length padding = {1, 1, {0, 1}}; + Length size = {2, 2, {0, 2}}; + + auto make_external = [](Tree *tree) { + tree->has_external_tokens = true; + tree->has_external_token_state = true; + return tree; + }; + + it("returns the last serialized external token state in the given tree", [&]() { + Tree *tree1, *tree2, *tree3, *tree4, *tree5, *tree6, *tree7, *tree8, *tree9; + + tree1 = ts_tree_make_node(symbol1, 2, tree_array({ + (tree2 = ts_tree_make_node(symbol2, 3, tree_array({ + (tree3 = make_external(ts_tree_make_leaf(symbol3, padding, size, visible))), + (tree4 = ts_tree_make_leaf(symbol4, padding, size, visible)), + (tree5 = ts_tree_make_leaf(symbol5, padding, size, visible)), + }), visible)), + (tree6 = ts_tree_make_node(symbol6, 2, tree_array({ + (tree7 = ts_tree_make_node(symbol7, 1, tree_array({ + (tree8 = ts_tree_make_leaf(symbol8, padding, size, visible)), + }), visible)), + (tree9 = ts_tree_make_leaf(symbol9, padding, size, visible)), + }), visible)), }), visible); - AssertThat(ts_tree_eq(different_parent, parent1), IsFalse()); - AssertThat(ts_tree_eq(parent1, different_parent), IsFalse()); - - ts_tree_release(different_tree); - ts_tree_release(different_parent); + auto state = ts_tree_last_external_token_state(tree1); + AssertThat(state, Equals(&tree3->external_token_state)); }); }); }); diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index 151da7cf..29d8f4d0 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -64,7 +64,7 @@ class LexTableBuilder { private: void add_lex_state_for_parse_state(ParseState *parse_state) { parse_state->lex_state_id = - add_lex_state(item_set_for_tokens(parse_state->expected_inputs())); + add_lex_state(item_set_for_terminals(parse_state->terminal_entries)); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -112,24 +112,27 @@ class LexTableBuilder { void mark_fragile_tokens() { for (ParseState &state : parse_table->states) { for (auto &entry : state.terminal_entries) { - auto homonyms = conflict_manager.possible_homonyms.find(entry.first); - if (homonyms != conflict_manager.possible_homonyms.end()) - for (Symbol::Index homonym : homonyms->second) - if (state.terminal_entries.count(homonym)) { - entry.second.reusable = false; - break; - } + Symbol symbol = entry.first; + if (symbol.is_token()) { + auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); + if (homonyms != conflict_manager.possible_homonyms.end()) + for (Symbol::Index homonym : homonyms->second) + if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) { + entry.second.reusable = false; + break; + } - if (!entry.second.reusable) - continue; + if (!entry.second.reusable) + continue; - auto extensions = conflict_manager.possible_extensions.find(entry.first); - if (extensions != conflict_manager.possible_extensions.end()) - for (Symbol::Index extension : extensions->second) - if (state.terminal_entries.count(extension)) { - entry.second.depends_on_lookahead = true; - break; - } + auto extensions = conflict_manager.possible_extensions.find(symbol.index); + if (extensions != conflict_manager.possible_extensions.end()) + for (Symbol::Index extension : extensions->second) + if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) { + entry.second.depends_on_lookahead = true; + break; + } + } } } } @@ -150,24 +153,27 @@ class LexTableBuilder { } } - LexItemSet item_set_for_tokens(const set &symbols) { + LexItemSet item_set_for_terminals(const map &terminals) { LexItemSet result; - for (const Symbol &symbol : symbols) - for (const rule_ptr &rule : rules_for_symbol(symbol)) - for (const rule_ptr &separator_rule : separator_rules) - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Seq::build({ - separator_rule, - Metadata::main_token(rule) })))); + for (const auto &pair : terminals) { + Symbol symbol = pair.first; + if (symbol.is_token()) { + for (const rule_ptr &rule : rules_for_symbol(symbol)) { + for (const rule_ptr &separator_rule : separator_rules) { + result.entries.insert(LexItem( + symbol, + Metadata::separator( + Seq::build({ + separator_rule, + Metadata::main_token(rule) })))); + } + } + } + } return result; } vector rules_for_symbol(const rules::Symbol &symbol) { - if (!symbol.is_token) - return {}; - if (symbol == rules::END_OF_INPUT()) return { CharacterSet().include(0).copy() }; diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 91444310..9fb6859f 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -52,7 +52,10 @@ class ParseTableBuilder { allow_any_conflict(false) {} pair build() { - Symbol start_symbol = Symbol(0, grammar.variables.empty()); + Symbol start_symbol = grammar.variables.empty() ? + Symbol(0, Symbol::Terminal) : + Symbol(0, Symbol::NonTerminal); + Production start_production({ ProductionStep(start_symbol, 0, rules::AssociativityNone), }); @@ -63,7 +66,7 @@ class ParseTableBuilder { add_parse_state(ParseItemSet({ { ParseItem(rules::START(), start_production, 0), - LookaheadSet({ END_OF_INPUT().index }), + LookaheadSet({ END_OF_INPUT() }), }, })); @@ -107,21 +110,25 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (const Symbol::Index index : parse_table.mergeable_symbols) { - add_out_of_context_parse_state(&error_state, Symbol(index, true)); + for (const Symbol symbol : parse_table.mergeable_symbols) { + add_out_of_context_parse_state(&error_state, symbol); } for (const Symbol &symbol : grammar.extra_tokens) { - if (!error_state.terminal_entries.count(symbol.index)) { - error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra()); + if (!error_state.terminal_entries.count(symbol)) { + error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra()); } } - for (size_t i = 0; i < grammar.variables.size(); i++) { - add_out_of_context_parse_state(&error_state, Symbol(i, false)); + for (size_t i = 0; i < grammar.external_tokens.size(); i++) { + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::External)); } - error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0)); + for (size_t i = 0; i < grammar.variables.size(); i++) { + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal)); + } + + error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); parse_table.states[0] = error_state; } @@ -130,10 +137,10 @@ class ParseTableBuilder { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state(item_set); - if (symbol.is_token) { - error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) }); - } else { + if (symbol.is_non_terminal()) { error_state->nonterminal_entries[symbol.index] = state; + } else { + error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) }); } } } @@ -152,9 +159,9 @@ class ParseTableBuilder { } string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { - map terminal_successors; + map terminal_successors; map nonterminal_successors; - set lookaheads_with_conflicts; + set lookaheads_with_conflicts; for (const auto &pair : item_set.entries) { const ParseItem &item = pair.first; @@ -168,7 +175,7 @@ class ParseTableBuilder { ParseAction::Reduce(item.lhs(), item.step_index, *item.production); int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { + for (Symbol lookahead : *lookahead_symbols.entries) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; // Only add the highest-precedence Reduce actions to the parse table. @@ -203,10 +210,10 @@ class ParseTableBuilder { Symbol symbol = item.production->at(item.step_index).symbol; ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - if (symbol.is_token) { - terminal_successors[symbol.index].entries[new_item] = lookahead_symbols; - } else { + if (symbol.is_non_terminal()) { nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols; + } else { + terminal_successors[symbol].entries[new_item] = lookahead_symbols; } } } @@ -214,7 +221,7 @@ class ParseTableBuilder { // Add a Shift action for each possible successor state. Shift actions for // terminal lookaheads can conflict with Reduce actions added previously. for (auto &pair : terminal_successors) { - Symbol::Index lookahead = pair.first; + Symbol lookahead = pair.first; ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(next_item_set); ParseState &state = parse_table.states[state_id]; @@ -223,7 +230,7 @@ class ParseTableBuilder { if (!allow_any_conflict) { if (had_existing_action) lookaheads_with_conflicts.insert(lookahead); - recovery_states[Symbol(lookahead, true)].add(next_item_set); + recovery_states[lookahead].add(next_item_set); } } @@ -234,10 +241,10 @@ class ParseTableBuilder { ParseStateId next_state = add_parse_state(next_item_set); parse_table.set_nonterminal_action(state_id, lookahead, next_state); if (!allow_any_conflict) - recovery_states[Symbol(lookahead, false)].add(next_item_set); + recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set); } - for (Symbol::Index lookahead : lookaheads_with_conflicts) { + for (Symbol lookahead : lookaheads_with_conflicts) { string conflict = handle_conflict(item_set, state_id, lookahead); if (!conflict.empty()) return conflict; } @@ -245,9 +252,9 @@ class ParseTableBuilder { ParseAction shift_extra = ParseAction::ShiftExtra(); ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { - if (!state.terminal_entries.count(extra_symbol.index) || + if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action() || allow_any_conflict) { - parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra); + parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } @@ -257,7 +264,6 @@ class ParseTableBuilder { void mark_fragile_actions() { for (ParseState &state : parse_table.states) { for (auto &entry : state.terminal_entries) { - const Symbol symbol(entry.first, true); auto &actions = entry.second.actions; for (ParseAction &action : actions) { @@ -359,7 +365,7 @@ class ParseTableBuilder { } string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, - Symbol::Index lookahead) { + Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; @@ -468,7 +474,7 @@ class ParseTableBuilder { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n\n"; description += "Possible interpretations:\n\n"; @@ -487,7 +493,7 @@ class ParseTableBuilder { description += " " + symbol_name(step.symbol); } description += ")"; - description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; + description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n"; } } @@ -564,14 +570,23 @@ class ParseTableBuilder { return "END_OF_INPUT"; else return ""; - } else if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - if (variable.type == VariableTypeNamed) - return variable.name; - else - return "'" + variable.name + "'"; - } else { - return grammar.variables[symbol.index].name; + } + + switch (symbol.type) { + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + if (variable.type == VariableTypeNamed) + return variable.name; + else + return "'" + variable.name + "'"; + } + case Symbol::NonTerminal: { + return grammar.variables[symbol.index].name; + } + case Symbol::External: + default: { + return grammar.external_tokens[symbol.index].name; + } } } diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc index 1ecb0baf..239bc029 100644 --- a/src/compiler/build_tables/lookahead_set.cc +++ b/src/compiler/build_tables/lookahead_set.cc @@ -12,8 +12,8 @@ using rules::Symbol; LookaheadSet::LookaheadSet() : entries(nullptr) {} -LookaheadSet::LookaheadSet(const set &symbols) - : entries(make_shared>(symbols)) {} +LookaheadSet::LookaheadSet(const set &symbols) + : entries(make_shared>(symbols)) {} bool LookaheadSet::empty() const { return !entries.get() || entries->empty(); @@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const { return *entries == *other.entries; } -bool LookaheadSet::contains(const Symbol::Index &symbol) const { +bool LookaheadSet::contains(const Symbol &symbol) const { return entries->find(symbol) != entries->end(); } @@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) { if (!other.entries.get()) return false; if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); size_t previous_size = entries->size(); entries->insert(other.entries->begin(), other.entries->end()); return entries->size() > previous_size; } -bool LookaheadSet::insert(const Symbol::Index &symbol) { +bool LookaheadSet::insert(const Symbol &symbol) { if (!entries.get()) - entries = make_shared>(); + entries = make_shared>(); return entries->insert(symbol).second; } diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h index fe99b4d5..e62ee34d 100644 --- a/src/compiler/build_tables/lookahead_set.h +++ b/src/compiler/build_tables/lookahead_set.h @@ -11,15 +11,15 @@ namespace build_tables { class LookaheadSet { public: LookaheadSet(); - explicit LookaheadSet(const std::set &); + explicit LookaheadSet(const std::set &); bool empty() const; bool operator==(const LookaheadSet &) const; - bool contains(const rules::Symbol::Index &) const; + bool contains(const rules::Symbol &) const; bool insert_all(const LookaheadSet &); - bool insert(const rules::Symbol::Index &); + bool insert(const rules::Symbol &); - std::shared_ptr> entries; + std::shared_ptr> entries; }; } // namespace build_tables diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 39b131cb..b9c3831b 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const { } Symbol ParseItem::lhs() const { - return Symbol(variable_index); + return Symbol(variable_index, Symbol::NonTerminal); } bool ParseItem::is_done() const { @@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const { return result; } -ParseItemSet::ActionMap ParseItemSet::actions() const { - ParseItemSet::ActionMap result; - - for (const auto &pair : entries) { - const ParseItem &item = pair.first; - const LookaheadSet &lookahead_symbols = pair.second; - - if (item.step_index == item.production->size()) { - int precedence = item.precedence(); - for (const Symbol::Index lookahead : *lookahead_symbols.entries) { - Action &action = result.terminal_actions[lookahead]; - if (precedence > action.completion_precedence) { - action.completions.assign({ &item }); - } else if (precedence == action.completion_precedence) { - action.completions.push_back({ &item }); - } - } - } else { - Symbol symbol = item.production->at(item.step_index).symbol; - ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - - if (symbol.is_token) { - result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols; - } else { - result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols; - } - } - } - - return result; -} - void ParseItemSet::add(const ParseItemSet &other) { for (const auto &pair : other.entries) entries[pair.first].insert_all(pair.second); diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index a091ac9d..a3785638 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -41,16 +41,6 @@ class ParseItemSet { ParseItemSet(); explicit ParseItemSet(const std::map &); - struct Completion; - struct Action; - - struct ActionMap { - std::map terminal_actions; - std::map nonterminal_continuations; - }; - - ActionMap actions() const; - bool operator==(const ParseItemSet &) const; void add(const ParseItemSet &); size_t unfinished_item_signature() const; @@ -58,22 +48,6 @@ class ParseItemSet { std::map entries; }; -struct ParseItemSet::Completion { - const ParseItem *item; - int precedence; - rules::Associativity associativity; - - bool operator<(const ParseItemSet::Completion &other) { - return precedence < other.precedence; - } -}; - -struct ParseItemSet::Action { - ParseItemSet continuation; - std::vector completions; - int completion_precedence; -}; - } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 34b347fe..0a2039d3 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -27,12 +27,17 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, set processed_non_terminals; for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - Symbol symbol(i, true); - first_sets.insert({symbol, LookaheadSet({ static_cast(i) })}); + Symbol symbol(i, Symbol::Terminal); + first_sets.insert({symbol, LookaheadSet({ symbol })}); + } + + for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { + Symbol symbol(i, Symbol::External); + first_sets.insert({symbol, LookaheadSet({ symbol })}); } for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); LookaheadSet first_set; processed_non_terminals.clear(); @@ -42,10 +47,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, Symbol current_symbol = symbols_to_process.back(); symbols_to_process.pop_back(); - if (current_symbol.is_token) { - first_set.insert(current_symbol.index); + if (!current_symbol.is_non_terminal()) { + first_set.insert(current_symbol); } else if (processed_non_terminals.insert(current_symbol.index).second) { - for (const Production &production : grammar.productions(current_symbol)) { + for (const Production &production : grammar.variables[current_symbol.index].productions) { if (!production.empty()) { symbols_to_process.push_back(production[0].symbol); } @@ -59,11 +64,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, vector components_to_process; for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol(i); + Symbol symbol(i, Symbol::NonTerminal); map> cache_entry; components_to_process.clear(); - for (const Production &production : grammar.productions(symbol)) { + for (const Production &production : grammar.variables[i].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(symbol, production, 0), LookaheadSet(), @@ -87,7 +92,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, if (component_is_new) { Symbol next_symbol = item.next_symbol(); - if (next_symbol.is_built_in() || next_symbol.is_token) + if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue; LookaheadSet next_lookaheads; @@ -102,7 +107,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, propagates_lookaheads = false; } - for (const Production &production : grammar.productions(next_symbol)) { + for (const Production &production : grammar.variables[next_symbol.index].productions) { components_to_process.push_back(ParseItemSetComponent{ ParseItem(next_symbol, production, 0), next_lookaheads, @@ -130,7 +135,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { const LookaheadSet &lookaheads = pair.second; const Symbol &next_symbol = item.next_symbol(); - if (!next_symbol.is_token && !next_symbol.is_built_in()) { + if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) { LookaheadSet next_lookaheads; size_t next_step = item.step_index + 1; if (next_step == item.production->size()) { diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc index 479de6b8..84b175bc 100644 --- a/src/compiler/build_tables/recovery_tokens.cc +++ b/src/compiler/build_tables/recovery_tokens.cc @@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator {}; class LastCharacters : public CharacterAggregator {}; class AllCharacters : public CharacterAggregator {}; -set recovery_tokens(const LexicalGrammar &grammar) { - set result; +set recovery_tokens(const LexicalGrammar &grammar) { + set result; AllCharacters all_separator_characters; for (const rule_ptr &separator : grammar.separators) @@ -79,7 +79,7 @@ set recovery_tokens(const LexicalGrammar &grammar) { !all_characters.result.intersects(all_separator_characters.result); if ((has_distinct_start && has_distinct_end) || has_no_separators) - result.insert(i); + result.insert(Symbol(i, Symbol::Terminal)); } return result; diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h index 4873b5a9..c97a8cfd 100644 --- a/src/compiler/build_tables/recovery_tokens.h +++ b/src/compiler/build_tables/recovery_tokens.h @@ -11,7 +11,7 @@ struct LexicalGrammar; namespace build_tables { -std::set recovery_tokens(const LexicalGrammar &); +std::set recovery_tokens(const LexicalGrammar &); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b7058603..bc84e557 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -11,9 +11,11 @@ #include "compiler/lexical_grammar.h" #include "compiler/rules/built_in_symbols.h" #include "compiler/util/string_helpers.h" +#include "tree_sitter/runtime.h" namespace tree_sitter { namespace generate_code { + using std::function; using std::map; using std::pair; @@ -22,6 +24,7 @@ using std::string; using std::to_string; using std::vector; using util::escape_char; +using rules::Symbol; static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr()); @@ -73,9 +76,8 @@ class CCodeGenerator { const LexicalGrammar lexical_grammar; map sanitized_names; vector> parse_table_entries; - vector>> in_progress_symbols; + vector> external_scanner_states; size_t next_parse_action_list_index; - size_t next_in_progress_symbol_list_index; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -87,19 +89,26 @@ class CCodeGenerator { lex_table(lex_table), syntax_grammar(syntax_grammar), lexical_grammar(lexical_grammar), - next_parse_action_list_index(0), - next_in_progress_symbol_list_index(0) {} + next_parse_action_list_index(0) {} string code() { buffer = ""; add_includes(); - add_state_and_symbol_counts(); + add_warning_pragma(); + add_stats(); add_symbol_enum(); add_symbol_names_list(); - add_symbol_node_types_list(); + add_symbol_metadata_list(); add_lex_function(); - add_lex_states_list(); + add_lex_modes_list(); + + if (!syntax_grammar.external_tokens.empty()) { + add_external_token_enum(); + add_external_scanner_symbol_map(); + add_external_scanner_states_list(); + } + add_parse_table(); add_parser_export(); @@ -112,10 +121,25 @@ class CCodeGenerator { line(); } - void add_state_and_symbol_counts() { + void add_warning_pragma() { + line("#pragma GCC diagnostic push"); + line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + line(); + } + + void add_stats() { + size_t token_count = 1 + lexical_grammar.variables.size(); + for (const ExternalToken &external_token : syntax_grammar.external_tokens) { + if (external_token.corresponding_internal_token == rules::NONE()) { + token_count++; + } + } + + line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION)); line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1)); + line("#define TOKEN_COUNT " + to_string(token_count)); + line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -124,7 +148,7 @@ class CCodeGenerator { indent([&]() { size_t i = 1; for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; if (!symbol.is_built_in()) { line(symbol_id(symbol) + " = " + to_string(i) + ","); i++; @@ -146,11 +170,11 @@ class CCodeGenerator { line(); } - void add_symbol_node_types_list() { + void add_symbol_metadata_list() { line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {"); indent([&]() { for (const auto &entry : parse_table.symbols) { - const rules::Symbol &symbol = entry.first; + const Symbol &symbol = entry.first; line("[" + symbol_id(symbol) + "] = {"); indent([&]() { switch (symbol_type(symbol)) { @@ -198,13 +222,102 @@ class CCodeGenerator { line(); } - void add_lex_states_list() { - line("static TSStateId ts_lex_states[STATE_COUNT] = {"); + void add_lex_modes_list() { + add_external_scanner_state({}); + + map external_tokens_by_corresponding_internal_token; + for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.corresponding_internal_token.index == Symbol::Index(i)) { + external_tokens_by_corresponding_internal_token.insert({i, j}); + break; + } + } + } + + line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { size_t state_id = 0; - for (const auto &state : parse_table.states) - line("[" + to_string(state_id++) + "] = " + - to_string(state.lex_state_id) + ","); + + for (const auto &state : parse_table.states) { + line("[" + to_string(state_id++) + "] = {.lex_state = "); + add(to_string(state.lex_state_id)); + + bool needs_external_scanner = false; + set external_token_indices; + for (const auto &pair : state.terminal_entries) { + Symbol symbol = pair.first; + if (symbol.is_external()) { + needs_external_scanner = true; + external_token_indices.insert(symbol.index); + } else if (symbol.is_token()) { + auto corresponding_external_token = + external_tokens_by_corresponding_internal_token.find(symbol.index); + if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) { + external_token_indices.insert(corresponding_external_token->second); + } + } + } + + if (needs_external_scanner) { + add(", .external_lex_state = " + add_external_scanner_state(external_token_indices)); + } + + add("},"); + } + }); + line("};"); + line(); + } + + string add_external_scanner_state(set external_token_ids) { + for (size_t i = 0, n = external_scanner_states.size(); i < n; i++) + if (external_scanner_states[i] == external_token_ids) + return to_string(i); + external_scanner_states.push_back(external_token_ids); + return to_string(external_scanner_states.size() - 1); + } + + void add_external_token_enum() { + line("enum {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) + line(external_token_id(i) + ","); + }); + line("};"); + line(); + } + + void add_external_scanner_symbol_map() { + line("TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { + line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ","); + } + }); + line("};"); + line(); + } + + void add_external_scanner_states_list() { + line("static bool ts_external_scanner_states["); + add(to_string(external_scanner_states.size())); + add("][EXTERNAL_TOKEN_COUNT] = {"); + indent([&]() { + size_t i = 0; + for (const auto &valid_external_lookaheads : external_scanner_states) { + if (!valid_external_lookaheads.empty()) { + line("[" + to_string(i) + "] = {"); + indent([&]() { + for (Symbol::Index id : valid_external_lookaheads) { + line("[" + external_token_id(id) + "] = true,"); + } + }); + line("},"); + } + i++; + } }); line("};"); line(); @@ -214,9 +327,6 @@ class CCodeGenerator { add_parse_action_list_id(ParseTableEntry{ {}, false, false }); size_t state_id = 0; - line("#pragma GCC diagnostic push"); - line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); - line(); line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {"); indent([&]() { @@ -224,12 +334,12 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {"); indent([&]() { for (const auto &entry : state.nonterminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE("); + line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE("); add(to_string(entry.second)); add("),"); } for (const auto &entry : state.terminal_entries) { - line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS("); + line("[" + symbol_id(entry.first) + "] = ACTIONS("); add(to_string(add_parse_action_list_id(entry.second))); add("),"); } @@ -242,12 +352,42 @@ class CCodeGenerator { line(); add_parse_action_list(); line(); - line("#pragma GCC diagnostic pop"); - line(); } void add_parser_export() { - line("EXPORT_LANGUAGE(ts_language_" + name + ");"); + string language_function_name = "tree_sitter_" + name; + string external_scanner_name = language_function_name + "_external_scanner"; + + if (!syntax_grammar.external_tokens.empty()) { + line("void *" + external_scanner_name + "_create();"); + line("void " + external_scanner_name + "_destroy();"); + line("void " + external_scanner_name + "_reset(void *);"); + line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); + line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);"); + line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);"); + line(); + } + + line("const TSLanguage *" + language_function_name + "() {"); + indent([&]() { + line("GET_LANGUAGE("); + if (syntax_grammar.external_tokens.empty()) { + add(");"); + } else { + indent([&]() { + line("(const bool *)ts_external_scanner_states,"); + line("ts_external_scanner_symbol_map,"); + line(external_scanner_name + "_create,"); + line(external_scanner_name + "_destroy,"); + line(external_scanner_name + "_reset,"); + line(external_scanner_name + "_scan,"); + line(external_scanner_name + "_serialize,"); + line(external_scanner_name + "_deserialize,"); + }); + line(");"); + } + }); + line("}"); line(); } @@ -379,22 +519,13 @@ class CCodeGenerator { return result; } - size_t add_in_progress_symbol_list_id(const set &symbols) { - for (const auto &pair : in_progress_symbols) { - if (pair.second == symbols) { - return pair.first; - } - } - - size_t result = next_in_progress_symbol_list_index; - in_progress_symbols.push_back({ result, symbols }); - next_in_progress_symbol_list_index += 1 + symbols.size(); - return result; - } - // Helper functions - string symbol_id(const rules::Symbol &symbol) { + string external_token_id(Symbol::Index index) { + return "ts_external_token_" + syntax_grammar.external_tokens[index].name; + } + + string symbol_id(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "ts_builtin_sym_end"; @@ -411,25 +542,33 @@ class CCodeGenerator { } } - string symbol_name(const rules::Symbol &symbol) { + string symbol_name(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return "END"; return entry_for_symbol(symbol).first; } - VariableType symbol_type(const rules::Symbol &symbol) { + VariableType symbol_type(const Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) return VariableTypeHidden; return entry_for_symbol(symbol).second; } - pair entry_for_symbol(const rules::Symbol &symbol) { - if (symbol.is_token) { - const Variable &variable = lexical_grammar.variables[symbol.index]; - return { variable.name, variable.type }; - } else { - const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; - return { variable.name, variable.type }; + pair entry_for_symbol(const Symbol &symbol) { + switch (symbol.type) { + case Symbol::NonTerminal: { + const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::Terminal: { + const Variable &variable = lexical_grammar.variables[symbol.index]; + return { variable.name, variable.type }; + } + case Symbol::External: + default: { + const ExternalToken &token = syntax_grammar.external_tokens[symbol.index]; + return { token.name, token.type }; + } } } diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index a8955c02..0a07280c 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -12,6 +12,7 @@ struct Grammar { std::vector> rules; std::vector extra_tokens; std::vector> expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 185d919b..327c0f31 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -210,7 +210,7 @@ ParseGrammarResult parse_grammar(const string &input) { string error_message; string name; Grammar grammar; - json_value name_json, rules_json, extras_json, conflicts_json; + json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json; json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; char parse_error[json_error_max]; @@ -302,6 +302,25 @@ ParseGrammarResult parse_grammar(const string &input) { } } + external_tokens_json = grammar_json->operator[]("externals"); + if (external_tokens_json.type != json_none) { + if (external_tokens_json.type != json_array) { + error_message = "External tokens must be an array"; + goto error; + } + + for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) { + json_value *token_name_json = external_tokens_json.u.array.values[i]; + if (token_name_json->type != json_string) { + error_message = "External token values must be strings"; + goto error; + } + + string token_name = token_name_json->u.string.ptr; + grammar.external_tokens.push_back(token_name); + } + } + json_value_free(grammar_json); return { name, grammar, "" }; diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index e6e4badd..a04eec8c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -1,6 +1,7 @@ #include "compiler/parse_table.h" #include #include "compiler/precedence_range.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { @@ -28,7 +29,7 @@ ParseAction::ParseAction() extra(false), fragile(false), state_index(-1), - symbol(Symbol(-1)), + symbol(rules::NONE()), consumed_symbol_count(0), production(nullptr) {} @@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() { } ParseAction ParseAction::Shift(ParseStateId state_index) { - return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr); + return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr); } ParseAction ParseAction::Recover(ParseStateId state_index) { - return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0, + return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0, nullptr); } @@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const { set ParseState::expected_inputs() const { set result; for (auto &entry : terminal_entries) - result.insert(Symbol(entry.first, true)); - for (auto &entry : nonterminal_entries) - result.insert(Symbol(entry.first, false)); + result.insert(entry.first); return result; } @@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() { return states.size() - 1; } -ParseAction &ParseTable::set_terminal_action(ParseStateId state_id, - Symbol::Index index, - ParseAction action) { - states[state_id].terminal_entries[index].actions.clear(); - return add_terminal_action(state_id, index, action); -} - ParseAction &ParseTable::add_terminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol lookahead, ParseAction action) { - Symbol symbol(index, true); if (action.type == ParseActionTypeShift && action.extra) - symbols[symbol].extra = true; + symbols[lookahead].extra = true; else - symbols[symbol].structural = true; + symbols[lookahead].structural = true; - ParseTableEntry &entry = states[state_id].terminal_entries[index]; + ParseTableEntry &entry = states[state_id].terminal_entries[lookahead]; entry.actions.push_back(action); return *entry.actions.rbegin(); } void ParseTable::set_nonterminal_action(ParseStateId state_id, - Symbol::Index index, + Symbol::Index lookahead, ParseStateId next_state_id) { - Symbol symbol(index, false); - symbols[symbol].structural = true; - states[state_id].nonterminal_entries[index] = next_state_id; + symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true; + states[state_id].nonterminal_entries[lookahead] = next_state_id; } static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { @@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) { return false; for (auto &entry : state.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - const auto &other_entry = other.terminal_entries.find(index); + const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; @@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) { } } - set symbols_to_merge; + set symbols_to_merge; for (auto &entry : other.terminal_entries) { - Symbol::Index index = entry.first; + Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - if (!state.terminal_entries.count(index)) { - if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index)) + if (!state.terminal_entries.count(lookahead)) { + if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) return false; if (actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) return false; - symbols_to_merge.insert(index); + symbols_to_merge.insert(lookahead); } } - for (const Symbol::Index &index : symbols_to_merge) - state.terminal_entries[index] = other.terminal_entries.find(index)->second; + for (const Symbol &lookahead : symbols_to_merge) + state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; return true; } diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 59eee4a8..79eec4fc 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -76,7 +76,7 @@ class ParseState { void each_referenced_state(std::function); bool has_shift_action() const; - std::map terminal_entries; + std::map terminal_entries; std::map nonterminal_entries; LexStateId lex_state_id; size_t shift_actions_signature; @@ -91,15 +91,14 @@ class ParseTable { public: std::set all_symbols() const; ParseStateId add_state(); - ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction); - ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction); - void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId); + ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); + void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); bool merge_state(size_t i, size_t j); std::vector states; std::map symbols; - std::set mergeable_symbols; + std::set mergeable_symbols; }; } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 7963e94b..331c9cea 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn { rule_ptr inner_rule = apply(rule->content); size_t index = aux_rules.size(); string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count); - Symbol repeat_symbol(offset + index); + Symbol repeat_symbol(offset + index, Symbol::NonTerminal); existing_repeats.push_back({ rule->copy(), repeat_symbol }); aux_rules.push_back( Variable(helper_rule_name, VariableTypeAuxiliary, @@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { result.variables = grammar.variables; result.extra_tokens = grammar.extra_tokens; result.expected_conflicts = grammar.expected_conflicts; + result.external_tokens = grammar.external_tokens; ExpandRepeats expander(result.variables.size()); for (auto &variable : result.variables) diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index bf7ac514..9d161ca8 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -38,7 +38,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { map replacements; Symbol replace_symbol(const Symbol &symbol) { - if (symbol.is_built_in() || symbol.is_token) + if (!symbol.is_non_terminal()) return symbol; auto replacement_pair = replacements.find(symbol); @@ -49,7 +49,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { for (const auto &pair : replacements) if (pair.first.index < symbol.index) new_index--; - return Symbol(new_index); + return Symbol(new_index, Symbol::NonTerminal); } }; @@ -60,14 +60,14 @@ class TokenExtractor : public rules::IdentityRuleFn { for (size_t i = 0; i < tokens.size(); i++) if (tokens[i].rule->operator==(*input)) { token_usage_counts[i]++; - return make_shared(i, true); + return make_shared(i, Symbol::Terminal); } rule_ptr rule = input->copy(); size_t index = tokens.size(); tokens.push_back(Variable(token_description(rule), entry_type, rule)); token_usage_counts.push_back(1); - return make_shared(index, true); + return make_shared(index, Symbol::Terminal); } rule_ptr apply_to(const rules::String *rule) { @@ -90,9 +90,8 @@ class TokenExtractor : public rules::IdentityRuleFn { vector tokens; }; -static CompileError ubiq_token_err(const string &message) { - return CompileError(TSCompileErrorTypeInvalidUbiquitousToken, - "Not a token: " + message); +static CompileError extra_token_error(const string &message) { + return CompileError(TSCompileErrorTypeInvalidExtraToken, "Not a token: " + message); } tuple extract_tokens( @@ -122,11 +121,10 @@ tuple extract_tokens( size_t i = 0; for (const Variable &variable : processed_variables) { auto symbol = variable.rule->as(); - if (symbol && symbol->is_token && !symbol->is_built_in() && - extractor.token_usage_counts[symbol->index] == 1) { + if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) { lexical_grammar.variables[symbol->index].type = variable.type; lexical_grammar.variables[symbol->index].name = variable.name; - symbol_replacer.replacements.insert({ Symbol(i), *symbol }); + symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol }); } else { syntax_grammar.variables.push_back(variable); } @@ -158,7 +156,7 @@ tuple extract_tokens( bool used_elsewhere_in_grammar = false; for (const Variable &variable : lexical_grammar.variables) { if (variable.rule->operator==(*rule)) { - syntax_grammar.extra_tokens.insert(Symbol(i, true)); + syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal)); used_elsewhere_in_grammar = true; } i++; @@ -175,17 +173,39 @@ tuple extract_tokens( auto symbol = rule->as(); if (!symbol) return make_tuple(syntax_grammar, lexical_grammar, - ubiq_token_err(rule->to_string())); + extra_token_error(rule->to_string())); Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); - if (!new_symbol.is_token) + if (new_symbol.is_non_terminal()) { return make_tuple( syntax_grammar, lexical_grammar, - ubiq_token_err(syntax_grammar.variables[new_symbol.index].name)); + extra_token_error(syntax_grammar.variables[new_symbol.index].name)); + } syntax_grammar.extra_tokens.insert(new_symbol); } + for (const ExternalToken &external_token : grammar.external_tokens) { + Symbol internal_token = symbol_replacer.replace_symbol(external_token.corresponding_internal_token); + + if (internal_token.is_non_terminal()) { + return make_tuple( + syntax_grammar, + lexical_grammar, + CompileError( + TSCompileErrorTypeInvalidExternalToken, + "Name '" + external_token.name + "' cannot be used for both an external token and a non-terminal rule" + ) + ); + } + + syntax_grammar.external_tokens.push_back({ + external_token.name, + external_token.type, + internal_token + }); + } + return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index ddba9a5f..8ac0e33c 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -92,6 +92,7 @@ pair flatten_grammar(const InitialSyntaxGrammar &gr SyntaxGrammar result; result.expected_conflicts = grammar.expected_conflicts; result.extra_tokens = grammar.extra_tokens; + result.external_tokens = grammar.external_tokens; bool is_start = true; for (const Variable &variable : grammar.variables) { diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index fe1ff37d..1ac319cb 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -1,13 +1,12 @@ #ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ #define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ -#include -#include #include +#include #include "tree_sitter/compiler.h" #include "compiler/rules/symbol.h" -#include "compiler/variable.h" #include "compiler/syntax_grammar.h" +#include "compiler/variable.h" namespace tree_sitter { namespace prepare_grammar { @@ -16,6 +15,7 @@ struct InitialSyntaxGrammar { std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index cd01719c..0786982b 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -8,6 +8,7 @@ #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { namespace prepare_grammar { @@ -17,8 +18,9 @@ using std::vector; using std::set; using std::pair; using std::make_shared; +using rules::Symbol; -class InternSymbols : public rules::IdentityRuleFn { +class SymbolInterner : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; rule_ptr apply_to(const rules::NamedSymbol *rule) { @@ -34,11 +36,14 @@ class InternSymbols : public rules::IdentityRuleFn { std::shared_ptr symbol_for_rule_name(string rule_name) { for (size_t i = 0; i < grammar.rules.size(); i++) if (grammar.rules[i].first == rule_name) - return make_shared(i); + return make_shared(i, Symbol::NonTerminal); + for (size_t i = 0; i < grammar.external_tokens.size(); i++) + if (grammar.external_tokens[i] == rule_name) + return make_shared(i, Symbol::External); return nullptr; } - explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} + explicit SymbolInterner(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; }; @@ -50,16 +55,35 @@ CompileError missing_rule_error(string rule_name) { pair intern_symbols(const Grammar &grammar) { InternedGrammar result; - InternSymbols interner(grammar); + + for (auto &external_token_name : grammar.external_tokens) { + Symbol corresponding_internal_token = rules::NONE(); + for (size_t i = 0, n = grammar.rules.size(); i < n; i++) { + if (grammar.rules[i].first == external_token_name) { + corresponding_internal_token = Symbol(i, Symbol::NonTerminal); + break; + } + } + + result.external_tokens.push_back(ExternalToken{ + external_token_name, + external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, + corresponding_internal_token + }); + } + + SymbolInterner interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second); if (!interner.missing_rule_name.empty()) return { result, missing_rule_error(interner.missing_rule_name) }; - result.variables.push_back(Variable( - pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, - new_rule)); + result.variables.push_back(Variable{ + pair.first, + pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed, + new_rule + }); } for (auto &rule : grammar.extra_tokens) { diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index c08c07dd..c8a14647 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,6 +15,7 @@ struct InternedGrammar { std::vector variables; std::vector extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index a7a877ec..0fe45f68 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -4,15 +4,15 @@ namespace tree_sitter { namespace rules { Symbol END_OF_INPUT() { - return Symbol(-1, true); + return Symbol(-1, Symbol::Terminal); } Symbol START() { - return Symbol(-2); + return Symbol(-2, Symbol::NonTerminal); } Symbol NONE() { - return Symbol(-3); + return Symbol(-3, Symbol::Type(-1)); } } // namespace rules diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index f85b09c7..e826cb0f 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -11,12 +11,10 @@ using std::string; using std::to_string; using util::hash_combine; -Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {} - -Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {} +Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {} bool Symbol::operator==(const Symbol &other) const { - return (other.index == index) && (other.is_token == is_token); + return (other.index == index) && (other.type == type); } bool Symbol::operator==(const Rule &rule) const { @@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const { size_t Symbol::hash_code() const { size_t result = 0; hash_combine(&result, index); - hash_combine(&result, is_token); + hash_combine(&result, type); return result; } @@ -36,14 +34,22 @@ rule_ptr Symbol::copy() const { } string Symbol::to_string() const { - string name = is_token ? "token" : "sym"; - return "(" + name + " " + std::to_string(index) + ")"; + switch (type) { + case Symbol::Terminal: + return "(terminal " + std::to_string(index) + ")"; + case Symbol::NonTerminal: + return "(non-terminal " + std::to_string(index) + ")"; + case Symbol::External: + return "(external " + std::to_string(index) + ")"; + default: + return "(none)"; + } } bool Symbol::operator<(const Symbol &other) const { - if (is_token && !other.is_token) + if (type < other.type) return true; - if (!is_token && other.is_token) + if (other.type < type) return false; return (index < other.index); } @@ -56,6 +62,18 @@ bool Symbol::is_built_in() const { return is_built_in(index); } +bool Symbol::is_token() const { + return type == Symbol::Terminal; +} + +bool Symbol::is_external() const { + return type == Symbol::External; +} + +bool Symbol::is_non_terminal() const { + return type == Symbol::NonTerminal; +} + void Symbol::accept(Visitor *visitor) const { visitor->visit(this); } diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 4ae9ece3..a963433c 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -11,9 +11,13 @@ class Symbol : public Rule { public: typedef int Index; + typedef enum { + External, + Terminal, + NonTerminal, + } Type; - explicit Symbol(Index index); - Symbol(Index index, bool is_token); + Symbol(Index index, Type type); bool operator==(const Symbol &other) const; bool operator==(const Rule &other) const; @@ -26,9 +30,12 @@ class Symbol : public Rule { bool operator<(const Symbol &other) const; static bool is_built_in(Index); bool is_built_in() const; + bool is_token() const; + bool is_external() const; + bool is_non_terminal() const; Index index; - bool is_token; + Type type; }; } // namespace rules diff --git a/src/compiler/rules/visitor.h b/src/compiler/rules/visitor.h index b8301183..c75e31dc 100644 --- a/src/compiler/rules/visitor.h +++ b/src/compiler/rules/visitor.h @@ -16,6 +16,7 @@ class String; class Symbol; class Pattern; class Metadata; +class ExternalToken; class Visitor { public: @@ -29,6 +30,7 @@ class Visitor { virtual void visit(const String *rule) = 0; virtual void visit(const NamedSymbol *rule) = 0; virtual void visit(const Symbol *rule) = 0; + virtual void visit(const ExternalToken *rule) = 0; virtual ~Visitor(); }; @@ -86,6 +88,10 @@ class RuleFn : private Visitor { return default_apply((const Rule *)rule); } + virtual T apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } + void visit(const Blank *rule) { value_ = apply_to(rule); } @@ -126,6 +132,10 @@ class RuleFn : private Visitor { value_ = apply_to(rule); } + void visit(const ExternalToken *rule) { + value_ = apply_to(rule); + } + private: T value_; }; @@ -170,6 +180,9 @@ class RuleFn : private Visitor { virtual void apply_to(const Symbol *rule) { return default_apply((const Rule *)rule); } + virtual void apply_to(const ExternalToken *rule) { + return default_apply((const Rule *)rule); + } void visit(const Blank *rule) { apply_to(rule); @@ -201,6 +214,9 @@ class RuleFn : private Visitor { void visit(const Symbol *rule) { apply_to(rule); } + void visit(const ExternalToken *rule) { + apply_to(rule); + } }; class IdentityRuleFn : public RuleFn { diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 706ec828..aa3074e8 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -13,8 +13,6 @@ using std::pair; using std::vector; using std::set; -static const vector NO_PRODUCTIONS; - SyntaxVariable::SyntaxVariable(const string &name, VariableType type, const vector &productions) : name(name), productions(productions), type(type) {} @@ -23,18 +21,14 @@ ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence, rules::Associativity associativity) : symbol(symbol), precedence(precedence), associativity(associativity) {} +bool ExternalToken::operator==(const ExternalToken &other) const { + return name == other.name && type == other.type && + corresponding_internal_token == other.corresponding_internal_token; +} + bool ProductionStep::operator==(const ProductionStep &other) const { return symbol == other.symbol && precedence == other.precedence && associativity == other.associativity; } -const vector &SyntaxGrammar::productions( - const rules::Symbol &symbol) const { - if (symbol.is_built_in() || symbol.is_token) { - return NO_PRODUCTIONS; - } else { - return variables[symbol.index].productions; - } -} - } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 89745fa5..e3af8f28 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -10,6 +10,14 @@ namespace tree_sitter { +struct ExternalToken { + std::string name; + VariableType type; + rules::Symbol corresponding_internal_token; + + bool operator==(const ExternalToken &) const; +}; + struct ProductionStep { ProductionStep(const rules::Symbol &, int, rules::Associativity); bool operator==(const ProductionStep &) const; @@ -33,11 +41,10 @@ struct SyntaxVariable { typedef std::set ConflictSet; struct SyntaxGrammar { - const std::vector &productions(const rules::Symbol &) const; - std::vector variables; std::set extra_tokens; std::set expected_conflicts; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/runtime/document.c b/src/runtime/document.c index 65f9e435..8c1eb779 100644 --- a/src/runtime/document.c +++ b/src/runtime/document.c @@ -36,8 +36,9 @@ const TSLanguage *ts_document_language(TSDocument *self) { } void ts_document_set_language(TSDocument *self, const TSLanguage *language) { + if (language->version != TREE_SITTER_LANGUAGE_VERSION) return; ts_document_invalidate(self); - self->parser.language = language; + parser_set_language(&self->parser, language); if (self->tree) { ts_tree_release(self->tree); self->tree = NULL; diff --git a/src/runtime/language.c b/src/runtime/language.c index af08bb38..7f1bdefa 100644 --- a/src/runtime/language.c +++ b/src/runtime/language.c @@ -34,6 +34,10 @@ uint32_t ts_language_symbol_count(const TSLanguage *language) { return language->symbol_count; } +uint32_t ts_language_version(const TSLanguage *language) { + return language->version; +} + TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) { if (symbol == ts_builtin_sym_error) diff --git a/src/runtime/language.h b/src/runtime/language.h index a4f44b11..20e6ec5d 100644 --- a/src/runtime/language.h +++ b/src/runtime/language.h @@ -19,6 +19,10 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); +static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { + return 0 < symbol && symbol < self->external_token_count + 1; +} + static inline const TSParseAction *ts_language_actions(const TSLanguage *self, TSStateId state, TSSymbol symbol, @@ -49,6 +53,16 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, } } +static inline const bool * +ts_language_enabled_external_tokens(const TSLanguage *self, + unsigned external_scanner_state) { + if (external_scanner_state == 0) { + return NULL; + } else { + return self->external_scanner.states + self->external_token_count * external_scanner_state; + } +} + #ifdef __cplusplus } #endif diff --git a/src/runtime/length.h b/src/runtime/length.h index 2477bbe1..352215d2 100644 --- a/src/runtime/length.h +++ b/src/runtime/length.h @@ -21,12 +21,11 @@ static inline void length_set_unknown_chars(Length *self) { } static inline Length length_min(Length len1, Length len2) { - return (len1.chars < len2.chars) ? len1 : len2; + return (len1.bytes < len2.bytes) ? len1 : len2; } static inline Length length_add(Length len1, Length len2) { Length result; - result.chars = len1.chars + len2.chars; result.bytes = len1.bytes + len2.bytes; result.extent = point_add(len1.extent, len2.extent); @@ -57,10 +56,4 @@ static inline Length length_zero() { return (Length){ 0, 0, {0, 0} }; } -static inline bool length_eq(Length self, Length other) { - return self.bytes == other.bytes && self.chars == other.chars && - self.extent.row == other.extent.row && - self.extent.column == other.extent.column; -} - #endif diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 32910935..902c2d3b 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -11,11 +11,8 @@ self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \ } -#define LOG_LOOKAHEAD() \ - LOG((0 < self->data.lookahead && self->data.lookahead < 256) \ - ? "lookahead char:'%c'" \ - : "lookahead char:%d", \ - self->data.lookahead); +#define LOG_CHARACTER(message, character) \ + LOG(character < 255 ? message " character:'%c'" : message " character:%d", character) static const char empty_chunk[2] = { 0, 0 }; @@ -42,11 +39,9 @@ static void ts_lexer__get_lookahead(Lexer *self) { utf8proc_iterate(chunk, size, &self->data.lookahead); else self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); - - LOG_LOOKAHEAD(); } -static void ts_lexer__advance(void *payload, TSStateId state, bool skip) { +static void ts_lexer__advance(void *payload, bool skip) { Lexer *self = (Lexer *)payload; if (self->chunk == empty_chunk) return; @@ -63,10 +58,10 @@ static void ts_lexer__advance(void *payload, TSStateId state, bool skip) { } if (skip) { - LOG("skip_separator state:%d", state); + LOG_CHARACTER("skip", self->data.lookahead); self->token_start_position = self->current_position; } else { - LOG("advance state:%d", state); + LOG_CHARACTER("consume", self->data.lookahead); } if (self->current_position.bytes >= self->chunk_start + self->chunk_size) @@ -93,6 +88,7 @@ void ts_lexer_init(Lexer *self) { .payload = NULL, .log = NULL }, + .last_external_token_state = NULL, }; ts_lexer_reset(self, length_zero()); } @@ -115,17 +111,16 @@ static inline void ts_lexer__reset(Lexer *self, Length position) { void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__reset(self, length_zero()); + self->last_external_token_state = NULL; } void ts_lexer_reset(Lexer *self, Length position) { - if (!length_eq(position, self->current_position)) + if (position.bytes != self->current_position.bytes) { ts_lexer__reset(self, position); - return; + } } -void ts_lexer_start(Lexer *self, TSStateId lex_state) { - LOG("start_lex state:%d, pos:%u", lex_state, self->current_position.chars); - +void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->data.result_symbol = 0; diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 1b047e5b..67470f6f 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -25,12 +25,13 @@ typedef struct { TSInput input; TSLogger logger; char debug_buffer[TS_DEBUG_BUFFER_SIZE]; + const TSExternalTokenState *last_external_token_state; } Lexer; void ts_lexer_init(Lexer *); void ts_lexer_set_input(Lexer *, TSInput); void ts_lexer_reset(Lexer *, Length); -void ts_lexer_start(Lexer *, TSStateId); +void ts_lexer_start(Lexer *); #ifdef __cplusplus } diff --git a/src/runtime/node.c b/src/runtime/node.c index d5bcb1a0..15e2d5cf 100644 --- a/src/runtime/node.c +++ b/src/runtime/node.c @@ -39,7 +39,15 @@ static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { static inline uint32_t ts_node__relevant_child_count(TSNode self, bool include_anonymous) { const Tree *tree = ts_node__tree(self); - return include_anonymous ? tree->visible_child_count : tree->named_child_count; + if (tree->child_count > 0) { + if (include_anonymous) { + return tree->visible_child_count; + } else { + return tree->named_child_count; + } + } else { + return 0; + } } static inline TSNode ts_node__direct_parent(TSNode self, uint32_t *index) { @@ -324,11 +332,21 @@ TSNode ts_node_named_child(TSNode self, uint32_t child_index) { } uint32_t ts_node_child_count(TSNode self) { - return ts_node__tree(self)->visible_child_count; + const Tree *tree = ts_node__tree(self); + if (tree->child_count > 0) { + return tree->visible_child_count; + } else { + return 0; + } } uint32_t ts_node_named_child_count(TSNode self) { - return ts_node__tree(self)->named_child_count; + const Tree *tree = ts_node__tree(self); + if (tree->child_count > 0) { + return tree->named_child_count; + } else { + return 0; + } } TSNode ts_node_next_sibling(TSNode self) { diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 2f5879a4..191354a3 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -109,28 +109,6 @@ static bool parser__breakdown_top_of_stack(Parser *self, StackVersion version) { return did_break_down; } -static void parser__pop_reusable_node(ReusableNode *reusable_node) { - reusable_node->byte_index += ts_tree_total_bytes(reusable_node->tree); - while (reusable_node->tree) { - Tree *parent = reusable_node->tree->context.parent; - uint32_t next_index = reusable_node->tree->context.index + 1; - if (parent && parent->child_count > next_index) { - reusable_node->tree = parent->children[next_index]; - return; - } - reusable_node->tree = parent; - } -} - -static bool parser__breakdown_reusable_node(ReusableNode *reusable_node) { - if (reusable_node->tree->child_count == 0) { - return false; - } else { - reusable_node->tree = reusable_node->tree->children[0]; - return true; - } -} - static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, TSStateId state, ReusableNode *reusable_node) { @@ -140,12 +118,11 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, reusable_node->tree->fragile_left || reusable_node->tree->fragile_right)) { LOG("state_mismatch sym:%s", SYM_NAME(reusable_node->tree->symbol)); - parser__breakdown_reusable_node(reusable_node); + reusable_node_breakdown(reusable_node); result = true; } if (result) { - LOG("lookahead sym:%s", SYM_NAME(reusable_node->tree->symbol)); ts_tree_release(*lookahead); ts_tree_retain(*lookahead = reusable_node->tree); } @@ -153,16 +130,20 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead, return result; } -static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) { - while (reusable_node->tree->child_count > 0) - reusable_node->tree = reusable_node->tree->children[0]; - parser__pop_reusable_node(reusable_node); +static inline bool ts_lex_mode_eq(TSLexMode self, TSLexMode other) { + return self.lex_state == other.lex_state && + self.external_lex_state == other.external_lex_state; } static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree, TableEntry *table_entry) { - if (tree->first_leaf.lex_state == self->language->lex_states[state]) + TSLexMode current_lex_mode = self->language->lex_modes[state]; + if (ts_lex_mode_eq(tree->first_leaf.lex_mode, current_lex_mode)) return true; + if (current_lex_mode.external_lex_state != 0) + return false; + if (tree->size.bytes == 0) + return false; if (!table_entry->is_reusable) return false; if (!table_entry->depends_on_lookahead) @@ -208,28 +189,76 @@ static bool parser__condense_stack(Parser *self) { return result; } -static Tree *parser__lex(Parser *self, TSStateId parse_state) { - TSStateId start_state = self->language->lex_states[parse_state]; - TSStateId current_state = start_state; - Length start_position = self->lexer.current_position; - LOG("lex state:%d", start_state); +static void parser__restore_external_scanner(Parser *self, StackVersion version) { + const TSExternalTokenState *state = ts_stack_external_token_state(self->stack, version); + if (self->lexer.last_external_token_state != state) { + LOG("restore_external_scanner"); + self->lexer.last_external_token_state = state; + if (state) { + self->language->external_scanner.deserialize( + self->external_scanner_payload, + *state + ); + } else { + self->language->external_scanner.reset(self->external_scanner_payload); + } + } +} +static Tree *parser__lex(Parser *self, StackVersion version) { + TSStateId parse_state = ts_stack_top_state(self->stack, version); + Length start_position = ts_stack_top_position(self->stack, version); + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + const bool *valid_external_tokens = ts_language_enabled_external_tokens( + self->language, + lex_mode.external_lex_state + ); + + bool found_external_token = false; + bool found_error = false; bool skipped_error = false; int32_t first_error_character = 0; Length error_start_position, error_end_position; + ts_lexer_reset(&self->lexer, start_position); - ts_lexer_start(&self->lexer, start_state); + for (;;) { + Length current_position = self->lexer.current_position; - while (!self->language->lex_fn(&self->lexer.data, current_state)) { - if (current_state != ERROR_STATE) { + if (valid_external_tokens) { + LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, + current_position.extent.row, current_position.extent.column); + parser__restore_external_scanner(self, version); + ts_lexer_start(&self->lexer); + if (self->language->external_scanner.scan(self->external_scanner_payload, + &self->lexer.data, valid_external_tokens)) { + found_external_token = true; + break; + } + ts_lexer_reset(&self->lexer, current_position); + } + + LOG("lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, + current_position.extent.row, current_position.extent.column); + ts_lexer_start(&self->lexer); + if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) { + break; + } + + if (!found_error) { LOG("retry_in_error_mode"); - current_state = ERROR_STATE; + found_error = true; + lex_mode = self->language->lex_modes[ERROR_STATE]; + valid_external_tokens = ts_language_enabled_external_tokens( + self->language, + lex_mode.external_lex_state + ); ts_lexer_reset(&self->lexer, start_position); - ts_lexer_start(&self->lexer, current_state); continue; } if (!skipped_error) { + LOG("skip_unrecognized_character"); + skipped_error = true; error_start_position = self->lexer.token_start_position; first_error_character = self->lexer.data.lookahead; } @@ -239,15 +268,13 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { self->lexer.data.result_symbol = ts_builtin_sym_error; break; } - self->lexer.data.advance(&self->lexer, ERROR_STATE, false); + self->lexer.data.advance(&self->lexer, false); } - skipped_error = true; error_end_position = self->lexer.current_position; } Tree *result; - if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); @@ -255,20 +282,28 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) { result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; - Length padding = - length_sub(self->lexer.token_start_position, start_position); - Length size = length_sub(self->lexer.current_position, - self->lexer.token_start_position); - result = - ts_tree_make_leaf(symbol, padding, size, - ts_language_symbol_metadata(self->language, symbol)); + if (found_external_token) { + symbol = self->language->external_scanner.symbol_map[symbol]; + } + + Length padding = length_sub(self->lexer.token_start_position, start_position); + Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); + TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); + result = ts_tree_make_leaf(symbol, padding, size, metadata); + + if (found_external_token) { + result->has_external_tokens = true; + result->has_external_token_state = true; + memset(result->external_token_state, 0, sizeof(TSExternalTokenState)); + self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state); + self->lexer.last_external_token_state = &result->external_token_state; + } } - if (!result) - return NULL; - result->parse_state = parse_state; - result->first_leaf.lex_state = start_state; + result->first_leaf.lex_mode = lex_mode; + + LOG("lexed_lookahead sym:%s, size:%u", SYM_NAME(result->symbol), result->size.bytes); return result; } @@ -277,21 +312,31 @@ static void parser__clear_cached_token(Parser *self) { self->cached_token = NULL; } +static inline bool ts_external_token_state_eq(const TSExternalTokenState *self, + const TSExternalTokenState *other) { + if (self == other) { + return true; + } else if (!self || !other) { + return false; + } else { + return memcmp(self, other, sizeof(TSExternalTokenState)) == 0; + } +} + static Tree *parser__get_lookahead(Parser *self, StackVersion version, - ReusableNode *reusable_node) { + ReusableNode *reusable_node, + bool *is_fresh) { Length position = ts_stack_top_position(self->stack, version); while (reusable_node->tree) { if (reusable_node->byte_index > position.bytes) { - LOG("before_reusable sym:%s, pos:%u", - SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index); + LOG("before_reusable_node sym:%s", SYM_NAME(reusable_node->tree->symbol)); break; } if (reusable_node->byte_index < position.bytes) { - LOG("past_reusable sym:%s, pos:%u", - SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index); - parser__pop_reusable_node(reusable_node); + LOG("past_reusable sym:%s", SYM_NAME(reusable_node->tree->symbol)); + reusable_node_pop(reusable_node); continue; } @@ -299,8 +344,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, LOG("cant_reuse_changed tree:%s, size:%u", SYM_NAME(reusable_node->tree->symbol), reusable_node->tree->size.bytes); - if (!parser__breakdown_reusable_node(reusable_node)) { - parser__pop_reusable_node(reusable_node); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); parser__breakdown_top_of_stack(self, version); } continue; @@ -310,8 +355,21 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, LOG("cant_reuse_error tree:%s, size:%u", SYM_NAME(reusable_node->tree->symbol), reusable_node->tree->size.bytes); - if (!parser__breakdown_reusable_node(reusable_node)) { - parser__pop_reusable_node(reusable_node); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); + parser__breakdown_top_of_stack(self, version); + } + continue; + } + + if (!ts_external_token_state_eq( + reusable_node->preceding_external_token_state, + ts_stack_external_token_state(self->stack, version))) { + LOG("cant_reuse_external_tokens tree:%s, size:%u", + SYM_NAME(reusable_node->tree->symbol), + reusable_node->tree->size.bytes); + if (!reusable_node_breakdown(reusable_node)) { + reusable_node_pop(reusable_node); parser__breakdown_top_of_stack(self, version); } continue; @@ -327,9 +385,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, return self->cached_token; } - ts_lexer_reset(&self->lexer, position); - TSStateId parse_state = ts_stack_top_state(self->stack, version); - return parser__lex(self, parse_state); + *is_fresh = true; + return parser__lex(self, version); } static bool parser__select_tree(Parser *self, Tree *left, Tree *right) { @@ -407,6 +464,10 @@ static void parser__shift(Parser *self, StackVersion version, TSStateId state, bool is_pending = lookahead->child_count > 0; ts_stack_push(self->stack, version, lookahead, is_pending, state); + if (lookahead->has_external_token_state) { + ts_stack_set_external_token_state( + self->stack, version, ts_tree_last_external_token_state(lookahead)); + } ts_tree_release(lookahead); } @@ -729,9 +790,13 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { LOG("new_parse"); } + if (self->language->external_scanner.reset) { + self->language->external_scanner.reset(self->external_scanner_payload); + } + ts_lexer_set_input(&self->lexer, input); ts_stack_clear(self->stack); - self->reusable_node = (ReusableNode){ previous_tree, 0 }; + self->reusable_node = reusable_node_new(previous_tree); self->cached_token = NULL; self->finished_tree = NULL; } @@ -950,30 +1015,29 @@ static void parser__recover(Parser *self, StackVersion version, TSStateId state, static void parser__advance(Parser *self, StackVersion version, ReusableNode *reusable_node) { bool validated_lookahead = false; - Tree *lookahead = parser__get_lookahead(self, version, reusable_node); + Tree *lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); for (;;) { TSStateId state = ts_stack_top_state(self->stack, version); TableEntry table_entry; - ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol, - &table_entry); + ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol, &table_entry); if (!validated_lookahead) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { - if (lookahead == reusable_node->tree) - parser__pop_reusable_node_leaf(reusable_node); - else + if (lookahead == reusable_node->tree) { + reusable_node_pop_leaf(reusable_node); + } else { parser__clear_cached_token(self); + } ts_tree_release(lookahead); - lookahead = parser__get_lookahead(self, version, reusable_node); + lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); continue; } validated_lookahead = true; - LOG("lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol), - lookahead->size.bytes); + LOG("reused_lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol), lookahead->size.bytes); } bool reduction_stopped_at_error = false; @@ -996,12 +1060,11 @@ static void parser__advance(Parser *self, StackVersion version, } if (lookahead->child_count > 0) { - if (parser__breakdown_lookahead(self, &lookahead, state, - reusable_node)) { + if (parser__breakdown_lookahead(self, &lookahead, state, reusable_node)) { if (!parser__can_reuse(self, state, lookahead, &table_entry)) { - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); - lookahead = parser__get_lookahead(self, version, reusable_node); + lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead); } } @@ -1011,7 +1074,7 @@ static void parser__advance(Parser *self, StackVersion version, parser__shift(self, version, next_state, lookahead, extra); if (lookahead == reusable_node->tree) - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); return; @@ -1053,7 +1116,7 @@ static void parser__advance(Parser *self, StackVersion version, case TSParseActionTypeRecover: { while (lookahead->child_count > 0) { - parser__breakdown_reusable_node(reusable_node); + reusable_node_breakdown(reusable_node); ts_tree_release(lookahead); lookahead = reusable_node->tree; ts_tree_retain(lookahead); @@ -1061,7 +1124,7 @@ static void parser__advance(Parser *self, StackVersion version, parser__recover(self, version, action.params.to_state, lookahead); if (lookahead == reusable_node->tree) - parser__pop_reusable_node(reusable_node); + reusable_node_pop(reusable_node); ts_tree_release(lookahead); return; } @@ -1103,6 +1166,18 @@ bool parser_init(Parser *self) { return true; } +void parser_set_language(Parser *self, const TSLanguage *language) { + if (self->external_scanner_payload && self->language->external_scanner.destroy) + self->language->external_scanner.destroy(self->external_scanner_payload); + + if (language && language->external_scanner.create) + self->external_scanner_payload = language->external_scanner.create(); + else + self->external_scanner_payload = NULL; + + self->language = language; +} + void parser_destroy(Parser *self) { if (self->stack) ts_stack_delete(self->stack); @@ -1112,6 +1187,7 @@ void parser_destroy(Parser *self) { array_delete(&self->tree_path1); if (self->tree_path2.contents) array_delete(&self->tree_path2); + parser_set_language(self, NULL); } Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { @@ -1128,15 +1204,14 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) { while (!ts_stack_is_halted(self->stack, version)) { position = ts_stack_top_position(self->stack, version).chars; - if (position > last_position || - (version > 0 && position == last_position)) + if (position > last_position || (version > 0 && position == last_position)) break; LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), ts_stack_top_state(self->stack, version), - ts_stack_top_position(self->stack, version).extent.row + 1, - ts_stack_top_position(self->stack, version).extent.column + 1); + ts_stack_top_position(self->stack, version).extent.row, + ts_stack_top_position(self->stack, version).extent.column); parser__advance(self, version, &reusable_node); LOG_STACK(); diff --git a/src/runtime/parser.h b/src/runtime/parser.h index 41512e12..a7b8dde3 100644 --- a/src/runtime/parser.h +++ b/src/runtime/parser.h @@ -8,13 +8,9 @@ extern "C" { #include "runtime/stack.h" #include "runtime/array.h" #include "runtime/lexer.h" +#include "runtime/reusable_node.h" #include "runtime/reduce_action.h" -typedef struct { - Tree *tree; - uint32_t byte_index; -} ReusableNode; - typedef struct { Lexer lexer; Stack *stack; @@ -29,11 +25,14 @@ typedef struct { ReusableNode reusable_node; TreePath tree_path1; TreePath tree_path2; + void *external_scanner_payload; + Tree *last_external_token; } Parser; bool parser_init(Parser *); void parser_destroy(Parser *); Tree *parser_parse(Parser *, TSInput, Tree *); +void parser_set_language(Parser *, const TSLanguage *); #ifdef __cplusplus } diff --git a/src/runtime/reusable_node.h b/src/runtime/reusable_node.h new file mode 100644 index 00000000..b9777638 --- /dev/null +++ b/src/runtime/reusable_node.h @@ -0,0 +1,50 @@ +#include "runtime/tree.h" + +typedef struct { + Tree *tree; + uint32_t byte_index; + bool has_preceding_external_token; + const TSExternalTokenState *preceding_external_token_state; +} ReusableNode; + +static inline ReusableNode reusable_node_new(Tree *tree) { + return (ReusableNode){ + .tree = tree, + .byte_index = 0, + .has_preceding_external_token = false, + .preceding_external_token_state = NULL, + }; +} + +static inline void reusable_node_pop(ReusableNode *self) { + self->byte_index += ts_tree_total_bytes(self->tree); + if (self->tree->has_external_tokens) { + self->has_preceding_external_token = true; + self->preceding_external_token_state = ts_tree_last_external_token_state(self->tree); + } + + while (self->tree) { + Tree *parent = self->tree->context.parent; + uint32_t next_index = self->tree->context.index + 1; + if (parent && parent->child_count > next_index) { + self->tree = parent->children[next_index]; + return; + } + self->tree = parent; + } +} + +static inline void reusable_node_pop_leaf(ReusableNode *self) { + while (self->tree->child_count > 0) + self->tree = self->tree->children[0]; + reusable_node_pop(self); +} + +static inline bool reusable_node_breakdown(ReusableNode *self) { + if (self->tree->child_count == 0) { + return false; + } else { + self->tree = self->tree->children[0]; + return true; + } +} diff --git a/src/runtime/stack.c b/src/runtime/stack.c index bdc5945c..fc875396 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -50,6 +50,7 @@ typedef struct { StackNode *node; bool is_halted; unsigned push_count; + const TSExternalTokenState *external_token_state; } StackHead; struct Stack { @@ -168,11 +169,13 @@ static void stack_node_add_link(StackNode *self, StackLink link) { } static StackVersion ts_stack__add_version(Stack *self, StackNode *node, - unsigned push_count) { + unsigned push_count, + const TSExternalTokenState *external_token_state) { StackHead head = { .node = node, .is_halted = false, .push_count = push_count, + .external_token_state = external_token_state, }; array_push(&self->heads, head); stack_node_retain(node); @@ -180,7 +183,8 @@ static StackVersion ts_stack__add_version(Stack *self, StackNode *node, } static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees, - unsigned push_count) { + unsigned push_count, + const TSExternalTokenState *external_token_state) { for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { StackVersion version = self->slices.contents[i].version; if (self->heads.contents[version].node == node) { @@ -190,7 +194,7 @@ static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees, } } - StackVersion version = ts_stack__add_version(self, node, push_count); + StackVersion version = ts_stack__add_version(self, node, push_count, external_token_state); StackSlice slice = { *trees, version }; array_push(&self->slices, slice); } @@ -202,6 +206,7 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version, StackHead *head = array_get(&self->heads, version); unsigned push_count = head->push_count; + const TSExternalTokenState *external_token_state = head->external_token_state; Iterator iterator = { .node = head->node, .trees = array_new(), @@ -229,7 +234,8 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version, if (!should_stop) ts_tree_array_copy(trees, &trees); array_reverse(&trees); - ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count); + ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count, + external_token_state); } if (should_stop) { @@ -288,7 +294,12 @@ Stack *ts_stack_new() { self->base_node = stack_node_new(NULL, NULL, false, 1, length_zero(), &self->node_pool); stack_node_retain(self->base_node); - array_push(&self->heads, ((StackHead){ self->base_node, false, 0 })); + array_push(&self->heads, ((StackHead){ + self->base_node, + false, + 0, + NULL + })); return self; } @@ -327,11 +338,19 @@ unsigned ts_stack_push_count(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->push_count; } -void ts_stack_decrease_push_count(const Stack *self, StackVersion version, +void ts_stack_decrease_push_count(Stack *self, StackVersion version, unsigned decrement) { array_get(&self->heads, version)->push_count -= decrement; } +const TSExternalTokenState *ts_stack_external_token_state(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->external_token_state; +} + +void ts_stack_set_external_token_state(Stack *self, StackVersion version, const TSExternalTokenState *state) { + array_get(&self->heads, version)->external_token_state = state; +} + ErrorStatus ts_stack_error_status(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); return (ErrorStatus){ @@ -480,7 +499,8 @@ bool ts_stack_merge(Stack *self, StackVersion version, StackVersion new_version) if (new_node->state == node->state && new_node->position.chars == node->position.chars && new_node->error_count == node->error_count && - new_node->error_cost == node->error_cost) { + new_node->error_cost == node->error_cost && + new_head->external_token_state == head->external_token_state) { for (uint32_t j = 0; j < new_node->link_count; j++) stack_node_add_link(node, new_node->links[j]); if (new_head->push_count > head->push_count) @@ -505,7 +525,12 @@ void ts_stack_clear(Stack *self) { for (uint32_t i = 0; i < self->heads.size; i++) stack_node_release(self->heads.contents[i].node, &self->node_pool); array_clear(&self->heads); - array_push(&self->heads, ((StackHead){ self->base_node, false, 0 })); + array_push(&self->heads, ((StackHead){ + self->base_node, + false, + 0, + NULL + })); } bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { @@ -528,8 +553,20 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { fprintf( f, "node_head_%u -> node_%p [label=%u, fontcolor=blue, weight=10000, " - "labeltooltip=\"push_count: %u\"]\n", + "labeltooltip=\"push_count: %u", i, head->node, i, head->push_count); + + if (head->external_token_state) { + const TSExternalTokenState *s = head->external_token_state; + fprintf(f, + "\nexternal_token_state: " + "%2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X", + (*s)[0], (*s)[1], (*s)[2], (*s)[3], (*s)[4], (*s)[5], (*s)[6], (*s)[7], + (*s)[8], (*s)[9], (*s)[10], (*s)[11], (*s)[12], (*s)[13], (*s)[14], (*s)[15] + ); + } + + fprintf(f, "\"]\n"); array_push(&self->iterators, ((Iterator){.node = head->node })); } diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 64d9842b..2e88d72a 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -65,7 +65,11 @@ TSStateId ts_stack_top_state(const Stack *, StackVersion); unsigned ts_stack_push_count(const Stack *, StackVersion); -void ts_stack_decrease_push_count(const Stack *, StackVersion, unsigned); +void ts_stack_decrease_push_count(Stack *, StackVersion, unsigned); + +const TSExternalTokenState *ts_stack_external_token_state(const Stack *, StackVersion); + +void ts_stack_set_external_token_state(Stack *, StackVersion, const TSExternalTokenState *); /* * Get the position at the top of the given version of the stack. If the stack diff --git a/src/runtime/tree.c b/src/runtime/tree.c index c94b1f9f..858ad90e 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -25,10 +25,7 @@ Tree *ts_tree_make_leaf(TSSymbol sym, Length padding, Length size, .visible = metadata.visible, .named = metadata.named, .has_changes = false, - .first_leaf = { - .symbol = sym, - .lex_state = 0 - } + .first_leaf.symbol = sym, }; return result; } @@ -111,6 +108,8 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->named_child_count = 0; self->visible_child_count = 0; self->error_cost = 0; + self->has_external_tokens = false; + self->has_external_token_state = false; for (uint32_t i = 0; i < child_count; i++) { Tree *child = children[i]; @@ -128,11 +127,14 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->visible_child_count++; if (child->named) self->named_child_count++; - } else { + } else if (child->child_count > 0) { self->visible_child_count += child->visible_child_count; self->named_child_count += child->named_child_count; } + if (child->has_external_tokens) self->has_external_tokens = true; + if (child->has_external_token_state) self->has_external_token_state = true; + if (child->symbol == ts_builtin_sym_error) { self->fragile_left = self->fragile_right = true; self->parse_state = TS_TREE_STATE_NONE; @@ -377,6 +379,21 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) { } } +const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *tree) { + while (tree->child_count > 0) { + for (uint32_t i = tree->child_count - 1; i + 1 > 0; i--) { + Tree *child = tree->children[i]; + if (child->has_external_token_state) { + tree = child; + break; + } else if (child->has_external_tokens) { + return NULL; + } + } + } + return &tree->external_token_state; +} + static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) { if (c == 0) return snprintf(s, n, "EOF"); diff --git a/src/runtime/tree.h b/src/runtime/tree.h index c37d61ab..d5916e31 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -22,10 +22,13 @@ typedef struct Tree { } context; uint32_t child_count; - uint32_t visible_child_count; - uint32_t named_child_count; union { - struct Tree **children; + struct { + uint32_t visible_child_count; + uint32_t named_child_count; + struct Tree **children; + }; + TSExternalTokenState external_token_state; int32_t lookahead_char; }; @@ -38,7 +41,7 @@ typedef struct Tree { struct { TSSymbol symbol; - TSStateId lex_state; + TSLexMode lex_mode; } first_leaf; unsigned short ref_count; @@ -48,6 +51,8 @@ typedef struct Tree { bool fragile_left : 1; bool fragile_right : 1; bool has_changes : 1; + bool has_external_tokens : 1; + bool has_external_token_state : 1; } Tree; typedef struct { @@ -81,6 +86,7 @@ void ts_tree_assign_parents(Tree *, TreePath *); void ts_tree_edit(Tree *, const TSInputEdit *edit); char *ts_tree_string(const Tree *, const TSLanguage *, bool include_all); void ts_tree_print_dot_graph(const Tree *, const TSLanguage *, FILE *); +const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *); static inline uint32_t ts_tree_total_bytes(const Tree *self) { return self->padding.bytes + self->size.bytes; diff --git a/src/runtime/tree_path.h b/src/runtime/tree_path.h index bba32718..f64dd02f 100644 --- a/src/runtime/tree_path.h +++ b/src/runtime/tree_path.h @@ -21,20 +21,20 @@ static void range_array_add(RangeArray *results, TSPoint start, TSPoint end) { } } -static bool tree_path_descend(TreePath *path, TSPoint position) { +static bool tree_path_descend(TreePath *path, Length position) { uint32_t original_size = path->size; + bool did_descend; do { did_descend = false; TreePathEntry entry = *array_back(path); - Length child_position = entry.position; + Length child_left = entry.position; for (uint32_t i = 0; i < entry.tree->child_count; i++) { Tree *child = entry.tree->children[i]; - Length child_right_position = - length_add(child_position, ts_tree_total_size(child)); - if (point_lt(position, child_right_position.extent)) { - TreePathEntry child_entry = { child, child_position, i }; - if (child->visible) { + Length child_right = length_add(child_left, ts_tree_total_size(child)); + if (position.bytes < child_right.bytes) { + TreePathEntry child_entry = { child, child_left, i }; + if (child->visible || child->child_count == 0) { array_push(path, child_entry); return true; } else if (child->visible_child_count > 0) { @@ -43,39 +43,44 @@ static bool tree_path_descend(TreePath *path, TSPoint position) { break; } } - child_position = child_right_position; + child_left = child_right; } } while (did_descend); + path->size = original_size; return false; } static uint32_t tree_path_advance(TreePath *path) { uint32_t ascend_count = 0; + while (path->size > 0) { TreePathEntry entry = array_pop(path); - if (path->size == 0) - break; + if (path->size == 0) break; TreePathEntry parent_entry = *array_back(path); if (parent_entry.tree->visible) ascend_count++; - Length position = - length_add(entry.position, ts_tree_total_size(entry.tree)); + + Length position = length_add(entry.position, ts_tree_total_size(entry.tree)); for (uint32_t i = entry.child_index + 1; i < parent_entry.tree->child_count; i++) { Tree *next_child = parent_entry.tree->children[i]; - if (next_child->visible || next_child->visible_child_count > 0) { + if (next_child->visible || + next_child->child_count == 0 || + next_child->visible_child_count > 0) { if (parent_entry.tree->visible) ascend_count--; array_push(path, ((TreePathEntry){ .tree = next_child, .child_index = i, .position = position, })); - if (!next_child->visible) - tree_path_descend(path, (TSPoint){ 0, 0 }); + if (!next_child->visible) { + tree_path_descend(path, length_zero()); + } return ascend_count; } position = length_add(position, ts_tree_total_size(next_child)); } } + return ascend_count; } @@ -94,8 +99,27 @@ static void tree_path_init(TreePath *path, Tree *tree) { .position = { 0, 0, { 0, 0 } }, .child_index = 0, })); - if (!tree->visible) - tree_path_descend(path, (TSPoint){ 0, 0 }); + if (!tree->visible) { + tree_path_descend(path, length_zero()); + } +} + +Tree *tree_path_visible_tree(TreePath *self) { + for (uint32_t i = self->size - 1; i + 1 > 0; i--) { + Tree *tree = self->contents[i].tree; + if (tree->visible) return tree; + } + return NULL; +} + +Length tree_path_start_position(TreePath *self) { + TreePathEntry entry = *array_back(self); + return length_add(entry.position, entry.tree->padding); +} + +Length tree_path_end_position(TreePath *self) { + TreePathEntry entry = *array_back(self); + return length_add(length_add(entry.position, entry.tree->padding), entry.tree->size); } static bool tree_must_eq(Tree *old_tree, Tree *new_tree) { @@ -112,67 +136,59 @@ static bool tree_must_eq(Tree *old_tree, Tree *new_tree) { static void tree_path_get_changes(TreePath *old_path, TreePath *new_path, TSRange **ranges, uint32_t *range_count) { - TSPoint position = { 0, 0 }; + Length position = length_zero(); RangeArray results = array_new(); while (old_path->size && new_path->size) { bool is_changed = false; - TSPoint next_position = position; + Length next_position = position; - TreePathEntry old_entry = *array_back(old_path); - TreePathEntry new_entry = *array_back(new_path); - Tree *old_tree = old_entry.tree; - Tree *new_tree = new_entry.tree; - uint32_t old_start_byte = old_entry.position.bytes + old_tree->padding.bytes; - uint32_t new_start_byte = new_entry.position.bytes + new_tree->padding.bytes; - TSPoint old_start_point = - point_add(old_entry.position.extent, old_tree->padding.extent); - TSPoint new_start_point = - point_add(new_entry.position.extent, new_tree->padding.extent); - TSPoint old_end_point = point_add(old_start_point, old_tree->size.extent); - TSPoint new_end_point = point_add(new_start_point, new_tree->size.extent); + Tree *old_tree = tree_path_visible_tree(old_path); + Tree *new_tree = tree_path_visible_tree(new_path); + Length old_start = tree_path_start_position(old_path); + Length new_start = tree_path_start_position(new_path); + Length old_end = tree_path_end_position(old_path); + Length new_end = tree_path_end_position(new_path); // #define NAME(t) (ts_language_symbol_name(language, ((Tree *)(t))->symbol)) - // printf("At [%-2lu, %-2lu] Compare (%-20s\t [%-2lu, %-2lu] - [%lu, %lu])\tvs\t(%-20s\t [%lu, %lu] - [%lu, %lu])\n", - // position.row, position.column, NAME(old_tree), old_start_point.row, - // old_start_point.column, old_end_point.row, old_end_point.column, - // NAME(new_tree), new_start_point.row, new_start_point.column, - // new_end_point.row, new_end_point.column); + // printf("At [%-2u, %-2u] Compare (%-20s\t [%-2u, %-2u] - [%u, %u])\tvs\t(%-20s\t [%u, %u] - [%u, %u])\n", + // position.extent.row, position.extent.column, + // NAME(old_tree), old_start.extent.row, old_start.extent.column, old_end.extent.row, old_end.extent.column, + // NAME(new_tree), new_start.extent.row, new_start.extent.column, new_end.extent.row, new_end.extent.column); - if (point_lt(position, old_start_point)) { - if (point_lt(position, new_start_point)) { - next_position = point_min(old_start_point, new_start_point); + if (position.bytes < old_start.bytes) { + if (position.bytes < new_start.bytes) { + next_position = length_min(old_start, new_start); } else { is_changed = true; - next_position = old_start_point; + next_position = old_start; } - } else if (point_lt(position, new_start_point)) { + } else if (position.bytes < new_start.bytes) { is_changed = true; - next_position = new_start_point; - } else if (old_start_byte == new_start_byte && - tree_must_eq(old_tree, new_tree)) { - next_position = old_end_point; + next_position = new_start; + } else if (old_start.bytes == new_start.bytes && tree_must_eq(old_tree, new_tree)) { + next_position = old_end; } else if (old_tree->symbol == new_tree->symbol) { if (tree_path_descend(old_path, position)) { if (!tree_path_descend(new_path, position)) { tree_path_ascend(old_path, 1); is_changed = true; - next_position = new_end_point; + next_position = new_end; } } else if (tree_path_descend(new_path, position)) { tree_path_ascend(new_path, 1); is_changed = true; - next_position = old_end_point; + next_position = old_end; } else { - next_position = point_min(old_end_point, new_end_point); + next_position = length_min(old_end, new_end); } } else { is_changed = true; - next_position = point_min(old_end_point, new_end_point); + next_position = length_min(old_end, new_end); } - bool at_old_end = point_lte(old_end_point, next_position); - bool at_new_end = point_lte(new_end_point, next_position); + bool at_old_end = old_end.bytes <= next_position.bytes; + bool at_new_end = new_end.bytes <= next_position.bytes; if (at_new_end && at_old_end) { uint32_t old_ascend_count = tree_path_advance(old_path); @@ -190,7 +206,7 @@ static void tree_path_get_changes(TreePath *old_path, TreePath *new_path, tree_path_ascend(new_path, ascend_count); } - if (is_changed) range_array_add(&results, position, next_position); + if (is_changed) range_array_add(&results, position.extent, next_position.extent); position = next_position; } diff --git a/todo.md b/todo.md deleted file mode 100644 index 0fd7f7b0..00000000 --- a/todo.md +++ /dev/null @@ -1,32 +0,0 @@ -TODO -==== - -### Handling ambiguity (GLR) -* Add a simple way to specify syntactic ambiguity resolutions in the Grammar (e.g. 'prefer declarations to statements' in C), similar to bison's `dprec` -construct. - -### Runtime System -* Refactoring: make separate symbol for unexpected characters than for interior error nodes. - -### Testing / Quality -* Start running the clang-analyzer on the codebase on Travis-CI. -* Use the Valgrind leak checker to fix the memory leaks in the runtime library. -* Randomize the editing in the language tests, using a seed that can be specified in order to reproduce failures. - -### Ubiquitous token handling -* Fix the unintuitive tree that results when ubiquitous tokens are last child of their parent node. - -### Error handling -* Use information about nesting depth of tokens like '(' and ')' to make error recovery more accurate. - -### Grammar Features -* Regexp assertions - - [ ] '^' - - [ ] '$' - - [ ] '\b' -* Composing languages - - [ ] Rule for referencing named grammar - - [ ] Grammar registry object in runtime - - [ ] Parsing returns control to parent language -* Indentation tokens -