diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 14bd7e5c..3025154d 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -107,6 +107,18 @@ describe("extracting tokens from a grammar", []() { }))) }); + it("preserves the separator characters in the lexical grammar", [&]() { + pair result = extract_tokens(InternedGrammar{ + { + { "rule_A", str("ab") }, + }, + {}, + { 'x', 'y', 'z' } + }); + + AssertThat(result.second.separators, Equals(vector({ 'x', 'y', 'z' }))); + }); + describe("when an entire rule can be extracted", [&]() { it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() { auto result = extract_tokens(InternedGrammar{ diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index a88bf185..63387cfa 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -93,11 +93,17 @@ namespace tree_sitter { lex_table.state(state_id).is_token_start = true; } + CharacterSet separator_set() const { + set ranges; + for (char c : lex_grammar.separators) + ranges.insert(c); + return CharacterSet(ranges); + } + rules::rule_ptr after_separators(rules::rule_ptr rule) { return rules::Seq::Build({ make_shared( - make_shared( - CharacterSet({ ' ', '\t', '\n', '\r' }).copy()), + make_shared(separator_set().copy()), map({ {rules::START_TOKEN, 1}, {rules::PRECEDENCE, -1}, diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 4296da29..03a013e2 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -34,8 +34,8 @@ namespace tree_sitter { ParseStateId state_id = parse_table.add_state(); parse_state_ids[item_set] = state_id; add_shift_actions(item_set, state_id); - add_ubiquitous_token_actions(item_set, state_id); add_reduce_actions(item_set, state_id); + add_ubiquitous_token_actions(item_set, state_id); return state_id; } else { return pair->second; diff --git a/src/compiler/grammar.cc b/src/compiler/grammar.cc index a8aa7b7b..84b2b5ae 100644 --- a/src/compiler/grammar.cc +++ b/src/compiler/grammar.cc @@ -10,7 +10,8 @@ namespace tree_sitter { Grammar::Grammar(const std::vector> &rules) : rules_(rules), - ubiquitous_tokens_({}) {} + ubiquitous_tokens_({}), + separators_({ ' ', '\r', '\t', '\n' }) {} bool Grammar::operator==(const Grammar &other) const { if (other.rules_.size() != rules_.size()) return false; diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 274f9a54..ae5b391b 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -124,7 +124,7 @@ namespace tree_sitter { return { SyntaxGrammar(rules, aux_rules, ubiquitous_tokens), - LexicalGrammar(tokens, aux_tokens, {}), + LexicalGrammar(tokens, aux_tokens, input_grammar.separators), }; } }