Make grammars' separator characters configurable

2014-06-26 07:31:08 -07:00 · 2014-06-26 07:31:08 -07:00 · a9dff20658
commit a9dff20658
parent 8aea89750d
5 changed files with 24 additions and 5 deletions
--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -107,6 +107,18 @@ describe("extracting tokens from a grammar", []() {
        })))
    });

+    it("preserves the separator characters in the lexical grammar", [&]() {
+        pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
+            {
+                { "rule_A", str("ab") },
+            },
+            {},
+            { 'x', 'y', 'z' }
+        });
+
+        AssertThat(result.second.separators, Equals(vector<char>({ 'x', 'y', 'z' })));
+    });
+
    describe("when an entire rule can be extracted", [&]() {
        it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() {
            auto result = extract_tokens(InternedGrammar{
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -93,11 +93,17 @@ namespace tree_sitter {
                        lex_table.state(state_id).is_token_start = true;
            }

+            CharacterSet separator_set() const {
+                set<rules::CharacterRange> ranges;
+                for (char c : lex_grammar.separators)
+                    ranges.insert(c);
+                return CharacterSet(ranges);
+            }
+
            rules::rule_ptr after_separators(rules::rule_ptr rule) {
                return rules::Seq::Build({
                    make_shared<rules::Metadata>(
-                        make_shared<rules::Repeat>(
-                            CharacterSet({ ' ', '\t', '\n', '\r' }).copy()),
+                        make_shared<rules::Repeat>(separator_set().copy()),
                        map<rules::MetadataKey, int>({
                            {rules::START_TOKEN, 1},
                            {rules::PRECEDENCE, -1},
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -34,8 +34,8 @@ namespace tree_sitter {
                    ParseStateId state_id = parse_table.add_state();
                    parse_state_ids[item_set] = state_id;
                    add_shift_actions(item_set, state_id);
-                    add_ubiquitous_token_actions(item_set, state_id);
                    add_reduce_actions(item_set, state_id);
+                    add_ubiquitous_token_actions(item_set, state_id);
                    return state_id;
                } else {
                    return pair->second;
--- a/src/compiler/grammar.cc
+++ b/src/compiler/grammar.cc
@ -10,7 +10,8 @@ namespace tree_sitter {

    Grammar::Grammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules) :
        rules_(rules),
-        ubiquitous_tokens_({}) {}
+        ubiquitous_tokens_({}),
+        separators_({ ' ', '\r', '\t', '\n' }) {}

    bool Grammar::operator==(const Grammar &other) const {
        if (other.rules_.size() != rules_.size()) return false;
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -124,7 +124,7 @@ namespace tree_sitter {

            return {
                SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),
-                LexicalGrammar(tokens, aux_tokens, {}),
+                LexicalGrammar(tokens, aux_tokens, input_grammar.separators),
            };
        }
    }