Fix handling of ubiquitous tokens used in grammar rules

2014-07-01 20:47:35 -07:00 · 2014-07-01 20:47:35 -07:00 · 83a1b9439e
commit 83a1b9439e
parent 59cc65c2e3
30 changed files with 39086 additions and 32890 deletions
--- a/examples/parsers/golang.c
+++ b/examples/parsers/golang.c
--- a/examples/parsers/javascript.c
+++ b/examples/parsers/javascript.c
--- a/include/tree_sitter/compiler.h
+++ b/include/tree_sitter/compiler.h
@ -2,6 +2,7 @@
 #define TREE_SITTER_COMPILER_H_

 #include <vector>
+#include <set>
 #include <string>
 #include <memory>

@ -29,8 +30,8 @@ namespace tree_sitter {
    class Grammar {
    protected:
        const std::vector<std::pair<std::string, rules::rule_ptr>> rules_;
-        std::vector<std::string> ubiquitous_tokens_;
-        std::vector<char> separators_;
+        std::set<std::string> ubiquitous_tokens_;
+        std::set<char> separators_;

    public:
        Grammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules);
@ -39,10 +40,10 @@ namespace tree_sitter {
        const rules::rule_ptr rule(const std::string &name) const;

        const std::vector<std::pair<std::string, rules::rule_ptr>> & rules() const;
-        const std::vector<std::string> & ubiquitous_tokens() const;
-        Grammar & ubiquitous_tokens(const std::vector<std::string> &ubiquitous_tokens);
-        const std::vector<char> & separators() const;
-        Grammar & separators(const std::vector<char> &separators);
+        const std::set<std::string> & ubiquitous_tokens() const;
+        Grammar & ubiquitous_tokens(const std::set<std::string> &ubiquitous_tokens);
+        const std::set<char> & separators() const;
+        Grammar & separators(const std::set<char> &separators);
    };

    struct Conflict {
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -5,7 +5,6 @@
 extern "C" {
 #endif

-//#define TS_DEBUG_PARSE
 // #define TS_DEBUG_LEX

 #include "tree_sitter/runtime.h"
--- a/include/tree_sitter/parser/state_machine.h
+++ b/include/tree_sitter/parser/state_machine.h
@ -13,6 +13,7 @@ typedef enum {
    TSParseActionTypeShift,
    TSParseActionTypeShiftExtra,
    TSParseActionTypeReduce,
+    TSParseActionTypeReduceExtra,
    TSParseActionTypeAccept,
 } TSParseActionType;

@ -33,6 +34,9 @@ typedef struct {
 #define SHIFT_EXTRA() \
 { .type = TSParseActionTypeShiftExtra }

+#define REDUCE_EXTRA(symbol_val) \
+{ .type = TSParseActionTypeReduceExtra, .data = { .symbol = symbol_val } }
+
 #define REDUCE(symbol_val, child_count_val) \
 { .type = TSParseActionTypeReduce, .data = { .symbol = symbol_val, .child_count = child_count_val } }

--- a/spec/compiler/build_tables/build_parse_table_spec.cc
+++ b/spec/compiler/build_tables/build_parse_table_spec.cc
@ -19,7 +19,7 @@ describe("building parse tables", []() {
    LexicalGrammar lex_grammar({
        { "token0", pattern("[a-c]") },
        { "token1", pattern("[b-d]") },
-    }, {}, {});
+    }, {});

    it("first looks for the start rule and its item set closure", [&]() {
        auto result = build_parse_table(parse_grammar, lex_grammar);
--- a/spec/compiler/build_tables/conflict_manager_spec.cc
+++ b/spec/compiler/build_tables/conflict_manager_spec.cc
@ -14,13 +14,13 @@ describe("resolving parse conflicts", []() {
    SyntaxGrammar parse_grammar({
        { "rule1", seq({ sym("rule2"), sym("token2") }) },
        { "rule2", sym("token1") },
-    }, {}, {});
+    }, {}, set<rules::Symbol>());

    LexicalGrammar lex_grammar({
        { "token1", pattern("[a-c]") },
        { "token2", pattern("[b-d]") },
        { "token3", keyword("stuff") },
-    }, {}, {});
+    }, {}, set<char>());

    describe("lexical conflicts", [&]() {
        Symbol sym1(0, SymbolOptionToken);
--- a/spec/compiler/build_tables/first_set_spec.cc
+++ b/spec/compiler/build_tables/first_set_spec.cc
@ -46,7 +46,7 @@ describe("computing FIRST sets", []() {
                    i_token(2),
                    i_token(3),
                    i_token(4) }) }
-            }, {}, {});
+            }, {});

            AssertThat(first_set(rule, grammar), Equals(set<Symbol>({
                Symbol(0, SymbolOptionToken),
@ -63,7 +63,7 @@ describe("computing FIRST sets", []() {
                { "rule0", choice({
                    i_token(0),
                    blank() }) }
-            }, {}, {});
+            }, {});

            AssertThat(first_set(rule, grammar), Equals(set<Symbol>({
                Symbol(0, SymbolOptionToken),
@ -79,7 +79,7 @@ describe("computing FIRST sets", []() {
                    seq({ i_sym(0), i_token(10) }),
                    i_token(11),
                }) },
-            }, {}, {});
+            }, {});

            auto rule = i_sym(0);

--- a/spec/compiler/build_tables/item_set_closure_spec.cc
+++ b/spec/compiler/build_tables/item_set_closure_spec.cc
@ -16,7 +16,7 @@ describe("computing closures of item sets", []() {
        { "T", seq({
            i_token(12),
            i_token(13) }) },
-    }, {}, {});
+    }, {});

    it("adds items at the beginnings of referenced rules", [&]() {
        ParseItemSet item_set = item_set_closure(ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0),
--- a/spec/compiler/build_tables/item_set_transitions_spec.cc
+++ b/spec/compiler/build_tables/item_set_transitions_spec.cc
@ -31,7 +31,7 @@ describe("syntactic item set transitions", [&]() {
    SyntaxGrammar grammar({
        { "A", blank() },
        { "B", i_token(21) },
-    }, {}, {});
+    }, {}, set<Symbol>());

    it("computes the closure of the new item sets", [&]() {
        ParseItemSet set1({
--- a/spec/compiler/build_tables/rule_can_be_blank_spec.cc
+++ b/spec/compiler/build_tables/rule_can_be_blank_spec.cc
@ -63,7 +63,7 @@ describe("checking if rules can be blank", [&]() {
            { "B", choice({
                seq({ i_sym(1), i_token(12) }),
                i_token(13) }) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        it("terminates for left-recursive rules that can be blank", [&]() {
            rule = i_sym(0);
--- a/spec/compiler/compiler_specs.cc
+++ b/spec/compiler/compiler_specs.cc
@ -5,7 +5,7 @@ int main(int argc, char *argv[]) {
        "",
        "--no-color",
        "--only="
-        // "compiles the javascript"
+        ""
    };
    return bandit::run(4, const_cast<char **>(args));
 }
--- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc
@ -12,7 +12,7 @@ describe("expanding repeat rules in a grammar", []() {
    it("replaces repeat rules with pairs of recursive rules", [&]() {
        SyntaxGrammar grammar({
            { "rule0", repeat(i_token(0)) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        auto match = expand_repeats(grammar);

@ -28,7 +28,7 @@ describe("expanding repeat rules in a grammar", []() {
    it("replaces repeats inside of sequences", [&]() {
        SyntaxGrammar grammar({
            { "rule0", seq({ i_token(10), repeat(i_token(11)) }) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        auto match = expand_repeats(grammar);

@ -46,7 +46,7 @@ describe("expanding repeat rules in a grammar", []() {
    it("replaces repeats inside of choices", [&]() {
        SyntaxGrammar grammar({
            { "rule0", choice({ i_token(10), repeat(i_token(11)) }) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        auto match = expand_repeats(grammar);

@ -64,7 +64,7 @@ describe("expanding repeat rules in a grammar", []() {
    it("can replace multiple repeats in the same rule", [&]() {
        SyntaxGrammar grammar({
            { "rule0", seq({ repeat(i_token(10)), repeat(i_token(11)) }) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        auto match = expand_repeats(grammar);

@ -90,7 +90,7 @@ describe("expanding repeat rules in a grammar", []() {
        SyntaxGrammar grammar({
            { "rule0", repeat(i_token(10)) },
            { "rule1", repeat(i_token(11)) },
-        }, {}, {});
+        }, {}, set<Symbol>());

        auto match = expand_repeats(grammar);

--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -15,7 +15,7 @@ describe("expanding token rules", []() {
                i_sym(10),
                pattern("x*"),
                i_sym(11) }) },
-        }, {}, {});
+        }, {});

        auto result = expand_tokens(grammar);

@ -34,7 +34,7 @@ describe("expanding token rules", []() {
                i_sym(10),
                str("xyz"),
                i_sym(11) }) },
-        }, {}, {});
+        }, {});

        auto result = expand_tokens(grammar);

@ -53,7 +53,7 @@ describe("expanding token rules", []() {
                pattern("("),
                str("xyz"),
                pattern("[") }) },
-        }, {}, {});
+        }, {});

        auto result = expand_tokens(grammar);

--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -17,8 +17,8 @@ describe("extracting tokens from a grammar", []() {
            {
                { "rule_A", seq({ str("ab"), i_sym(0) }) }
            },
-            {},
-            {}
+            set<Symbol>(),
+            set<char>()
        });

        AssertThat(result.first.rules, Equals(rule_list({
@ -36,8 +36,8 @@ describe("extracting tokens from a grammar", []() {
            {
                { "rule_A", seq({ pattern("a+"), i_sym(0) }) }
            },
-            {},
-            {}
+            set<Symbol>(),
+            set<char>()
        });

        AssertThat(result.first.rules, Equals(rule_list({
@ -57,8 +57,8 @@ describe("extracting tokens from a grammar", []() {
                    token(seq({ pattern("."), choice({ str("a"), str("b") }) })),
                    i_sym(0) }) }
            },
-            {},
-            {}
+            set<Symbol>(),
+            set<char>()
        });

        AssertThat(result.first.rules, Equals(rule_list({
@ -76,8 +76,8 @@ describe("extracting tokens from a grammar", []() {
            {
                { "rule_A", choice({ i_sym(0), blank() }) },
            },
-            {},
-            {}
+            set<Symbol>(),
+            set<char>()
        });

        AssertThat(result.first.rules, Equals(rule_list({
@ -93,8 +93,8 @@ describe("extracting tokens from a grammar", []() {
            {
                { "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
            },
-            {},
-            {}
+            set<Symbol>(),
+            set<char>()
        });

        AssertThat(result.first.rules, Equals(rule_list({
@ -112,11 +112,11 @@ describe("extracting tokens from a grammar", []() {
            {
                { "rule_A", str("ab") },
            },
-            {},
+            set<Symbol>(),
            { 'x', 'y', 'z' }
        });

-        AssertThat(result.second.separators, Equals(vector<char>({ 'x', 'y', 'z' })));
+        AssertThat(result.second.separators, Equals(set<char>({ 'x', 'y', 'z' })));
    });

    describe("when an entire rule can be extracted", [&]() {
@ -127,8 +127,8 @@ describe("extracting tokens from a grammar", []() {
                    { "rule_B", pattern("a|b") },
                    { "rule_C", token(seq({ str("a"), str("b") })) },
                },
-                {},
-                {}
+                set<Symbol>(),
+                set<char>()
            });

            AssertThat(result.first.rules, Equals(rule_list({
@ -149,8 +149,8 @@ describe("extracting tokens from a grammar", []() {
                    { "rule_B", i_sym(0) },
                    { "rule_C", i_sym(1) },
                },
-                {},
-                {}
+                set<Symbol>(),
+                set<char>()
            });

            AssertThat(result.first.rules, Equals(rule_list({
@ -172,10 +172,10 @@ describe("extracting tokens from a grammar", []() {
                    { "rule_C", i_sym(1) },
                },
                { Symbol(0) },
-                {}
+                set<char>()
            });

-            AssertThat(result.first.ubiquitous_tokens, Equals(vector<Symbol>({
+            AssertThat(result.first.ubiquitous_tokens, Equals(set<Symbol>({
                { Symbol(0, SymbolOptionToken) }
            })));
        });
--- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc
+++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc
@ -50,7 +50,7 @@ describe("interning symbols in a grammar", []() {
        auto result = intern_symbols(grammar);

        AssertThat(result.second, Equals((GrammarError *)nullptr));
-        AssertThat(result.first.ubiquitous_tokens, Equals(vector<Symbol>({
+        AssertThat(result.first.ubiquitous_tokens, Equals(set<Symbol>({
            Symbol(2)
        })));
    });
@ -62,7 +62,7 @@ describe("interning symbols in a grammar", []() {

        auto result = intern_symbols(grammar);

-        AssertThat(result.first.separators, Equals(vector<char>({ 'x', 'y' })))
+        AssertThat(result.first.separators, Equals(set<char>({ 'x', 'y' })))
    });
 });

--- a/spec/runtime/languages/javascript/errors.txt
+++ b/spec/runtime/languages/javascript/errors.txt
@ -24,7 +24,7 @@ moreStuff();
 recovers from errors in for loops
 ==========================================
 stuff();
-for (var i = 0; *nonsense*; *what*) {
+for (var i = 0; *nonsense*; i++) {
    *more-nonsense*;
 }
 moreStuff();
@ -34,7 +34,7 @@ moreStuff();
    (for_statement
        (var_declaration (assignment (identifier) (number)))
        (expression_statement (ERROR '*'))
-        (ERROR '*')
+        (math_op (identifier))
        (statement_block (expression_statement (ERROR '*'))))
    (expression_statement (function_call (identifier))))

--- a/spec/runtime/languages/javascript/operators.txt
+++ b/spec/runtime/languages/javascript/operators.txt
@ -34,6 +34,18 @@ print(object.property);
        (identifier)
        (property_access (identifier) (identifier)))))

+==========================================
+parses property access across lines
+==========================================
+object
+  .someProperty
+  .otherProperty
+---
+(program (expression_statement
+    (property_access
+        (property_access (identifier) (identifier))
+        (identifier))))
+
 ===========================================
 parses dynamic property access
 ==========================================
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -11,12 +11,16 @@
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/item_set_closure.h"
 #include "compiler/build_tables/item_set_transitions.h"
+#include "compiler/build_tables/first_set.h"
+
+#include <iostream>

 namespace tree_sitter {
    using std::pair;
    using std::string;
    using std::vector;
    using std::set;
+    using std::map;
    using std::unordered_map;
    using std::make_shared;
    using rules::Symbol;
@ -33,8 +37,8 @@ namespace tree_sitter {
                if (pair == parse_state_ids.end()) {
                    ParseStateId state_id = parse_table.add_state();
                    parse_state_ids[item_set] = state_id;
-                    add_shift_actions(item_set, state_id);
                    add_reduce_actions(item_set, state_id);
+                    add_shift_actions(item_set, state_id);
                    add_ubiquitous_token_actions(item_set, state_id);
                    return state_id;
                } else {
@ -43,26 +47,34 @@ namespace tree_sitter {
            }

            void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) {
+                map<Symbol, size_t> shifts;
+
                for (const auto &transition : sym_transitions(item_set, grammar)) {
                    const Symbol &symbol = transition.first;
                    const ParseItemSet &next_item_set = transition.second;
-                    auto &actions = parse_table.states[state_id].actions;
-                    auto current_action = actions.find(symbol);

-                    set<int> precedence_values = precedence_values_for_item_set(next_item_set);
-                    if (current_action == actions.end() ||
-                        conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) {
+                    ParseAction new_action = ParseAction::Shift(0, precedence_values_for_item_set(next_item_set));
+                    if (should_add_action(state_id, symbol, new_action)) {
                        ParseStateId new_state_id = add_parse_state(next_item_set);
-                        parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values));
+                        new_action.state_index = new_state_id;
+                        parse_table.add_action(state_id, symbol, new_action);
+
+                        shifts.insert({ symbol, new_state_id });
                    }
                }
-            }

-            void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
-                for (const Symbol &symbol : grammar.ubiquitous_tokens) {
-                    auto &actions = parse_table.states[state_id].actions;
-                    if (actions.find(symbol) == actions.end())
-                        parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
+                for (auto &pair : shifts) {
+                    const Symbol &shift_symbol = pair.first;
+                    size_t new_state_id = pair.second;
+
+                    if (grammar.ubiquitous_tokens.find(shift_symbol) != grammar.ubiquitous_tokens.end()) {
+                        for (const auto &pair : parse_table.states[state_id].actions) {
+                            const Symbol &lookahead_sym = pair.first;
+                            ParseAction action = ParseAction::ReduceExtra(shift_symbol);
+                            if (should_add_action(new_state_id, lookahead_sym, action))
+                                parse_table.add_action(new_state_id, lookahead_sym, action);
+                        }
+                    }
                }
            }

@ -75,19 +87,39 @@ namespace tree_sitter {
                        ParseAction action = (item.lhs == rules::START()) ?
                            ParseAction::Accept() :
                            ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence());
-
-                        for (auto &lookahead_sym : lookahead_symbols) {
-                            auto current_actions = parse_table.states[state_id].actions;
-                            auto current_action = current_actions.find(lookahead_sym);
-                            if (current_action == current_actions.end() ||
-                                conflict_manager.resolve_parse_action(lookahead_sym, current_action->second, action)) {
+                        for (auto &lookahead_sym : lookahead_symbols)
+                            if (should_add_action(state_id, lookahead_sym, action))
                                parse_table.add_action(state_id, lookahead_sym, action);
-                            }
-                        }
                    }
                }
            }

+            void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
+                for (const Symbol &symbol : grammar.ubiquitous_tokens) {
+                    auto &actions = parse_table.states[state_id].actions;
+                    if (actions.find(symbol) == actions.end())
+                        parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
+                }
+            }
+
+            set<Symbol> first_set_for_item_set(const ParseItemSet &item_set) {
+                set<Symbol> result;
+                for (const auto &pair : item_set) {
+                    auto new_set = first_set(pair.first.rule, grammar);
+                    result.insert(new_set.begin(), new_set.end());
+                }
+                return result;
+            }
+
+            bool should_add_action(size_t state_id, const Symbol &symbol, const ParseAction &action) {
+                auto current_actions = parse_table.states[state_id].actions;
+                auto current_action = current_actions.find(symbol);
+                return (
+                    current_action == current_actions.end() ||
+                    conflict_manager.resolve_parse_action(symbol, current_action->second, action)
+                );
+            }
+
            set<int> precedence_values_for_item_set(const ParseItemSet &item_set) {
                set<int> result;
                for (const auto &pair : item_set) {
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -290,6 +290,9 @@ namespace tree_sitter {
                            symbol_id(action.symbol) + ", " +
                            to_string(action.consumed_symbol_count) + ")");
                        break;
+                    case ParseActionTypeReduceExtra:
+                        add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
+                        break;
                    default:;
                }
            }
--- a/src/compiler/grammar.cc
+++ b/src/compiler/grammar.cc
@ -2,9 +2,10 @@
 #include "compiler/rules/rule.h"

 namespace tree_sitter {
-    using std::string;
    using std::ostream;
    using std::pair;
+    using std::set;
+    using std::string;
    using std::vector;
    using rules::rule_ptr;

@ -59,20 +60,20 @@ namespace tree_sitter {
            return stream << string("#<null>");
    }

-    const vector<string> & Grammar::ubiquitous_tokens() const {
+    const set<string> & Grammar::ubiquitous_tokens() const {
        return ubiquitous_tokens_;
    }

-    Grammar & Grammar::ubiquitous_tokens(const vector<string> &ubiquitous_tokens) {
+    Grammar & Grammar::ubiquitous_tokens(const set<string> &ubiquitous_tokens) {
        ubiquitous_tokens_ = ubiquitous_tokens;
        return *this;
    }

-    const vector<char> & Grammar::separators() const {
+    const set<char> & Grammar::separators() const {
        return separators_;
    }

-    Grammar & Grammar::separators(const vector<char> &separators) {
+    Grammar & Grammar::separators(const set<char> &separators) {
        separators_ = separators;
        return *this;
    }
--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@ -39,7 +39,11 @@ namespace tree_sitter {
    }

    ParseAction ParseAction::ShiftExtra() {
-        return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, set<int>({}));
+        return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, { 0 });
+    }
+
+    ParseAction ParseAction::ReduceExtra(Symbol symbol) {
+        return ParseAction(ParseActionTypeReduceExtra, -1, symbol, 0, { 0 });
    }

    ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, int precedence) {
--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@ -14,6 +14,7 @@ namespace tree_sitter {
        ParseActionTypeShift,
        ParseActionTypeShiftExtra,
        ParseActionTypeReduce,
+        ParseActionTypeReduceExtra,
        ParseActionTypeAccept,
    } ParseActionType;

@ -28,8 +29,9 @@ namespace tree_sitter {
        static ParseAction Accept();
        static ParseAction Error();
        static ParseAction Shift(size_t state_index, std::set<int> precedence_values);
-        static ParseAction ShiftExtra();
        static ParseAction Reduce(rules::Symbol symbol, size_t consumed_symbol_count, int precedence);
+        static ParseAction ShiftExtra();
+        static ParseAction ReduceExtra(rules::Symbol symbol);
        bool operator==(const ParseAction &action) const;

        ParseActionType type;
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -51,14 +51,14 @@ namespace tree_sitter {
            for (auto &pair : grammar.rules) {
                auto rule = expander.apply(pair.second);
                if (expander.error)
-                    return { LexicalGrammar({}, {}, {}), expander.error };
+                    return { LexicalGrammar(), expander.error };
                rules.push_back({ pair.first, rule });
            }

            for (auto &pair : grammar.aux_rules) {
                auto rule = expander.apply(pair.second);
                if (expander.error)
-                    return { LexicalGrammar({}, {}, {}), expander.error };
+                    return { LexicalGrammar(), expander.error };
                aux_rules.push_back({ pair.first, rule });
            }

--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -1,6 +1,7 @@
 #include "compiler/prepare_grammar/extract_tokens.h"
 #include <map>
 #include <vector>
+#include <set>
 #include <string>
 #include "tree_sitter/compiler.h"
 #include "compiler/prepared_grammar.h"
@ -18,6 +19,7 @@ namespace tree_sitter {
    using std::map;
    using std::to_string;
    using std::vector;
+    using std::set;
    using std::make_shared;
    using rules::rule_ptr;
    using rules::Symbol;
@ -96,7 +98,7 @@ namespace tree_sitter {

        pair<SyntaxGrammar, LexicalGrammar> extract_tokens(const InternedGrammar &input_grammar) {
            vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
-            vector<Symbol> ubiquitous_tokens;
+            set<Symbol> ubiquitous_tokens;

            TokenExtractor extractor;
            map<Symbol, Symbol> symbol_replacements;
@ -120,7 +122,7 @@ namespace tree_sitter {
            for (auto &pair : rules)
                pair.second = inliner.apply(pair.second);
            for (auto &symbol : input_grammar.ubiquitous_tokens)
-                ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
+                ubiquitous_tokens.insert(inliner.replace_symbol(symbol));

            return {
                SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),
--- a/src/compiler/prepare_grammar/intern_symbols.cc
+++ b/src/compiler/prepare_grammar/intern_symbols.cc
@ -1,6 +1,7 @@
 #include "compiler/prepare_grammar/intern_symbols.h"
 #include <memory>
 #include <vector>
+#include <set>
 #include "tree_sitter/compiler.h"
 #include "compiler/prepare_grammar/interned_grammar.h"
 #include "compiler/prepared_grammar.h"
@ -12,6 +13,7 @@ namespace tree_sitter {
    using std::string;
    using rules::rule_ptr;
    using std::vector;
+    using std::set;
    using std::pair;
    using std::make_shared;

@ -58,12 +60,12 @@ namespace tree_sitter {
                rules.push_back({ pair.first, new_rule });
            }

-            vector<rules::Symbol> ubiquitous_tokens;
+            set<rules::Symbol> ubiquitous_tokens;
            for (auto &name : grammar.ubiquitous_tokens()) {
                auto token = interner.symbol_for_rule_name(name);
                if (!token.get())
                    return missing_rule_error(name);
-                ubiquitous_tokens.push_back(*token);
+                ubiquitous_tokens.insert(*token);
            }

            InternedGrammar result;
--- a/src/compiler/prepare_grammar/interned_grammar.h
+++ b/src/compiler/prepare_grammar/interned_grammar.h
@ -3,6 +3,7 @@

 #include <utility>
 #include <vector>
+#include <set>
 #include <string>
 #include "tree_sitter/compiler.h"
 #include "compiler/rules/symbol.h"
@ -12,8 +13,8 @@ namespace tree_sitter {
        class InternedGrammar {
        public:
            std::vector<std::pair<std::string, rules::rule_ptr>> rules;
-            std::vector<rules::Symbol> ubiquitous_tokens;
-            std::vector<char> separators;
+            std::set<rules::Symbol> ubiquitous_tokens;
+            std::set<char> separators;
        };
    }
 }
--- a/src/compiler/prepared_grammar.cc
+++ b/src/compiler/prepared_grammar.cc
@ -8,6 +8,7 @@ namespace tree_sitter {
    using std::string;
    using std::pair;
    using std::vector;
+    using std::set;

    const rules::rule_ptr & PreparedGrammar::rule(const rules::Symbol &symbol) const {
        return symbol.is_auxiliary() ?
@ -25,6 +26,16 @@ namespace tree_sitter {
    SyntaxGrammar::SyntaxGrammar() {}
    LexicalGrammar::LexicalGrammar() {}

+    SyntaxGrammar::SyntaxGrammar(
+        const vector<pair<string, rules::rule_ptr>> &rules,
+        const vector<pair<string, rules::rule_ptr>> &aux_rules) :
+        PreparedGrammar(rules, aux_rules) {}
+
+    LexicalGrammar::LexicalGrammar(
+        const vector<pair<string, rules::rule_ptr>> &rules,
+        const vector<pair<string, rules::rule_ptr>> &aux_rules) :
+        PreparedGrammar(rules, aux_rules) {}
+
    PreparedGrammar::PreparedGrammar(
        const vector<pair<string, rules::rule_ptr>> &rules,
        const vector<pair<string, rules::rule_ptr>> &aux_rules) :
@ -34,14 +45,14 @@ namespace tree_sitter {
    SyntaxGrammar::SyntaxGrammar(
        const vector<pair<string, rules::rule_ptr>> &rules,
        const vector<pair<string, rules::rule_ptr>> &aux_rules,
-        const vector<rules::Symbol> &ubiquitous_tokens) :
+        const set<rules::Symbol> &ubiquitous_tokens) :
        PreparedGrammar(rules, aux_rules),
        ubiquitous_tokens(ubiquitous_tokens) {}

    LexicalGrammar::LexicalGrammar(
        const vector<pair<string, rules::rule_ptr>> &rules,
        const vector<pair<string, rules::rule_ptr>> &aux_rules,
-        const vector<char> &separators) :
+        const set<char> &separators) :
        PreparedGrammar(rules, aux_rules),
        separators(separators) {}
 }
--- a/src/compiler/prepared_grammar.h
+++ b/src/compiler/prepared_grammar.h
@ -3,6 +3,7 @@

 #include <vector>
 #include <string>
+#include <set>
 #include <utility>
 #include "tree_sitter/compiler.h"
 #include "compiler/rules/symbol.h"
@ -25,23 +26,29 @@ namespace tree_sitter {
    class SyntaxGrammar : public PreparedGrammar {
    public:
        SyntaxGrammar();
+        SyntaxGrammar(
+            const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
+            const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
        SyntaxGrammar(
            const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
            const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
-            const std::vector<rules::Symbol> &ubiquitous_tokens);
+            const std::set<rules::Symbol> &ubiquitous_tokens);

-        std::vector<rules::Symbol> ubiquitous_tokens;
+        std::set<rules::Symbol> ubiquitous_tokens;
    };

    class LexicalGrammar : public PreparedGrammar {
    public:
        LexicalGrammar();
+        LexicalGrammar(
+            const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
+            const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
        LexicalGrammar(
            const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
            const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
-            const std::vector<char> &separators);
+            const std::set<char> &separators);

-        std::vector<char> separators;
+        std::set<char> separators;
    };
 }

--- a/src/runtime/state_machine.c
+++ b/src/runtime/state_machine.c
@ -9,20 +9,37 @@ static const TSParseAction * actions_for_state(TSStateMachine *machine, TSStateI
    return machine->config.parse_table + (state * machine->config.symbol_count);
 }

-void shift(TSStateMachine *machine, TSStateId parse_state, int is_extra) {
-    machine->lookahead->is_extra = is_extra;
+void shift(TSStateMachine *machine, TSStateId parse_state) {
+    if (machine->lookahead->is_extra)
+        parse_state = ts_stack_top_state(&machine->stack);
    ts_stack_push(&machine->stack, parse_state, machine->lookahead);
    machine->lookahead = machine->next_lookahead;
    machine->next_lookahead = NULL;
 }

+void shift_extra(TSStateMachine *machine) {
+    machine->lookahead->is_extra = 1;
+    shift(machine, 0);
+}
+
 void reduce(TSStateMachine *machine, TSSymbol symbol, size_t child_count) {
    machine->next_lookahead = machine->lookahead;
-    machine->lookahead = ts_stack_reduce(&machine->stack,
-                                        symbol,
-                                        child_count,
-                                        machine->config.hidden_symbol_flags,
-                                        1);
+    machine->lookahead = ts_stack_reduce(
+        &machine->stack,
+        symbol,
+        child_count,
+        machine->config.hidden_symbol_flags, 1);
+}
+
+int reduce_extra(TSStateMachine *machine, TSSymbol symbol) {
+    TSTree *top_node = ts_stack_top_node(&machine->stack);
+    if (top_node->symbol == symbol && !top_node->is_extra) {
+        reduce(machine, symbol, 1);
+        machine->lookahead->is_extra = 1;
+        return 1;
+    } else {
+        return 0;
+    }
 }

 static size_t breakdown_stack(TSStateMachine *machine, TSInputEdit *edit) {
@ -199,6 +216,8 @@ void ts_state_machine_initialize(TSStateMachine *machine, TSInput input, TSInput
    ts_lexer_advance(&machine->lexer);
 }

+// #define TS_DEBUG_PARSE
+
 #ifdef TS_DEBUG_PARSE
 #include <stdio.h>
 #define DEBUG_PARSE(...) fprintf(stderr, "\n" __VA_ARGS__)
@ -212,26 +231,33 @@ TSTree * ts_state_machine_parse(TSStateMachine *machine, const char **symbol_nam
    switch (action.type) {
        case TSParseActionTypeShift:
            DEBUG_PARSE("SHIFT %d", action.data.to_state);
-            shift(machine, action.data.to_state, 0);
+            shift(machine, action.data.to_state);
            return NULL;
        case TSParseActionTypeShiftExtra:
            DEBUG_PARSE("SHIFT EXTRA");
-            shift(machine, ts_stack_top_state(&machine->stack), 1);
+            shift_extra(machine);
            return NULL;
        case TSParseActionTypeReduce:
            DEBUG_PARSE("REDUCE %s %d", symbol_names[action.data.symbol], action.data.child_count);
            reduce(machine, action.data.symbol, action.data.child_count);
            return NULL;
+        case TSParseActionTypeReduceExtra:
+            if (!reduce_extra(machine, action.data.symbol))
+                goto error;
+            DEBUG_PARSE("REDUCE EXTRA");
+            return NULL;
        case TSParseActionTypeAccept:
            DEBUG_PARSE("ACCEPT");
            return get_tree_root(machine);
        case TSParseActionTypeError:
-            DEBUG_PARSE("ERROR");
-            if (handle_error(machine))
-                return NULL;
-            else
-                return get_tree_root(machine);
+            goto error;
        default:
            return NULL;
    }
+error:
+    DEBUG_PARSE("ERROR");
+    if (handle_error(machine))
+        return NULL;
+    else
+        return get_tree_root(machine);
 }