Ensure that the word token has a low numerical index

Fixes https://github.com/tree-sitter/tree-sitter/issues/258
2019-01-17 12:44:14 -08:00 · 2019-01-17 12:44:14 -08:00 · 9f7079c9c5
commit 9f7079c9c5
parent 3d11388cd1
2 changed files with 15 additions and 5 deletions
--- a/cli/src/generate/build_tables/mod.rs
+++ b/cli/src/generate/build_tables/mod.rs
@ -172,17 +172,17 @@ fn populate_used_symbols(
            non_terminal_usages[symbol.index] = true;
        }
    }
-    for (i, value) in external_usages.into_iter().enumerate() {
-        if value {
-            parse_table.symbols.push(Symbol::external(i));
-        }
-    }
    parse_table.symbols.push(Symbol::end());
    for (i, value) in terminal_usages.into_iter().enumerate() {
        if value {
            parse_table.symbols.push(Symbol::terminal(i));
        }
    }
+    for (i, value) in external_usages.into_iter().enumerate() {
+        if value {
+            parse_table.symbols.push(Symbol::external(i));
+        }
+    }
    for (i, value) in non_terminal_usages.into_iter().enumerate() {
        if value {
            parse_table.symbols.push(Symbol::non_terminal(i));
--- a/cli/src/generate/prepare_grammar/extract_tokens.rs
+++ b/cli/src/generate/prepare_grammar/extract_tokens.rs
@ -15,6 +15,16 @@ pub(super) fn extract_tokens(
        extracted_usage_counts: Vec::new(),
    };

+    // Extract the word token first to give it a low numerical index. This ensure that
+    // it can be stored in a subtree with no heap allocations, even for grammars with
+    // very large numbers of tokens. This is an optimization, but also important to
+    // ensure that a subtree's symbol can be successfully reassigned to the word token
+    // without having to move the subtree to the heap.
+    // See https://github.com/tree-sitter/tree-sitter/issues/258
+    if let Some(token) = grammar.word_token {
+        extractor.extract_tokens_in_variable(&mut grammar.variables[token.index]);
+    }
+
    for mut variable in grammar.variables.iter_mut() {
        extractor.extract_tokens_in_variable(&mut variable);
    }