Merge branch 'master' into HEAD

2020-12-03 09:44:33 -08:00 · 2020-12-03 09:44:33 -08:00 · 026231e93d
commit 026231e93d
parent f191858bae 2a94208acf
173 changed files with 22878 additions and 6961 deletions
--- a/cli/src/generate/build_tables/build_lex_table.rs
+++ b/cli/src/generate/build_tables/build_lex_table.rs
@ -2,7 +2,7 @@ use super::coincident_tokens::CoincidentTokenIndex;
 use super::token_conflicts::TokenConflictMap;
 use crate::generate::dedup::split_state_id_groups;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
-use crate::generate::nfa::{CharacterSet, NfaCursor};
+use crate::generate::nfa::NfaCursor;
 use crate::generate::rules::{Symbol, TokenSet};
 use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
 use log::info;
@ -189,13 +189,10 @@ impl<'a> LexTableBuilder<'a> {
        // character that leads to the empty set of NFA states.
        if eof_valid {
            let (next_state_id, _) = self.add_state(Vec::new(), false);
-            self.table.states[state_id].advance_actions.push((
-                CharacterSet::empty().add_char('\0'),
-                AdvanceAction {
-                    state: next_state_id,
-                    in_main_token: true,
-                },
-            ));
+            self.table.states[state_id].eof_action = Some(AdvanceAction {
+                state: next_state_id,
+                in_main_token: true,
+            });
        }

        for transition in transitions {
@ -273,6 +270,7 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
        let signature = (
            i == 0,
            state.accept_action,
+            state.eof_action.is_some(),
            state
                .advance_actions
                .iter()
@ -320,6 +318,9 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
        for (_, advance_action) in new_state.advance_actions.iter_mut() {
            advance_action.state = group_ids_by_state_id[advance_action.state];
        }
+        if let Some(eof_action) = &mut new_state.eof_action {
+            eof_action.state = group_ids_by_state_id[eof_action.state];
+        }
        new_states.push(new_state);
    }

@ -364,6 +365,9 @@ fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
            for (_, advance_action) in state.advance_actions.iter_mut() {
                advance_action.state = new_ids_by_old_id[advance_action.state];
            }
+            if let Some(eof_action) = &mut state.eof_action {
+                eof_action.state = new_ids_by_old_id[eof_action.state];
+            }
            state
        })
        .collect();
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@ -7,7 +7,7 @@ use crate::generate::grammars::{
 use crate::generate::node_types::VariableInfo;
 use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
 use crate::generate::tables::{
-    FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
+    FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
    ProductionInfo, ProductionInfoId,
 };
 use core::ops::Range;
@ -16,17 +16,19 @@ use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::fmt::Write;
 use std::u32;

+// For conflict reporting, each parse state is associated with an example
+// sequence of symbols that could lead to that parse state.
+type SymbolSequence = Vec<Symbol>;
+
+type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
+pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
+
 #[derive(Clone)]
 struct AuxiliarySymbolInfo {
    auxiliary_symbol: Symbol,
    parent_symbols: Vec<Symbol>,
 }

-type SymbolSequence = Vec<Symbol>;
-type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
-
-pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
-
 struct ParseStateQueueEntry {
    state_id: ParseStateId,
    preceding_auxiliary_symbols: AuxiliarySymbolSequence,
@ -41,6 +43,7 @@ struct ParseTableBuilder<'a> {
    state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
    parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
    parse_state_queue: VecDeque<ParseStateQueueEntry>,
+    non_terminal_extra_states: Vec<(Symbol, usize)>,
    parse_table: ParseTable,
 }

@ -52,7 +55,7 @@ impl<'a> ParseTableBuilder<'a> {
            .push(ProductionInfo::default());

        // Add the error state at index 0.
-        self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
+        self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default(), false);

        // Add the starting state at index 1.
        self.add_parse_state(
@ -66,8 +69,40 @@ impl<'a> ParseTableBuilder<'a> {
                .iter()
                .cloned(),
            ),
+            false,
        );

+        // Compute the possible item sets for non-terminal extras.
+        let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
+        for extra_non_terminal in self
+            .syntax_grammar
+            .extra_symbols
+            .iter()
+            .filter(|s| s.is_non_terminal())
+        {
+            let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
+            for production in &variable.productions {
+                non_terminal_extra_item_sets_by_first_terminal
+                    .entry(production.first_symbol().unwrap())
+                    .or_insert(ParseItemSet::default())
+                    .insert(
+                        ParseItem {
+                            variable_index: extra_non_terminal.index as u32,
+                            production,
+                            step_index: 1,
+                        },
+                        &[Symbol::end()].iter().cloned().collect(),
+                    );
+            }
+        }
+
+        // Add a state for each starting terminal of a non-terminal extra rule.
+        for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
+            self.non_terminal_extra_states
+                .push((terminal, self.parse_table.states.len()));
+            self.add_parse_state(&Vec::new(), &Vec::new(), item_set, true);
+        }
+
        while let Some(entry) = self.parse_state_queue.pop_front() {
            let item_set = self
                .item_set_builder
@ -91,9 +126,15 @@ impl<'a> ParseTableBuilder<'a> {
        preceding_symbols: &SymbolSequence,
        preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
        item_set: ParseItemSet<'a>,
+        is_non_terminal_extra: bool,
    ) -> ParseStateId {
        match self.state_ids_by_item_set.entry(item_set) {
+            // If an equivalent item set has already been processed, then return
+            // the existing parse state index.
            Entry::Occupied(o) => *o.get(),
+
+            // Otherwise, insert a new parse state and add it to the queue of
+            // parse states to populate.
            Entry::Vacant(v) => {
                let core = v.key().core();
                let core_count = self.core_ids_by_core.len();
@ -116,6 +157,7 @@ impl<'a> ParseTableBuilder<'a> {
                    terminal_entries: HashMap::new(),
                    nonterminal_entries: HashMap::new(),
                    core_id,
+                    is_non_terminal_extra,
                });
                self.parse_state_queue.push_back(ParseStateQueueEntry {
                    state_id,
@ -138,7 +180,12 @@ impl<'a> ParseTableBuilder<'a> {
        let mut non_terminal_successors = BTreeMap::new();
        let mut lookaheads_with_conflicts = TokenSet::new();

+        // Each item in the item set contributes to either or a Shift action or a Reduce
+        // action in this state.
        for (item, lookaheads) in &item_set.entries {
+            // If the item is unfinished, then this state has a transition for the item's
+            // next symbol. Advance the item to its next step and insert the resulting
+            // item into the successor item set.
            if let Some(next_symbol) = item.symbol() {
                let successor = item.successor();
                if next_symbol.is_non_terminal() {
@ -160,7 +207,10 @@ impl<'a> ParseTableBuilder<'a> {
                        .or_insert_with(|| ParseItemSet::default())
                        .insert(successor, lookaheads);
                }
-            } else {
+            }
+            // If the item is finished, then add a Reduce action to this state based
+            // on this item.
+            else {
                let action = if item.is_augmented() {
                    ParseAction::Accept
                } else {
@ -179,6 +229,10 @@ impl<'a> ParseTableBuilder<'a> {
                        .terminal_entries
                        .entry(lookahead);
                    let entry = entry.or_insert_with(|| ParseTableEntry::new());
+
+                    // While inserting Reduce actions, eagerly resolve conflicts related
+                    // to precedence: avoid inserting lower-precedence reductions, and
+                    // clear the action list when inserting higher-precedence reductions.
                    if entry.actions.is_empty() {
                        entry.actions.push(action);
                    } else if action.precedence() > entry.actions[0].precedence() {
@ -193,12 +247,16 @@ impl<'a> ParseTableBuilder<'a> {
            }
        }

+        // Having computed the the successor item sets for each symbol, add a new
+        // parse state for each of these item sets, and add a corresponding Shift
+        // action to this state.
        for (symbol, next_item_set) in terminal_successors {
            preceding_symbols.push(symbol);
            let next_state_id = self.add_parse_state(
                &preceding_symbols,
                &preceding_auxiliary_symbols,
                next_item_set,
+                self.parse_table.states[state_id].is_non_terminal_extra,
            );
            preceding_symbols.pop();

@ -226,13 +284,19 @@ impl<'a> ParseTableBuilder<'a> {
                &preceding_symbols,
                &preceding_auxiliary_symbols,
                next_item_set,
+                self.parse_table.states[state_id].is_non_terminal_extra,
            );
            preceding_symbols.pop();
            self.parse_table.states[state_id]
                .nonterminal_entries
-                .insert(symbol, next_state_id);
+                .insert(symbol, GotoAction::Goto(next_state_id));
        }

+        // For any symbol with multiple actions, perform conflict resolution.
+        // This will either
+        // * choose one action over the others using precedence or associativity
+        // * keep multiple actions if this conflict has been whitelisted in the grammar
+        // * fail, terminating the parser generation process
        for symbol in lookaheads_with_conflicts.iter() {
            self.handle_conflict(
                &item_set,
@ -243,15 +307,50 @@ impl<'a> ParseTableBuilder<'a> {
            )?;
        }

+        // Finally, add actions for the grammar's `extra` symbols.
        let state = &mut self.parse_table.states[state_id];
-        for extra_token in &self.syntax_grammar.extra_tokens {
-            state
-                .terminal_entries
-                .entry(*extra_token)
-                .or_insert(ParseTableEntry {
-                    reusable: true,
-                    actions: vec![ParseAction::ShiftExtra],
-                });
+        let is_non_terminal_extra = state.is_non_terminal_extra;
+        let is_end_of_non_terminal_extra =
+            is_non_terminal_extra && state.terminal_entries.len() == 1;
+
+        // Add actions for the start tokens of each non-terminal extra rule.
+        // These actions are added to every state except for the states that are
+        // alread within non-terminal extras. Non-terminal extras are not allowed
+        // to nest within each other.
+        if !is_non_terminal_extra {
+            for (terminal, state_id) in &self.non_terminal_extra_states {
+                state
+                    .terminal_entries
+                    .entry(*terminal)
+                    .or_insert(ParseTableEntry {
+                        reusable: true,
+                        actions: vec![ParseAction::Shift {
+                            state: *state_id,
+                            is_repetition: false,
+                        }],
+                    });
+            }
+        }
+
+        // Add ShiftExtra actions for the terminal extra tokens. These actions
+        // are added to every state except for those at the ends of non-terminal
+        // extras.
+        if !is_end_of_non_terminal_extra {
+            for extra_token in &self.syntax_grammar.extra_symbols {
+                if extra_token.is_non_terminal() {
+                    state
+                        .nonterminal_entries
+                        .insert(*extra_token, GotoAction::ShiftExtra);
+                } else {
+                    state
+                        .terminal_entries
+                        .entry(*extra_token)
+                        .or_insert(ParseTableEntry {
+                            reusable: true,
+                            actions: vec![ParseAction::ShiftExtra],
+                        });
+                }
+            }
        }

        Ok(())
@ -362,8 +461,8 @@ impl<'a> ParseTableBuilder<'a> {
                    }
                }

-                // If all reduce actions are left associative, remove the SHIFT action.
-                // If all reduce actions are right associative, remove the REDUCE actions.
+                // If all Reduce actions are left associative, remove the SHIFT action.
+                // If all Reduce actions are right associative, remove the REDUCE actions.
                match (has_left, has_non, has_right) {
                    (true, false, false) => {
                        entry.actions.pop();
@ -744,7 +843,7 @@ fn populate_following_tokens(
            }
        }
    }
-    for extra in &grammar.extra_tokens {
+    for extra in &grammar.extra_symbols {
        if extra.is_terminal() {
            for entry in result.iter_mut() {
                entry.insert(*extra);
@ -774,6 +873,7 @@ pub(crate) fn build_parse_table<'a>(
        lexical_grammar,
        item_set_builder,
        variable_info,
+        non_terminal_extra_states: Vec::new(),
        state_ids_by_item_set: HashMap::new(),
        core_ids_by_core: HashMap::new(),
        parse_state_info_by_id: Vec::new(),
--- a/cli/src/generate/build_tables/minimize_parse_table.rs
+++ b/cli/src/generate/build_tables/minimize_parse_table.rs
@ -2,7 +2,9 @@ use super::token_conflicts::TokenConflictMap;
 use crate::generate::dedup::split_state_id_groups;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
 use crate::generate::rules::{AliasMap, Symbol, TokenSet};
-use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry};
+use crate::generate::tables::{
+    GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
+};
 use log::info;
 use std::collections::{HashMap, HashSet};
 use std::mem;
@ -66,6 +68,7 @@ impl<'a> Minimizer<'a> {
                            ..
                        } => {
                            if !self.simple_aliases.contains_key(&symbol)
+                                && !self.syntax_grammar.supertype_symbols.contains(&symbol)
                                && !aliased_symbols.contains(&symbol)
                                && self.syntax_grammar.variables[symbol.index].kind
                                    != VariableType::Named
@ -101,7 +104,10 @@ impl<'a> Minimizer<'a> {
                state.update_referenced_states(|other_state_id, state| {
                    if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
                        done = false;
-                        state.nonterminal_entries[symbol]
+                        match state.nonterminal_entries.get(symbol) {
+                            Some(GotoAction::Goto(state_id)) => *state_id,
+                            _ => other_state_id,
+                        }
                    } else {
                        other_state_id
                    }
@ -194,6 +200,9 @@ impl<'a> Minimizer<'a> {
        right_state: &ParseState,
        group_ids_by_state_id: &Vec<ParseStateId>,
    ) -> bool {
+        if left_state.is_non_terminal_extra != right_state.is_non_terminal_extra {
+            return true;
+        }
        for (token, left_entry) in &left_state.terminal_entries {
            if let Some(right_entry) = right_state.terminal_entries.get(token) {
                if self.entries_conflict(
@ -262,18 +271,24 @@ impl<'a> Minimizer<'a> {

        for (symbol, s1) in &state1.nonterminal_entries {
            if let Some(s2) = state2.nonterminal_entries.get(symbol) {
-                let group1 = group_ids_by_state_id[*s1];
-                let group2 = group_ids_by_state_id[*s2];
-                if group1 != group2 {
-                    info!(
-                        "split states {} {} - successors for {} are split: {} {}",
-                        state1.id,
-                        state2.id,
-                        self.symbol_name(symbol),
-                        s1,
-                        s2,
-                    );
-                    return true;
+                match (s1, s2) {
+                    (GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
+                    (GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
+                        let group1 = group_ids_by_state_id[*s1];
+                        let group2 = group_ids_by_state_id[*s2];
+                        if group1 != group2 {
+                            info!(
+                                "split states {} {} - successors for {} are split: {} {}",
+                                state1.id,
+                                state2.id,
+                                self.symbol_name(symbol),
+                                s1,
+                                s2,
+                            );
+                            return true;
+                        }
+                    }
+                    _ => return true,
                }
            }
        }
--- a/cli/src/generate/build_tables/mod.rs
+++ b/cli/src/generate/build_tables/mod.rs
@ -271,6 +271,7 @@ fn identify_keywords(
            cursor.reset(vec![variable.start_state]);
            if all_chars_are_alphabetical(&cursor)
                && token_conflict_map.does_match_same_string(i, word_token.index)
+                && !token_conflict_map.does_match_different_string(i, word_token.index)
            {
                info!(
                    "Keywords - add candidate {}",
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@ -1,9 +1,9 @@
-use crate::generate::build_tables::item::{TokenSetDisplay};
+use crate::generate::build_tables::item::TokenSetDisplay;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
 use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
 use crate::generate::rules::TokenSet;
-use std::collections::HashSet;
 use std::cmp::Ordering;
+use std::collections::HashSet;
 use std::fmt;

 #[derive(Clone, Debug, Default, PartialEq, Eq)]
@ -13,6 +13,7 @@ struct TokenConflictStatus {
    does_match_valid_continuation: bool,
    does_match_separators: bool,
    matches_same_string: bool,
+    matches_different_string: bool,
 }

 pub(crate) struct TokenConflictMap<'a> {
@ -25,6 +26,12 @@ pub(crate) struct TokenConflictMap<'a> {
 }

 impl<'a> TokenConflictMap<'a> {
+    /// Create a token conflict map based on a lexical grammar, which describes the structure
+    /// each token, and a `following_token` map, which indicates which tokens may be appear
+    /// immediately after each other token.
+    ///
+    /// This analyzes the possible kinds of overlap between each pair of tokens and stores
+    /// them in a matrix.
    pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
        let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
        let starting_chars = get_starting_chars(&mut cursor, grammar);
@ -50,12 +57,21 @@ impl<'a> TokenConflictMap<'a> {
        }
    }

+    /// Does token `i` match any strings that token `j` also matches, such that token `i`
+    /// is preferred over token `j`?
    pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
        let left = &self.status_matrix[matrix_index(self.n, a, other)];
        let right = &self.status_matrix[matrix_index(self.n, b, other)];
        left == right
    }

+    /// Does token `i` match any strings that token `j` does *not* match?
+    pub fn does_match_different_string(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].matches_different_string
+    }
+
+    /// Does token `i` match any strings that token `j` also matches, where
+    /// token `i` is preferred over token `j`?
    pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
        self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
    }
@ -67,6 +83,7 @@ impl<'a> TokenConflictMap<'a> {
            || entry.matches_same_string
    }

+    /// Does token `i` match any strings that are *prefixes* of strings matched by `j`?
    pub fn does_match_prefix(&self, i: usize, j: usize) -> bool {
        self.status_matrix[matrix_index(self.n, i, j)].matches_prefix
    }
@ -239,19 +256,29 @@ fn compute_conflict_status(
    );

    while let Some(state_set) = state_set_queue.pop() {
-        // Don't pursue states where there's no potential for conflict.
-        if grammar.variable_indices_for_nfa_states(&state_set).count() > 1 {
-            cursor.reset(state_set);
-        } else {
+        let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set);
+
+        // If only one of the two tokens could possibly match from this state, then
+        // there is no reason to analyze any of its successors. Just record the fact
+        // that the token matches a string that the other token does not match.
+        let first_live_variable_index = live_variable_indices.next().unwrap();
+        if live_variable_indices.count() == 0 {
+            if first_live_variable_index == i {
+                result.0.matches_different_string = true;
+            } else {
+                result.1.matches_different_string = true;
+            }
            continue;
        }

-        let has_sep = cursor.transition_chars().any(|(_, sep)| sep);
+        // Don't pursue states where there's no potential for conflict.
+        cursor.reset(state_set);
+        let within_separator = cursor.transition_chars().any(|(_, sep)| sep);

        // Examine each possible completed token in this state.
        let mut completion = None;
        for (id, precedence) in cursor.completions() {
-            if has_sep {
+            if within_separator {
                if id == i {
                    result.0.does_match_separators = true;
                } else {
@ -316,7 +343,7 @@ fn compute_conflict_status(
                        &transition,
                        completed_id,
                        completed_precedence,
-                        has_sep,
+                        within_separator,
                    ) {
                        can_advance = true;
                        if advanced_id == i {
--- a/cli/src/generate/dsl.js
+++ b/cli/src/generate/dsl.js
@ -292,7 +292,12 @@ function grammar(baseGrammar, options) {

    extras = options.extras
      .call(ruleBuilder, ruleBuilder, baseGrammar.extras)
-      .map(normalize);
+
+    if (!Array.isArray(extras)) {
+      throw new Error("Grammar's 'extras' function must return an array.")
+    }
+
+    extras = extras.map(normalize);
  }

  let word = baseGrammar.word;
--- a/cli/src/generate/grammar-schema.json
+++ b/cli/src/generate/grammar-schema.json
@ -1,15 +1,15 @@
 {
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "tree-sitter grammar specification",
  "type": "object",

-  "required": [
-    "name",
-    "rules"
-  ],
+  "required": ["name", "rules"],

  "additionalProperties": false,

  "properties": {
    "name": {
+      "description": "the name of the grammar",
      "type": "string",
      "pattern": "^[a-zA-Z_]\\w*"
    },
@ -60,6 +60,15 @@
    "word": {
      "type": "string",
      "pattern": "^[a-zA-Z_]\\w*"
+    },
+
+    "supertypes": {
+      "description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types.",
+      "type": "array",
+      "items": {
+        "description": "the name of a rule in `rules` or `extras`",
+        "type": "string"
+      }
    }
  },

@ -96,20 +105,19 @@
          "type": "string",
          "pattern": "^PATTERN$"
        },
-        "value": {"type": "string"}
+        "value": { "type": "string" }
      },
      "required": ["type", "value"]
    },

    "symbol-rule": {
-      "required": ["name"],
      "type": "object",
      "properties": {
        "type": {
          "type": "string",
          "pattern": "^SYMBOL$"
        },
-        "name": {"type": "string"}
+        "name": { "type": "string" }
      },
      "required": ["type", "name"]
    },
@ -210,6 +218,20 @@
      "required": ["type", "content"]
    },

+    "field-rule": {
+      "properties": {
+        "name": { "type": "string" },
+        "type": {
+          "type": "string",
+          "pattern": "^FIELD$"
+        },
+        "content": {
+          "$ref": "#/definitions/rule"
+        }
+      },
+      "required": ["name", "type", "content"]
+    },
+
    "prec-rule": {
      "type": "object",
      "properties": {
@ -239,6 +261,7 @@
        { "$ref": "#/definitions/repeat1-rule" },
        { "$ref": "#/definitions/repeat-rule" },
        { "$ref": "#/definitions/token-rule" },
+        { "$ref": "#/definitions/field-rule" },
        { "$ref": "#/definitions/prec-rule" }
      ]
    }
--- a/cli/src/generate/grammars.rs
+++ b/cli/src/generate/grammars.rs
@ -23,7 +23,7 @@ pub(crate) struct Variable {
 pub(crate) struct InputGrammar {
    pub name: String,
    pub variables: Vec<Variable>,
-    pub extra_tokens: Vec<Rule>,
+    pub extra_symbols: Vec<Rule>,
    pub expected_conflicts: Vec<Vec<String>>,
    pub external_tokens: Vec<Rule>,
    pub variables_to_inline: Vec<String>,
@ -87,7 +87,7 @@ pub(crate) struct ExternalToken {
 #[derive(Debug, Default)]
 pub(crate) struct SyntaxGrammar {
    pub variables: Vec<SyntaxVariable>,
-    pub extra_tokens: Vec<Symbol>,
+    pub extra_symbols: Vec<Symbol>,
    pub expected_conflicts: Vec<Vec<Symbol>>,
    pub external_tokens: Vec<ExternalToken>,
    pub supertype_symbols: Vec<Symbol>,
--- a/cli/src/generate/mod.rs
+++ b/cli/src/generate/mod.rs
@ -6,13 +6,12 @@ mod node_types;
 mod npm_files;
 pub mod parse_grammar;
 mod prepare_grammar;
-pub mod properties;
 mod render;
 mod rules;
 mod tables;

 use self::build_tables::build_tables;
-use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
+use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
 use self::parse_grammar::parse_grammar;
 use self::prepare_grammar::prepare_grammar;
 use self::render::render_c_code;
@ -20,9 +19,8 @@ use self::rules::AliasMap;
 use crate::error::{Error, Result};
 use lazy_static::lazy_static;
 use regex::{Regex, RegexBuilder};
-use std::collections::HashSet;
-use std::fs::{self, File};
-use std::io::{BufWriter, Write};
+use std::fs;
+use std::io::Write;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};

@ -33,15 +31,9 @@ lazy_static! {
        .unwrap();
 }

-const NEW_HEADER_PARTS: [&'static str; 2] = [
-    "
-  uint32_t large_state_count;
-  const uint16_t *small_parse_table;
-  const uint32_t *small_parse_table_map;",
-    "
-#define SMALL_STATE(id) id - LARGE_STATE_COUNT
-",
-];
+const NEW_HEADER_PARTS: &[&'static str] = &["
+  const uint16_t *alias_map;
+  uint32_t state_count;"];

 struct GeneratedParser {
    c_code: String,
@ -51,13 +43,11 @@ struct GeneratedParser {
 pub fn generate_parser_in_directory(
    repo_path: &PathBuf,
    grammar_path: Option<&str>,
-    properties_only: bool,
    next_abi: bool,
    report_symbol_name: Option<&str>,
 ) -> Result<()> {
    let src_path = repo_path.join("src");
    let header_path = src_path.join("tree_sitter");
-    let properties_dir_path = repo_path.join("properties");

    // Ensure that the output directories exist.
    fs::create_dir_all(&src_path)?;
@ -82,71 +72,48 @@ pub fn generate_parser_in_directory(
        prepare_grammar(&input_grammar)?;
    let language_name = input_grammar.name;

-    // If run with no arguments, read all of the property sheets and compile them to JSON.
-    if grammar_path.is_none() {
-        let token_names = get_token_names(&syntax_grammar, &lexical_grammar);
-        if let Ok(entries) = fs::read_dir(properties_dir_path) {
-            for entry in entries {
-                let css_path = entry?.path();
-                let css = fs::read_to_string(&css_path)?;
-                let sheet = properties::generate_property_sheet(&css_path, &css, &token_names)?;
-                let property_sheet_json_path = src_path
-                    .join(css_path.file_name().unwrap())
-                    .with_extension("json");
-                let property_sheet_json_file =
-                    File::create(&property_sheet_json_path).map_err(Error::wrap(|| {
-                        format!("Failed to create {:?}", property_sheet_json_path)
-                    }))?;
-                let mut writer = BufWriter::new(property_sheet_json_file);
-                serde_json::to_writer_pretty(&mut writer, &sheet)?;
-            }
-        }
-    }
-
    // Generate the parser and related files.
-    if !properties_only {
-        let GeneratedParser {
-            c_code,
-            node_types_json,
-        } = generate_parser_for_grammar_with_opts(
-            &language_name,
-            syntax_grammar,
-            lexical_grammar,
-            inlines,
-            simple_aliases,
-            next_abi,
-            report_symbol_name,
-        )?;
+    let GeneratedParser {
+        c_code,
+        node_types_json,
+    } = generate_parser_for_grammar_with_opts(
+        &language_name,
+        syntax_grammar,
+        lexical_grammar,
+        inlines,
+        simple_aliases,
+        next_abi,
+        report_symbol_name,
+    )?;

-        write_file(&src_path.join("parser.c"), c_code)?;
-        write_file(&src_path.join("node-types.json"), node_types_json)?;
+    write_file(&src_path.join("parser.c"), c_code)?;
+    write_file(&src_path.join("node-types.json"), node_types_json)?;

-        if next_abi {
-            write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
-        } else {
-            let mut header = tree_sitter::PARSER_HEADER.to_string();
+    if next_abi {
+        write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
+    } else {
+        let mut header = tree_sitter::PARSER_HEADER.to_string();

-            for part in &NEW_HEADER_PARTS {
-                let pos = header
-                    .find(part)
-                    .expect("Missing expected part of parser.h header");
-                header.replace_range(pos..(pos + part.len()), "");
-            }
-
-            write_file(&header_path.join("parser.h"), header)?;
+        for part in NEW_HEADER_PARTS.iter() {
+            let pos = header
+                .find(part)
+                .expect("Missing expected part of parser.h header");
+            header.replace_range(pos..(pos + part.len()), "");
        }

-        ensure_file(&repo_path.join("index.js"), || {
-            npm_files::index_js(&language_name)
-        })?;
-        ensure_file(&src_path.join("binding.cc"), || {
-            npm_files::binding_cc(&language_name)
-        })?;
-        ensure_file(&repo_path.join("binding.gyp"), || {
-            npm_files::binding_gyp(&language_name)
-        })?;
+        write_file(&header_path.join("parser.h"), header)?;
    }

+    ensure_file(&repo_path.join("index.js"), || {
+        npm_files::index_js(&language_name)
+    })?;
+    ensure_file(&src_path.join("binding.cc"), || {
+        npm_files::binding_cc(&language_name)
+    })?;
+    ensure_file(&repo_path.join("binding.gyp"), || {
+        npm_files::binding_gyp(&language_name)
+    })?;
+
    Ok(())
 }

@ -176,7 +143,8 @@ fn generate_parser_for_grammar_with_opts(
    next_abi: bool,
    report_symbol_name: Option<&str>,
 ) -> Result<GeneratedParser> {
-    let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?;
+    let variable_info =
+        node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
    let node_types_json = node_types::generate_node_types_json(
        &syntax_grammar,
        &lexical_grammar,
@ -208,35 +176,6 @@ fn generate_parser_for_grammar_with_opts(
    })
 }

-fn get_token_names(
-    syntax_grammar: &SyntaxGrammar,
-    lexical_grammar: &LexicalGrammar,
-) -> HashSet<String> {
-    let mut result = HashSet::new();
-    for variable in &lexical_grammar.variables {
-        if variable.kind == VariableType::Named {
-            result.insert(variable.name.clone());
-        }
-    }
-    for token in &syntax_grammar.external_tokens {
-        if token.kind == VariableType::Named {
-            result.insert(token.name.clone());
-        }
-    }
-    for variable in &syntax_grammar.variables {
-        for production in &variable.productions {
-            for step in &production.steps {
-                if let Some(alias) = &step.alias {
-                    if !step.symbol.is_non_terminal() && alias.is_named {
-                        result.insert(alias.value.clone());
-                    }
-                }
-            }
-        }
-    }
-    result
-}
-
 fn load_grammar_file(grammar_path: &Path) -> Result<String> {
    match grammar_path.extension().and_then(|e| e.to_str()) {
        Some("js") => Ok(load_js_grammar_file(grammar_path)?),
--- a/cli/src/generate/nfa.rs
+++ b/cli/src/generate/nfa.rs
@ -1,8 +1,10 @@
 use std::char;
 use std::cmp::max;
 use std::cmp::Ordering;
+use std::collections::HashSet;
 use std::fmt;
 use std::mem::swap;
+use std::ops::Range;

 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub enum CharacterSet {
@ -178,6 +180,40 @@ impl CharacterSet {
        }
    }

+    pub fn ranges<'a>(
+        chars: &'a Vec<char>,
+        ruled_out_characters: &'a HashSet<u32>,
+    ) -> impl Iterator<Item = Range<char>> + 'a {
+        let mut prev_range: Option<Range<char>> = None;
+        chars
+            .iter()
+            .map(|c| (*c, false))
+            .chain(Some(('\0', true)))
+            .filter_map(move |(c, done)| {
+                if done {
+                    return prev_range.clone();
+                }
+                if ruled_out_characters.contains(&(c as u32)) {
+                    return None;
+                }
+                if let Some(range) = prev_range.clone() {
+                    let mut prev_range_successor = range.end as u32 + 1;
+                    while prev_range_successor < c as u32 {
+                        if !ruled_out_characters.contains(&prev_range_successor) {
+                            prev_range = Some(c..c);
+                            return Some(range);
+                        }
+                        prev_range_successor += 1;
+                    }
+                    prev_range = Some(range.start..c);
+                    None
+                } else {
+                    prev_range = Some(c..c);
+                    None
+                }
+            })
+    }
+
    #[cfg(test)]
    pub fn contains(&self, c: char) -> bool {
        match self {
@ -266,6 +302,13 @@ fn compare_chars(left: &Vec<char>, right: &Vec<char>) -> SetComparision {
            result.common = true;
        }
    }
+
+    match (i, j) {
+        (Some(_), _) => result.left_only = true,
+        (_, Some(_)) => result.right_only = true,
+        _ => {}
+    }
+
    result
 }

@ -718,7 +761,7 @@ mod tests {
                .add_range('d', 'e')
        );

-        // A whitelist and an intersecting blacklist.
+        // An inclusion and an intersecting exclusion.
        // Both sets contain 'e', 'f', and 'm'
        let mut a = CharacterSet::empty()
            .add_range('c', 'h')
@ -748,7 +791,7 @@ mod tests {
        assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
        assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());

-        // A blacklist and an overlapping blacklist.
+        // An exclusion and an overlapping inclusion.
        // Both sets exclude 'c', 'd', and 'e'
        let mut a = CharacterSet::empty().add_range('a', 'e').negate();
        let mut b = CharacterSet::empty().add_range('c', 'h').negate();
@ -759,7 +802,7 @@ mod tests {
        assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
        assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));

-        // A blacklist and a larger blacklist.
+        // An exclusion and a larger exclusion.
        let mut a = CharacterSet::empty().add_range('b', 'c').negate();
        let mut b = CharacterSet::empty().add_range('a', 'd').negate();
        assert_eq!(
@ -810,5 +853,53 @@ mod tests {
        );
        assert!(a.does_intersect(&b));
        assert!(b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::Include(vec!['c']),
+            CharacterSet::Exclude(vec!['a']),
+        );
+        assert!(a.does_intersect(&b));
+        assert!(b.does_intersect(&a));
+    }
+
+    #[test]
+    fn test_character_set_get_ranges() {
+        struct Row {
+            chars: Vec<char>,
+            ruled_out_chars: Vec<char>,
+            expected_ranges: Vec<Range<char>>,
+        }
+
+        let table = [
+            Row {
+                chars: vec!['a'],
+                ruled_out_chars: vec![],
+                expected_ranges: vec!['a'..'a'],
+            },
+            Row {
+                chars: vec!['a', 'b', 'c', 'e', 'z'],
+                ruled_out_chars: vec![],
+                expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'],
+            },
+            Row {
+                chars: vec!['a', 'b', 'c', 'e', 'h', 'z'],
+                ruled_out_chars: vec!['d', 'f', 'g'],
+                expected_ranges: vec!['a'..'h', 'z'..'z'],
+            },
+        ];
+
+        for Row {
+            chars,
+            ruled_out_chars,
+            expected_ranges,
+        } in table.iter()
+        {
+            let ruled_out_chars = ruled_out_chars
+                .into_iter()
+                .map(|c: &char| *c as u32)
+                .collect();
+            let ranges = CharacterSet::ranges(chars, &ruled_out_chars).collect::<Vec<_>>();
+            assert_eq!(ranges, *expected_ranges);
+        }
    }
 }
--- a/cli/src/generate/node_types.rs
+++ b/cli/src/generate/node_types.rs
--- a/cli/src/generate/parse_grammar.rs
+++ b/cli/src/generate/parse_grammar.rs
@ -87,7 +87,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
        })
    }

-    let extra_tokens = grammar_json
+    let extra_symbols = grammar_json
        .extras
        .unwrap_or(Vec::new())
        .into_iter()
@ -107,7 +107,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
        name: grammar_json.name,
        word_token: grammar_json.word,
        variables,
-        extra_tokens,
+        extra_symbols,
        expected_conflicts,
        external_tokens,
        supertype_symbols,
--- a/cli/src/generate/prepare_grammar/expand_repeats.rs
+++ b/cli/src/generate/prepare_grammar/expand_repeats.rs
@ -283,7 +283,7 @@ mod tests {
    fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
        ExtractedSyntaxGrammar {
            variables,
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/extract_default_aliases.rs
+++ b/cli/src/generate/prepare_grammar/extract_default_aliases.rs
@ -0,0 +1,293 @@
+use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
+use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
+
+#[derive(Clone, Default)]
+struct SymbolStatus {
+    aliases: Vec<(Alias, usize)>,
+    appears_unaliased: bool,
+}
+
+// Update the grammar by finding symbols that always are aliased, and for each such symbol,
+// promoting one of its aliases to a "default alias", which is applied globally instead
+// of in a context-specific way.
+//
+// This has two benefits:
+// * It reduces the overhead of storing production-specific alias info in the parse table.
+// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation
+//   ensures that the children of an `ERROR` node have symbols that are consistent with the
+//   way that they would appear in a valid syntax tree.
+pub(super) fn extract_default_aliases(
+    syntax_grammar: &mut SyntaxGrammar,
+    lexical_grammar: &LexicalGrammar,
+) -> AliasMap {
+    let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
+    let mut non_terminal_status_list =
+        vec![SymbolStatus::default(); syntax_grammar.variables.len()];
+    let mut external_status_list =
+        vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
+
+    // For each grammar symbol, find all of the aliases under which the symbol appears,
+    // and determine whether or not the symbol ever appears *unaliased*.
+    for variable in syntax_grammar.variables.iter() {
+        for production in variable.productions.iter() {
+            for step in production.steps.iter() {
+                let mut status = match step.symbol.kind {
+                    SymbolType::External => &mut external_status_list[step.symbol.index],
+                    SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
+                    SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
+                    SymbolType::End => panic!("Unexpected end token"),
+                };
+
+                // Default aliases don't work for inlined variables.
+                if syntax_grammar.variables_to_inline.contains(&step.symbol) {
+                    continue;
+                }
+
+                if let Some(alias) = &step.alias {
+                    if let Some(count_for_alias) = status
+                        .aliases
+                        .iter_mut()
+                        .find_map(|(a, count)| if a == alias { Some(count) } else { None })
+                    {
+                        *count_for_alias += 1;
+                    } else {
+                        status.aliases.push((alias.clone(), 1));
+                    }
+                } else {
+                    status.appears_unaliased = true;
+                }
+            }
+        }
+    }
+
+    let symbols_with_statuses = (terminal_status_list
+        .iter_mut()
+        .enumerate()
+        .map(|(i, status)| (Symbol::terminal(i), status)))
+    .chain(
+        non_terminal_status_list
+            .iter_mut()
+            .enumerate()
+            .map(|(i, status)| (Symbol::non_terminal(i), status)),
+    )
+    .chain(
+        external_status_list
+            .iter_mut()
+            .enumerate()
+            .map(|(i, status)| (Symbol::external(i), status)),
+    );
+
+    // For each symbol that always appears aliased, find the alias the occurs most often,
+    // and designate that alias as the symbol's "default alias". Store all of these
+    // default aliases in a map that will be returned.
+    let mut result = AliasMap::new();
+    for (symbol, status) in symbols_with_statuses {
+        if status.appears_unaliased {
+            status.aliases.clear();
+        } else {
+            if let Some(default_entry) = status
+                .aliases
+                .iter()
+                .enumerate()
+                .max_by_key(|(i, (_, count))| (count, -(*i as i64)))
+                .map(|(_, entry)| entry.clone())
+            {
+                status.aliases.clear();
+                status.aliases.push(default_entry.clone());
+                result.insert(symbol, default_entry.0);
+            }
+        }
+    }
+
+    // Wherever a symbol is aliased as its default alias, remove the usage of the alias,
+    // because it will now be redundant.
+    let mut alias_positions_to_clear = Vec::new();
+    for variable in syntax_grammar.variables.iter_mut() {
+        alias_positions_to_clear.clear();
+
+        for (i, production) in variable.productions.iter().enumerate() {
+            for (j, step) in production.steps.iter().enumerate() {
+                let status = match step.symbol.kind {
+                    SymbolType::External => &mut external_status_list[step.symbol.index],
+                    SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
+                    SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
+                    SymbolType::End => panic!("Unexpected end token"),
+                };
+
+                // If this step is aliased as the symbol's default alias, then remove that alias.
+                if step.alias.is_some()
+                    && step.alias.as_ref() == status.aliases.get(0).map(|t| &t.0)
+                {
+                    let mut other_productions_must_use_this_alias_at_this_index = false;
+                    for (other_i, other_production) in variable.productions.iter().enumerate() {
+                        if other_i != i
+                            && other_production.steps.len() > j
+                            && other_production.steps[j].alias == step.alias
+                            && result.get(&other_production.steps[j].symbol) != step.alias.as_ref()
+                        {
+                            other_productions_must_use_this_alias_at_this_index = true;
+                            break;
+                        }
+                    }
+
+                    if !other_productions_must_use_this_alias_at_this_index {
+                        alias_positions_to_clear.push((i, j));
+                    }
+                }
+            }
+        }
+
+        for (production_index, step_index) in &alias_positions_to_clear {
+            variable.productions[*production_index].steps[*step_index].alias = None;
+        }
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::generate::grammars::{
+        LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
+    };
+    use crate::generate::nfa::Nfa;
+
+    #[test]
+    fn test_extract_simple_aliases() {
+        let mut syntax_grammar = SyntaxGrammar {
+            variables: vec![
+                SyntaxVariable {
+                    name: "v1".to_owned(),
+                    kind: VariableType::Named,
+                    productions: vec![Production {
+                        dynamic_precedence: 0,
+                        steps: vec![
+                            ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
+                            ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
+                            ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
+                            ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
+                        ],
+                    }],
+                },
+                SyntaxVariable {
+                    name: "v2".to_owned(),
+                    kind: VariableType::Named,
+                    productions: vec![Production {
+                        dynamic_precedence: 0,
+                        steps: vec![
+                            // Token 0 is always aliased as "a1".
+                            ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
+                            // Token 1 is aliased within rule `v1` above, but not here.
+                            ProductionStep::new(Symbol::terminal(1)),
+                            // Token 2 is aliased differently here than in `v1`. The alias from
+                            // `v1` should be promoted to the default alias, because `v1` appears
+                            // first in the grammar.
+                            ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
+                            // Token 3 is also aliased differently here than in `v1`. In this case,
+                            // this alias should be promoted to the default alias, because it is
+                            // used a greater number of times (twice).
+                            ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
+                            ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
+                        ],
+                    }],
+                },
+            ],
+            extra_symbols: Vec::new(),
+            expected_conflicts: Vec::new(),
+            variables_to_inline: Vec::new(),
+            supertype_symbols: Vec::new(),
+            external_tokens: Vec::new(),
+            word_token: None,
+        };
+
+        let lexical_grammar = LexicalGrammar {
+            nfa: Nfa::new(),
+            variables: vec![
+                LexicalVariable {
+                    name: "t0".to_string(),
+                    kind: VariableType::Anonymous,
+                    implicit_precedence: 0,
+                    start_state: 0,
+                },
+                LexicalVariable {
+                    name: "t1".to_string(),
+                    kind: VariableType::Anonymous,
+                    implicit_precedence: 0,
+                    start_state: 0,
+                },
+                LexicalVariable {
+                    name: "t2".to_string(),
+                    kind: VariableType::Anonymous,
+                    implicit_precedence: 0,
+                    start_state: 0,
+                },
+                LexicalVariable {
+                    name: "t3".to_string(),
+                    kind: VariableType::Anonymous,
+                    implicit_precedence: 0,
+                    start_state: 0,
+                },
+            ],
+        };
+
+        let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
+        assert_eq!(default_aliases.len(), 3);
+
+        assert_eq!(
+            default_aliases.get(&Symbol::terminal(0)),
+            Some(&Alias {
+                value: "a1".to_string(),
+                is_named: true,
+            })
+        );
+        assert_eq!(
+            default_aliases.get(&Symbol::terminal(2)),
+            Some(&Alias {
+                value: "a3".to_string(),
+                is_named: true,
+            })
+        );
+        assert_eq!(
+            default_aliases.get(&Symbol::terminal(3)),
+            Some(&Alias {
+                value: "a6".to_string(),
+                is_named: true,
+            })
+        );
+        assert_eq!(default_aliases.get(&Symbol::terminal(1)), None);
+
+        assert_eq!(
+            syntax_grammar.variables,
+            vec![
+                SyntaxVariable {
+                    name: "v1".to_owned(),
+                    kind: VariableType::Named,
+                    productions: vec![Production {
+                        dynamic_precedence: 0,
+                        steps: vec![
+                            ProductionStep::new(Symbol::terminal(0)),
+                            ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
+                            ProductionStep::new(Symbol::terminal(2)),
+                            ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
+                        ],
+                    },],
+                },
+                SyntaxVariable {
+                    name: "v2".to_owned(),
+                    kind: VariableType::Named,
+                    productions: vec![Production {
+                        dynamic_precedence: 0,
+                        steps: vec![
+                            ProductionStep::new(Symbol::terminal(0)),
+                            ProductionStep::new(Symbol::terminal(1)),
+                            ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
+                            ProductionStep::new(Symbol::terminal(3)),
+                            ProductionStep::new(Symbol::terminal(3)),
+                        ],
+                    },],
+                },
+            ]
+        );
+    }
+}
--- a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs
+++ b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs
@ -1,223 +0,0 @@
-use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
-use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
-
-#[derive(Clone, Default)]
-struct SymbolStatus {
-    alias: Option<Alias>,
-    conflicting: bool,
-}
-
-pub(super) fn extract_simple_aliases(
-    syntax_grammar: &mut SyntaxGrammar,
-    lexical_grammar: &LexicalGrammar,
-) -> AliasMap {
-    // Determine which symbols in the grammars are *always* aliased to a single name.
-    let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
-    let mut non_terminal_status_list =
-        vec![SymbolStatus::default(); syntax_grammar.variables.len()];
-    let mut external_status_list =
-        vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
-    for variable in syntax_grammar.variables.iter() {
-        for production in variable.productions.iter() {
-            for step in production.steps.iter() {
-                let mut status = match step.symbol {
-                    Symbol {
-                        kind: SymbolType::External,
-                        index,
-                    } => &mut external_status_list[index],
-                    Symbol {
-                        kind: SymbolType::NonTerminal,
-                        index,
-                    } => &mut non_terminal_status_list[index],
-                    Symbol {
-                        kind: SymbolType::Terminal,
-                        index,
-                    } => &mut terminal_status_list[index],
-                    Symbol {
-                        kind: SymbolType::End,
-                        ..
-                    } => panic!("Unexpected end token"),
-                };
-
-                if step.alias.is_none() {
-                    status.alias = None;
-                    status.conflicting = true;
-                }
-
-                if !status.conflicting {
-                    if status.alias.is_none() {
-                        status.alias = step.alias.clone();
-                    } else if status.alias != step.alias {
-                        status.alias = None;
-                        status.conflicting = true;
-                    }
-                }
-            }
-        }
-    }
-
-    // Remove the aliases for those symbols.
-    for variable in syntax_grammar.variables.iter_mut() {
-        for production in variable.productions.iter_mut() {
-            for step in production.steps.iter_mut() {
-                let status = match step.symbol {
-                    Symbol {
-                        kind: SymbolType::External,
-                        index,
-                    } => &external_status_list[index],
-                    Symbol {
-                        kind: SymbolType::NonTerminal,
-                        index,
-                    } => &non_terminal_status_list[index],
-                    Symbol {
-                        kind: SymbolType::Terminal,
-                        index,
-                    } => &terminal_status_list[index],
-                    Symbol {
-                        kind: SymbolType::End,
-                        ..
-                    } => panic!("Unexpected end token"),
-                };
-
-                if status.alias.is_some() {
-                    step.alias = None;
-                }
-            }
-        }
-    }
-
-    // Populate a map of the symbols to their aliases.
-    let mut result = AliasMap::new();
-    for (i, status) in terminal_status_list.into_iter().enumerate() {
-        if let Some(alias) = status.alias {
-            result.insert(Symbol::terminal(i), alias);
-        }
-    }
-    for (i, status) in non_terminal_status_list.into_iter().enumerate() {
-        if let Some(alias) = status.alias {
-            result.insert(Symbol::non_terminal(i), alias);
-        }
-    }
-    for (i, status) in external_status_list.into_iter().enumerate() {
-        if let Some(alias) = status.alias {
-            result.insert(Symbol::external(i), alias);
-        }
-    }
-    result
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::generate::grammars::{
-        LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
-    };
-    use crate::generate::nfa::Nfa;
-
-    #[test]
-    fn test_extract_simple_aliases() {
-        let mut syntax_grammar = SyntaxGrammar {
-            variables: vec![
-                SyntaxVariable {
-                    name: "v1".to_owned(),
-                    kind: VariableType::Named,
-                    productions: vec![Production {
-                        dynamic_precedence: 0,
-                        steps: vec![
-                            ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
-                            ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
-                            ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
-                        ],
-                    }],
-                },
-                SyntaxVariable {
-                    name: "v2".to_owned(),
-                    kind: VariableType::Named,
-                    productions: vec![Production {
-                        dynamic_precedence: 0,
-                        steps: vec![
-                            // Token 0 is always aliased as "a1".
-                            ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
-                            // Token 1 is aliased above, but not here.
-                            ProductionStep::new(Symbol::terminal(1)),
-                            // Token 2 is aliased differently than above.
-                            ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
-                        ],
-                    }],
-                },
-            ],
-            extra_tokens: Vec::new(),
-            expected_conflicts: Vec::new(),
-            variables_to_inline: Vec::new(),
-            supertype_symbols: Vec::new(),
-            external_tokens: Vec::new(),
-            word_token: None,
-        };
-
-        let lexical_grammar = LexicalGrammar {
-            nfa: Nfa::new(),
-            variables: vec![
-                LexicalVariable {
-                    name: "t1".to_string(),
-                    kind: VariableType::Anonymous,
-                    implicit_precedence: 0,
-                    start_state: 0,
-                },
-                LexicalVariable {
-                    name: "t2".to_string(),
-                    kind: VariableType::Anonymous,
-                    implicit_precedence: 0,
-                    start_state: 0,
-                },
-                LexicalVariable {
-                    name: "t3".to_string(),
-                    kind: VariableType::Anonymous,
-                    implicit_precedence: 0,
-                    start_state: 0,
-                },
-            ],
-        };
-
-        let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
-        assert_eq!(simple_aliases.len(), 1);
-        assert_eq!(
-            simple_aliases[&Symbol::terminal(0)],
-            Alias {
-                value: "a1".to_string(),
-                is_named: true,
-            }
-        );
-
-        assert_eq!(
-            syntax_grammar.variables,
-            vec![
-                SyntaxVariable {
-                    name: "v1".to_owned(),
-                    kind: VariableType::Named,
-                    productions: vec![Production {
-                        dynamic_precedence: 0,
-                        steps: vec![
-                            // 'Simple' alias removed
-                            ProductionStep::new(Symbol::terminal(0)),
-                            // Other aliases unchanged
-                            ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
-                            ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
-                        ],
-                    },],
-                },
-                SyntaxVariable {
-                    name: "v2".to_owned(),
-                    kind: VariableType::Named,
-                    productions: vec![Production {
-                        dynamic_precedence: 0,
-                        steps: vec![
-                            ProductionStep::new(Symbol::terminal(0)),
-                            ProductionStep::new(Symbol::terminal(1)),
-                            ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
-                        ],
-                    },],
-                },
-            ]
-        );
-    }
-}
--- a/cli/src/generate/prepare_grammar/extract_tokens.rs
+++ b/cli/src/generate/prepare_grammar/extract_tokens.rs
@ -90,21 +90,13 @@ pub(super) fn extract_tokens(
        .collect();

    let mut separators = Vec::new();
-    let mut extra_tokens = Vec::new();
-    for rule in grammar.extra_tokens {
+    let mut extra_symbols = Vec::new();
+    for rule in grammar.extra_symbols {
        if let Rule::Symbol(symbol) = rule {
-            let new_symbol = symbol_replacer.replace_symbol(symbol);
-            if new_symbol.is_non_terminal() {
-                return Error::err(format!(
-                    "Non-token symbol '{}' cannot be used as an extra token",
-                    &variables[new_symbol.index].name
-                ));
-            } else {
-                extra_tokens.push(new_symbol);
-            }
+            extra_symbols.push(symbol_replacer.replace_symbol(symbol));
        } else {
            if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
-                extra_tokens.push(Symbol::terminal(index));
+                extra_symbols.push(Symbol::terminal(index));
            } else {
                separators.push(rule);
            }
@ -158,7 +150,7 @@ pub(super) fn extract_tokens(
        ExtractedSyntaxGrammar {
            variables,
            expected_conflicts,
-            extra_tokens,
+            extra_symbols,
            variables_to_inline,
            supertype_symbols,
            external_tokens,
@ -415,15 +407,15 @@ mod test {
    }

    #[test]
-    fn test_extracting_extra_tokens() {
+    fn test_extracting_extra_symbols() {
        let mut grammar = build_grammar(vec![
            Variable::named("rule_0", Rule::string("x")),
            Variable::named("comment", Rule::pattern("//.*")),
        ]);
-        grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
+        grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];

        let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
-        assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
+        assert_eq!(syntax_grammar.extra_symbols, vec![Symbol::terminal(1),]);
        assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
    }

@ -472,28 +464,6 @@ mod test {
        );
    }

-    #[test]
-    fn test_error_on_non_terminal_symbol_extras() {
-        let mut grammar = build_grammar(vec![
-            Variable::named("rule_0", Rule::non_terminal(1)),
-            Variable::named("rule_1", Rule::non_terminal(2)),
-            Variable::named("rule_2", Rule::string("x")),
-        ]);
-        grammar.extra_tokens = vec![Rule::non_terminal(1)];
-
-        match extract_tokens(grammar) {
-            Err(e) => {
-                assert_eq!(
-                    e.message(),
-                    "Non-token symbol 'rule_1' cannot be used as an extra token"
-                );
-            }
-            _ => {
-                panic!("Expected an error but got no error");
-            }
-        }
-    }
-
    #[test]
    fn test_error_on_external_with_same_name_as_non_terminal() {
        let mut grammar = build_grammar(vec![
@ -522,7 +492,7 @@ mod test {
    fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
        InternedGrammar {
            variables,
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/flatten_grammar.rs
+++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs
@ -199,7 +199,7 @@ unless they are used only as the grammar's start rule.
        }
    }
    Ok(SyntaxGrammar {
-        extra_tokens: grammar.extra_tokens,
+        extra_symbols: grammar.extra_symbols,
        expected_conflicts: grammar.expected_conflicts,
        variables_to_inline: grammar.variables_to_inline,
        external_tokens: grammar.external_tokens,
--- a/cli/src/generate/prepare_grammar/intern_symbols.rs
+++ b/cli/src/generate/prepare_grammar/intern_symbols.rs
@ -30,9 +30,9 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
        external_tokens.push(Variable { name, kind, rule });
    }

-    let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
-    for extra_token in grammar.extra_tokens.iter() {
-        extra_tokens.push(interner.intern_rule(extra_token)?);
+    let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
+    for extra_token in grammar.extra_symbols.iter() {
+        extra_symbols.push(interner.intern_rule(extra_token)?);
    }

    let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
@ -73,10 +73,16 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
        );
    }

+    for (i, variable) in variables.iter_mut().enumerate() {
+        if supertype_symbols.contains(&Symbol::non_terminal(i)) {
+            variable.kind = VariableType::Hidden;
+        }
+    }
+
    Ok(InternedGrammar {
        variables,
        external_tokens,
-        extra_tokens,
+        extra_symbols,
        expected_conflicts,
        variables_to_inline,
        supertype_symbols,
@ -236,7 +242,7 @@ mod tests {
        InputGrammar {
            variables,
            name: "the_language".to_string(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/mod.rs
+++ b/cli/src/generate/prepare_grammar/mod.rs
@ -1,6 +1,6 @@
 mod expand_repeats;
 mod expand_tokens;
-mod extract_simple_aliases;
+mod extract_default_aliases;
 mod extract_tokens;
 mod flatten_grammar;
 mod intern_symbols;
@ -8,7 +8,7 @@ mod process_inlines;

 use self::expand_repeats::expand_repeats;
 pub(crate) use self::expand_tokens::expand_tokens;
-use self::extract_simple_aliases::extract_simple_aliases;
+use self::extract_default_aliases::extract_default_aliases;
 use self::extract_tokens::extract_tokens;
 use self::flatten_grammar::flatten_grammar;
 use self::intern_symbols::intern_symbols;
@ -21,7 +21,7 @@ use crate::generate::rules::{AliasMap, Rule, Symbol};

 pub(crate) struct IntermediateGrammar<T, U> {
    variables: Vec<Variable>,
-    extra_tokens: Vec<T>,
+    extra_symbols: Vec<T>,
    expected_conflicts: Vec<Vec<Symbol>>,
    external_tokens: Vec<U>,
    variables_to_inline: Vec<Symbol>,
@ -52,7 +52,7 @@ pub(crate) fn prepare_grammar(
    let syntax_grammar = expand_repeats(syntax_grammar);
    let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
    let lexical_grammar = expand_tokens(lexical_grammar)?;
-    let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
+    let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
    let inlines = process_inlines(&syntax_grammar);
-    Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases))
+    Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
 }
--- a/cli/src/generate/prepare_grammar/process_inlines.rs
+++ b/cli/src/generate/prepare_grammar/process_inlines.rs
@ -127,6 +127,9 @@ impl InlinedProductionMapBuilder {
                                    last_inserted_step.associativity = removed_step.associativity;
                                }
                            }
+                            if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() {
+                                production.dynamic_precedence = p.dynamic_precedence;
+                            }
                            production
                        }),
                    );
@ -196,7 +199,7 @@ mod tests {
    fn test_basic_inlining() {
        let grammar = SyntaxGrammar {
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
@ -226,7 +229,7 @@ mod tests {
                            ],
                        },
                        Production {
-                            dynamic_precedence: 0,
+                            dynamic_precedence: -2,
                            steps: vec![ProductionStep::new(Symbol::terminal(14))],
                        },
                    ],
@ -258,7 +261,7 @@ mod tests {
                    ],
                },
                Production {
-                    dynamic_precedence: 0,
+                    dynamic_precedence: -2,
                    steps: vec![
                        ProductionStep::new(Symbol::terminal(10)),
                        ProductionStep::new(Symbol::terminal(14)),
@ -327,7 +330,7 @@ mod tests {
                Symbol::non_terminal(3),
            ],
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
@ -429,7 +432,7 @@ mod tests {
                },
            ],
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
--- a/cli/src/generate/properties.rs
+++ b/cli/src/generate/properties.rs
--- a/cli/src/generate/render.rs
+++ b/cli/src/generate/render.rs
--- a/cli/src/generate/rules.rs
+++ b/cli/src/generate/rules.rs
@ -1,3 +1,4 @@
+use super::grammars::VariableType;
 use smallbitvec::SmallBitVec;
 use std::collections::HashMap;
 use std::iter::FromIterator;
@ -139,6 +140,16 @@ impl Rule {
    }
 }

+impl Alias {
+    pub fn kind(&self) -> VariableType {
+        if self.is_named {
+            VariableType::Named
+        } else {
+            VariableType::Anonymous
+        }
+    }
+}
+
 #[cfg(test)]
 impl Rule {
    pub fn terminal(index: usize) -> Self {
@ -366,7 +377,7 @@ impl FromIterator<Symbol> for TokenSet {

 fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
    match input {
-        Rule::Metadata { rule, mut params } => {
+        Rule::Metadata { rule, mut params } if !params.is_token => {
            f(&mut params);
            Rule::Metadata { rule, params }
        }
--- a/cli/src/generate/tables.rs
+++ b/cli/src/generate/tables.rs
@ -24,6 +24,12 @@ pub(crate) enum ParseAction {
    },
 }

+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum GotoAction {
+    Goto(ParseStateId),
+    ShiftExtra,
+}
+
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub(crate) struct ParseTableEntry {
    pub actions: Vec<ParseAction>,
@ -34,10 +40,11 @@ pub(crate) struct ParseTableEntry {
 pub(crate) struct ParseState {
    pub id: ParseStateId,
    pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
-    pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
+    pub nonterminal_entries: HashMap<Symbol, GotoAction>,
    pub lex_state_id: usize,
    pub external_lex_state_id: usize,
    pub core_id: usize,
+    pub is_non_terminal_extra: bool,
 }

 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
@ -70,6 +77,7 @@ pub(crate) struct AdvanceAction {
 #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct LexState {
    pub accept_action: Option<Symbol>,
+    pub eof_action: Option<AdvanceAction>,
    pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
 }

@ -103,7 +111,13 @@ impl ParseState {
                    _ => None,
                })
            })
-            .chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
+            .chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
+                if let GotoAction::Goto(state) = action {
+                    Some(*state)
+                } else {
+                    None
+                }
+            }))
    }

    pub fn update_referenced_states<F>(&mut self, mut f: F)
@ -121,15 +135,18 @@ impl ParseState {
                }
            }
        }
-        for (symbol, other_state) in &self.nonterminal_entries {
-            let result = f(*other_state, self);
-            if result != *other_state {
-                updates.push((*symbol, 0, result));
+        for (symbol, action) in &self.nonterminal_entries {
+            if let GotoAction::Goto(other_state) = action {
+                let result = f(*other_state, self);
+                if result != *other_state {
+                    updates.push((*symbol, 0, result));
+                }
            }
        }
        for (symbol, action_index, new_state) in updates {
            if symbol.is_non_terminal() {
-                self.nonterminal_entries.insert(symbol, new_state);
+                self.nonterminal_entries
+                    .insert(symbol, GotoAction::Goto(new_state));
            } else {
                let entry = self.terminal_entries.get_mut(&symbol).unwrap();
                if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {