Merge pull request #469 from tree-sitter/non-terminal-extras

Allow non-terminal extras
2019-10-21 18:47:45 -07:00 · 2019-10-21 18:47:45 -07:00 · 6cd82574a3
commit 6cd82574a3
parent ddd3dc2d6d 0cceca7b4e
17 changed files with 304 additions and 113 deletions
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@ -7,7 +7,7 @@ use crate::generate::grammars::{
 use crate::generate::node_types::VariableInfo;
 use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
 use crate::generate::tables::{
-    FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
+    FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
    ProductionInfo, ProductionInfoId,
 };
 use core::ops::Range;
@ -16,17 +16,19 @@ use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::fmt::Write;
 use std::u32;

+// For conflict reporting, each parse state is associated with an example
+// sequence of symbols that could lead to that parse state.
+type SymbolSequence = Vec<Symbol>;
+
+type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
+pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
+
 #[derive(Clone)]
 struct AuxiliarySymbolInfo {
    auxiliary_symbol: Symbol,
    parent_symbols: Vec<Symbol>,
 }

-type SymbolSequence = Vec<Symbol>;
-type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
-
-pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
-
 struct ParseStateQueueEntry {
    state_id: ParseStateId,
    preceding_auxiliary_symbols: AuxiliarySymbolSequence,
@ -41,6 +43,7 @@ struct ParseTableBuilder<'a> {
    state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
    parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
    parse_state_queue: VecDeque<ParseStateQueueEntry>,
+    non_terminal_extra_states: Vec<(Symbol, usize)>,
    parse_table: ParseTable,
 }

@ -52,7 +55,7 @@ impl<'a> ParseTableBuilder<'a> {
            .push(ProductionInfo::default());

        // Add the error state at index 0.
-        self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
+        self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default(), false);

        // Add the starting state at index 1.
        self.add_parse_state(
@ -66,8 +69,40 @@ impl<'a> ParseTableBuilder<'a> {
                .iter()
                .cloned(),
            ),
+            false,
        );

+        // Compute the possible item sets for non-terminal extras.
+        let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
+        for extra_non_terminal in self
+            .syntax_grammar
+            .extra_symbols
+            .iter()
+            .filter(|s| s.is_non_terminal())
+        {
+            let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
+            for production in &variable.productions {
+                non_terminal_extra_item_sets_by_first_terminal
+                    .entry(production.first_symbol().unwrap())
+                    .or_insert(ParseItemSet::default())
+                    .insert(
+                        ParseItem {
+                            variable_index: extra_non_terminal.index as u32,
+                            production,
+                            step_index: 1,
+                        },
+                        &[Symbol::end()].iter().cloned().collect(),
+                    );
+            }
+        }
+
+        // Add a state for each starting terminal of a non-terminal extra rule.
+        for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
+            self.non_terminal_extra_states
+                .push((terminal, self.parse_table.states.len()));
+            self.add_parse_state(&Vec::new(), &Vec::new(), item_set, true);
+        }
+
        while let Some(entry) = self.parse_state_queue.pop_front() {
            let item_set = self
                .item_set_builder
@ -91,9 +126,15 @@ impl<'a> ParseTableBuilder<'a> {
        preceding_symbols: &SymbolSequence,
        preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
        item_set: ParseItemSet<'a>,
+        is_non_terminal_extra: bool,
    ) -> ParseStateId {
        match self.state_ids_by_item_set.entry(item_set) {
+            // If an equivalent item set has already been processed, then return
+            // the existing parse state index.
            Entry::Occupied(o) => *o.get(),
+
+            // Otherwise, insert a new parse state and add it to the queue of
+            // parse states to populate.
            Entry::Vacant(v) => {
                let core = v.key().core();
                let core_count = self.core_ids_by_core.len();
@ -116,6 +157,7 @@ impl<'a> ParseTableBuilder<'a> {
                    terminal_entries: HashMap::new(),
                    nonterminal_entries: HashMap::new(),
                    core_id,
+                    is_non_terminal_extra,
                });
                self.parse_state_queue.push_back(ParseStateQueueEntry {
                    state_id,
@ -138,7 +180,12 @@ impl<'a> ParseTableBuilder<'a> {
        let mut non_terminal_successors = BTreeMap::new();
        let mut lookaheads_with_conflicts = TokenSet::new();

+        // Each item in the item set contributes to either or a Shift action or a Reduce
+        // action in this state.
        for (item, lookaheads) in &item_set.entries {
+            // If the item is unfinished, then this state has a transition for the item's
+            // next symbol. Advance the item to its next step and insert the resulting
+            // item into the successor item set.
            if let Some(next_symbol) = item.symbol() {
                let successor = item.successor();
                if next_symbol.is_non_terminal() {
@ -160,7 +207,10 @@ impl<'a> ParseTableBuilder<'a> {
                        .or_insert_with(|| ParseItemSet::default())
                        .insert(successor, lookaheads);
                }
-            } else {
+            }
+            // If the item is finished, then add a Reduce action to this state based
+            // on this item.
+            else {
                let action = if item.is_augmented() {
                    ParseAction::Accept
                } else {
@ -179,6 +229,10 @@ impl<'a> ParseTableBuilder<'a> {
                        .terminal_entries
                        .entry(lookahead);
                    let entry = entry.or_insert_with(|| ParseTableEntry::new());
+
+                    // While inserting Reduce actions, eagerly resolve conflicts related
+                    // to precedence: avoid inserting lower-precedence reductions, and
+                    // clear the action list when inserting higher-precedence reductions.
                    if entry.actions.is_empty() {
                        entry.actions.push(action);
                    } else if action.precedence() > entry.actions[0].precedence() {
@ -193,12 +247,16 @@ impl<'a> ParseTableBuilder<'a> {
            }
        }

+        // Having computed the the successor item sets for each symbol, add a new
+        // parse state for each of these item sets, and add a corresponding Shift
+        // action to this state.
        for (symbol, next_item_set) in terminal_successors {
            preceding_symbols.push(symbol);
            let next_state_id = self.add_parse_state(
                &preceding_symbols,
                &preceding_auxiliary_symbols,
                next_item_set,
+                self.parse_table.states[state_id].is_non_terminal_extra,
            );
            preceding_symbols.pop();

@ -226,13 +284,19 @@ impl<'a> ParseTableBuilder<'a> {
                &preceding_symbols,
                &preceding_auxiliary_symbols,
                next_item_set,
+                self.parse_table.states[state_id].is_non_terminal_extra,
            );
            preceding_symbols.pop();
            self.parse_table.states[state_id]
                .nonterminal_entries
-                .insert(symbol, next_state_id);
+                .insert(symbol, GotoAction::Goto(next_state_id));
        }

+        // For any symbol with multiple actions, perform conflict resolution.
+        // This will either
+        // * choose one action over the others using precedence or associativity
+        // * keep multiple actions if this conflict has been whitelisted in the grammar
+        // * fail, terminating the parser generation process
        for symbol in lookaheads_with_conflicts.iter() {
            self.handle_conflict(
                &item_set,
@ -243,15 +307,50 @@ impl<'a> ParseTableBuilder<'a> {
            )?;
        }

+        // Finally, add actions for the grammar's `extra` symbols.
        let state = &mut self.parse_table.states[state_id];
-        for extra_token in &self.syntax_grammar.extra_tokens {
-            state
-                .terminal_entries
-                .entry(*extra_token)
-                .or_insert(ParseTableEntry {
-                    reusable: true,
-                    actions: vec![ParseAction::ShiftExtra],
-                });
+        let is_non_terminal_extra = state.is_non_terminal_extra;
+        let is_end_of_non_terminal_extra =
+            is_non_terminal_extra && state.terminal_entries.len() == 1;
+
+        // Add actions for the start tokens of each non-terminal extra rule.
+        // These actions are added to every state except for the states that are
+        // alread within non-terminal extras. Non-terminal extras are not allowed
+        // to nest within each other.
+        if !is_non_terminal_extra {
+            for (terminal, state_id) in &self.non_terminal_extra_states {
+                state
+                    .terminal_entries
+                    .entry(*terminal)
+                    .or_insert(ParseTableEntry {
+                        reusable: true,
+                        actions: vec![ParseAction::Shift {
+                            state: *state_id,
+                            is_repetition: false,
+                        }],
+                    });
+            }
+        }
+
+        // Add ShiftExtra actions for the terminal extra tokens. These actions
+        // are added to every state except for those at the ends of non-terminal
+        // extras.
+        if !is_end_of_non_terminal_extra {
+            for extra_token in &self.syntax_grammar.extra_symbols {
+                if extra_token.is_non_terminal() {
+                    state
+                        .nonterminal_entries
+                        .insert(*extra_token, GotoAction::ShiftExtra);
+                } else {
+                    state
+                        .terminal_entries
+                        .entry(*extra_token)
+                        .or_insert(ParseTableEntry {
+                            reusable: true,
+                            actions: vec![ParseAction::ShiftExtra],
+                        });
+                }
+            }
        }

        Ok(())
@ -362,8 +461,8 @@ impl<'a> ParseTableBuilder<'a> {
                    }
                }

-                // If all reduce actions are left associative, remove the SHIFT action.
-                // If all reduce actions are right associative, remove the REDUCE actions.
+                // If all Reduce actions are left associative, remove the SHIFT action.
+                // If all Reduce actions are right associative, remove the REDUCE actions.
                match (has_left, has_non, has_right) {
                    (true, false, false) => {
                        entry.actions.pop();
@ -744,7 +843,7 @@ fn populate_following_tokens(
            }
        }
    }
-    for extra in &grammar.extra_tokens {
+    for extra in &grammar.extra_symbols {
        if extra.is_terminal() {
            for entry in result.iter_mut() {
                entry.insert(*extra);
@ -774,6 +873,7 @@ pub(crate) fn build_parse_table<'a>(
        lexical_grammar,
        item_set_builder,
        variable_info,
+        non_terminal_extra_states: Vec::new(),
        state_ids_by_item_set: HashMap::new(),
        core_ids_by_core: HashMap::new(),
        parse_state_info_by_id: Vec::new(),
--- a/cli/src/generate/build_tables/minimize_parse_table.rs
+++ b/cli/src/generate/build_tables/minimize_parse_table.rs
@ -2,7 +2,9 @@ use super::token_conflicts::TokenConflictMap;
 use crate::generate::dedup::split_state_id_groups;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
 use crate::generate::rules::{AliasMap, Symbol, TokenSet};
-use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry};
+use crate::generate::tables::{
+    GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
+};
 use log::info;
 use std::collections::{HashMap, HashSet};
 use std::mem;
@ -101,7 +103,10 @@ impl<'a> Minimizer<'a> {
                state.update_referenced_states(|other_state_id, state| {
                    if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
                        done = false;
-                        state.nonterminal_entries[symbol]
+                        match state.nonterminal_entries.get(symbol) {
+                            Some(GotoAction::Goto(state_id)) => *state_id,
+                            _ => other_state_id,
+                        }
                    } else {
                        other_state_id
                    }
@ -262,18 +267,24 @@ impl<'a> Minimizer<'a> {

        for (symbol, s1) in &state1.nonterminal_entries {
            if let Some(s2) = state2.nonterminal_entries.get(symbol) {
-                let group1 = group_ids_by_state_id[*s1];
-                let group2 = group_ids_by_state_id[*s2];
-                if group1 != group2 {
-                    info!(
-                        "split states {} {} - successors for {} are split: {} {}",
-                        state1.id,
-                        state2.id,
-                        self.symbol_name(symbol),
-                        s1,
-                        s2,
-                    );
-                    return true;
+                match (s1, s2) {
+                    (GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
+                    (GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
+                        let group1 = group_ids_by_state_id[*s1];
+                        let group2 = group_ids_by_state_id[*s2];
+                        if group1 != group2 {
+                            info!(
+                                "split states {} {} - successors for {} are split: {} {}",
+                                state1.id,
+                                state2.id,
+                                self.symbol_name(symbol),
+                                s1,
+                                s2,
+                            );
+                            return true;
+                        }
+                    }
+                    _ => return true,
                }
            }
        }
--- a/cli/src/generate/grammars.rs
+++ b/cli/src/generate/grammars.rs
@ -23,7 +23,7 @@ pub(crate) struct Variable {
 pub(crate) struct InputGrammar {
    pub name: String,
    pub variables: Vec<Variable>,
-    pub extra_tokens: Vec<Rule>,
+    pub extra_symbols: Vec<Rule>,
    pub expected_conflicts: Vec<Vec<String>>,
    pub external_tokens: Vec<Rule>,
    pub variables_to_inline: Vec<String>,
@ -87,7 +87,7 @@ pub(crate) struct ExternalToken {
 #[derive(Debug, Default)]
 pub(crate) struct SyntaxGrammar {
    pub variables: Vec<SyntaxVariable>,
-    pub extra_tokens: Vec<Symbol>,
+    pub extra_symbols: Vec<Symbol>,
    pub expected_conflicts: Vec<Vec<Symbol>>,
    pub external_tokens: Vec<ExternalToken>,
    pub supertype_symbols: Vec<Symbol>,
--- a/cli/src/generate/node_types.rs
+++ b/cli/src/generate/node_types.rs
@ -689,7 +689,7 @@ mod tests {
    fn test_node_types_simple() {
        let node_types = get_node_types(InputGrammar {
            name: String::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
@ -775,7 +775,7 @@ mod tests {
    fn test_node_types_with_supertypes() {
        let node_types = get_node_types(InputGrammar {
            name: String::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
@ -862,7 +862,7 @@ mod tests {
    fn test_node_types_for_children_without_fields() {
        let node_types = get_node_types(InputGrammar {
            name: String::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
@ -960,7 +960,7 @@ mod tests {
    fn test_node_types_for_aliased_nodes() {
        let node_types = get_node_types(InputGrammar {
            name: String::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
@ -1036,7 +1036,7 @@ mod tests {
    fn test_node_types_with_multiple_valued_fields() {
        let node_types = get_node_types(InputGrammar {
            name: String::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/parse_grammar.rs
+++ b/cli/src/generate/parse_grammar.rs
@ -87,7 +87,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
        })
    }

-    let extra_tokens = grammar_json
+    let extra_symbols = grammar_json
        .extras
        .unwrap_or(Vec::new())
        .into_iter()
@ -107,7 +107,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
        name: grammar_json.name,
        word_token: grammar_json.word,
        variables,
-        extra_tokens,
+        extra_symbols,
        expected_conflicts,
        external_tokens,
        supertype_symbols,
--- a/cli/src/generate/prepare_grammar/expand_repeats.rs
+++ b/cli/src/generate/prepare_grammar/expand_repeats.rs
@ -283,7 +283,7 @@ mod tests {
    fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
        ExtractedSyntaxGrammar {
            variables,
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs
+++ b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs
@ -146,7 +146,7 @@ mod tests {
                    }],
                },
            ],
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
            supertype_symbols: Vec::new(),
--- a/cli/src/generate/prepare_grammar/extract_tokens.rs
+++ b/cli/src/generate/prepare_grammar/extract_tokens.rs
@ -90,21 +90,13 @@ pub(super) fn extract_tokens(
        .collect();

    let mut separators = Vec::new();
-    let mut extra_tokens = Vec::new();
-    for rule in grammar.extra_tokens {
+    let mut extra_symbols = Vec::new();
+    for rule in grammar.extra_symbols {
        if let Rule::Symbol(symbol) = rule {
-            let new_symbol = symbol_replacer.replace_symbol(symbol);
-            if new_symbol.is_non_terminal() {
-                return Error::err(format!(
-                    "Non-token symbol '{}' cannot be used as an extra token",
-                    &variables[new_symbol.index].name
-                ));
-            } else {
-                extra_tokens.push(new_symbol);
-            }
+            extra_symbols.push(symbol_replacer.replace_symbol(symbol));
        } else {
            if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
-                extra_tokens.push(Symbol::terminal(index));
+                extra_symbols.push(Symbol::terminal(index));
            } else {
                separators.push(rule);
            }
@ -158,7 +150,7 @@ pub(super) fn extract_tokens(
        ExtractedSyntaxGrammar {
            variables,
            expected_conflicts,
-            extra_tokens,
+            extra_symbols,
            variables_to_inline,
            supertype_symbols,
            external_tokens,
@ -415,15 +407,15 @@ mod test {
    }

    #[test]
-    fn test_extracting_extra_tokens() {
+    fn test_extracting_extra_symbols() {
        let mut grammar = build_grammar(vec![
            Variable::named("rule_0", Rule::string("x")),
            Variable::named("comment", Rule::pattern("//.*")),
        ]);
-        grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
+        grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];

        let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
-        assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
+        assert_eq!(syntax_grammar.extra_symbols, vec![Symbol::terminal(1),]);
        assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
    }

@ -472,28 +464,6 @@ mod test {
        );
    }

-    #[test]
-    fn test_error_on_non_terminal_symbol_extras() {
-        let mut grammar = build_grammar(vec![
-            Variable::named("rule_0", Rule::non_terminal(1)),
-            Variable::named("rule_1", Rule::non_terminal(2)),
-            Variable::named("rule_2", Rule::string("x")),
-        ]);
-        grammar.extra_tokens = vec![Rule::non_terminal(1)];
-
-        match extract_tokens(grammar) {
-            Err(e) => {
-                assert_eq!(
-                    e.message(),
-                    "Non-token symbol 'rule_1' cannot be used as an extra token"
-                );
-            }
-            _ => {
-                panic!("Expected an error but got no error");
-            }
-        }
-    }
-
    #[test]
    fn test_error_on_external_with_same_name_as_non_terminal() {
        let mut grammar = build_grammar(vec![
@ -522,7 +492,7 @@ mod test {
    fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
        InternedGrammar {
            variables,
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/flatten_grammar.rs
+++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs
@ -199,7 +199,7 @@ unless they are used only as the grammar's start rule.
        }
    }
    Ok(SyntaxGrammar {
-        extra_tokens: grammar.extra_tokens,
+        extra_symbols: grammar.extra_symbols,
        expected_conflicts: grammar.expected_conflicts,
        variables_to_inline: grammar.variables_to_inline,
        external_tokens: grammar.external_tokens,
--- a/cli/src/generate/prepare_grammar/intern_symbols.rs
+++ b/cli/src/generate/prepare_grammar/intern_symbols.rs
@ -30,9 +30,9 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
        external_tokens.push(Variable { name, kind, rule });
    }

-    let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
-    for extra_token in grammar.extra_tokens.iter() {
-        extra_tokens.push(interner.intern_rule(extra_token)?);
+    let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
+    for extra_token in grammar.extra_symbols.iter() {
+        extra_symbols.push(interner.intern_rule(extra_token)?);
    }

    let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
@ -76,7 +76,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
    Ok(InternedGrammar {
        variables,
        external_tokens,
-        extra_tokens,
+        extra_symbols,
        expected_conflicts,
        variables_to_inline,
        supertype_symbols,
@ -236,7 +236,7 @@ mod tests {
        InputGrammar {
            variables,
            name: "the_language".to_string(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            expected_conflicts: Vec::new(),
            variables_to_inline: Vec::new(),
--- a/cli/src/generate/prepare_grammar/mod.rs
+++ b/cli/src/generate/prepare_grammar/mod.rs
@ -21,7 +21,7 @@ use crate::generate::rules::{AliasMap, Rule, Symbol};

 pub(crate) struct IntermediateGrammar<T, U> {
    variables: Vec<Variable>,
-    extra_tokens: Vec<T>,
+    extra_symbols: Vec<T>,
    expected_conflicts: Vec<Vec<Symbol>>,
    external_tokens: Vec<U>,
    variables_to_inline: Vec<Symbol>,
--- a/cli/src/generate/prepare_grammar/process_inlines.rs
+++ b/cli/src/generate/prepare_grammar/process_inlines.rs
@ -196,7 +196,7 @@ mod tests {
    fn test_basic_inlining() {
        let grammar = SyntaxGrammar {
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
@ -327,7 +327,7 @@ mod tests {
                Symbol::non_terminal(3),
            ],
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
@ -429,7 +429,7 @@ mod tests {
                },
            ],
            expected_conflicts: Vec::new(),
-            extra_tokens: Vec::new(),
+            extra_symbols: Vec::new(),
            external_tokens: Vec::new(),
            supertype_symbols: Vec::new(),
            word_token: None,
--- a/cli/src/generate/render.rs
+++ b/cli/src/generate/render.rs
@ -2,7 +2,8 @@ use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType
 use super::nfa::CharacterSet;
 use super::rules::{Alias, AliasMap, Symbol, SymbolType};
 use super::tables::{
-    AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry,
+    AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
+    ParseTableEntry,
 };
 use core::ops::Range;
 use std::cmp;
@ -678,7 +679,12 @@ impl Generator {
        add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{");
        indent!(self);
        for (i, state) in self.parse_table.states.iter().enumerate() {
-            if state.external_lex_state_id > 0 {
+            if state.is_non_terminal_extra
+                && state.terminal_entries.len() == 1
+                && *state.terminal_entries.iter().next().unwrap().0 == Symbol::end()
+            {
+                add_line!(self, "[{}] = {{-1}},", i,);
+            } else if state.external_lex_state_id > 0 {
                add_line!(
                    self,
                    "[{}] = {{.lex_state = {}, .external_lex_state = {}}},",
@ -807,12 +813,15 @@ impl Generator {
            terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
            nonterminal_entries.sort_unstable_by_key(|k| k.0);

-            for (symbol, state_id) in &nonterminal_entries {
+            for (symbol, action) in &nonterminal_entries {
                add_line!(
                    self,
                    "[{}] = STATE({}),",
                    self.symbol_ids[symbol],
-                    *state_id
+                    match action {
+                        GotoAction::Goto(state) => *state,
+                        GotoAction::ShiftExtra => i,
+                    }
                );
            }

@ -865,9 +874,15 @@ impl Generator {
                        .or_default()
                        .push(**symbol);
                }
-                for (symbol, state_id) in &state.nonterminal_entries {
+                for (symbol, action) in &state.nonterminal_entries {
+                    let state_id = match action {
+                        GotoAction::Goto(i) => *i,
+                        GotoAction::ShiftExtra => {
+                            self.large_state_count + small_state_indices.len() - 1
+                        }
+                    };
                    symbols_by_value
-                        .entry((*state_id, SymbolType::NonTerminal))
+                        .entry((state_id, SymbolType::NonTerminal))
                        .or_default()
                        .push(*symbol);
                }
--- a/cli/src/generate/tables.rs
+++ b/cli/src/generate/tables.rs
@ -24,6 +24,12 @@ pub(crate) enum ParseAction {
    },
 }

+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum GotoAction {
+    Goto(ParseStateId),
+    ShiftExtra,
+}
+
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub(crate) struct ParseTableEntry {
    pub actions: Vec<ParseAction>,
@ -34,10 +40,11 @@ pub(crate) struct ParseTableEntry {
 pub(crate) struct ParseState {
    pub id: ParseStateId,
    pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
-    pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
+    pub nonterminal_entries: HashMap<Symbol, GotoAction>,
    pub lex_state_id: usize,
    pub external_lex_state_id: usize,
    pub core_id: usize,
+    pub is_non_terminal_extra: bool,
 }

 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
@ -103,7 +110,13 @@ impl ParseState {
                    _ => None,
                })
            })
-            .chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
+            .chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
+                if let GotoAction::Goto(state) = action {
+                    Some(*state)
+                } else {
+                    None
+                }
+            }))
    }

    pub fn update_referenced_states<F>(&mut self, mut f: F)
@ -121,15 +134,18 @@ impl ParseState {
                }
            }
        }
-        for (symbol, other_state) in &self.nonterminal_entries {
-            let result = f(*other_state, self);
-            if result != *other_state {
-                updates.push((*symbol, 0, result));
+        for (symbol, action) in &self.nonterminal_entries {
+            if let GotoAction::Goto(other_state) = action {
+                let result = f(*other_state, self);
+                if result != *other_state {
+                    updates.push((*symbol, 0, result));
+                }
            }
        }
        for (symbol, action_index, new_state) in updates {
            if symbol.is_non_terminal() {
-                self.nonterminal_entries.insert(symbol, new_state);
+                self.nonterminal_entries
+                    .insert(symbol, GotoAction::Goto(new_state));
            } else {
                let entry = self.terminal_entries.get_mut(&symbol).unwrap();
                if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
--- a/lib/src/parser.c
+++ b/lib/src/parser.c
@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
  Length start_position = ts_stack_position(self->stack, version);
  Subtree external_token = ts_stack_last_external_token(self->stack, version);
  TSLexMode lex_mode = self->language->lex_modes[parse_state];
+  if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
  const bool *valid_external_tokens = ts_language_enabled_external_tokens(
    self->language,
    lex_mode.external_lex_state
@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
  uint32_t count,
  int dynamic_precedence,
  uint16_t production_id,
-  bool fragile
+  bool is_fragile,
+  bool is_extra
 ) {
  uint32_t initial_version_count = ts_stack_version_count(self->stack);
  uint32_t removed_version_count = 0;
@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(

    TSStateId state = ts_stack_state(self->stack, slice_version);
    TSStateId next_state = ts_language_next_state(self->language, state, symbol);
-    if (fragile || pop.size > 1 || initial_version_count > 1) {
+    if (is_extra) parent.ptr->extra = true;
+    if (is_fragile || pop.size > 1 || initial_version_count > 1) {
      parent.ptr->fragile_left = true;
      parent.ptr->fragile_right = true;
      parent.ptr->parse_state = TS_TREE_STATE_NONE;
@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
      reduction_version = ts_parser__reduce(
        self, version, action.symbol, action.count,
        action.dynamic_precedence, action.production_id,
-        true
+        true, false
      );
    }

@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
  // Otherwise, re-run the lexer.
  if (!lookahead.ptr) {
    lookahead = ts_parser__lex(self, version, state);
-    ts_parser__set_cached_token(self, position, last_external_token, lookahead);
-    ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
+    if (lookahead.ptr) {
+      ts_parser__set_cached_token(self, position, last_external_token, lookahead);
+      ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
+    }
+
+    // When parsing a non-terminal extra, a null lookahead indicates the
+    // end of the rule. The reduction is stored in the EOF table entry.
+    // After the reduction, the lexer needs to be run again.
+    else {
+      ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
+    }
  }

  for (;;) {
@ -1422,11 +1434,12 @@ static bool ts_parser__advance(

        case TSParseActionTypeReduce: {
          bool is_fragile = table_entry.action_count > 1;
+          bool is_extra = lookahead.ptr == NULL;
          LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
          StackVersion reduction_version = ts_parser__reduce(
            self, version, action.params.symbol, action.params.child_count,
            action.params.dynamic_precedence, action.params.production_id,
-            is_fragile
+            is_fragile, is_extra
          );
          if (reduction_version != STACK_VERSION_NONE) {
            last_reduction_version = reduction_version;
@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
      ts_stack_renumber_version(self->stack, last_reduction_version, version);
      LOG_STACK();
      state = ts_stack_state(self->stack, version);
+
+      // At the end of a non-terminal extra rule, the lexer will return a
+      // null subtree, because the parser needs to perform a fixed reduction
+      // regardless of the lookahead node. After performing that reduction,
+      // (and completing the non-terminal extra rule) run the lexer again based
+      // on the current parse state.
+      if (!lookahead.ptr) {
+        lookahead = ts_parser__lex(self, version, state);
+      }
      ts_language_table_entry(
        self->language,
        state,
--- a/test/fixtures/test_grammars/extra_non_terminals/corpus.txt
+++ b/test/fixtures/test_grammars/extra_non_terminals/corpus.txt
@ -0,0 +1,22 @@
+==============
+No extras
+==============
+
+a b c d
+
+---
+
+(module)
+
+==============
+Extras
+==============
+
+a (one) b (two) (three) c d
+
+---
+
+(module
+  (comment)
+  (comment)
+  (comment))
--- a/test/fixtures/test_grammars/extra_non_terminals/grammar.json
+++ b/test/fixtures/test_grammars/extra_non_terminals/grammar.json
@ -0,0 +1,35 @@
+{
+  "name": "extra_non_terminals",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"},
+    {"type": "SYMBOL", "name": "comment"}
+  ],
+
+  "rules": {
+    "module": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "a"},
+        {"type": "STRING", "value": "b"},
+        {"type": "STRING", "value": "c"},
+        {"type": "STRING", "value": "d"}
+      ]
+    },
+
+    "comment": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "("},
+        {
+          "type": "REPEAT",
+          "content": {
+            "type": "PATTERN",
+            "value": "[a-z]+"
+          }
+        },
+        {"type": "STRING", "value": ")"}
+      ]
+    }
+  }
+}