diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 21594253..b365feb1 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -2,7 +2,7 @@ use super::coincident_tokens::CoincidentTokenIndex; use super::token_conflicts::TokenConflictMap; use crate::generate::dedup::split_state_id_groups; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::{CharacterSet, NfaCursor}; +use crate::generate::nfa::NfaCursor; use crate::generate::rules::{Symbol, TokenSet}; use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use log::info; @@ -189,13 +189,10 @@ impl<'a> LexTableBuilder<'a> { // character that leads to the empty set of NFA states. if eof_valid { let (next_state_id, _) = self.add_state(Vec::new(), false); - self.table.states[state_id].advance_actions.push(( - CharacterSet::empty().add_char('\0'), - AdvanceAction { - state: next_state_id, - in_main_token: true, - }, - )); + self.table.states[state_id].eof_action = Some(AdvanceAction { + state: next_state_id, + in_main_token: true, + }); } for transition in transitions { @@ -273,6 +270,7 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { let signature = ( i == 0, state.accept_action, + state.eof_action.is_some(), state .advance_actions .iter() @@ -320,6 +318,9 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { for (_, advance_action) in new_state.advance_actions.iter_mut() { advance_action.state = group_ids_by_state_id[advance_action.state]; } + if let Some(eof_action) = &mut new_state.eof_action { + eof_action.state = group_ids_by_state_id[eof_action.state]; + } new_states.push(new_state); } @@ -364,6 +365,9 @@ fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) { for (_, advance_action) in state.advance_actions.iter_mut() { advance_action.state = new_ids_by_old_id[advance_action.state]; } + if let Some(eof_action) = &mut state.eof_action { + eof_action.state = new_ids_by_old_id[eof_action.state]; + } state }) .collect(); diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index e2afa893..be4b5681 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -540,6 +540,10 @@ impl Generator { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } + if let Some(eof_action) = state.eof_action { + add_line!(self, "if (eof) ADVANCE({});", eof_action.state); + } + let mut ruled_out_characters = HashSet::new(); for (characters, action) in state.advance_actions { let previous_length = self.buffer.len(); diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index 6f4c34a6..15b18a97 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -77,6 +77,7 @@ pub(crate) struct AdvanceAction { #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] pub(crate) struct LexState { pub accept_action: Option, + pub eof_action: Option, pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, } diff --git a/lib/include/tree_sitter/parser.h b/lib/include/tree_sitter/parser.h index 974a7ca5..73e7889c 100644 --- a/lib/include/tree_sitter/parser.h +++ b/lib/include/tree_sitter/parser.h @@ -45,7 +45,8 @@ struct TSLexer { void (*advance)(TSLexer *, bool); void (*mark_end)(TSLexer *); uint32_t (*get_column)(TSLexer *); - bool (*is_at_included_range_start)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); }; typedef enum { @@ -126,13 +127,15 @@ struct TSLanguage { #define START_LEXER() \ bool result = false; \ bool skip = false; \ + bool eof = false; \ int32_t lookahead; \ goto start; \ next_state: \ lexer->advance(lexer, skip); \ start: \ skip = false; \ - lookahead = lexer->lookahead; + lookahead = lexer->lookahead; \ + eof = lexer->eof(lexer); #define ADVANCE(state_value) \ { \ diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 8257ff4e..e2ca8519 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -224,6 +224,7 @@ void ts_lexer_init(Lexer *self) { .mark_end = ts_lexer__mark_end, .get_column = ts_lexer__get_column, .is_at_included_range_start = ts_lexer__is_at_included_range_start, + .eof = ts_lexer__eof, .lookahead = 0, .result_symbol = 0, }, diff --git a/lib/src/lexer.h b/lib/src/lexer.h index f523d88f..8cd9c267 100644 --- a/lib/src/lexer.h +++ b/lib/src/lexer.h @@ -16,7 +16,7 @@ typedef struct { Length token_start_position; Length token_end_position; - TSRange * included_ranges; + TSRange *included_ranges; size_t included_range_count; size_t current_included_range_index;