tree-sitter/src/build_tables/mod.rs

251 lines
8.3 KiB
Rust

mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::build_parse_table;
use self::coincident_tokens::CoincidentTokenIndex;
use self::item::LookaheadSet;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
inlines: &InlinedProductionMap,
minimize: bool,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
let (mut parse_table, following_tokens) =
build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
lexical_grammar,
&parse_table,
syntax_grammar.word_token,
&token_conflict_map,
&coincident_token_index,
);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
);
if minimize {
minimize_parse_table(
&mut parse_table,
syntax_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
);
}
let (main_lex_table, keyword_lex_table) = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&keywords,
minimize,
);
Ok((
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way.
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_conflict(i, j)
});
if conflicts_with_other_tokens {
None
} else {
info!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
}));
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
if !conflict_free_tokens.contains(&symbol) {
if syntax_grammar.word_token != Some(symbol) {
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
info!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
}
info!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Option<Symbol>,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> LookaheadSet {
if word_token.is_none() {
return LookaheadSet::new();
}
let word_token = word_token.unwrap();
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
{
info!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None
}
},
));
// Exclude keyword candidates that shadow another keyword candidate.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_token in keywords.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
{
info!(
"Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
}));
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keywords.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
info!(
"Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
info!(
"Keywords - include {}",
lexical_grammar.variables[token.index].name,
);
true
}));
keywords
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.successors().all(|(chars, _, _, is_sep)| {
if is_sep {
true
} else if let CharacterSet::Include(chars) = chars {
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
} else {
false
}
})
}