Merge branch 'master' into HEAD

This commit is contained in:
Max Brunsfeld 2020-12-03 09:44:33 -08:00
commit 026231e93d
173 changed files with 22878 additions and 6961 deletions

View file

@ -2,7 +2,7 @@ use super::coincident_tokens::CoincidentTokenIndex;
use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::{CharacterSet, NfaCursor};
use crate::generate::nfa::NfaCursor;
use crate::generate::rules::{Symbol, TokenSet};
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
use log::info;
@ -189,13 +189,10 @@ impl<'a> LexTableBuilder<'a> {
// character that leads to the empty set of NFA states.
if eof_valid {
let (next_state_id, _) = self.add_state(Vec::new(), false);
self.table.states[state_id].advance_actions.push((
CharacterSet::empty().add_char('\0'),
AdvanceAction {
state: next_state_id,
in_main_token: true,
},
));
self.table.states[state_id].eof_action = Some(AdvanceAction {
state: next_state_id,
in_main_token: true,
});
}
for transition in transitions {
@ -273,6 +270,7 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
let signature = (
i == 0,
state.accept_action,
state.eof_action.is_some(),
state
.advance_actions
.iter()
@ -320,6 +318,9 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
for (_, advance_action) in new_state.advance_actions.iter_mut() {
advance_action.state = group_ids_by_state_id[advance_action.state];
}
if let Some(eof_action) = &mut new_state.eof_action {
eof_action.state = group_ids_by_state_id[eof_action.state];
}
new_states.push(new_state);
}
@ -364,6 +365,9 @@ fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = new_ids_by_old_id[advance_action.state];
}
if let Some(eof_action) = &mut state.eof_action {
eof_action.state = new_ids_by_old_id[eof_action.state];
}
state
})
.collect();

View file

@ -7,7 +7,7 @@ use crate::generate::grammars::{
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{
FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
ProductionInfo, ProductionInfoId,
};
use core::ops::Range;
@ -16,17 +16,19 @@ use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::Write;
use std::u32;
// For conflict reporting, each parse state is associated with an example
// sequence of symbols that could lead to that parse state.
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
struct ParseStateQueueEntry {
state_id: ParseStateId,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
@ -41,6 +43,7 @@ struct ParseTableBuilder<'a> {
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
non_terminal_extra_states: Vec<(Symbol, usize)>,
parse_table: ParseTable,
}
@ -52,7 +55,7 @@ impl<'a> ParseTableBuilder<'a> {
.push(ProductionInfo::default());
// Add the error state at index 0.
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default(), false);
// Add the starting state at index 1.
self.add_parse_state(
@ -66,8 +69,40 @@ impl<'a> ParseTableBuilder<'a> {
.iter()
.cloned(),
),
false,
);
// Compute the possible item sets for non-terminal extras.
let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
for extra_non_terminal in self
.syntax_grammar
.extra_symbols
.iter()
.filter(|s| s.is_non_terminal())
{
let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
for production in &variable.productions {
non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap())
.or_insert(ParseItemSet::default())
.insert(
ParseItem {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
},
&[Symbol::end()].iter().cloned().collect(),
);
}
}
// Add a state for each starting terminal of a non-terminal extra rule.
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
self.non_terminal_extra_states
.push((terminal, self.parse_table.states.len()));
self.add_parse_state(&Vec::new(), &Vec::new(), item_set, true);
}
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
@ -91,9 +126,15 @@ impl<'a> ParseTableBuilder<'a> {
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
is_non_terminal_extra: bool,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
// If an equivalent item set has already been processed, then return
// the existing parse state index.
Entry::Occupied(o) => *o.get(),
// Otherwise, insert a new parse state and add it to the queue of
// parse states to populate.
Entry::Vacant(v) => {
let core = v.key().core();
let core_count = self.core_ids_by_core.len();
@ -116,6 +157,7 @@ impl<'a> ParseTableBuilder<'a> {
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
core_id,
is_non_terminal_extra,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
@ -138,7 +180,12 @@ impl<'a> ParseTableBuilder<'a> {
let mut non_terminal_successors = BTreeMap::new();
let mut lookaheads_with_conflicts = TokenSet::new();
// Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state.
for (item, lookaheads) in &item_set.entries {
// If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set.
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
@ -160,7 +207,10 @@ impl<'a> ParseTableBuilder<'a> {
.or_insert_with(|| ParseItemSet::default())
.insert(successor, lookaheads);
}
} else {
}
// If the item is finished, then add a Reduce action to this state based
// on this item.
else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
@ -179,6 +229,10 @@ impl<'a> ParseTableBuilder<'a> {
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
// While inserting Reduce actions, eagerly resolve conflicts related
// to precedence: avoid inserting lower-precedence reductions, and
// clear the action list when inserting higher-precedence reductions.
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
@ -193,12 +247,16 @@ impl<'a> ParseTableBuilder<'a> {
}
}
// Having computed the the successor item sets for each symbol, add a new
// parse state for each of these item sets, and add a corresponding Shift
// action to this state.
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
self.parse_table.states[state_id].is_non_terminal_extra,
);
preceding_symbols.pop();
@ -226,13 +284,19 @@ impl<'a> ParseTableBuilder<'a> {
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
self.parse_table.states[state_id].is_non_terminal_extra,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
.insert(symbol, GotoAction::Goto(next_state_id));
}
// For any symbol with multiple actions, perform conflict resolution.
// This will either
// * choose one action over the others using precedence or associativity
// * keep multiple actions if this conflict has been whitelisted in the grammar
// * fail, terminating the parser generation process
for symbol in lookaheads_with_conflicts.iter() {
self.handle_conflict(
&item_set,
@ -243,15 +307,50 @@ impl<'a> ParseTableBuilder<'a> {
)?;
}
// Finally, add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
let is_non_terminal_extra = state.is_non_terminal_extra;
let is_end_of_non_terminal_extra =
is_non_terminal_extra && state.terminal_entries.len() == 1;
// Add actions for the start tokens of each non-terminal extra rule.
// These actions are added to every state except for the states that are
// alread within non-terminal extras. Non-terminal extras are not allowed
// to nest within each other.
if !is_non_terminal_extra {
for (terminal, state_id) in &self.non_terminal_extra_states {
state
.terminal_entries
.entry(*terminal)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::Shift {
state: *state_id,
is_repetition: false,
}],
});
}
}
// Add ShiftExtra actions for the terminal extra tokens. These actions
// are added to every state except for those at the ends of non-terminal
// extras.
if !is_end_of_non_terminal_extra {
for extra_token in &self.syntax_grammar.extra_symbols {
if extra_token.is_non_terminal() {
state
.nonterminal_entries
.insert(*extra_token, GotoAction::ShiftExtra);
} else {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
}
}
Ok(())
@ -362,8 +461,8 @@ impl<'a> ParseTableBuilder<'a> {
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
// If all Reduce actions are left associative, remove the SHIFT action.
// If all Reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
@ -744,7 +843,7 @@ fn populate_following_tokens(
}
}
}
for extra in &grammar.extra_tokens {
for extra in &grammar.extra_symbols {
if extra.is_terminal() {
for entry in result.iter_mut() {
entry.insert(*extra);
@ -774,6 +873,7 @@ pub(crate) fn build_parse_table<'a>(
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: HashMap::new(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),

View file

@ -2,7 +2,9 @@ use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::generate::rules::{AliasMap, Symbol, TokenSet};
use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry};
use crate::generate::tables::{
GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use log::info;
use std::collections::{HashMap, HashSet};
use std::mem;
@ -66,6 +68,7 @@ impl<'a> Minimizer<'a> {
..
} => {
if !self.simple_aliases.contains_key(&symbol)
&& !self.syntax_grammar.supertype_symbols.contains(&symbol)
&& !aliased_symbols.contains(&symbol)
&& self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named
@ -101,7 +104,10 @@ impl<'a> Minimizer<'a> {
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
match state.nonterminal_entries.get(symbol) {
Some(GotoAction::Goto(state_id)) => *state_id,
_ => other_state_id,
}
} else {
other_state_id
}
@ -194,6 +200,9 @@ impl<'a> Minimizer<'a> {
right_state: &ParseState,
group_ids_by_state_id: &Vec<ParseStateId>,
) -> bool {
if left_state.is_non_terminal_extra != right_state.is_non_terminal_extra {
return true;
}
for (token, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(token) {
if self.entries_conflict(
@ -262,18 +271,24 @@ impl<'a> Minimizer<'a> {
for (symbol, s1) in &state1.nonterminal_entries {
if let Some(s2) = state2.nonterminal_entries.get(symbol) {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(symbol),
s1,
s2,
);
return true;
match (s1, s2) {
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(symbol),
s1,
s2,
);
return true;
}
}
_ => return true,
}
}
}

View file

@ -271,6 +271,7 @@ fn identify_keywords(
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
&& !token_conflict_map.does_match_different_string(i, word_token.index)
{
info!(
"Keywords - add candidate {}",

View file

@ -1,9 +1,9 @@
use crate::generate::build_tables::item::{TokenSetDisplay};
use crate::generate::build_tables::item::TokenSetDisplay;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use crate::generate::rules::TokenSet;
use std::collections::HashSet;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::fmt;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
@ -13,6 +13,7 @@ struct TokenConflictStatus {
does_match_valid_continuation: bool,
does_match_separators: bool,
matches_same_string: bool,
matches_different_string: bool,
}
pub(crate) struct TokenConflictMap<'a> {
@ -25,6 +26,12 @@ pub(crate) struct TokenConflictMap<'a> {
}
impl<'a> TokenConflictMap<'a> {
/// Create a token conflict map based on a lexical grammar, which describes the structure
/// each token, and a `following_token` map, which indicates which tokens may be appear
/// immediately after each other token.
///
/// This analyzes the possible kinds of overlap between each pair of tokens and stores
/// them in a matrix.
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
@ -50,12 +57,21 @@ impl<'a> TokenConflictMap<'a> {
}
}
/// Does token `i` match any strings that token `j` also matches, such that token `i`
/// is preferred over token `j`?
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
let left = &self.status_matrix[matrix_index(self.n, a, other)];
let right = &self.status_matrix[matrix_index(self.n, b, other)];
left == right
}
/// Does token `i` match any strings that token `j` does *not* match?
pub fn does_match_different_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_different_string
}
/// Does token `i` match any strings that token `j` also matches, where
/// token `i` is preferred over token `j`?
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
@ -67,6 +83,7 @@ impl<'a> TokenConflictMap<'a> {
|| entry.matches_same_string
}
/// Does token `i` match any strings that are *prefixes* of strings matched by `j`?
pub fn does_match_prefix(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_prefix
}
@ -239,19 +256,29 @@ fn compute_conflict_status(
);
while let Some(state_set) = state_set_queue.pop() {
// Don't pursue states where there's no potential for conflict.
if grammar.variable_indices_for_nfa_states(&state_set).count() > 1 {
cursor.reset(state_set);
} else {
let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set);
// If only one of the two tokens could possibly match from this state, then
// there is no reason to analyze any of its successors. Just record the fact
// that the token matches a string that the other token does not match.
let first_live_variable_index = live_variable_indices.next().unwrap();
if live_variable_indices.count() == 0 {
if first_live_variable_index == i {
result.0.matches_different_string = true;
} else {
result.1.matches_different_string = true;
}
continue;
}
let has_sep = cursor.transition_chars().any(|(_, sep)| sep);
// Don't pursue states where there's no potential for conflict.
cursor.reset(state_set);
let within_separator = cursor.transition_chars().any(|(_, sep)| sep);
// Examine each possible completed token in this state.
let mut completion = None;
for (id, precedence) in cursor.completions() {
if has_sep {
if within_separator {
if id == i {
result.0.does_match_separators = true;
} else {
@ -316,7 +343,7 @@ fn compute_conflict_status(
&transition,
completed_id,
completed_precedence,
has_sep,
within_separator,
) {
can_advance = true;
if advanced_id == i {

View file

@ -292,7 +292,12 @@ function grammar(baseGrammar, options) {
extras = options.extras
.call(ruleBuilder, ruleBuilder, baseGrammar.extras)
.map(normalize);
if (!Array.isArray(extras)) {
throw new Error("Grammar's 'extras' function must return an array.")
}
extras = extras.map(normalize);
}
let word = baseGrammar.word;

View file

@ -1,15 +1,15 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "tree-sitter grammar specification",
"type": "object",
"required": [
"name",
"rules"
],
"required": ["name", "rules"],
"additionalProperties": false,
"properties": {
"name": {
"description": "the name of the grammar",
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
@ -60,6 +60,15 @@
"word": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
"supertypes": {
"description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types.",
"type": "array",
"items": {
"description": "the name of a rule in `rules` or `extras`",
"type": "string"
}
}
},
@ -96,20 +105,19 @@
"type": "string",
"pattern": "^PATTERN$"
},
"value": {"type": "string"}
"value": { "type": "string" }
},
"required": ["type", "value"]
},
"symbol-rule": {
"required": ["name"],
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^SYMBOL$"
},
"name": {"type": "string"}
"name": { "type": "string" }
},
"required": ["type", "name"]
},
@ -210,6 +218,20 @@
"required": ["type", "content"]
},
"field-rule": {
"properties": {
"name": { "type": "string" },
"type": {
"type": "string",
"pattern": "^FIELD$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["name", "type", "content"]
},
"prec-rule": {
"type": "object",
"properties": {
@ -239,6 +261,7 @@
{ "$ref": "#/definitions/repeat1-rule" },
{ "$ref": "#/definitions/repeat-rule" },
{ "$ref": "#/definitions/token-rule" },
{ "$ref": "#/definitions/field-rule" },
{ "$ref": "#/definitions/prec-rule" }
]
}

View file

@ -23,7 +23,7 @@ pub(crate) struct Variable {
pub(crate) struct InputGrammar {
pub name: String,
pub variables: Vec<Variable>,
pub extra_tokens: Vec<Rule>,
pub extra_symbols: Vec<Rule>,
pub expected_conflicts: Vec<Vec<String>>,
pub external_tokens: Vec<Rule>,
pub variables_to_inline: Vec<String>,
@ -87,7 +87,7 @@ pub(crate) struct ExternalToken {
#[derive(Debug, Default)]
pub(crate) struct SyntaxGrammar {
pub variables: Vec<SyntaxVariable>,
pub extra_tokens: Vec<Symbol>,
pub extra_symbols: Vec<Symbol>,
pub expected_conflicts: Vec<Vec<Symbol>>,
pub external_tokens: Vec<ExternalToken>,
pub supertype_symbols: Vec<Symbol>,

View file

@ -6,13 +6,12 @@ mod node_types;
mod npm_files;
pub mod parse_grammar;
mod prepare_grammar;
pub mod properties;
mod render;
mod rules;
mod tables;
use self::build_tables::build_tables;
use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use self::parse_grammar::parse_grammar;
use self::prepare_grammar::prepare_grammar;
use self::render::render_c_code;
@ -20,9 +19,8 @@ use self::rules::AliasMap;
use crate::error::{Error, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use std::collections::HashSet;
use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
@ -33,15 +31,9 @@ lazy_static! {
.unwrap();
}
const NEW_HEADER_PARTS: [&'static str; 2] = [
"
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;",
"
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
",
];
const NEW_HEADER_PARTS: &[&'static str] = &["
const uint16_t *alias_map;
uint32_t state_count;"];
struct GeneratedParser {
c_code: String,
@ -51,13 +43,11 @@ struct GeneratedParser {
pub fn generate_parser_in_directory(
repo_path: &PathBuf,
grammar_path: Option<&str>,
properties_only: bool,
next_abi: bool,
report_symbol_name: Option<&str>,
) -> Result<()> {
let src_path = repo_path.join("src");
let header_path = src_path.join("tree_sitter");
let properties_dir_path = repo_path.join("properties");
// Ensure that the output directories exist.
fs::create_dir_all(&src_path)?;
@ -82,71 +72,48 @@ pub fn generate_parser_in_directory(
prepare_grammar(&input_grammar)?;
let language_name = input_grammar.name;
// If run with no arguments, read all of the property sheets and compile them to JSON.
if grammar_path.is_none() {
let token_names = get_token_names(&syntax_grammar, &lexical_grammar);
if let Ok(entries) = fs::read_dir(properties_dir_path) {
for entry in entries {
let css_path = entry?.path();
let css = fs::read_to_string(&css_path)?;
let sheet = properties::generate_property_sheet(&css_path, &css, &token_names)?;
let property_sheet_json_path = src_path
.join(css_path.file_name().unwrap())
.with_extension("json");
let property_sheet_json_file =
File::create(&property_sheet_json_path).map_err(Error::wrap(|| {
format!("Failed to create {:?}", property_sheet_json_path)
}))?;
let mut writer = BufWriter::new(property_sheet_json_file);
serde_json::to_writer_pretty(&mut writer, &sheet)?;
}
}
}
// Generate the parser and related files.
if !properties_only {
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&language_name,
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
next_abi,
report_symbol_name,
)?;
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&language_name,
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
next_abi,
report_symbol_name,
)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
if next_abi {
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
} else {
let mut header = tree_sitter::PARSER_HEADER.to_string();
if next_abi {
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
} else {
let mut header = tree_sitter::PARSER_HEADER.to_string();
for part in &NEW_HEADER_PARTS {
let pos = header
.find(part)
.expect("Missing expected part of parser.h header");
header.replace_range(pos..(pos + part.len()), "");
}
write_file(&header_path.join("parser.h"), header)?;
for part in NEW_HEADER_PARTS.iter() {
let pos = header
.find(part)
.expect("Missing expected part of parser.h header");
header.replace_range(pos..(pos + part.len()), "");
}
ensure_file(&repo_path.join("index.js"), || {
npm_files::index_js(&language_name)
})?;
ensure_file(&src_path.join("binding.cc"), || {
npm_files::binding_cc(&language_name)
})?;
ensure_file(&repo_path.join("binding.gyp"), || {
npm_files::binding_gyp(&language_name)
})?;
write_file(&header_path.join("parser.h"), header)?;
}
ensure_file(&repo_path.join("index.js"), || {
npm_files::index_js(&language_name)
})?;
ensure_file(&src_path.join("binding.cc"), || {
npm_files::binding_cc(&language_name)
})?;
ensure_file(&repo_path.join("binding.gyp"), || {
npm_files::binding_gyp(&language_name)
})?;
Ok(())
}
@ -176,7 +143,8 @@ fn generate_parser_for_grammar_with_opts(
next_abi: bool,
report_symbol_name: Option<&str>,
) -> Result<GeneratedParser> {
let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
@ -208,35 +176,6 @@ fn generate_parser_for_grammar_with_opts(
})
}
fn get_token_names(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) -> HashSet<String> {
let mut result = HashSet::new();
for variable in &lexical_grammar.variables {
if variable.kind == VariableType::Named {
result.insert(variable.name.clone());
}
}
for token in &syntax_grammar.external_tokens {
if token.kind == VariableType::Named {
result.insert(token.name.clone());
}
}
for variable in &syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if let Some(alias) = &step.alias {
if !step.symbol.is_non_terminal() && alias.is_named {
result.insert(alias.value.clone());
}
}
}
}
}
result
}
fn load_grammar_file(grammar_path: &Path) -> Result<String> {
match grammar_path.extension().and_then(|e| e.to_str()) {
Some("js") => Ok(load_js_grammar_file(grammar_path)?),

View file

@ -1,8 +1,10 @@
use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::fmt;
use std::mem::swap;
use std::ops::Range;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CharacterSet {
@ -178,6 +180,40 @@ impl CharacterSet {
}
}
pub fn ranges<'a>(
chars: &'a Vec<char>,
ruled_out_characters: &'a HashSet<u32>,
) -> impl Iterator<Item = Range<char>> + 'a {
let mut prev_range: Option<Range<char>> = None;
chars
.iter()
.map(|c| (*c, false))
.chain(Some(('\0', true)))
.filter_map(move |(c, done)| {
if done {
return prev_range.clone();
}
if ruled_out_characters.contains(&(c as u32)) {
return None;
}
if let Some(range) = prev_range.clone() {
let mut prev_range_successor = range.end as u32 + 1;
while prev_range_successor < c as u32 {
if !ruled_out_characters.contains(&prev_range_successor) {
prev_range = Some(c..c);
return Some(range);
}
prev_range_successor += 1;
}
prev_range = Some(range.start..c);
None
} else {
prev_range = Some(c..c);
None
}
})
}
#[cfg(test)]
pub fn contains(&self, c: char) -> bool {
match self {
@ -266,6 +302,13 @@ fn compare_chars(left: &Vec<char>, right: &Vec<char>) -> SetComparision {
result.common = true;
}
}
match (i, j) {
(Some(_), _) => result.left_only = true,
(_, Some(_)) => result.right_only = true,
_ => {}
}
result
}
@ -718,7 +761,7 @@ mod tests {
.add_range('d', 'e')
);
// A whitelist and an intersecting blacklist.
// An inclusion and an intersecting exclusion.
// Both sets contain 'e', 'f', and 'm'
let mut a = CharacterSet::empty()
.add_range('c', 'h')
@ -748,7 +791,7 @@ mod tests {
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
// A blacklist and an overlapping blacklist.
// An exclusion and an overlapping inclusion.
// Both sets exclude 'c', 'd', and 'e'
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
@ -759,7 +802,7 @@ mod tests {
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
// A blacklist and a larger blacklist.
// An exclusion and a larger exclusion.
let mut a = CharacterSet::empty().add_range('b', 'c').negate();
let mut b = CharacterSet::empty().add_range('a', 'd').negate();
assert_eq!(
@ -810,5 +853,53 @@ mod tests {
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
let (a, b) = (
CharacterSet::Include(vec!['c']),
CharacterSet::Exclude(vec!['a']),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
}
#[test]
fn test_character_set_get_ranges() {
struct Row {
chars: Vec<char>,
ruled_out_chars: Vec<char>,
expected_ranges: Vec<Range<char>>,
}
let table = [
Row {
chars: vec!['a'],
ruled_out_chars: vec![],
expected_ranges: vec!['a'..'a'],
},
Row {
chars: vec!['a', 'b', 'c', 'e', 'z'],
ruled_out_chars: vec![],
expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'],
},
Row {
chars: vec!['a', 'b', 'c', 'e', 'h', 'z'],
ruled_out_chars: vec!['d', 'f', 'g'],
expected_ranges: vec!['a'..'h', 'z'..'z'],
},
];
for Row {
chars,
ruled_out_chars,
expected_ranges,
} in table.iter()
{
let ruled_out_chars = ruled_out_chars
.into_iter()
.map(|c: &char| *c as u32)
.collect();
let ranges = CharacterSet::ranges(chars, &ruled_out_chars).collect::<Vec<_>>();
assert_eq!(ranges, *expected_ranges);
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -87,7 +87,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
})
}
let extra_tokens = grammar_json
let extra_symbols = grammar_json
.extras
.unwrap_or(Vec::new())
.into_iter()
@ -107,7 +107,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
name: grammar_json.name,
word_token: grammar_json.word,
variables,
extra_tokens,
extra_symbols,
expected_conflicts,
external_tokens,
supertype_symbols,

View file

@ -283,7 +283,7 @@ mod tests {
fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
ExtractedSyntaxGrammar {
variables,
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),

View file

@ -0,0 +1,293 @@
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
#[derive(Clone, Default)]
struct SymbolStatus {
aliases: Vec<(Alias, usize)>,
appears_unaliased: bool,
}
// Update the grammar by finding symbols that always are aliased, and for each such symbol,
// promoting one of its aliases to a "default alias", which is applied globally instead
// of in a context-specific way.
//
// This has two benefits:
// * It reduces the overhead of storing production-specific alias info in the parse table.
// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation
// ensures that the children of an `ERROR` node have symbols that are consistent with the
// way that they would appear in a valid syntax tree.
pub(super) fn extract_default_aliases(
syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) -> AliasMap {
let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
let mut non_terminal_status_list =
vec![SymbolStatus::default(); syntax_grammar.variables.len()];
let mut external_status_list =
vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
// For each grammar symbol, find all of the aliases under which the symbol appears,
// and determine whether or not the symbol ever appears *unaliased*.
for variable in syntax_grammar.variables.iter() {
for production in variable.productions.iter() {
for step in production.steps.iter() {
let mut status = match step.symbol.kind {
SymbolType::External => &mut external_status_list[step.symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
SymbolType::End => panic!("Unexpected end token"),
};
// Default aliases don't work for inlined variables.
if syntax_grammar.variables_to_inline.contains(&step.symbol) {
continue;
}
if let Some(alias) = &step.alias {
if let Some(count_for_alias) = status
.aliases
.iter_mut()
.find_map(|(a, count)| if a == alias { Some(count) } else { None })
{
*count_for_alias += 1;
} else {
status.aliases.push((alias.clone(), 1));
}
} else {
status.appears_unaliased = true;
}
}
}
}
let symbols_with_statuses = (terminal_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::terminal(i), status)))
.chain(
non_terminal_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::non_terminal(i), status)),
)
.chain(
external_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::external(i), status)),
);
// For each symbol that always appears aliased, find the alias the occurs most often,
// and designate that alias as the symbol's "default alias". Store all of these
// default aliases in a map that will be returned.
let mut result = AliasMap::new();
for (symbol, status) in symbols_with_statuses {
if status.appears_unaliased {
status.aliases.clear();
} else {
if let Some(default_entry) = status
.aliases
.iter()
.enumerate()
.max_by_key(|(i, (_, count))| (count, -(*i as i64)))
.map(|(_, entry)| entry.clone())
{
status.aliases.clear();
status.aliases.push(default_entry.clone());
result.insert(symbol, default_entry.0);
}
}
}
// Wherever a symbol is aliased as its default alias, remove the usage of the alias,
// because it will now be redundant.
let mut alias_positions_to_clear = Vec::new();
for variable in syntax_grammar.variables.iter_mut() {
alias_positions_to_clear.clear();
for (i, production) in variable.productions.iter().enumerate() {
for (j, step) in production.steps.iter().enumerate() {
let status = match step.symbol.kind {
SymbolType::External => &mut external_status_list[step.symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
SymbolType::End => panic!("Unexpected end token"),
};
// If this step is aliased as the symbol's default alias, then remove that alias.
if step.alias.is_some()
&& step.alias.as_ref() == status.aliases.get(0).map(|t| &t.0)
{
let mut other_productions_must_use_this_alias_at_this_index = false;
for (other_i, other_production) in variable.productions.iter().enumerate() {
if other_i != i
&& other_production.steps.len() > j
&& other_production.steps[j].alias == step.alias
&& result.get(&other_production.steps[j].symbol) != step.alias.as_ref()
{
other_productions_must_use_this_alias_at_this_index = true;
break;
}
}
if !other_productions_must_use_this_alias_at_this_index {
alias_positions_to_clear.push((i, j));
}
}
}
}
for (production_index, step_index) in &alias_positions_to_clear {
variable.productions[*production_index].steps[*step_index].alias = None;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{
LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
};
use crate::generate::nfa::Nfa;
#[test]
fn test_extract_simple_aliases() {
let mut syntax_grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
],
}],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// Token 0 is always aliased as "a1".
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
// Token 1 is aliased within rule `v1` above, but not here.
ProductionStep::new(Symbol::terminal(1)),
// Token 2 is aliased differently here than in `v1`. The alias from
// `v1` should be promoted to the default alias, because `v1` appears
// first in the grammar.
ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
// Token 3 is also aliased differently here than in `v1`. In this case,
// this alias should be promoted to the default alias, because it is
// used a greater number of times (twice).
ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
],
}],
},
],
extra_symbols: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
supertype_symbols: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t0".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
],
};
let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
assert_eq!(default_aliases.len(), 3);
assert_eq!(
default_aliases.get(&Symbol::terminal(0)),
Some(&Alias {
value: "a1".to_string(),
is_named: true,
})
);
assert_eq!(
default_aliases.get(&Symbol::terminal(2)),
Some(&Alias {
value: "a3".to_string(),
is_named: true,
})
);
assert_eq!(
default_aliases.get(&Symbol::terminal(3)),
Some(&Alias {
value: "a6".to_string(),
is_named: true,
})
);
assert_eq!(default_aliases.get(&Symbol::terminal(1)), None);
assert_eq!(
syntax_grammar.variables,
vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)),
ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
],
},],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)),
ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
ProductionStep::new(Symbol::terminal(3)),
ProductionStep::new(Symbol::terminal(3)),
],
},],
},
]
);
}
}

View file

@ -1,223 +0,0 @@
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
#[derive(Clone, Default)]
struct SymbolStatus {
alias: Option<Alias>,
conflicting: bool,
}
pub(super) fn extract_simple_aliases(
syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) -> AliasMap {
// Determine which symbols in the grammars are *always* aliased to a single name.
let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
let mut non_terminal_status_list =
vec![SymbolStatus::default(); syntax_grammar.variables.len()];
let mut external_status_list =
vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
for variable in syntax_grammar.variables.iter() {
for production in variable.productions.iter() {
for step in production.steps.iter() {
let mut status = match step.symbol {
Symbol {
kind: SymbolType::External,
index,
} => &mut external_status_list[index],
Symbol {
kind: SymbolType::NonTerminal,
index,
} => &mut non_terminal_status_list[index],
Symbol {
kind: SymbolType::Terminal,
index,
} => &mut terminal_status_list[index],
Symbol {
kind: SymbolType::End,
..
} => panic!("Unexpected end token"),
};
if step.alias.is_none() {
status.alias = None;
status.conflicting = true;
}
if !status.conflicting {
if status.alias.is_none() {
status.alias = step.alias.clone();
} else if status.alias != step.alias {
status.alias = None;
status.conflicting = true;
}
}
}
}
}
// Remove the aliases for those symbols.
for variable in syntax_grammar.variables.iter_mut() {
for production in variable.productions.iter_mut() {
for step in production.steps.iter_mut() {
let status = match step.symbol {
Symbol {
kind: SymbolType::External,
index,
} => &external_status_list[index],
Symbol {
kind: SymbolType::NonTerminal,
index,
} => &non_terminal_status_list[index],
Symbol {
kind: SymbolType::Terminal,
index,
} => &terminal_status_list[index],
Symbol {
kind: SymbolType::End,
..
} => panic!("Unexpected end token"),
};
if status.alias.is_some() {
step.alias = None;
}
}
}
}
// Populate a map of the symbols to their aliases.
let mut result = AliasMap::new();
for (i, status) in terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::terminal(i), alias);
}
}
for (i, status) in non_terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::non_terminal(i), alias);
}
}
for (i, status) in external_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::external(i), alias);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{
LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
};
use crate::generate::nfa::Nfa;
#[test]
fn test_extract_simple_aliases() {
let mut syntax_grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
}],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// Token 0 is always aliased as "a1".
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
// Token 1 is aliased above, but not here.
ProductionStep::new(Symbol::terminal(1)),
// Token 2 is aliased differently than above.
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
}],
},
],
extra_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
supertype_symbols: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
],
};
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
assert_eq!(simple_aliases.len(), 1);
assert_eq!(
simple_aliases[&Symbol::terminal(0)],
Alias {
value: "a1".to_string(),
is_named: true,
}
);
assert_eq!(
syntax_grammar.variables,
vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// 'Simple' alias removed
ProductionStep::new(Symbol::terminal(0)),
// Other aliases unchanged
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
},],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)),
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
},],
},
]
);
}
}

View file

@ -90,21 +90,13 @@ pub(super) fn extract_tokens(
.collect();
let mut separators = Vec::new();
let mut extra_tokens = Vec::new();
for rule in grammar.extra_tokens {
let mut extra_symbols = Vec::new();
for rule in grammar.extra_symbols {
if let Rule::Symbol(symbol) = rule {
let new_symbol = symbol_replacer.replace_symbol(symbol);
if new_symbol.is_non_terminal() {
return Error::err(format!(
"Non-token symbol '{}' cannot be used as an extra token",
&variables[new_symbol.index].name
));
} else {
extra_tokens.push(new_symbol);
}
extra_symbols.push(symbol_replacer.replace_symbol(symbol));
} else {
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
extra_tokens.push(Symbol::terminal(index));
extra_symbols.push(Symbol::terminal(index));
} else {
separators.push(rule);
}
@ -158,7 +150,7 @@ pub(super) fn extract_tokens(
ExtractedSyntaxGrammar {
variables,
expected_conflicts,
extra_tokens,
extra_symbols,
variables_to_inline,
supertype_symbols,
external_tokens,
@ -415,15 +407,15 @@ mod test {
}
#[test]
fn test_extracting_extra_tokens() {
fn test_extracting_extra_symbols() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::string("x")),
Variable::named("comment", Rule::pattern("//.*")),
]);
grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];
let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
assert_eq!(syntax_grammar.extra_symbols, vec![Symbol::terminal(1),]);
assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
}
@ -472,28 +464,6 @@ mod test {
);
}
#[test]
fn test_error_on_non_terminal_symbol_extras() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::named("rule_1", Rule::non_terminal(2)),
Variable::named("rule_2", Rule::string("x")),
]);
grammar.extra_tokens = vec![Rule::non_terminal(1)];
match extract_tokens(grammar) {
Err(e) => {
assert_eq!(
e.message(),
"Non-token symbol 'rule_1' cannot be used as an extra token"
);
}
_ => {
panic!("Expected an error but got no error");
}
}
}
#[test]
fn test_error_on_external_with_same_name_as_non_terminal() {
let mut grammar = build_grammar(vec![
@ -522,7 +492,7 @@ mod test {
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
InternedGrammar {
variables,
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),

View file

@ -199,7 +199,7 @@ unless they are used only as the grammar's start rule.
}
}
Ok(SyntaxGrammar {
extra_tokens: grammar.extra_tokens,
extra_symbols: grammar.extra_symbols,
expected_conflicts: grammar.expected_conflicts,
variables_to_inline: grammar.variables_to_inline,
external_tokens: grammar.external_tokens,

View file

@ -30,9 +30,9 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
external_tokens.push(Variable { name, kind, rule });
}
let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
for extra_token in grammar.extra_tokens.iter() {
extra_tokens.push(interner.intern_rule(extra_token)?);
let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
for extra_token in grammar.extra_symbols.iter() {
extra_symbols.push(interner.intern_rule(extra_token)?);
}
let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
@ -73,10 +73,16 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
);
}
for (i, variable) in variables.iter_mut().enumerate() {
if supertype_symbols.contains(&Symbol::non_terminal(i)) {
variable.kind = VariableType::Hidden;
}
}
Ok(InternedGrammar {
variables,
external_tokens,
extra_tokens,
extra_symbols,
expected_conflicts,
variables_to_inline,
supertype_symbols,
@ -236,7 +242,7 @@ mod tests {
InputGrammar {
variables,
name: "the_language".to_string(),
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),

View file

@ -1,6 +1,6 @@
mod expand_repeats;
mod expand_tokens;
mod extract_simple_aliases;
mod extract_default_aliases;
mod extract_tokens;
mod flatten_grammar;
mod intern_symbols;
@ -8,7 +8,7 @@ mod process_inlines;
use self::expand_repeats::expand_repeats;
pub(crate) use self::expand_tokens::expand_tokens;
use self::extract_simple_aliases::extract_simple_aliases;
use self::extract_default_aliases::extract_default_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
use self::intern_symbols::intern_symbols;
@ -21,7 +21,7 @@ use crate::generate::rules::{AliasMap, Rule, Symbol};
pub(crate) struct IntermediateGrammar<T, U> {
variables: Vec<Variable>,
extra_tokens: Vec<T>,
extra_symbols: Vec<T>,
expected_conflicts: Vec<Vec<Symbol>>,
external_tokens: Vec<U>,
variables_to_inline: Vec<Symbol>,
@ -52,7 +52,7 @@ pub(crate) fn prepare_grammar(
let syntax_grammar = expand_repeats(syntax_grammar);
let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
let lexical_grammar = expand_tokens(lexical_grammar)?;
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
let inlines = process_inlines(&syntax_grammar);
Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases))
Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
}

View file

@ -127,6 +127,9 @@ impl InlinedProductionMapBuilder {
last_inserted_step.associativity = removed_step.associativity;
}
}
if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() {
production.dynamic_precedence = p.dynamic_precedence;
}
production
}),
);
@ -196,7 +199,7 @@ mod tests {
fn test_basic_inlining() {
let grammar = SyntaxGrammar {
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
supertype_symbols: Vec::new(),
word_token: None,
@ -226,7 +229,7 @@ mod tests {
],
},
Production {
dynamic_precedence: 0,
dynamic_precedence: -2,
steps: vec![ProductionStep::new(Symbol::terminal(14))],
},
],
@ -258,7 +261,7 @@ mod tests {
],
},
Production {
dynamic_precedence: 0,
dynamic_precedence: -2,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(14)),
@ -327,7 +330,7 @@ mod tests {
Symbol::non_terminal(3),
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
supertype_symbols: Vec::new(),
word_token: None,
@ -429,7 +432,7 @@ mod tests {
},
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
supertype_symbols: Vec::new(),
word_token: None,

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,4 @@
use super::grammars::VariableType;
use smallbitvec::SmallBitVec;
use std::collections::HashMap;
use std::iter::FromIterator;
@ -139,6 +140,16 @@ impl Rule {
}
}
impl Alias {
pub fn kind(&self) -> VariableType {
if self.is_named {
VariableType::Named
} else {
VariableType::Anonymous
}
}
}
#[cfg(test)]
impl Rule {
pub fn terminal(index: usize) -> Self {
@ -366,7 +377,7 @@ impl FromIterator<Symbol> for TokenSet {
fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
match input {
Rule::Metadata { rule, mut params } => {
Rule::Metadata { rule, mut params } if !params.is_token => {
f(&mut params);
Rule::Metadata { rule, params }
}

View file

@ -24,6 +24,12 @@ pub(crate) enum ParseAction {
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum GotoAction {
Goto(ParseStateId),
ShiftExtra,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseTableEntry {
pub actions: Vec<ParseAction>,
@ -34,10 +40,11 @@ pub(crate) struct ParseTableEntry {
pub(crate) struct ParseState {
pub id: ParseStateId,
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub nonterminal_entries: HashMap<Symbol, GotoAction>,
pub lex_state_id: usize,
pub external_lex_state_id: usize,
pub core_id: usize,
pub is_non_terminal_extra: bool,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
@ -70,6 +77,7 @@ pub(crate) struct AdvanceAction {
#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct LexState {
pub accept_action: Option<Symbol>,
pub eof_action: Option<AdvanceAction>,
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
}
@ -103,7 +111,13 @@ impl ParseState {
_ => None,
})
})
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
.chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
if let GotoAction::Goto(state) = action {
Some(*state)
} else {
None
}
}))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
@ -121,15 +135,18 @@ impl ParseState {
}
}
}
for (symbol, other_state) in &self.nonterminal_entries {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
for (symbol, action) in &self.nonterminal_entries {
if let GotoAction::Goto(other_state) = action {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries.insert(symbol, new_state);
self.nonterminal_entries
.insert(symbol, GotoAction::Goto(new_state));
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {