Refine logic for deciding when tokens need to be re-lexed

* While generating the lex table, note which tokens can match the
  same string. A token needs to be relexed when it has possible
  homonyms in the current state.
* Also note which tokens can match substrings of each other tokens.
  A token needs to be relexed when there are viable tokens that
  could match longer strings in the current state and the next
  token has been edited.
* Remove the logic for marking tokens as fragile on creation.
* Store the reusability/non-reusability of symbols off of individual
  actions and onto the entire entry for the state & symbol.
This commit is contained in:
Max Brunsfeld 2016-06-21 07:28:04 -07:00
parent 45f7cee0c8
commit 38c144b4a3
19 changed files with 337 additions and 257 deletions

View file

@ -92,7 +92,8 @@ class LexTableBuilder {
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
auto current_action = lex_table.state(state_id).accept_action;
if (conflict_manager.resolve(action, current_action)) {
if (conflict_manager.resolve(transition.destination, action,
current_action)) {
action.state_index = add_lex_state(transition.destination);
lex_table.state(state_id).advance_actions[characters] = action;
}
@ -114,10 +115,31 @@ class LexTableBuilder {
}
void mark_fragile_tokens() {
for (LexState &state : lex_table.states)
if (state.accept_action.is_present())
if (conflict_manager.fragile_tokens.count(state.accept_action.symbol))
state.accept_action.is_fragile = true;
for (ParseState &state : parse_table->states) {
for (auto &entry : state.entries) {
if (!entry.first.is_token)
continue;
auto homonyms = conflict_manager.possible_homonyms.find(entry.first);
if (homonyms != conflict_manager.possible_homonyms.end())
for (const Symbol &homonym : homonyms->second)
if (state.entries.count(homonym)) {
entry.second.reusable = false;
break;
}
if (!entry.second.reusable)
continue;
auto extensions = conflict_manager.possible_extensions.find(entry.first);
if (extensions != conflict_manager.possible_extensions.end())
for (const Symbol &extension : extensions->second)
if (state.entries.count(extension)) {
entry.second.depends_on_lookahead = true;
break;
}
}
}
}
void remove_duplicate_lex_states() {

View file

@ -110,7 +110,7 @@ class ParseTableBuilder {
}
for (const Symbol &symbol : grammar.extra_tokens) {
parse_table.error_state.actions[symbol].push_back(
parse_table.error_state.entries[symbol].actions.push_back(
ParseAction::ShiftExtra());
}
@ -119,7 +119,7 @@ class ParseTableBuilder {
add_out_of_context_parse_state(symbol);
}
parse_table.error_state.actions[rules::END_OF_INPUT()].push_back(
parse_table.error_state.entries[rules::END_OF_INPUT()].actions.push_back(
ParseAction::Shift(0, PrecedenceRange()));
}
@ -127,7 +127,7 @@ class ParseTableBuilder {
const ParseItemSet &item_set = recovery_states[symbol];
if (!item_set.entries.empty()) {
ParseStateId state = add_parse_state(item_set);
parse_table.error_state.actions[symbol].push_back(
parse_table.error_state.entries[symbol].actions.push_back(
ParseAction::Shift(state, PrecedenceRange()));
}
}
@ -198,15 +198,15 @@ class ParseTableBuilder {
const ParseState &state = parse_table.states[state_id];
for (const Symbol &extra_symbol : grammar.extra_tokens) {
const auto &actions_for_symbol = state.actions.find(extra_symbol);
if (actions_for_symbol == state.actions.end())
const auto &entry_for_symbol = state.entries.find(extra_symbol);
if (entry_for_symbol == state.entries.end())
continue;
for (const ParseAction &action : actions_for_symbol->second)
for (const ParseAction &action : entry_for_symbol->second.actions)
if (action.type == ParseActionTypeShift && !action.extra) {
size_t dest_state_id = action.state_index;
ParseAction reduce_extra = ParseAction::ReduceExtra(extra_symbol);
for (const auto &pair : state.actions)
for (const auto &pair : state.entries)
add_action(dest_state_id, pair.first, reduce_extra, null_item_set);
}
}
@ -216,11 +216,14 @@ class ParseTableBuilder {
for (ParseState &state : parse_table.states) {
set<Symbol> symbols_with_multiple_actions;
for (auto &entry : state.actions) {
if (entry.second.size() > 1)
symbols_with_multiple_actions.insert(entry.first);
for (auto &entry : state.entries) {
const Symbol &symbol = entry.first;
auto &actions = entry.second.actions;
for (ParseAction &action : entry.second) {
if (actions.size() > 1)
symbols_with_multiple_actions.insert(symbol);
for (ParseAction &action : actions) {
if (action.type == ParseActionTypeReduce && !action.extra) {
if (has_fragile_production(action.production))
action.fragile = true;
@ -231,11 +234,11 @@ class ParseTableBuilder {
}
}
for (auto i = entry.second.begin(); i != entry.second.end();) {
for (auto i = actions.begin(); i != actions.end();) {
bool erased = false;
for (auto j = entry.second.begin(); j != i; j++) {
for (auto j = actions.begin(); j != i; j++) {
if (*j == *i) {
entry.second.erase(i);
actions.erase(i);
erased = true;
break;
}
@ -246,12 +249,12 @@ class ParseTableBuilder {
}
if (!symbols_with_multiple_actions.empty()) {
for (auto &entry : state.actions) {
for (auto &entry : state.entries) {
if (!entry.first.is_token) {
set<Symbol> first_set = get_first_set(entry.first);
for (const Symbol &symbol : symbols_with_multiple_actions) {
if (first_set.count(symbol)) {
entry.second[0].can_hide_split = true;
entry.second.reusable = false;
break;
}
}
@ -276,14 +279,14 @@ class ParseTableBuilder {
ParseAction *add_action(ParseStateId state_id, Symbol lookahead,
const ParseAction &new_action,
const ParseItemSet &item_set) {
const auto &current_actions = parse_table.states[state_id].actions;
const auto &current_entry = current_actions.find(lookahead);
if (current_entry == current_actions.end())
const ParseState &state = parse_table.states[state_id];
const auto &current_entry = state.entries.find(lookahead);
if (current_entry == state.entries.end())
return &parse_table.set_action(state_id, lookahead, new_action);
if (allow_any_conflict)
return &parse_table.add_action(state_id, lookahead, new_action);
const ParseAction old_action = current_entry->second[0];
const ParseAction old_action = current_entry->second.actions[0];
auto resolution = conflict_manager.resolve(new_action, old_action);
switch (resolution.second) {

View file

@ -2,15 +2,23 @@
#include <utility>
#include "compiler/parse_table.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/build_tables/lex_item.h"
namespace tree_sitter {
namespace build_tables {
bool LexConflictManager::resolve(const AdvanceAction &new_action,
bool LexConflictManager::resolve(const LexItemSet &item_set,
const AdvanceAction &new_action,
const AcceptTokenAction &old_action) {
if (!old_action.is_present())
return true;
return new_action.precedence_range.max >= old_action.precedence;
if (new_action.precedence_range.max >= old_action.precedence) {
for (const LexItem &item : item_set.entries)
possible_extensions[old_action.symbol].insert(item.lhs);
return true;
} else {
return false;
}
}
bool LexConflictManager::resolve(const AcceptTokenAction &new_action,
@ -36,9 +44,9 @@ bool LexConflictManager::resolve(const AcceptTokenAction &new_action,
result = false;
if (result)
fragile_tokens.insert(old_action.symbol);
possible_homonyms[old_action.symbol].insert(new_action.symbol);
else
fragile_tokens.insert(new_action.symbol);
possible_homonyms[new_action.symbol].insert(old_action.symbol);
return result;
}

View file

@ -1,6 +1,7 @@
#ifndef COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#include <map>
#include <set>
#include "compiler/lexical_grammar.h"
#include "compiler/rules/symbol.h"
@ -12,12 +13,16 @@ struct AcceptTokenAction;
namespace build_tables {
class LexItemSet;
class LexConflictManager {
public:
bool resolve(const AdvanceAction &, const AcceptTokenAction &);
bool resolve(const LexItemSet &, const AdvanceAction &,
const AcceptTokenAction &);
bool resolve(const AcceptTokenAction &, const AcceptTokenAction &);
std::set<rules::Symbol> fragile_tokens;
std::map<rules::Symbol, std::set<rules::Symbol>> possible_homonyms;
std::map<rules::Symbol, std::set<rules::Symbol>> possible_extensions;
};
} // namespace build_tables

View file

@ -72,7 +72,7 @@ class CCodeGenerator {
const SyntaxGrammar syntax_grammar;
const LexicalGrammar lexical_grammar;
map<string, string> sanitized_names;
vector<pair<size_t, vector<ParseAction>>> parse_actions;
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
vector<pair<size_t, set<rules::Symbol>>> in_progress_symbols;
size_t next_parse_action_list_index;
size_t next_in_progress_symbol_list_index;
@ -155,35 +155,28 @@ class CCodeGenerator {
for (const auto &entry : parse_table.symbols) {
const rules::Symbol &symbol = entry.first;
line("[" + symbol_id(symbol) + "] = {");
indent([&]() {
switch (symbol_type(symbol)) {
case VariableTypeNamed:
line(".visible = true,");
line(".named = true,");
break;
case VariableTypeAnonymous:
line(".visible = true,");
line(".named = false,");
break;
case VariableTypeHidden:
case VariableTypeAuxiliary:
line(".visible = false,");
line(".named = false,");
break;
}
switch (symbol_type(symbol)) {
case VariableTypeNamed:
add(".visible = true, .named = true");
break;
case VariableTypeAnonymous:
add(".visible = true, .named = false");
break;
case VariableTypeHidden:
case VariableTypeAuxiliary:
add(".visible = false, .named = false");
break;
}
line(".structural = " + _boolean(entry.second.structural) + ",");
line(".extra = " + _boolean(entry.second.extra) + ",");
});
add(", ");
if (entry.second.structural)
add(".structural = true");
else
add(".structural = false");
add(", ");
if (syntax_grammar.extra_tokens.count(symbol))
add(".extra = true");
else
add(".extra = false");
add("},");
line("},");
}
});
line("};");
@ -221,11 +214,10 @@ class CCodeGenerator {
void add_recovery_parse_states_list() {
line("static TSParseAction ts_recovery_actions[SYMBOL_COUNT] = {");
indent([&]() {
for (const auto &entry : parse_table.error_state.actions) {
const rules::Symbol &symbol = entry.first;
if (!entry.second.empty()) {
line("[" + symbol_id(symbol) + "] = ");
ParseAction action = entry.second[0];
for (const auto &entry : parse_table.error_state.entries) {
if (!entry.second.actions.empty()) {
line("[" + symbol_id(entry.first) + "] = ");
ParseAction action = entry.second.actions[0];
if (action.extra) {
add("RECOVER_EXTRA(),");
} else {
@ -239,7 +231,8 @@ class CCodeGenerator {
}
void add_parse_table() {
add_parse_action_list_id({ ParseAction::Error() });
add_parse_action_list_id(
ParseTableEntry{ { ParseAction::Error() }, true, false });
size_t state_id = 0;
line("#pragma GCC diagnostic push");
@ -251,9 +244,9 @@ class CCodeGenerator {
for (const auto &state : parse_table.states) {
line("[" + to_string(state_id++) + "] = {");
indent([&]() {
for (const auto &pair : state.actions) {
line("[" + symbol_id(pair.first) + "] = ");
add(to_string(add_parse_action_list_id(pair.second)));
for (const auto &entry : state.entries) {
line("[" + symbol_id(entry.first) + "] = ");
add(to_string(add_parse_action_list_id(entry.second)));
add(",");
}
});
@ -338,22 +331,21 @@ class CCodeGenerator {
}
void add_accept_token_action(const AcceptTokenAction &action) {
if (action.is_fragile)
line("ACCEPT_FRAGILE_TOKEN(" + symbol_id(action.symbol) + ");");
else
line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");");
line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");");
}
void add_parse_action_list() {
line("static TSParseActionEntry ts_parse_actions[] = {");
indent([&]() {
for (const auto &pair : parse_actions) {
for (const auto &pair : parse_table_entries) {
size_t index = pair.first;
line("[" + to_string(index) + "] = {.count = " +
to_string(pair.second.size()) + "},");
to_string(pair.second.actions.size()) + ", .reusable = " +
_boolean(pair.second.reusable) + ", .depends_on_lookahead = " +
_boolean(pair.second.depends_on_lookahead) + "},");
for (const ParseAction &action : pair.second) {
for (const ParseAction &action : pair.second.actions) {
add(" ");
switch (action.type) {
case ParseActionTypeError:
@ -366,19 +358,18 @@ class CCodeGenerator {
if (action.extra) {
add("SHIFT_EXTRA()");
} else {
add("SHIFT(" + to_string(action.state_index) + ", ");
add_action_flags(action);
add(")");
add("SHIFT(" + to_string(action.state_index) + ")");
}
break;
case ParseActionTypeReduce:
if (action.extra) {
add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
} else if (action.fragile) {
add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
} else {
add("REDUCE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ", ");
add_action_flags(action);
add(")");
to_string(action.consumed_symbol_count) + ")");
}
break;
default: {}
@ -391,16 +382,16 @@ class CCodeGenerator {
line("};");
}
size_t add_parse_action_list_id(const vector<ParseAction> &actions) {
for (const auto &pair : parse_actions) {
if (pair.second == actions) {
size_t add_parse_action_list_id(const ParseTableEntry &entry) {
for (const auto &pair : parse_table_entries) {
if (pair.second == entry) {
return pair.first;
}
}
size_t result = next_parse_action_list_index;
parse_actions.push_back({ next_parse_action_list_index, actions });
next_parse_action_list_index += 1 + actions.size();
parse_table_entries.push_back({ next_parse_action_list_index, entry });
next_parse_action_list_index += 1 + entry.actions.size();
return result;
}
@ -417,17 +408,6 @@ class CCodeGenerator {
return result;
}
void add_action_flags(const ParseAction &action) {
if (action.fragile && action.can_hide_split)
add("FRAGILE|CAN_HIDE_SPLIT");
else if (action.fragile)
add("FRAGILE");
else if (action.can_hide_split)
add("CAN_HIDE_SPLIT");
else
add("0");
}
// Helper functions
string symbol_id(const rules::Symbol &symbol) {

View file

@ -27,14 +27,11 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const {
}
AcceptTokenAction::AcceptTokenAction()
: symbol(rules::NONE()), precedence(0), is_string(false), is_fragile(false) {}
: symbol(rules::NONE()), precedence(0), is_string(false) {}
AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence,
bool is_string)
: symbol(symbol),
precedence(precedence),
is_string(is_string),
is_fragile(false) {}
: symbol(symbol), precedence(precedence), is_string(is_string) {}
bool AcceptTokenAction::is_present() const {
return symbol != rules::NONE();
@ -42,7 +39,7 @@ bool AcceptTokenAction::is_present() const {
bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
return (symbol == other.symbol) && (precedence == other.precedence) &&
(is_string == other.is_string) && (is_fragile == other.is_fragile);
(is_string == other.is_string);
}
LexState::LexState() : is_token_start(false) {}

View file

@ -39,7 +39,6 @@ struct AcceptTokenAction {
rules::Symbol symbol;
int precedence;
bool is_string;
bool is_fragile;
};
} // namespace tree_sitter

View file

@ -20,7 +20,6 @@ ParseAction::ParseAction(ParseActionType type, ParseStateId state_index,
: type(type),
extra(false),
fragile(false),
can_hide_split(false),
symbol(symbol),
state_index(state_index),
consumed_symbol_count(consumed_symbol_count),
@ -32,7 +31,6 @@ ParseAction::ParseAction()
: type(ParseActionTypeError),
extra(false),
fragile(false),
can_hide_split(false),
symbol(Symbol(-1)),
state_index(-1),
consumed_symbol_count(0),
@ -81,9 +79,8 @@ ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count,
bool ParseAction::operator==(const ParseAction &other) const {
return (type == other.type && extra == other.extra &&
fragile == other.fragile && can_hide_split == other.can_hide_split &&
symbol == other.symbol && state_index == other.state_index &&
production == other.production &&
fragile == other.fragile && symbol == other.symbol &&
state_index == other.state_index && production == other.production &&
consumed_symbol_count == other.consumed_symbol_count);
}
@ -100,10 +97,6 @@ bool ParseAction::operator<(const ParseAction &other) const {
return true;
if (other.fragile && !fragile)
return false;
if (can_hide_split && !other.can_hide_split)
return true;
if (other.can_hide_split && !can_hide_split)
return false;
if (symbol < other.symbol)
return true;
if (other.symbol < symbol)
@ -119,24 +112,38 @@ bool ParseAction::operator<(const ParseAction &other) const {
return consumed_symbol_count < other.consumed_symbol_count;
}
ParseTableEntry::ParseTableEntry()
: reusable(true), depends_on_lookahead(false) {}
ParseTableEntry::ParseTableEntry(const vector<ParseAction> &actions,
bool reusable, bool depends_on_lookahead)
: actions(actions),
reusable(reusable),
depends_on_lookahead(depends_on_lookahead) {}
bool ParseTableEntry::operator==(const ParseTableEntry &other) const {
return actions == other.actions && reusable == other.reusable &&
depends_on_lookahead == other.depends_on_lookahead;
}
ParseState::ParseState() : lex_state_id(-1) {}
set<Symbol> ParseState::expected_inputs() const {
set<Symbol> result;
for (auto &pair : actions)
result.insert(pair.first);
for (auto &entry : entries)
result.insert(entry.first);
return result;
}
void ParseState::each_advance_action(function<void(ParseAction *)> fn) {
for (auto &entry : actions)
for (ParseAction &action : entry.second)
for (auto &entry : entries)
for (ParseAction &action : entry.second.actions)
if (action.type == ParseActionTypeShift)
fn(&action);
}
bool ParseState::operator==(const ParseState &other) const {
return actions == other.actions;
return entries == other.entries;
}
set<Symbol> ParseTable::all_symbols() const {
@ -154,26 +161,28 @@ ParseStateId ParseTable::add_state() {
ParseAction &ParseTable::set_action(ParseStateId id, Symbol symbol,
ParseAction action) {
if (action.extra)
symbols[symbol];
symbols[symbol].extra = true;
else
symbols[symbol].structural = true;
states[id].actions[symbol] = vector<ParseAction>({ action });
return *states[id].actions[symbol].begin();
states[id].entries[symbol].actions = { action };
return *states[id].entries[symbol].actions.begin();
}
ParseAction &ParseTable::add_action(ParseStateId id, Symbol symbol,
ParseAction action) {
if (action.extra)
symbols[symbol];
symbols[symbol].extra = true;
else
symbols[symbol].structural = true;
for (ParseAction &existing_action : states[id].actions[symbol])
ParseState &state = states[id];
for (ParseAction &existing_action : state.entries[symbol].actions)
if (existing_action == action)
return existing_action;
states[id].actions[symbol].push_back(action);
return *states[id].actions[symbol].rbegin();
state.entries[symbol].actions.push_back(action);
return *state.entries[symbol].actions.rbegin();
}
} // namespace tree_sitter

View file

@ -15,12 +15,12 @@ namespace tree_sitter {
typedef uint64_t ParseStateId;
typedef enum {
enum ParseActionType {
ParseActionTypeError,
ParseActionTypeShift,
ParseActionTypeReduce,
ParseActionTypeAccept,
} ParseActionType;
};
class ParseAction {
ParseAction(ParseActionType type, ParseStateId state_index,
@ -43,7 +43,6 @@ class ParseAction {
ParseActionType type;
bool extra;
bool fragile;
bool can_hide_split;
rules::Symbol symbol;
ParseStateId state_index;
size_t consumed_symbol_count;
@ -52,30 +51,16 @@ class ParseAction {
const Production *production;
};
} // namespace tree_sitter
struct ParseTableEntry {
std::vector<ParseAction> actions;
bool reusable;
bool depends_on_lookahead;
namespace std {
template <>
struct hash<tree_sitter::ParseAction> {
size_t operator()(const tree_sitter::ParseAction &action) const {
return (hash<int>()(action.type) ^
hash<tree_sitter::rules::Symbol>()(action.symbol) ^
hash<size_t>()(action.state_index) ^
hash<size_t>()(action.consumed_symbol_count) ^
hash<bool>()(action.extra) ^ hash<bool>()(action.fragile) ^
hash<bool>()(action.can_hide_split) ^
hash<int>()(action.associativity) ^
hash<int>()(action.precedence_range.min) ^
hash<int>()(action.precedence_range.max) ^
hash<const void *>()(&action.production));
}
ParseTableEntry();
ParseTableEntry(const std::vector<ParseAction> &, bool, bool);
bool operator==(const ParseTableEntry &other) const;
};
} // namespace std
namespace tree_sitter {
class ParseState {
public:
ParseState();
@ -83,11 +68,12 @@ class ParseState {
bool operator==(const ParseState &) const;
void each_advance_action(std::function<void(ParseAction *)>);
std::map<rules::Symbol, std::vector<ParseAction>> actions;
std::map<rules::Symbol, ParseTableEntry> entries;
LexStateId lex_state_id;
};
struct ParseTableSymbolMetadata {
bool extra;
bool structural;
};