From 1b5ae380ce57a4e65a9a5f7255dfe070cc3f3147 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 11 Dec 2019 17:18:15 -0800 Subject: [PATCH] Don't attempt to extract keywords that don't entirely match word token (#505) --- cli/src/generate/build_tables/mod.rs | 1 + .../generate/build_tables/token_conflicts.rs | 41 +++++++++++++++---- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index e0f84244..2e5d2f57 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -271,6 +271,7 @@ fn identify_keywords( cursor.reset(vec![variable.start_state]); if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) + && !token_conflict_map.does_match_different_string(i, word_token.index) { info!( "Keywords - add candidate {}", diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index a71dcca6..64e7564b 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -13,6 +13,7 @@ struct TokenConflictStatus { does_match_valid_continuation: bool, does_match_separators: bool, matches_same_string: bool, + matches_different_string: bool, } pub(crate) struct TokenConflictMap<'a> { @@ -25,6 +26,12 @@ pub(crate) struct TokenConflictMap<'a> { } impl<'a> TokenConflictMap<'a> { + /// Create a token conflict map based on a lexical grammar, which describes the structure + /// each token, and a `following_token` map, which indicates which tokens may be appear + /// immediately after each other token. + /// + /// This analyzes the possible kinds of overlap between each pair of tokens and stores + /// them in a matrix. pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); @@ -50,12 +57,21 @@ impl<'a> TokenConflictMap<'a> { } } + /// Does token `i` match any strings that token `j` also matches, such that token `i` + /// is preferred over token `j`? pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { let left = &self.status_matrix[matrix_index(self.n, a, other)]; let right = &self.status_matrix[matrix_index(self.n, b, other)]; left == right } + /// Does token `i` match any strings that token `j` does *not* match? + pub fn does_match_different_string(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].matches_different_string + } + + /// Does token `i` match any strings that token `j` also matches, where + /// token `i` is preferred over token `j`? pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } @@ -67,6 +83,7 @@ impl<'a> TokenConflictMap<'a> { || entry.matches_same_string } + /// Does token `i` match any strings that are *prefixes* of strings matched by `j`? pub fn does_match_prefix(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_prefix } @@ -239,19 +256,29 @@ fn compute_conflict_status( ); while let Some(state_set) = state_set_queue.pop() { - // Don't pursue states where there's no potential for conflict. - if grammar.variable_indices_for_nfa_states(&state_set).count() > 1 { - cursor.reset(state_set); - } else { + let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set); + + // If only one of the two tokens could possibly match from this state, then + // there is no reason to analyze any of its successors. Just record the fact + // that the token matches a string that the other token does not match. + let first_live_variable_index = live_variable_indices.next().unwrap(); + if live_variable_indices.count() == 0 { + if first_live_variable_index == i { + result.0.matches_different_string = true; + } else { + result.1.matches_different_string = true; + } continue; } - let has_sep = cursor.transition_chars().any(|(_, sep)| sep); + // Don't pursue states where there's no potential for conflict. + cursor.reset(state_set); + let within_separator = cursor.transition_chars().any(|(_, sep)| sep); // Examine each possible completed token in this state. let mut completion = None; for (id, precedence) in cursor.completions() { - if has_sep { + if within_separator { if id == i { result.0.does_match_separators = true; } else { @@ -316,7 +343,7 @@ fn compute_conflict_status( &transition, completed_id, completed_precedence, - has_sep, + within_separator, ) { can_advance = true; if advanced_id == i {