Move code into cli directory

2019-01-04 16:50:52 -08:00 · 2019-01-04 16:50:52 -08:00 · 5b0e12ea33
commit 5b0e12ea33
parent b8dd5d2640
29 changed files with 32 additions and 26 deletions
--- a/cli/src/build_tables/token_conflicts.rs
+++ b/cli/src/build_tables/token_conflicts.rs
@ -0,0 +1,382 @@
+use crate::build_tables::item::LookaheadSet;
+use crate::grammars::LexicalGrammar;
+use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
+use hashbrown::HashSet;
+use std::cmp::Ordering;
+use std::fmt;
+
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+struct TokenConflictStatus {
+    does_overlap: bool,
+    does_match_valid_continuation: bool,
+    does_match_separators: bool,
+    matches_same_string: bool,
+}
+
+pub(crate) struct TokenConflictMap<'a> {
+    n: usize,
+    status_matrix: Vec<TokenConflictStatus>,
+    starting_chars_by_index: Vec<CharacterSet>,
+    following_chars_by_index: Vec<CharacterSet>,
+    grammar: &'a LexicalGrammar,
+}
+
+impl<'a> TokenConflictMap<'a> {
+    pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
+        let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
+        let starting_chars = get_starting_chars(&mut cursor, grammar);
+        let following_chars = get_following_chars(&starting_chars, following_tokens);
+
+        let n = grammar.variables.len();
+        let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
+        for i in 0..grammar.variables.len() {
+            for j in 0..i {
+                let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
+                status_matrix[matrix_index(n, i, j)] = status.0;
+                status_matrix[matrix_index(n, j, i)] = status.1;
+            }
+        }
+
+        TokenConflictMap {
+            n,
+            status_matrix,
+            starting_chars_by_index: starting_chars,
+            following_chars_by_index: following_chars,
+            grammar,
+        }
+    }
+
+    pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
+        let left = &self.status_matrix[matrix_index(self.n, a, other)];
+        let right = &self.status_matrix[matrix_index(self.n, b, other)];
+        left == right
+    }
+
+    pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
+    }
+
+    pub fn does_conflict(&self, i: usize, j: usize) -> bool {
+        let entry = &self.status_matrix[matrix_index(self.n, i, j)];
+        entry.does_match_valid_continuation || entry.does_match_separators
+    }
+
+    pub fn does_overlap(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].does_overlap
+    }
+
+    pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
+        if left.0 > right.0 {
+            return true;
+        } else if left.0 < right.0 {
+            return false;
+        }
+
+        match grammar.variables[left.1]
+            .implicit_precedence
+            .cmp(&grammar.variables[right.1].implicit_precedence)
+        {
+            Ordering::Less => false,
+            Ordering::Greater => true,
+            Ordering::Equal => left.1 < right.1,
+        }
+    }
+}
+
+impl<'a> fmt::Debug for TokenConflictMap<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "TokenConflictMap {{\n")?;
+
+        write!(f, "  starting_characters: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {:?},\n", i, self.starting_chars_by_index[i])?;
+        }
+        write!(f, "  }},\n")?;
+
+        write!(f, "  following_characters: {{\n")?;
+        for i in 0..self.n {
+            write!(
+                f,
+                "    {}: {:?},\n",
+                self.grammar.variables[i].name, self.following_chars_by_index[i]
+            )?;
+        }
+        write!(f, "  }},\n")?;
+
+        write!(f, "  status_matrix: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {{\n", self.grammar.variables[i].name)?;
+            for j in 0..self.n {
+                write!(
+                    f,
+                    "      {}: {:?},\n",
+                    self.grammar.variables[j].name,
+                    self.status_matrix[matrix_index(self.n, i, j)]
+                )?;
+            }
+            write!(f, "    }},\n")?;
+        }
+        write!(f, "  }},")?;
+        write!(f, "}}")?;
+        Ok(())
+    }
+}
+
+fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
+    variable_count * i + j
+}
+
+fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
+    let mut result = Vec::with_capacity(grammar.variables.len());
+    for variable in &grammar.variables {
+        cursor.reset(vec![variable.start_state]);
+        let mut all_chars = CharacterSet::empty();
+        for (chars, _) in cursor.transition_chars() {
+            all_chars = all_chars.add(chars);
+        }
+        result.push(all_chars);
+    }
+    result
+}
+
+fn get_following_chars(
+    starting_chars: &Vec<CharacterSet>,
+    following_tokens: Vec<LookaheadSet>,
+) -> Vec<CharacterSet> {
+    following_tokens
+        .into_iter()
+        .map(|following_tokens| {
+            let mut chars = CharacterSet::empty();
+            for token in following_tokens.iter() {
+                if token.is_terminal() {
+                    chars = chars.add(&starting_chars[token.index]);
+                }
+            }
+            chars
+        })
+        .collect()
+}
+
+fn compute_conflict_status(
+    cursor: &mut NfaCursor,
+    grammar: &LexicalGrammar,
+    following_chars: &Vec<CharacterSet>,
+    i: usize,
+    j: usize,
+) -> (TokenConflictStatus, TokenConflictStatus) {
+    let mut visited_state_sets = HashSet::new();
+    let mut state_set_queue = vec![vec![
+        grammar.variables[i].start_state,
+        grammar.variables[j].start_state,
+    ]];
+    let mut result = (
+        TokenConflictStatus::default(),
+        TokenConflictStatus::default(),
+    );
+
+    while let Some(state_set) = state_set_queue.pop() {
+        // Don't pursue states where there's no potential for conflict.
+        if variable_ids_for_states(&state_set, grammar).count() > 1 {
+            cursor.reset(state_set);
+        } else {
+            continue;
+        }
+
+        let mut completion = None;
+        for (id, precedence) in cursor.completions() {
+            if let Some((prev_id, prev_precedence)) = completion {
+                if id == prev_id {
+                    continue;
+                }
+
+                // Prefer tokens with higher precedence. For tokens with equal precedence,
+                // prefer those listed earlier in the grammar.
+                let winning_id;
+                if TokenConflictMap::prefer_token(
+                    grammar,
+                    (prev_precedence, prev_id),
+                    (precedence, id),
+                ) {
+                    winning_id = prev_id;
+                } else {
+                    winning_id = id;
+                    completion = Some((id, precedence));
+                }
+
+                if winning_id == i {
+                    result.0.matches_same_string = true;
+                    result.0.does_overlap = true;
+                } else {
+                    result.1.matches_same_string = true;
+                    result.1.does_overlap = true;
+                }
+            } else {
+                completion = Some((id, precedence));
+            }
+        }
+
+        for NfaTransition {
+            characters,
+            precedence,
+            states,
+            is_separator,
+        } in cursor.transitions()
+        {
+            let mut can_advance = true;
+            if let Some((completed_id, completed_precedence)) = completion {
+                let mut other_id = None;
+                let mut successor_contains_completed_id = false;
+                for variable_id in variable_ids_for_states(&states, grammar) {
+                    if variable_id == completed_id {
+                        successor_contains_completed_id = true;
+                        break;
+                    } else {
+                        other_id = Some(variable_id);
+                    }
+                }
+
+                if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
+                    let winning_id;
+                    if precedence < completed_precedence {
+                        winning_id = completed_id;
+                        can_advance = false;
+                    } else {
+                        winning_id = other_id;
+                    }
+
+                    if winning_id == i {
+                        result.0.does_overlap = true;
+                        if characters.does_intersect(&following_chars[j]) {
+                            result.0.does_match_valid_continuation = true;
+                        }
+                        if is_separator {
+                            result.0.does_match_separators = true;
+                        }
+                    } else {
+                        result.1.does_overlap = true;
+                        if characters.does_intersect(&following_chars[i]) {
+                            result.1.does_match_valid_continuation = true;
+                        }
+                    }
+                }
+            }
+
+            if can_advance && visited_state_sets.insert(states.clone()) {
+                state_set_queue.push(states);
+            }
+        }
+    }
+    result
+}
+
+fn variable_ids_for_states<'a>(
+    state_ids: &'a Vec<u32>,
+    grammar: &'a LexicalGrammar,
+) -> impl Iterator<Item = usize> + 'a {
+    let mut prev = None;
+    state_ids.iter().filter_map(move |state_id| {
+        let variable_id = grammar.variable_index_for_nfa_state(*state_id);
+        if prev != Some(variable_id) {
+            prev = Some(variable_id);
+            prev
+        } else {
+            None
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::grammars::{Variable, VariableType};
+    use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
+    use crate::rules::{Rule, Symbol};
+
+    #[test]
+    fn test_starting_characters() {
+        let grammar = expand_tokens(ExtractedLexicalGrammar {
+            separators: Vec::new(),
+            variables: vec![
+                Variable {
+                    name: "token_0".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::pattern("[a-f]1|0x\\d"),
+                },
+                Variable {
+                    name: "token_1".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::pattern("d*ef"),
+                },
+            ],
+        })
+        .unwrap();
+
+        let token_map = TokenConflictMap::new(&grammar, Vec::new());
+
+        assert_eq!(
+            token_map.starting_chars_by_index[0],
+            CharacterSet::empty().add_range('a', 'f').add_char('0')
+        );
+        assert_eq!(
+            token_map.starting_chars_by_index[1],
+            CharacterSet::empty().add_range('d', 'e')
+        );
+    }
+
+    #[test]
+    fn test_token_conflicts() {
+        let grammar = expand_tokens(ExtractedLexicalGrammar {
+            separators: Vec::new(),
+            variables: vec![
+                Variable {
+                    name: "in".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::string("in"),
+                },
+                Variable {
+                    name: "identifier".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::pattern("\\w+"),
+                },
+                Variable {
+                    name: "instanceof".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::string("instanceof"),
+                },
+            ],
+        })
+        .unwrap();
+
+        let var = |name| index_of_var(&grammar, name);
+
+        let token_map = TokenConflictMap::new(
+            &grammar,
+            vec![
+                LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
+                LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
+                LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
+            ],
+        );
+
+        // Given the string "in", the `in` token is preferred over the `identifier` token
+        assert!(token_map.does_match_same_string(var("in"), var("identifier")));
+        assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
+
+        // Depending on what character follows, the string "in" may be treated as part of an
+        // `identifier` token.
+        assert!(token_map.does_conflict(var("identifier"), var("in")));
+
+        // Depending on what character follows, the string "instanceof" may be treated as part of
+        // an `identifier` token.
+        assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
+        assert!(token_map.does_conflict(var("instanceof"), var("in")));
+    }
+
+    fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
+        grammar
+            .variables
+            .iter()
+            .position(|v| v.name == name)
+            .unwrap()
+    }
+}