Add handling of precedence within tokens

2018-12-29 13:56:00 -08:00 · 2018-12-29 13:56:00 -08:00 · 479400e5d3
commit 479400e5d3
parent 5258ee2e6a
3 changed files with 670 additions and 267 deletions
--- a/src/nfa.rs
+++ b/src/nfa.rs
@ -1,5 +1,8 @@
-use std::fmt;
 use std::char;
+use std::cmp::max;
+use std::cmp::Ordering;
+use std::fmt;
+use std::mem::swap;

 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub enum CharacterSet {
@ -13,14 +16,18 @@ pub enum NfaState {
        chars: CharacterSet,
        state_id: u32,
        is_sep: bool,
+        precedence: i32,
    },
    Split(u32, u32),
-    Accept(usize),
+    Accept {
+        variable_index: usize,
+        precedence: i32,
+    },
 }

 #[derive(PartialEq, Eq)]
 pub struct Nfa {
-    pub states: Vec<NfaState>
+    pub states: Vec<NfaState>,
 }

 impl Default for Nfa {
@ -78,14 +85,57 @@ impl CharacterSet {
        }
    }

-    pub fn add(self, other: CharacterSet) -> Self {
-        if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) {
-            chars.extend(other_chars);
-            chars.sort_unstable();
-            chars.dedup();
-            CharacterSet::Include(chars)
+    pub fn add(self, other: &CharacterSet) -> Self {
+        if let CharacterSet::Include(other_chars) = other {
+            if let CharacterSet::Include(mut chars) = self {
+                chars.extend(other_chars);
+                chars.sort_unstable();
+                chars.dedup();
+                return CharacterSet::Include(chars);
+            }
+        }
+        panic!("Called add with a negated character set");
+    }
+
+    pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
+        match self {
+            CharacterSet::Include(chars) => match other {
+                CharacterSet::Include(other_chars) => {
+                    CharacterSet::Include(remove_chars(chars, other_chars, true))
+                }
+                CharacterSet::Exclude(other_chars) => {
+                    let mut removed = remove_chars(chars, other_chars, false);
+                    add_chars(other_chars, chars);
+                    swap(&mut removed, chars);
+                    CharacterSet::Include(removed)
+                }
+            },
+            CharacterSet::Exclude(chars) => match other {
+                CharacterSet::Include(other_chars) => {
+                    let mut removed = remove_chars(other_chars, chars, false);
+                    add_chars(chars, other_chars);
+                    swap(&mut removed, other_chars);
+                    CharacterSet::Include(removed)
+                }
+                CharacterSet::Exclude(other_chars) => {
+                    let removed = remove_chars(chars, other_chars, true);
+                    let mut included_characters = Vec::new();
+                    let mut other_included_characters = Vec::new();
+                    swap(&mut included_characters, other_chars);
+                    swap(&mut other_included_characters, chars);
+                    *self = CharacterSet::Include(included_characters);
+                    *other = CharacterSet::Include(other_included_characters);
+                    CharacterSet::Exclude(removed)
+                }
+            },
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        if let CharacterSet::Include(c) = self {
+            c.is_empty()
        } else {
-            panic!("Called add with a negated character set");
+            false
        }
    }

@ -97,6 +147,84 @@ impl CharacterSet {
    }
 }

+impl Ord for CharacterSet {
+    fn cmp(&self, other: &CharacterSet) -> Ordering {
+        match self {
+            CharacterSet::Include(chars) => {
+                if let CharacterSet::Include(other_chars) = other {
+                    compare_chars(chars, other_chars)
+                } else {
+                    Ordering::Less
+                }
+            }
+            CharacterSet::Exclude(chars) => {
+                if let CharacterSet::Exclude(other_chars) = other {
+                    compare_chars(chars, other_chars)
+                } else {
+                    Ordering::Greater
+                }
+            }
+        }
+    }
+}
+
+impl PartialOrd for CharacterSet {
+    fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
+    for c in right {
+        match left.binary_search(c) {
+            Err(i) => left.insert(i, *c),
+            _ => {}
+        }
+    }
+}
+
+fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
+    let mut result = Vec::new();
+    right.retain(|right_char| {
+        if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
+            left.remove(index);
+            result.push(*right_char);
+            false || !mutate_right
+        } else {
+            true
+        }
+    });
+    result
+}
+
+fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
+    if chars.is_empty() {
+        if other_chars.is_empty() {
+            Ordering::Equal
+        } else {
+            Ordering::Less
+        }
+    } else if other_chars.is_empty() {
+        Ordering::Greater
+    } else {
+        let mut other_c = other_chars.iter();
+        for c in chars.iter() {
+            if let Some(other_c) = other_c.next() {
+                let cmp = c.cmp(other_c);
+                if cmp != Ordering::Equal {
+                    return cmp;
+                }
+            } else {
+                return Ordering::Greater;
+            }
+        }
+        if other_c.next().is_some() {
+            return Ordering::Less;
+        }
+        Ordering::Equal
+    }
+}
+
 impl Nfa {
    pub fn new() -> Self {
        Nfa { states: Vec::new() }
@ -124,17 +252,32 @@ impl fmt::Debug for Nfa {

 impl<'a> NfaCursor<'a> {
    pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
-        let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true };
+        let mut result = Self {
+            nfa,
+            state_ids: Vec::new(),
+            in_sep: true,
+        };
        result.add_states(&mut states);
        result
    }

+    pub fn reset(&mut self, mut states: Vec<u32>) {
+        self.state_ids.clear();
+        self.add_states(&mut states);
+    }
+
    pub fn advance(&mut self, c: char) -> bool {
        let mut result = false;
        let mut new_state_ids = Vec::new();
        let mut any_sep_transitions = false;
        for current_state_id in &self.state_ids {
-            if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] {
+            if let NfaState::Advance {
+                chars,
+                state_id,
+                is_sep,
+                ..
+            } = &self.nfa.states[*current_state_id as usize]
+            {
                if chars.contains(c) {
                    if *is_sep {
                        any_sep_transitions = true;
@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> {
        result
    }

-    pub fn finished_id(&self) -> Option<usize> {
+    pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
+        self.state_ids.iter().filter_map(move |id| {
+            if let NfaState::Advance {
+                chars,
+                state_id,
+                precedence,
+                ..
+            } = &self.nfa.states[*id as usize]
+            {
+                Some((chars, *precedence, *state_id))
+            } else {
+                None
+            }
+        })
+    }
+
+    pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
+        Self::group_successors(self.successors())
+    }
+
+    fn group_successors<'b>(
+        iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
+    ) -> Vec<(CharacterSet, i32, Vec<u32>)> {
+        let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
+        for (chars, prec, state) in iter {
+            let mut chars = chars.clone();
+            let mut i = 0;
+            while i < result.len() {
+                let intersection = result[i].0.remove_intersection(&mut chars);
+                if !intersection.is_empty() {
+                    let mut states = result[i].2.clone();
+                    let mut precedence = result[i].1;
+                    states.push(state);
+                    result.insert(i, (intersection, max(precedence, prec), states));
+                    i += 1;
+                }
+                i += 1;
+            }
+            if !chars.is_empty() {
+                result.push((chars, prec, vec![state]));
+            }
+        }
+        result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+        result
+    }
+
+    pub fn finished_id(&self) -> Option<(usize, i32)> {
        let mut result = None;
        for state_id in self.state_ids.iter() {
-            if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] {
+            if let NfaState::Accept {
+                variable_index,
+                precedence,
+            } = self.nfa.states[*state_id as usize]
+            {
                match result {
-                    None => {
-                        result = Some(id)
-                    },
-                    Some(existing_id) => if id < existing_id {
-                        result = Some(id)
+                    None => result = Some((variable_index, precedence)),
+                    Some((existing_id, existing_precedence)) => {
+                        if precedence > existing_precedence
+                            || (precedence == existing_precedence && variable_index < existing_id)
+                        {
+                            result = Some((variable_index, precedence))
+                        }
                    }
                }
            }
@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_group_successors() {
+        let table = [
+            (
+                vec![
+                    (CharacterSet::empty().add_range('a', 'f'), 0, 1),
+                    (CharacterSet::empty().add_range('d', 'i'), 1, 2),
+                ],
+                vec![
+                    (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
+                    (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
+                    (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
+                ],
+            ),
+            (
+                vec![
+                    (CharacterSet::empty().add_range('a', 'z'), 0, 1),
+                    (CharacterSet::empty().add_char('d'), 0, 2),
+                    (CharacterSet::empty().add_char('i'), 0, 3),
+                    (CharacterSet::empty().add_char('f'), 0, 4),
+                ],
+                vec![
+                    (
+                        CharacterSet::empty()
+                            .add_range('a', 'c')
+                            .add_char('e')
+                            .add_range('g', 'h')
+                            .add_range('j', 'z'),
+                        0,
+                        vec![1],
+                    ),
+                    (CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
+                    (CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
+                    (CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
+                ],
+            ),
+        ];
+
+        for row in table.iter() {
+            assert_eq!(
+                NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
+                row.1
+            );
+        }
+
+        // let successors = NfaCursor::group_successors(
+        //     [
+        //         (&CharacterSet::empty().add_range('a', 'f'), 1),
+        //         (&CharacterSet::empty().add_range('d', 'i'), 2),
+        //     ]
+        //     .iter()
+        //     .cloned(),
+        // );
+        //
+        // assert_eq!(
+        //     successors,
+        //     vec![
+        //         (CharacterSet::empty().add_range('a', 'c'), vec![1],),
+        //         (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
+        //         (CharacterSet::empty().add_range('g', 'i'), vec![2],),
+        //     ]
+        // );
+    }
+
+    #[test]
+    fn test_character_set_intersection() {
+        // whitelist - whitelist
+        // both sets contain 'c', 'd', and 'f'
+        let mut a = CharacterSet::empty().add_range('a', 'f');
+        let mut b = CharacterSet::empty().add_range('c', 'h');
+        assert_eq!(
+            a.remove_intersection(&mut b),
+            CharacterSet::empty().add_range('c', 'f')
+        );
+        assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
+        assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
+
+        let mut a = CharacterSet::empty().add_range('a', 'f');
+        let mut b = CharacterSet::empty().add_range('c', 'h');
+        assert_eq!(
+            b.remove_intersection(&mut a),
+            CharacterSet::empty().add_range('c', 'f')
+        );
+        assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
+        assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
+
+        // whitelist - blacklist
+        // both sets contain 'e', 'f', and 'm'
+        let mut a = CharacterSet::empty()
+            .add_range('c', 'h')
+            .add_range('k', 'm');
+        let mut b = CharacterSet::empty()
+            .add_range('a', 'd')
+            .add_range('g', 'l')
+            .negate();
+        assert_eq!(
+            a.remove_intersection(&mut b),
+            CharacterSet::Include(vec!['e', 'f', 'm'])
+        );
+        assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
+        assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
+
+        let mut a = CharacterSet::empty()
+            .add_range('c', 'h')
+            .add_range('k', 'm');
+        let mut b = CharacterSet::empty()
+            .add_range('a', 'd')
+            .add_range('g', 'l')
+            .negate();
+        assert_eq!(
+            b.remove_intersection(&mut a),
+            CharacterSet::Include(vec!['e', 'f', 'm'])
+        );
+        assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
+        assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
+
+        // blacklist - blacklist
+        // both sets exclude 'c', 'd', and 'e'
+        let mut a = CharacterSet::empty().add_range('a', 'e').negate();
+        let mut b = CharacterSet::empty().add_range('c', 'h').negate();
+        assert_eq!(
+            a.remove_intersection(&mut b),
+            CharacterSet::Exclude(vec!['c', 'd', 'e'])
+        );
+        assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
+        assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
+    }
+}
--- a/src/prepare_grammar/expand_tokens.rs
+++ b/src/prepare_grammar/expand_tokens.rs
@ -7,8 +7,18 @@ use regex_syntax::ast::{
    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
 };

-pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
-    let mut nfa = Nfa::new();
+struct NfaBuilder {
+    nfa: Nfa,
+    is_sep: bool,
+    precedence_stack: Vec<i32>,
+}
+
+pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
+    let mut builder = NfaBuilder {
+        nfa: Nfa::new(),
+        is_sep: true,
+        precedence_stack: vec![0],
+    };

    let separator_rule = if grammar.separators.len() > 0 {
        grammar.separators.push(Rule::Blank);
@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
            _ => false,
        };

-        nfa.states.push(NfaState::Accept(i));
-        let last_state_id = nfa.last_state_id();
-        expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e {
-            Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)),
-            _ => e,
-        })?;
+        builder.is_sep = false;
+        builder.nfa.states.push(NfaState::Accept {
+            variable_index: i,
+            precedence: 0,
+        });
+        let last_state_id = builder.nfa.last_state_id();
+        builder
+            .expand_rule(&variable.rule, last_state_id)
+            .map_err(|e| match e {
+                Error::RegexError(msg) => {
+                    Error::RegexError(format!("Rule {} {}", variable.name, msg))
+                }
+                _ => e,
+            })?;

        if !is_immediate_token {
-            let last_state_id = nfa.last_state_id();
-            expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
+            builder.is_sep = true;
+            let last_state_id = builder.nfa.last_state_id();
+            builder.expand_rule(&separator_rule, last_state_id)?;
        }

        variables.push(LexicalVariable {
            name: variable.name,
            kind: variable.kind,
-            start_state: nfa.last_state_id(),
+            start_state: builder.nfa.last_state_id(),
        });
    }

-    Ok(LexicalGrammar { nfa, variables })
+    Ok(LexicalGrammar {
+        nfa: builder.nfa,
+        variables,
+    })
 }

-fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
-    match rule {
-        Rule::Pattern(s) => {
-            let ast = parse::Parser::new()
-                .parse(&s)
-                .map_err(|e| Error::GrammarError(e.to_string()))?;
-            expand_regex(&ast, nfa, next_state_id, is_sep)
-        }
-        Rule::String(s) => {
-            for c in s.chars().rev() {
-                nfa.prepend(|last_state_id| NfaState::Advance {
-                    chars: CharacterSet::empty().add_char(c),
-                    state_id: last_state_id,
-                    is_sep,
-                });
+impl NfaBuilder {
+    fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
+        match rule {
+            Rule::Pattern(s) => {
+                let ast = parse::Parser::new()
+                    .parse(&s)
+                    .map_err(|e| Error::GrammarError(e.to_string()))?;
+                self.expand_regex(&ast, next_state_id)
            }
-            Ok(s.len() > 0)
-        }
-        Rule::Choice(elements) => {
-            let mut alternative_state_ids = Vec::new();
-            for element in elements {
-                if expand_rule(element, nfa, next_state_id, is_sep)? {
-                    alternative_state_ids.push(nfa.last_state_id());
-                } else {
-                    alternative_state_ids.push(next_state_id);
+            Rule::String(s) => {
+                for c in s.chars().rev() {
+                    self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id());
                }
+                Ok(s.len() > 0)
            }
-            alternative_state_ids.retain(|i| *i != nfa.last_state_id());
-            for alternative_state_id in alternative_state_ids {
-                nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
-            }
-            Ok(true)
-        }
-        Rule::Seq(elements) => {
-            let mut result = false;
-            for element in elements.into_iter().rev() {
-                if expand_rule(element, nfa, next_state_id, is_sep)? {
-                    result = true;
+            Rule::Choice(elements) => {
+                let mut alternative_state_ids = Vec::new();
+                for element in elements {
+                    if self.expand_rule(element, next_state_id)? {
+                        alternative_state_ids.push(self.nfa.last_state_id());
+                    } else {
+                        alternative_state_ids.push(next_state_id);
+                    }
                }
-                next_state_id = nfa.last_state_id();
-            }
-            Ok(result)
-        }
-        Rule::Repeat(rule) => {
-            nfa.states.push(NfaState::Accept(0)); // Placeholder for split
-            let split_state_id = nfa.last_state_id();
-            if expand_rule(rule, nfa, split_state_id, is_sep)? {
-                nfa.states[split_state_id as usize] =
-                    NfaState::Split(nfa.last_state_id(), next_state_id);
-                Ok(true)
-            } else {
-                Ok(false)
-            }
-        }
-        Rule::Metadata { rule, .. } => {
-            // TODO - implement precedence
-            expand_rule(rule, nfa, next_state_id, is_sep)
-        }
-        Rule::Blank => Ok(false),
-        _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
-    }
-}
-
-fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
-    nfa.states.push(NfaState::Accept(0)); // Placeholder for split
-    let split_state_id = nfa.last_state_id();
-    if expand_regex(&ast, nfa, split_state_id, is_sep)? {
-        nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id);
-        Ok(true)
-    } else {
-        nfa.states.pop();
-        Ok(false)
-    }
-}
-
-fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
-    if expand_regex(ast, nfa, next_state_id, is_sep)? {
-        nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
-        Ok(true)
-    } else {
-        Ok(false)
-    }
-}
-
-fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
-    if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? {
-        nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
-        Ok(true)
-    } else {
-        Ok(false)
-    }
-}
-
-fn expand_count(
-    ast: &Ast,
-    count: u32,
-    nfa: &mut Nfa,
-    mut next_state_id: u32,
-    is_sep: bool,
-) -> Result<bool> {
-    let mut result = false;
-    for _ in 0..count {
-        if expand_regex(ast, nfa, next_state_id, is_sep)? {
-            result = true;
-            next_state_id = nfa.last_state_id();
-        }
-    }
-    Ok(result)
-}
-
-fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
-    match ast {
-        Ast::Empty(_) => Ok(false),
-        Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
-        Ast::Literal(literal) => {
-            nfa.states.push(NfaState::Advance {
-                chars: CharacterSet::Include(vec![literal.c]),
-                state_id: next_state_id,
-                is_sep,
-            });
-            Ok(true)
-        }
-        Ast::Dot(_) => {
-            nfa.states.push(NfaState::Advance {
-                chars: CharacterSet::Exclude(vec!['\n']),
-                state_id: next_state_id,
-                is_sep,
-            });
-            Ok(true)
-        }
-        Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
-        Ast::Class(class) => match class {
-            Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
-            Class::Perl(class) => {
-                nfa.states.push(NfaState::Advance {
-                    chars: expand_perl_character_class(&class.kind),
-                    state_id: next_state_id,
-                    is_sep,
-                });
-                Ok(true)
-            }
-            Class::Bracketed(class) => match &class.kind {
-                ClassSet::Item(item) => {
-                    let character_set = expand_character_class(&item)?;
-                    nfa.states.push(NfaState::Advance {
-                        chars: character_set,
-                        state_id: next_state_id,
-                        is_sep,
+                alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
+                for alternative_state_id in alternative_state_ids {
+                    self.nfa.prepend(|last_state_id| {
+                        NfaState::Split(last_state_id, alternative_state_id)
                    });
-                    Ok(true)
                }
-                ClassSet::BinaryOp(_) => Err(Error::regex(
-                    "Binary operators in character classes aren't supported",
-                )),
-            },
-        },
-        Ast::Repetition(repetition) => match repetition.op.kind {
-            RepetitionKind::ZeroOrOne => {
-                expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)
+                Ok(true)
            }
-            RepetitionKind::OneOrMore => {
-                expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep)
+            Rule::Seq(elements) => {
+                let mut result = false;
+                for element in elements.into_iter().rev() {
+                    if self.expand_rule(element, next_state_id)? {
+                        result = true;
+                    }
+                    next_state_id = self.nfa.last_state_id();
+                }
+                Ok(result)
            }
-            RepetitionKind::ZeroOrMore => {
-                expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)
-            }
-            RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
-                expand_count(&repetition.ast, count, nfa, next_state_id, is_sep)
-            }
-            RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
-                if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? {
-                    expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)
+            Rule::Repeat(rule) => {
+                self.nfa.states.push(NfaState::Accept {
+                    variable_index: 0,
+                    precedence: 0,
+                }); // Placeholder for split
+                let split_state_id = self.nfa.last_state_id();
+                if self.expand_rule(rule, split_state_id)? {
+                    self.nfa.states[split_state_id as usize] =
+                        NfaState::Split(self.nfa.last_state_id(), next_state_id);
+                    Ok(true)
                } else {
                    Ok(false)
                }
            }
-            RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
-                let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?;
-                for _ in min..max {
-                    if result {
-                        next_state_id = nfa.last_state_id();
+            Rule::Metadata { rule, params } => {
+                if let Some(precedence) = params.precedence {
+                    self.precedence_stack.push(precedence);
+                }
+                let result = self.expand_rule(rule, next_state_id);
+                if params.precedence.is_some() {
+                    self.precedence_stack.pop();
+                }
+                result
+            }
+            Rule::Blank => Ok(false),
+            _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
+        }
+    }
+
+    fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
+        match ast {
+            Ast::Empty(_) => Ok(false),
+            Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
+            Ast::Literal(literal) => {
+                self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id);
+                Ok(true)
+            }
+            Ast::Dot(_) => {
+                self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id);
+                Ok(true)
+            }
+            Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
+            Ast::Class(class) => match class {
+                Class::Unicode(_) => {
+                    Err(Error::regex("Unicode character classes are not supported"))
+                }
+                Class::Perl(class) => {
+                    self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id);
+                    Ok(true)
+                }
+                Class::Bracketed(class) => match &class.kind {
+                    ClassSet::Item(item) => {
+                        self.push_advance(self.expand_character_class(&item)?, next_state_id);
+                        Ok(true)
                    }
-                    if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? {
+                    ClassSet::BinaryOp(_) => Err(Error::regex(
+                        "Binary operators in character classes aren't supported",
+                    )),
+                },
+            },
+            Ast::Repetition(repetition) => match repetition.op.kind {
+                RepetitionKind::ZeroOrOne => {
+                    self.expand_zero_or_one(&repetition.ast, next_state_id)
+                }
+                RepetitionKind::OneOrMore => {
+                    self.expand_one_or_more(&repetition.ast, next_state_id)
+                }
+                RepetitionKind::ZeroOrMore => {
+                    self.expand_zero_or_more(&repetition.ast, next_state_id)
+                }
+                RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
+                    self.expand_count(&repetition.ast, count, next_state_id)
+                }
+                RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
+                    if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
+                        self.expand_count(&repetition.ast, min, next_state_id)
+                    } else {
+                        Ok(false)
+                    }
+                }
+                RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
+                    let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
+                    for _ in min..max {
+                        if result {
+                            next_state_id = self.nfa.last_state_id();
+                        }
+                        if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
+                            result = true;
+                        }
+                    }
+                    Ok(result)
+                }
+            },
+            Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()),
+            Ast::Alternation(alternation) => {
+                let mut alternative_state_ids = Vec::new();
+                for ast in alternation.asts.iter() {
+                    if self.expand_regex(&ast, next_state_id)? {
+                        alternative_state_ids.push(self.nfa.last_state_id());
+                    } else {
+                        alternative_state_ids.push(next_state_id);
+                    }
+                }
+                alternative_state_ids.sort_unstable();
+                alternative_state_ids.dedup();
+                alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
+
+                for alternative_state_id in alternative_state_ids {
+                    self.nfa.prepend(|last_state_id| {
+                        NfaState::Split(last_state_id, alternative_state_id)
+                    });
+                }
+                Ok(true)
+            }
+            Ast::Concat(concat) => {
+                let mut result = false;
+                for ast in concat.asts.iter().rev() {
+                    if self.expand_regex(&ast, next_state_id)? {
                        result = true;
+                        next_state_id = self.nfa.last_state_id();
                    }
                }
                Ok(result)
            }
-        },
-        Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep),
-        Ast::Alternation(alternation) => {
-            let mut alternative_state_ids = Vec::new();
-            for ast in alternation.asts.iter() {
-                if expand_regex(&ast, nfa, next_state_id, is_sep)? {
-                    alternative_state_ids.push(nfa.last_state_id());
-                } else {
-                    alternative_state_ids.push(next_state_id);
-                }
-            }
-            alternative_state_ids.retain(|i| *i != nfa.last_state_id());
-            for alternative_state_id in alternative_state_ids {
-                nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
-            }
+        }
+    }
+
+    fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
+        self.nfa.states.push(NfaState::Accept {
+            variable_index: 0,
+            precedence: 0,
+        }); // Placeholder for split
+        let split_state_id = self.nfa.last_state_id();
+        if self.expand_regex(&ast, split_state_id)? {
+            self.nfa.states[split_state_id as usize] =
+                NfaState::Split(self.nfa.last_state_id(), next_state_id);
            Ok(true)
+        } else {
+            self.nfa.states.pop();
+            Ok(false)
        }
-        Ast::Concat(concat) => {
-            let mut result = false;
-            for ast in concat.asts.iter().rev() {
-                if expand_regex(&ast, nfa, next_state_id, is_sep)? {
-                    result = true;
-                    next_state_id = nfa.last_state_id();
+    }
+
+    fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
+        if self.expand_regex(ast, next_state_id)? {
+            self.nfa
+                .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
+        if self.expand_one_or_more(&ast, next_state_id)? {
+            self.nfa
+                .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
+        let mut result = false;
+        for _ in 0..count {
+            if self.expand_regex(ast, next_state_id)? {
+                result = true;
+                next_state_id = self.nfa.last_state_id();
+            }
+        }
+        Ok(result)
+    }
+
+    fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
+        match item {
+            ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
+            ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
+            ClassSetItem::Range(range) => {
+                Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
+            }
+            ClassSetItem::Union(union) => {
+                let mut result = CharacterSet::empty();
+                for item in &union.items {
+                    result = result.add(&self.expand_character_class(&item)?);
                }
+                Ok(result)
            }
-            Ok(result)
+            ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
+            _ => Err(Error::regex(&format!(
+                "Unsupported character class syntax {:?}",
+                item
+            ))),
        }
    }
-}

-fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
-    match item {
-        ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
-        ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
-        ClassSetItem::Range(range) => {
-            Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
+    fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
+        match item {
+            ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
+            ClassPerlKind::Space => CharacterSet::empty()
+                .add_char(' ')
+                .add_char('\t')
+                .add_char('\r')
+                .add_char('\n'),
+            ClassPerlKind::Word => CharacterSet::empty()
+                .add_char('_')
+                .add_range('A', 'Z')
+                .add_range('a', 'z')
+                .add_range('0', '9'),
        }
-        ClassSetItem::Union(union) => {
-            let mut result = CharacterSet::empty();
-            for item in &union.items {
-                result = result.add(expand_character_class(&item)?);
-            }
-            Ok(result)
-        }
-        ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)),
-        _ => Err(Error::regex(&format!(
-            "Unsupported character class syntax {:?}",
-            item
-        ))),
    }
-}

-fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
-    match item {
-        ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
-        ClassPerlKind::Space => CharacterSet::empty()
-            .add_char(' ')
-            .add_char('\t')
-            .add_char('\r')
-            .add_char('\n'),
-        ClassPerlKind::Word => CharacterSet::empty()
-            .add_char('_')
-            .add_range('A', 'Z')
-            .add_range('a', 'z')
-            .add_range('0', '9'),
+    fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
+        let precedence = *self.precedence_stack.last().unwrap();
+        self.add_precedence(precedence, vec![state_id]);
+        self.nfa.states.push(NfaState::Advance {
+            chars,
+            state_id,
+            precedence,
+            is_sep: self.is_sep,
+        });
+    }
+
+    fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
+        let mut i = 0;
+        while i < state_ids.len() {
+            let state_id = state_ids[i];
+            let (left, right) = match &mut self.nfa.states[state_id as usize] {
+                NfaState::Accept {precedence, ..} => {
+                    *precedence = prec;
+                    return;
+                },
+                NfaState::Split(left, right) => (*left, *right),
+                _ => return
+            };
+            if !state_ids.contains(&left) {
+                state_ids.push(left);
+            }
+            if !state_ids.contains(&right) {
+                state_ids.push(right);
+            }
+            i += 1;
+        }
    }
 }

@ -313,11 +367,15 @@ mod tests {
        let mut cursor = NfaCursor::new(&grammar.nfa, start_states);

        let mut result = None;
+        let mut result_precedence = 0;
        let mut start_char = 0;
        let mut end_char = 0;
        for c in s.chars() {
-            if let Some(id) = cursor.finished_id() {
-                result = Some((id, &s[start_char..end_char]));
+            if let Some((id, finished_precedence)) = cursor.finished_id() {
+                if result.is_none() || result_precedence <= finished_precedence {
+                    result = Some((id, &s[start_char..end_char]));
+                    result_precedence = finished_precedence;
+                }
            }
            if cursor.advance(c) {
                end_char += 1;
@ -329,8 +387,11 @@ mod tests {
            }
        }

-        if let Some(id) = cursor.finished_id() {
-            result = Some((id, &s[start_char..end_char]));
+        if let Some((id, finished_precedence)) = cursor.finished_id() {
+            if result.is_none() || result_precedence <= finished_precedence {
+                result = Some((id, &s[start_char..end_char]));
+                result_precedence = finished_precedence;
+            }
        }

        result
@ -443,6 +504,20 @@ mod tests {
                    ("  \\\na", Some((0, "a"))),
                ],
            },
+            // shorter tokens with higher precedence
+            Row {
+                rules: vec![
+                    Rule::prec(2, Rule::pattern("abc")),
+                    Rule::prec(1, Rule::pattern("ab[cd]e")),
+                    Rule::pattern("[a-e]+"),
+                ],
+                separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
+                examples: vec![
+                    ("abceef", Some((0, "abc"))),
+                    ("abdeef", Some((1, "abde"))),
+                    ("aeeeef", Some((2, "aeeee"))),
+                ],
+            },
        ];

        for Row {
--- a/src/prepare_grammar/mod.rs
+++ b/src/prepare_grammar/mod.rs
@ -7,7 +7,7 @@ mod intern_symbols;
 mod process_inlines;

 use self::expand_repeats::expand_repeats;
-use self::expand_tokens::expand_tokens;
+pub(crate) use self::expand_tokens::expand_tokens;
 use self::extract_simple_aliases::extract_simple_aliases;
 use self::extract_tokens::extract_tokens;
 use self::flatten_grammar::flatten_grammar;
@ -19,7 +19,7 @@ use crate::grammars::{
 };
 use crate::rules::{AliasMap, Rule, Symbol};

-pub(self) struct IntermediateGrammar<T, U> {
+pub(crate) struct IntermediateGrammar<T, U> {
    variables: Vec<Variable>,
    extra_tokens: Vec<T>,
    expected_conflicts: Vec<Vec<Symbol>>,
@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar<T, U> {
    word_token: Option<Symbol>,
 }

-pub(self) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
+pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;

-pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
+pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;

 #[derive(Debug, PartialEq, Eq)]
-pub(self) struct ExtractedLexicalGrammar {
-    variables: Vec<Variable>,
-    separators: Vec<Rule>,
+pub(crate) struct ExtractedLexicalGrammar {
+    pub variables: Vec<Variable>,
+    pub separators: Vec<Rule>,
 }

 pub(crate) fn prepare_grammar(