From 6e4115548c1982a764ca22f819544455ca9f7807 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 10 Jul 2016 14:03:00 -0700 Subject: [PATCH 001/102] Initial commit --- .gitignore | 2 + .gitmodules | 3 + Cargo.toml | 15 ++ build.rs | 35 +++++ script/bindgen.sh | 16 +++ src/ffi.rs | 333 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 329 ++++++++++++++++++++++++++++++++++++++++++++ vendor/tree-sitter | 1 + 8 files changed, 734 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Cargo.toml create mode 100644 build.rs create mode 100755 script/bindgen.sh create mode 100644 src/ffi.rs create mode 100644 src/lib.rs create mode 160000 vendor/tree-sitter diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a9d37c56 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..eef86f94 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/tree-sitter"] + path = vendor/tree-sitter + url = https://github.com/tree-sitter/tree-sitter diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..0a93febe --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "tree-sitter" +version = "0.1.0" +authors = ["Max Brunsfeld "] +build = "build.rs" +exclude = ["vendor/tree-sitter/**/*"] +include = [ + "vendor/tree-sitter/src/runtime/*", + "vendor/tree-sitter/externals/utf8proc/utf8proc*" +] + +[dependencies] + +[build-dependencies] +cc = "1.0" diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..3427ed5f --- /dev/null +++ b/build.rs @@ -0,0 +1,35 @@ +extern crate cc; + +use std::path::Path; + + +fn main() { + let dir_path = Path::new("vendor/tree-sitter/src/runtime"); + + let source_filenames = [ + "get_changed_ranges.c", + "language.c", + "lexer.c", + "node.c", + "parser.c", + "parser.c", + "stack.c", + "subtree.c", + "tree_cursor.c", + "tree.c", + "utf16.c", + ]; + + let mut config = cc::Build::new(); + config.include("vendor/tree-sitter/src"); + config.include("vendor/tree-sitter/include"); + config.include("vendor/tree-sitter/externals/utf8proc"); + config.flag_if_supported("-Wno-unused-parameter"); + + for source_filename in source_filenames.iter() { + let source_path = dir_path.join(Path::new(&source_filename)); + config.file(&source_path.to_str().unwrap()); + } + + config.compile("libruntime.a") +} diff --git a/script/bindgen.sh b/script/bindgen.sh new file mode 100755 index 00000000..190e7a4f --- /dev/null +++ b/script/bindgen.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +output_path=src/ffi.rs +header_path='vendor/tree-sitter/include/tree_sitter/runtime.h' + +bindgen \ + --no-layout-tests \ + --whitelist-type '^TS.*' \ + --whitelist-function '^ts_.*' \ + --opaque-type FILE \ + $header_path > $output_path + +echo "" >> $output_path +version_constant='TREE_SITTER_LANGUAGE_VERSION' +version_number=$(egrep "#define $version_constant (.*)" $header_path | cut -d' ' -f3) +echo "pub const $version_constant: usize = $version_number;" >> $output_path diff --git a/src/ffi.rs b/src/ffi.rs new file mode 100644 index 00000000..7d1c06e8 --- /dev/null +++ b/src/ffi.rs @@ -0,0 +1,333 @@ +/* automatically generated by rust-bindgen */ + +pub type FILE = [u64; 19usize]; +pub type TSSymbol = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLanguage { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSParser { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTree { + _unused: [u8; 0], +} +pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; +pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; +pub type TSInputEncoding = u32; +pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; +pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; +pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; +pub type TSSymbolType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSPoint { + pub row: u32, + pub column: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSRange { + pub start: TSPoint, + pub end: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInput { + pub payload: *mut ::std::os::raw::c_void, + pub read: ::std::option::Option< + unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) + -> *const ::std::os::raw::c_char, + >, + pub seek: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + byte_index: u32, + position: TSPoint, + ) -> ::std::os::raw::c_int, + >, + pub encoding: TSInputEncoding, +} +pub const TSLogType_TSLogTypeParse: TSLogType = 0; +pub const TSLogType_TSLogTypeLex: TSLogType = 1; +pub type TSLogType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLogger { + pub payload: *mut ::std::os::raw::c_void, + pub log: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + arg1: TSLogType, + arg2: *const ::std::os::raw::c_char, + ), + >, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_point: TSPoint, + pub old_end_point: TSPoint, + pub new_end_point: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSNode { + pub context: [u32; 4usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTreeCursor { + pub context: [u32; 2usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +extern "C" { + #[link_name = "\u{1}_ts_parser_new"] + pub fn ts_parser_new() -> *mut TSParser; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_delete"] + pub fn ts_parser_delete(arg1: *mut TSParser); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_language"] + pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_language"] + pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_logger"] + pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_logger"] + pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_print_dot_graphs"] + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_halt_on_error"] + pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse"] + pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse_string"] + pub fn ts_parser_parse_string( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + ) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_copy"] + pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_delete"] + pub fn ts_tree_delete(arg1: *mut TSTree); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_root_node"] + pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_edit"] + pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_get_changed_ranges"] + pub fn ts_tree_get_changed_ranges( + arg1: *const TSTree, + arg2: *const TSTree, + arg3: *mut u32, + ) -> *mut TSRange; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_print_dot_graph"] + pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_byte"] + pub fn ts_node_start_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_point"] + pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_byte"] + pub fn ts_node_end_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_point"] + pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_symbol"] + pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; +} +extern "C" { + #[link_name = "\u{1}_ts_node_type"] + pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_string"] + pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_eq"] + pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_null"] + pub fn ts_node_is_null(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_named"] + pub fn ts_node_is_named(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_missing"] + pub fn ts_node_is_missing(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_changes"] + pub fn ts_node_has_changes(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_error"] + pub fn ts_node_has_error(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_parent"] + pub fn ts_node_parent(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child"] + pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child"] + pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child_count"] + pub fn ts_node_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child_count"] + pub fn ts_node_named_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_sibling"] + pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_named_sibling"] + pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_sibling"] + pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_named_sibling"] + pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_child_for_byte"] + pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] + pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] + pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] + pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_point_range"] + pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) + -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] + pub fn ts_node_named_descendant_for_point_range( + arg1: TSNode, + arg2: TSPoint, + arg3: TSPoint, + ) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_new"] + pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_delete"] + pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] + pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_current_node"] + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_count"] + pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_name"] + pub fn ts_language_symbol_name( + arg1: *const TSLanguage, + arg2: TSSymbol, + ) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_type"] + pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; +} +extern "C" { + #[link_name = "\u{1}_ts_language_version"] + pub fn ts_language_version(arg1: *const TSLanguage) -> u32; +} + +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..ef11757a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,329 @@ +mod ffi; + +use std::ffi::CStr; +use std::marker::PhantomData; +use std::os::raw::{c_char, c_int, c_void}; +use std::ptr; + +#[derive(Clone, Copy)] +pub struct Symbol(ffi::TSSymbol); + +#[derive(Clone, Copy)] +pub struct Language(*const ffi::TSLanguage); + +pub trait Utf16Input { + fn read(&self) -> &[u16]; + fn seek(&self, u32, Point); +} + +pub trait Utf8Input { + fn read(&self) -> &[u8]; + fn seek(&self, u32, Point); +} + +pub enum LogType { + Parse, + Lex, +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Point { + pub row: u32, + pub column: u32, +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct InputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_position: Point, + pub old_end_position: Point, + pub new_end_position: Point, +} + +pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); + +pub struct Parser(*mut ffi::TSParser); + +pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); + +pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); + +impl Parser { + pub fn new() -> Parser { + unsafe { + let parser = ffi::ts_parser_new(); + Parser(parser) + } + } + + pub fn set_language(&mut self, language: Language) { + unsafe { + ffi::ts_parser_set_language(self.0, language.0); + } + } + + pub fn set_logger ()>(&mut self, logger: &mut F) { + unsafe extern "C" fn log ()>( + payload: *mut c_void, + c_log_type: ffi::TSLogType, + c_message: *const c_char, + ) { + let callback = (payload as *mut F).as_mut().unwrap(); + if let Ok(message) = CStr::from_ptr(c_message).to_str() { + let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { + LogType::Parse + } else { + LogType::Lex + }; + callback(log_type, message); + } + }; + + let c_logger = ffi::TSLogger { + payload: logger as *mut F as *mut c_void, + log: Some(log::), + }; + + unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; + } + + pub fn parse_utf8( + &mut self, + input: &mut T, + old_tree: Option, + ) -> Option { + unsafe extern "C" fn read( + payload: *mut c_void, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let result = input.read(); + *bytes_read = result.len() as u32; + return result.as_ptr() as *const c_char; + }; + + unsafe extern "C" fn seek( + payload: *mut c_void, + byte: u32, + position: ffi::TSPoint, + ) -> c_int { + let input = (payload as *mut T).as_mut().unwrap(); + input.seek( + byte, + Point { + row: position.row, + column: position.column, + }, + ); + return 1; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + seek: Some(seek::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + + let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; + if new_tree_ptr.is_null() { + None + } else { + Some(Tree(new_tree_ptr, ffi::TSInputEncoding_TSInputEncodingUTF8)) + } + } + + pub fn parse_utf16( + &mut self, + input: &mut T, + old_tree: Option, + ) -> Option { + unsafe extern "C" fn read( + payload: *mut c_void, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let result = input.read(); + *bytes_read = result.len() as u32 * 2; + return result.as_ptr() as *const c_char; + }; + + unsafe extern "C" fn seek( + payload: *mut c_void, + byte: u32, + position: ffi::TSPoint, + ) -> c_int { + let input = (payload as *mut T).as_mut().unwrap(); + input.seek( + byte / 2, + Point { + row: position.row, + column: position.column / 2, + }, + ); + return 1; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + seek: Some(seek::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + + let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; + if new_tree_ptr.is_null() { + None + } else { + Some(Tree( + new_tree_ptr, + ffi::TSInputEncoding_TSInputEncodingUTF16, + )) + } + } +} + +impl Drop for Parser { + fn drop(&mut self) { + unsafe { ffi::ts_parser_delete(self.0) } + } +} + +impl Tree { + pub fn root_node(&self) -> Node { + Node::new(unsafe { ffi::ts_tree_root_node(self.0) }).unwrap() + } + + pub fn edit(&mut self, edit: &InputEdit) { + let edit = ffi::TSInputEdit { + start_byte: edit.start_byte, + old_end_byte: edit.old_end_byte, + new_end_byte: edit.new_end_byte, + start_point: edit.start_position.into(), + old_end_point: edit.old_end_position.into(), + new_end_point: edit.new_end_position.into(), + }; + unsafe { ffi::ts_tree_edit(self.0, &edit) }; + } + + pub fn walk(&self) -> TreeCursor { + TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + } +} + +impl Drop for Tree { + fn drop(&mut self) { + unsafe { ffi::ts_tree_delete(self.0) } + } +} + +impl Clone for Tree { + fn clone(&self) -> Tree { + unsafe { Tree(ffi::ts_tree_copy(self.0), self.1) } + } +} + +impl<'a> Node<'a> { + fn new(node: ffi::TSNode) -> Option { + if node.id.is_null() { + None + } else { + Some(Node(node, PhantomData)) + } + } + + pub fn name(&self) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } + .to_str() + .unwrap() + } + + pub fn start_index(&self) -> u32 { + unsafe { ffi::ts_node_start_byte(self.0) } + } + + pub fn end_index(&self) -> u32 { + unsafe { ffi::ts_node_end_byte(self.0) } + } + + pub fn start_position(&self) -> Point { + let result = unsafe { ffi::ts_node_start_point(self.0) }; + Point { + row: result.row, + column: result.column, + } + } + + pub fn end_position(&self) -> Point { + let result = unsafe { ffi::ts_node_end_point(self.0) }; + Point { + row: result.row, + column: result.column, + } + } + + pub fn child(&self, i: u32) -> Option { + Self::new(unsafe { ffi::ts_node_child(self.0, i) }) + } + + pub fn parent(&self) -> Option { + Self::new(unsafe { ffi::ts_node_parent(self.0) }) + } +} + +impl<'a> TreeCursor<'a> { + fn node(&'a self) -> Node<'a> { + Node( + unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, + PhantomData, + ) + } + + fn goto_first_child(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; + } + + fn goto_parent(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; + } + + fn goto_next_sibling(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; + } + + fn goto_first_child_for_index(&mut self, index: u32) -> Option { + let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; + if result < 0 { + None + } else { + Some(result as u32) + } + } +} + +impl<'a> Drop for TreeCursor<'a> { + fn drop(&mut self) { + unsafe { ffi::ts_tree_cursor_delete(&mut self.0) } + } +} + +impl Into for Point { + fn into(self) -> ffi::TSPoint { + ffi::TSPoint { + row: self.row, + column: self.column, + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn it_works() {} +} diff --git a/vendor/tree-sitter b/vendor/tree-sitter new file mode 160000 index 00000000..5ec3769c --- /dev/null +++ b/vendor/tree-sitter @@ -0,0 +1 @@ +Subproject commit 5ec3769cb4c9acfda64f80d7c14abce939e8b4c5 From 8918d1a5b14f9a54ef23dcb4b29d8bf2bccd6384 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:35:31 -0700 Subject: [PATCH 002/102] Add boilerplate --- .travis.yml | 8 ++++++++ LICENSE | 21 +++++++++++++++++++++ README.md | 8 ++++++++ 3 files changed, 37 insertions(+) create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..32e3a71f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: rust + +rust: + - stable + +branches: + only: + - master diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..971b81f9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 Max Brunsfeld + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..08df0e4e --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +Rust Tree-sitter +=========================== + +[![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) + +Rust bindings to the [Tree-sitter][] parsing library. + +[tree-sitter]: https://github.com/tree-sitter/tree-sitter From f07f710db7633dc26d86163972512799ae407540 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:40:31 -0700 Subject: [PATCH 003/102] Compile tree-sitter sources in c99 mode --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 3427ed5f..53265655 100644 --- a/build.rs +++ b/build.rs @@ -24,6 +24,7 @@ fn main() { config.include("vendor/tree-sitter/src"); config.include("vendor/tree-sitter/include"); config.include("vendor/tree-sitter/externals/utf8proc"); + config.flag_if_supported("-std=c99"); config.flag_if_supported("-Wno-unused-parameter"); for source_filename in source_filenames.iter() { From ead0e312624a4e20a312875c073be4dc51a2f29b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:43:30 -0700 Subject: [PATCH 004/102] Fix duplicated compile of parser.c --- build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/build.rs b/build.rs index 53265655..ad62f3a1 100644 --- a/build.rs +++ b/build.rs @@ -12,7 +12,6 @@ fn main() { "lexer.c", "node.c", "parser.c", - "parser.c", "stack.c", "subtree.c", "tree_cursor.c", From 08217fff8dfc7a80b2348679144ff44344d63008 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:16:35 -0700 Subject: [PATCH 005/102] Get basic parsing working, add some unit tests --- .gitignore | 1 + .travis.yml | 6 ++ Cargo.toml | 2 - build.rs | 31 ++++++---- fixtures/.gitkeep | 0 script/fetch-test-fixtures.sh | 14 +++++ src/lib.rs | 113 +++++++++++++++++++++++++++++----- 7 files changed, 138 insertions(+), 29 deletions(-) create mode 100644 fixtures/.gitkeep create mode 100755 script/fetch-test-fixtures.sh diff --git a/.gitignore b/.gitignore index a9d37c56..fbd4fda0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target Cargo.lock +fixtures/tree-sitter-rust diff --git a/.travis.yml b/.travis.yml index 32e3a71f..10fcfe94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,12 @@ language: rust rust: - stable +env: + - RUST_TREE_SITTER_TEST=1 + +before_install: + - ./script/fetch-test-fixtures.sh + branches: only: - master diff --git a/Cargo.toml b/Cargo.toml index 0a93febe..e20d40aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,5 @@ include = [ "vendor/tree-sitter/externals/utf8proc/utf8proc*" ] -[dependencies] - [build-dependencies] cc = "1.0" diff --git a/build.rs b/build.rs index ad62f3a1..fa8b41ea 100644 --- a/build.rs +++ b/build.rs @@ -1,10 +1,17 @@ extern crate cc; +use std::env; use std::path::Path; - fn main() { - let dir_path = Path::new("vendor/tree-sitter/src/runtime"); + let root_path = Path::new("vendor/tree-sitter"); + + let mut config = cc::Build::new(); + config.flag_if_supported("-std=c99"); + config.flag_if_supported("-Wno-unused-parameter"); + config.include(root_path.join(Path::new("src"))); + config.include(root_path.join(Path::new("include"))); + config.include(root_path.join(Path::new("externals/utf8proc"))); let source_filenames = [ "get_changed_ranges.c", @@ -19,16 +26,18 @@ fn main() { "utf16.c", ]; - let mut config = cc::Build::new(); - config.include("vendor/tree-sitter/src"); - config.include("vendor/tree-sitter/include"); - config.include("vendor/tree-sitter/externals/utf8proc"); - config.flag_if_supported("-std=c99"); - config.flag_if_supported("-Wno-unused-parameter"); + config.files(source_filenames.iter().map(|source_filename| { + root_path + .join(Path::new(&"src/runtime")) + .join(Path::new(&source_filename)) + })); - for source_filename in source_filenames.iter() { - let source_path = dir_path.join(Path::new(&source_filename)); - config.file(&source_path.to_str().unwrap()); + config.file(root_path.join(Path::new("externals/utf8proc/utf8proc.c"))); + + if env::var("RUST_TREE_SITTER_TEST").is_ok() { + let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + config.file(parser_dir.join("parser.c")); + config.file(parser_dir.join("scanner.c")); } config.compile("libruntime.a") diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/script/fetch-test-fixtures.sh b/script/fetch-test-fixtures.sh new file mode 100755 index 00000000..24cc316a --- /dev/null +++ b/script/fetch-test-fixtures.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +grammar_dir='fixtures/tree-sitter-rust' +grammar_url='https://github.com/tree-sitter/tree-sitter-rust' + +if [ ! -d $grammar_dir ]; then + git clone $grammar_url $grammar_dir --depth=1 +fi + +( + cd $grammar_dir; + git fetch origin master --depth=1 + git reset --hard origin/master; +) diff --git a/src/lib.rs b/src/lib.rs index ef11757a..fa1db0f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,19 +8,19 @@ use std::ptr; #[derive(Clone, Copy)] pub struct Symbol(ffi::TSSymbol); -#[derive(Clone, Copy)] -pub struct Language(*const ffi::TSLanguage); +pub type Language = *const ffi::TSLanguage; pub trait Utf16Input { - fn read(&self) -> &[u16]; - fn seek(&self, u32, Point); + fn read(&mut self) -> &[u16]; + fn seek(&mut self, u32, Point); } pub trait Utf8Input { - fn read(&self) -> &[u8]; - fn seek(&self, u32, Point); + fn read(&mut self) -> &[u8]; + fn seek(&mut self, u32, Point); } +#[derive(Debug, PartialEq, Eq)] pub enum LogType { Parse, Lex, @@ -50,6 +50,11 @@ pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +struct FlatInput<'a> { + bytes: &'a [u8], + offset: usize, +} + impl Parser { pub fn new() -> Parser { unsafe { @@ -60,11 +65,11 @@ impl Parser { pub fn set_language(&mut self, language: Language) { unsafe { - ffi::ts_parser_set_language(self.0, language.0); + ffi::ts_parser_set_language(self.0, language); } } - pub fn set_logger ()>(&mut self, logger: &mut F) { + pub fn set_logger ()>(&mut self, logger: Option<&mut F>) { unsafe extern "C" fn log ()>( payload: *mut c_void, c_log_type: ffi::TSLogType, @@ -81,14 +86,24 @@ impl Parser { } }; - let c_logger = ffi::TSLogger { - payload: logger as *mut F as *mut c_void, - log: Some(log::), - }; + let c_logger; + if let Some(logger) = logger { + c_logger = ffi::TSLogger { + payload: logger as *mut F as *mut c_void, + log: Some(log::), + }; + } else { + c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; + } unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; } + pub fn parse_str(&mut self, input: &str, old_tree: Option) -> Option { + let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; + self.parse_utf8(&mut input, old_tree) + } + pub fn parse_utf8( &mut self, input: &mut T, @@ -239,9 +254,7 @@ impl<'a> Node<'a> { } pub fn name(&self) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } - .to_str() - .unwrap() + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } pub fn start_index(&self) -> u32 { @@ -272,11 +285,24 @@ impl<'a> Node<'a> { Self::new(unsafe { ffi::ts_node_child(self.0, i) }) } + pub fn child_count(&self) -> u32 { + unsafe { ffi::ts_node_child_count(self.0) } + } + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } + + pub fn to_sexp(&self) -> String { + let c_string = unsafe { ffi::ts_node_string(self.0) }; + let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); + unsafe { free(c_string as *mut c_void) }; + result + } } +extern "C" { fn free(pointer: *mut c_void); } + impl<'a> TreeCursor<'a> { fn node(&'a self) -> Node<'a> { Node( @@ -322,8 +348,63 @@ impl Into for Point { } } +impl<'a> Utf8Input for FlatInput<'a> { + fn read(&mut self) -> &[u8] { + let result = &self.bytes[self.offset..]; + self.offset = self.bytes.len(); + result + } + + fn seek(&mut self, offset: u32, _position: Point) { + self.offset = offset as usize; + } +} + #[cfg(test)] mod tests { + use super::*; + + fn rust() -> Language { unsafe { tree_sitter_rust() } } + extern "C" { fn tree_sitter_rust() -> Language; } + #[test] - fn it_works() {} + fn test_basic_parsing() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let tree = parser.parse_str(" + struct Stuff {} + fn main() {} + ", None).unwrap(); + + let root_node = tree.root_node(); + assert_eq!(root_node.name(), "source_file"); + + assert_eq!( + root_node.to_sexp(), + "(source_file (struct_item (type_identifier) (field_declaration_list)) (function_item (identifier) (parameters) (block)))" + ); + + let struct_node = root_node.child(0).unwrap(); + assert_eq!(struct_node.name(), "struct_item"); + } + + #[test] + fn test_logging() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let mut messages = Vec::new(); + parser.set_logger(Some(&mut |log_type, message| { + messages.push((log_type, message.to_string())); + })); + + parser.parse_str(" + struct Stuff {} + fn main() {} + ", None).unwrap(); + + assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); + assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); + } } From 7e6675d56effa6177eaf387b13942c8219107ae1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:23:35 -0700 Subject: [PATCH 006/102] Use a more unique library name when building C sources --- build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.rs b/build.rs index fa8b41ea..4e2c3b8f 100644 --- a/build.rs +++ b/build.rs @@ -40,5 +40,5 @@ fn main() { config.file(parser_dir.join("scanner.c")); } - config.compile("libruntime.a") + config.compile("treesitter") } From 572a60183c86920b0c1bc83941d70b3772534e3a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:29:23 -0700 Subject: [PATCH 007/102] Suppress warnings associated w/ generated bindings --- build.rs | 1 + script/bindgen.sh | 2 +- src/bindings.rs | 333 +++++++++++++++++++++++++++++++++++++++++++++ src/ffi.rs | 335 +--------------------------------------------- 4 files changed, 338 insertions(+), 333 deletions(-) create mode 100644 src/bindings.rs diff --git a/build.rs b/build.rs index 4e2c3b8f..5fa5d408 100644 --- a/build.rs +++ b/build.rs @@ -36,6 +36,7 @@ fn main() { if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + config.flag_if_supported("-Wno-typedef-redefinition"); config.file(parser_dir.join("parser.c")); config.file(parser_dir.join("scanner.c")); } diff --git a/script/bindgen.sh b/script/bindgen.sh index 190e7a4f..1b9008b2 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -1,6 +1,6 @@ #!/bin/bash -output_path=src/ffi.rs +output_path=src/bindings.rs header_path='vendor/tree-sitter/include/tree_sitter/runtime.h' bindgen \ diff --git a/src/bindings.rs b/src/bindings.rs new file mode 100644 index 00000000..7d1c06e8 --- /dev/null +++ b/src/bindings.rs @@ -0,0 +1,333 @@ +/* automatically generated by rust-bindgen */ + +pub type FILE = [u64; 19usize]; +pub type TSSymbol = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLanguage { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSParser { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTree { + _unused: [u8; 0], +} +pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; +pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; +pub type TSInputEncoding = u32; +pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; +pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; +pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; +pub type TSSymbolType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSPoint { + pub row: u32, + pub column: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSRange { + pub start: TSPoint, + pub end: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInput { + pub payload: *mut ::std::os::raw::c_void, + pub read: ::std::option::Option< + unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) + -> *const ::std::os::raw::c_char, + >, + pub seek: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + byte_index: u32, + position: TSPoint, + ) -> ::std::os::raw::c_int, + >, + pub encoding: TSInputEncoding, +} +pub const TSLogType_TSLogTypeParse: TSLogType = 0; +pub const TSLogType_TSLogTypeLex: TSLogType = 1; +pub type TSLogType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLogger { + pub payload: *mut ::std::os::raw::c_void, + pub log: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + arg1: TSLogType, + arg2: *const ::std::os::raw::c_char, + ), + >, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_point: TSPoint, + pub old_end_point: TSPoint, + pub new_end_point: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSNode { + pub context: [u32; 4usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTreeCursor { + pub context: [u32; 2usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +extern "C" { + #[link_name = "\u{1}_ts_parser_new"] + pub fn ts_parser_new() -> *mut TSParser; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_delete"] + pub fn ts_parser_delete(arg1: *mut TSParser); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_language"] + pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_language"] + pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_logger"] + pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_logger"] + pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_print_dot_graphs"] + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_halt_on_error"] + pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse"] + pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse_string"] + pub fn ts_parser_parse_string( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + ) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_copy"] + pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_delete"] + pub fn ts_tree_delete(arg1: *mut TSTree); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_root_node"] + pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_edit"] + pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_get_changed_ranges"] + pub fn ts_tree_get_changed_ranges( + arg1: *const TSTree, + arg2: *const TSTree, + arg3: *mut u32, + ) -> *mut TSRange; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_print_dot_graph"] + pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_byte"] + pub fn ts_node_start_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_point"] + pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_byte"] + pub fn ts_node_end_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_point"] + pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_symbol"] + pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; +} +extern "C" { + #[link_name = "\u{1}_ts_node_type"] + pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_string"] + pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_eq"] + pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_null"] + pub fn ts_node_is_null(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_named"] + pub fn ts_node_is_named(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_missing"] + pub fn ts_node_is_missing(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_changes"] + pub fn ts_node_has_changes(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_error"] + pub fn ts_node_has_error(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_parent"] + pub fn ts_node_parent(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child"] + pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child"] + pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child_count"] + pub fn ts_node_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child_count"] + pub fn ts_node_named_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_sibling"] + pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_named_sibling"] + pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_sibling"] + pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_named_sibling"] + pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_child_for_byte"] + pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] + pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] + pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] + pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_point_range"] + pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) + -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] + pub fn ts_node_named_descendant_for_point_range( + arg1: TSNode, + arg2: TSPoint, + arg3: TSPoint, + ) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_new"] + pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_delete"] + pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] + pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_current_node"] + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_count"] + pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_name"] + pub fn ts_language_symbol_name( + arg1: *const TSLanguage, + arg2: TSSymbol, + ) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_type"] + pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; +} +extern "C" { + #[link_name = "\u{1}_ts_language_version"] + pub fn ts_language_version(arg1: *const TSLanguage) -> u32; +} + +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; diff --git a/src/ffi.rs b/src/ffi.rs index 7d1c06e8..323609e0 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -1,333 +1,4 @@ -/* automatically generated by rust-bindgen */ +#![allow(dead_code)] +#![allow(non_upper_case_globals)] -pub type FILE = [u64; 19usize]; -pub type TSSymbol = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSLanguage { - _unused: [u8; 0], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSParser { - _unused: [u8; 0], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSTree { - _unused: [u8; 0], -} -pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; -pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; -pub type TSInputEncoding = u32; -pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; -pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; -pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; -pub type TSSymbolType = u32; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSPoint { - pub row: u32, - pub column: u32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSRange { - pub start: TSPoint, - pub end: TSPoint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSInput { - pub payload: *mut ::std::os::raw::c_void, - pub read: ::std::option::Option< - unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) - -> *const ::std::os::raw::c_char, - >, - pub seek: ::std::option::Option< - unsafe extern "C" fn( - payload: *mut ::std::os::raw::c_void, - byte_index: u32, - position: TSPoint, - ) -> ::std::os::raw::c_int, - >, - pub encoding: TSInputEncoding, -} -pub const TSLogType_TSLogTypeParse: TSLogType = 0; -pub const TSLogType_TSLogTypeLex: TSLogType = 1; -pub type TSLogType = u32; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSLogger { - pub payload: *mut ::std::os::raw::c_void, - pub log: ::std::option::Option< - unsafe extern "C" fn( - payload: *mut ::std::os::raw::c_void, - arg1: TSLogType, - arg2: *const ::std::os::raw::c_char, - ), - >, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSInputEdit { - pub start_byte: u32, - pub old_end_byte: u32, - pub new_end_byte: u32, - pub start_point: TSPoint, - pub old_end_point: TSPoint, - pub new_end_point: TSPoint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSNode { - pub context: [u32; 4usize], - pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSTreeCursor { - pub context: [u32; 2usize], - pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, -} -extern "C" { - #[link_name = "\u{1}_ts_parser_new"] - pub fn ts_parser_new() -> *mut TSParser; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_delete"] - pub fn ts_parser_delete(arg1: *mut TSParser); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_language"] - pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_set_language"] - pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_logger"] - pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_set_logger"] - pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_print_dot_graphs"] - pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_halt_on_error"] - pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_parse"] - pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_parse_string"] - pub fn ts_parser_parse_string( - arg1: *mut TSParser, - arg2: *const TSTree, - arg3: *const ::std::os::raw::c_char, - arg4: u32, - ) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_copy"] - pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_delete"] - pub fn ts_tree_delete(arg1: *mut TSTree); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_root_node"] - pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_edit"] - pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_get_changed_ranges"] - pub fn ts_tree_get_changed_ranges( - arg1: *const TSTree, - arg2: *const TSTree, - arg3: *mut u32, - ) -> *mut TSRange; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_print_dot_graph"] - pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); -} -extern "C" { - #[link_name = "\u{1}_ts_node_start_byte"] - pub fn ts_node_start_byte(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_start_point"] - pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; -} -extern "C" { - #[link_name = "\u{1}_ts_node_end_byte"] - pub fn ts_node_end_byte(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_end_point"] - pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; -} -extern "C" { - #[link_name = "\u{1}_ts_node_symbol"] - pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; -} -extern "C" { - #[link_name = "\u{1}_ts_node_type"] - pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_node_string"] - pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_node_eq"] - pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_null"] - pub fn ts_node_is_null(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_named"] - pub fn ts_node_is_named(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_missing"] - pub fn ts_node_is_missing(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_has_changes"] - pub fn ts_node_has_changes(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_has_error"] - pub fn ts_node_has_error(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_parent"] - pub fn ts_node_parent(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_child"] - pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_child"] - pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_child_count"] - pub fn ts_node_child_count(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_child_count"] - pub fn ts_node_named_child_count(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_next_sibling"] - pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_next_named_sibling"] - pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_prev_sibling"] - pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_prev_named_sibling"] - pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_first_child_for_byte"] - pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] - pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] - pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] - pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_point_range"] - pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) - -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] - pub fn ts_node_named_descendant_for_point_range( - arg1: TSNode, - arg2: TSPoint, - arg3: TSPoint, - ) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_new"] - pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_delete"] - pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] - pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] - pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] - pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] - pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_current_node"] - pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_count"] - pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_name"] - pub fn ts_language_symbol_name( - arg1: *const TSLanguage, - arg2: TSSymbol, - ) -> *const ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_type"] - pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; -} -extern "C" { - #[link_name = "\u{1}_ts_language_version"] - pub fn ts_language_version(arg1: *const TSLanguage) -> u32; -} - -pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; +include!("./bindings.rs"); From b1ff399960cb4a72fe9a4323ecfc9b633c35e545 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 18:02:01 -0700 Subject: [PATCH 008/102] :arrow_up: tree-sitter for warning fixes --- build.rs | 21 +++++++++++---------- vendor/tree-sitter | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/build.rs b/build.rs index 5fa5d408..8736b645 100644 --- a/build.rs +++ b/build.rs @@ -4,14 +4,15 @@ use std::env; use std::path::Path; fn main() { + let mut config = cc::Build::new(); let root_path = Path::new("vendor/tree-sitter"); - let mut config = cc::Build::new(); - config.flag_if_supported("-std=c99"); - config.flag_if_supported("-Wno-unused-parameter"); - config.include(root_path.join(Path::new("src"))); - config.include(root_path.join(Path::new("include"))); - config.include(root_path.join(Path::new("externals/utf8proc"))); + config + .flag("-std=c99") + .flag("-Wno-unused-parameter") + .include(root_path.join(Path::new("src"))) + .include(root_path.join(Path::new("include"))) + .include(root_path.join(Path::new("externals/utf8proc"))); let source_filenames = [ "get_changed_ranges.c", @@ -36,10 +37,10 @@ fn main() { if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); - config.flag_if_supported("-Wno-typedef-redefinition"); - config.file(parser_dir.join("parser.c")); - config.file(parser_dir.join("scanner.c")); + config + .file(parser_dir.join("parser.c")) + .file(parser_dir.join("scanner.c")); } - config.compile("treesitter") + config.compile("treesitter_ffi"); } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 5ec3769c..3c01382b 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 5ec3769cb4c9acfda64f80d7c14abce939e8b4c5 +Subproject commit 3c01382b95364ce40f0cf9856865a30af77f9690 From 13e26b5007b19f2f98584adf594b478f2cbb9175 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 18:08:44 -0700 Subject: [PATCH 009/102] Try a static flag --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 8736b645..b7433f54 100644 --- a/build.rs +++ b/build.rs @@ -10,6 +10,7 @@ fn main() { config .flag("-std=c99") .flag("-Wno-unused-parameter") + .static_flag(true) .include(root_path.join(Path::new("src"))) .include(root_path.join(Path::new("include"))) .include(root_path.join(Path::new("externals/utf8proc"))); From 29dfa0550413cecb9f2fb13798e60f95522bb0ba Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 19:40:06 -0700 Subject: [PATCH 010/102] Try clang --- .travis.yml | 12 +++++++++++- build.rs | 1 - 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 10fcfe94..5b99d596 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,21 @@ rust: - stable env: - - RUST_TREE_SITTER_TEST=1 + - CC=clang-3.6 RUST_TREE_SITTER_TEST=1 before_install: - ./script/fetch-test-fixtures.sh +compiler: clang-3.6 + +addons: + apt: + sources: + - llvm-toolchain-precise-3.6 + - ubuntu-toolchain-r-test + packages: + - clang-3.6 + branches: only: - master diff --git a/build.rs b/build.rs index b7433f54..8736b645 100644 --- a/build.rs +++ b/build.rs @@ -10,7 +10,6 @@ fn main() { config .flag("-std=c99") .flag("-Wno-unused-parameter") - .static_flag(true) .include(root_path.join(Path::new("src"))) .include(root_path.join(Path::new("include"))) .include(root_path.join(Path::new("externals/utf8proc"))); From e61edf539824631b4e59a8d8ed022f7a065cf95a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 09:30:00 -0700 Subject: [PATCH 011/102] Don't perform platform-specific name mangling on C functions for bindings --- script/bindgen.sh | 1 + src/bindings.rs | 57 +---------------------------------------------- 2 files changed, 2 insertions(+), 56 deletions(-) diff --git a/script/bindgen.sh b/script/bindgen.sh index 1b9008b2..699f0339 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -8,6 +8,7 @@ bindgen \ --whitelist-type '^TS.*' \ --whitelist-function '^ts_.*' \ --opaque-type FILE \ + --distrust-clang-mangling \ $header_path > $output_path echo "" >> $output_path diff --git a/src/bindings.rs b/src/bindings.rs index 7d1c06e8..1ab49bde 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -1,7 +1,7 @@ /* automatically generated by rust-bindgen */ pub type FILE = [u64; 19usize]; -pub type TSSymbol = ::std::os::raw::c_ushort; +pub type TSSymbol = u16; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSLanguage { @@ -93,43 +93,33 @@ pub struct TSTreeCursor { pub tree: *const ::std::os::raw::c_void, } extern "C" { - #[link_name = "\u{1}_ts_parser_new"] pub fn ts_parser_new() -> *mut TSParser; } extern "C" { - #[link_name = "\u{1}_ts_parser_delete"] pub fn ts_parser_delete(arg1: *mut TSParser); } extern "C" { - #[link_name = "\u{1}_ts_parser_language"] pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; } extern "C" { - #[link_name = "\u{1}_ts_parser_set_language"] pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_parser_logger"] pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; } extern "C" { - #[link_name = "\u{1}_ts_parser_set_logger"] pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); } extern "C" { - #[link_name = "\u{1}_ts_parser_print_dot_graphs"] pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); } extern "C" { - #[link_name = "\u{1}_ts_parser_halt_on_error"] pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); } extern "C" { - #[link_name = "\u{1}_ts_parser_parse"] pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_parser_parse_string"] pub fn ts_parser_parse_string( arg1: *mut TSParser, arg2: *const TSTree, @@ -138,23 +128,18 @@ extern "C" { ) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_tree_copy"] pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_tree_delete"] pub fn ts_tree_delete(arg1: *mut TSTree); } extern "C" { - #[link_name = "\u{1}_ts_tree_root_node"] pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_tree_edit"] pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); } extern "C" { - #[link_name = "\u{1}_ts_tree_get_changed_ranges"] pub fn ts_tree_get_changed_ranges( arg1: *const TSTree, arg2: *const TSTree, @@ -162,120 +147,91 @@ extern "C" { ) -> *mut TSRange; } extern "C" { - #[link_name = "\u{1}_ts_tree_print_dot_graph"] pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); } extern "C" { - #[link_name = "\u{1}_ts_node_start_byte"] pub fn ts_node_start_byte(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_start_point"] pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; } extern "C" { - #[link_name = "\u{1}_ts_node_end_byte"] pub fn ts_node_end_byte(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_end_point"] pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; } extern "C" { - #[link_name = "\u{1}_ts_node_symbol"] pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; } extern "C" { - #[link_name = "\u{1}_ts_node_type"] pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_node_string"] pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_node_eq"] pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_null"] pub fn ts_node_is_null(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_named"] pub fn ts_node_is_named(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_missing"] pub fn ts_node_is_missing(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_has_changes"] pub fn ts_node_has_changes(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_has_error"] pub fn ts_node_has_error(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_parent"] pub fn ts_node_parent(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_child"] pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_child"] pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_child_count"] pub fn ts_node_child_count(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_named_child_count"] pub fn ts_node_named_child_count(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_next_sibling"] pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_next_named_sibling"] pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_prev_sibling"] pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_prev_named_sibling"] pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_first_child_for_byte"] pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_point_range"] pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] pub fn ts_node_named_descendant_for_point_range( arg1: TSNode, arg2: TSPoint, @@ -283,50 +239,39 @@ extern "C" { ) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_new"] pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_delete"] pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_current_node"] pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_count"] pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_name"] pub fn ts_language_symbol_name( arg1: *const TSLanguage, arg2: TSSymbol, ) -> *const ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_type"] pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; } extern "C" { - #[link_name = "\u{1}_ts_language_version"] pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } From 29c0cd3aa4d9e569c0ea2d1b4ea2652e207ca51a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 09:48:24 -0700 Subject: [PATCH 012/102] Add appveyor config --- README.md | 1 + appveyor.yml | 24 ++++++++++++++++++++++++ script/fetch-test-fixtures.cmd | 16 ++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 appveyor.yml create mode 100755 script/fetch-test-fixtures.cmd diff --git a/README.md b/README.md index 08df0e4e..40f5624f 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ Rust Tree-sitter =========================== [![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) +[![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master) Rust bindings to the [Tree-sitter][] parsing library. diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 00000000..23fe3d97 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,24 @@ +environment: + RUST_TREE_SITTER_TEST: true + +build: false + +install: + - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe + - rustup-init -yv --default-toolchain stable + - set PATH=%PATH%;%USERPROFILE%\.cargo\bin + - rustc -vV + - cargo -vV + - script\fetch-test-fixtures.cmd + +test_script: + - cargo build + - cargo test + +branches: + only: + - master + +cache: + - fixtures + - C:\Users\appveyor\.cargo diff --git a/script/fetch-test-fixtures.cmd b/script/fetch-test-fixtures.cmd new file mode 100755 index 00000000..33543961 --- /dev/null +++ b/script/fetch-test-fixtures.cmd @@ -0,0 +1,16 @@ +@Echo off +SETLOCAL + +Set grammar_dir=fixtures\tree-sitter-rust +Set grammar_url=https://github.com/tree-sitter/tree-sitter-rust + +@IF NOT EXIST %grammar_dir% ( + git clone %grammar_url% %grammar_dir% --depth=1 +) + +pushd %grammar_dir% +git fetch origin master --depth=1 +git reset --hard origin/master +popd + +ENDLOCAL From 8d485857e10d90f76c344811a2da645ddfb74bd2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:01:37 -0700 Subject: [PATCH 013/102] Tweak build script for windows --- build.rs | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/build.rs b/build.rs index 8736b645..c1e768ff 100644 --- a/build.rs +++ b/build.rs @@ -1,18 +1,18 @@ extern crate cc; use std::env; -use std::path::Path; +use std::path::PathBuf; fn main() { let mut config = cc::Build::new(); - let root_path = Path::new("vendor/tree-sitter"); + let root_path: PathBuf = ["vendor", "tree-sitter"].iter().collect(); config - .flag("-std=c99") - .flag("-Wno-unused-parameter") - .include(root_path.join(Path::new("src"))) - .include(root_path.join(Path::new("include"))) - .include(root_path.join(Path::new("externals/utf8proc"))); + .flag_if_supported("-std=c99") + .flag_if_supported("-Wno-unused-parameter") + .include(root_path.join("src")) + .include(root_path.join("include")) + .include(root_path.join("externals").join("utf8proc")); let source_filenames = [ "get_changed_ranges.c", @@ -29,18 +29,19 @@ fn main() { config.files(source_filenames.iter().map(|source_filename| { root_path - .join(Path::new(&"src/runtime")) - .join(Path::new(&source_filename)) + .join("src") + .join("runtime") + .join(&source_filename) })); - config.file(root_path.join(Path::new("externals/utf8proc/utf8proc.c"))); + config.file(root_path.join("externals").join("utf8proc").join("utf8proc.c")); if env::var("RUST_TREE_SITTER_TEST").is_ok() { - let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); config .file(parser_dir.join("parser.c")) .file(parser_dir.join("scanner.c")); } - config.compile("treesitter_ffi"); + config.compile("tree-sitter-runtime"); } From 7748f8e1687042fc477890378fc653c152bc2b31 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:16:28 -0700 Subject: [PATCH 014/102] Fetch submodules on appveyor --- appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 23fe3d97..22c8b96e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -4,11 +4,14 @@ environment: build: false install: + - git submodule update --init --recursive + - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - rustup-init -yv --default-toolchain stable - set PATH=%PATH%;%USERPROFILE%\.cargo\bin - rustc -vV - cargo -vV + - script\fetch-test-fixtures.cmd test_script: From 654789f92534b4fd6d59006a13353edc923da1cb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:27:36 -0700 Subject: [PATCH 015/102] Use UTF8PROC_STATIC macro --- build.rs | 1 + vendor/tree-sitter | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index c1e768ff..7d9ee83e 100644 --- a/build.rs +++ b/build.rs @@ -8,6 +8,7 @@ fn main() { let root_path: PathBuf = ["vendor", "tree-sitter"].iter().collect(); config + .define("UTF8PROC_STATIC", "") .flag_if_supported("-std=c99") .flag_if_supported("-Wno-unused-parameter") .include(root_path.join("src")) diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 3c01382b..9c1e82a7 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 3c01382b95364ce40f0cf9856865a30af77f9690 +Subproject commit 9c1e82a7eac97767cee0469faa2722fd5753b065 From 993bfea669b1ba49fa4a37b11abd82c5206f0209 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:39:00 -0700 Subject: [PATCH 016/102] Add missing source file --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 7d9ee83e..2843c758 100644 --- a/build.rs +++ b/build.rs @@ -22,6 +22,7 @@ fn main() { "node.c", "parser.c", "stack.c", + "string_input.c", "subtree.c", "tree_cursor.c", "tree.c", From 4603542747743e0f0bb1361a8cdb3d4abbb089b0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:44:14 -0700 Subject: [PATCH 017/102] Add more public methods and tests --- src/lib.rs | 134 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 124 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fa1db0f9..ef53e4de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -253,15 +253,27 @@ impl<'a> Node<'a> { } } - pub fn name(&self) -> &'static str { + pub fn kind(&self) -> &'static str { unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } - pub fn start_index(&self) -> u32 { + pub fn is_named(&self) -> bool { + unsafe { ffi::ts_node_is_named(self.0) } + } + + pub fn has_changes(&self) -> bool { + unsafe { ffi::ts_node_has_changes(self.0) } + } + + pub fn has_error(&self) -> bool { + unsafe { ffi::ts_node_has_error(self.0) } + } + + pub fn start_byte(&self) -> u32 { unsafe { ffi::ts_node_start_byte(self.0) } } - pub fn end_index(&self) -> u32 { + pub fn end_byte(&self) -> u32 { unsafe { ffi::ts_node_end_byte(self.0) } } @@ -289,10 +301,34 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_child_count(self.0) } } + pub fn named_child(&self, i: u32) -> Option { + Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) + } + + pub fn named_child_count(&self) -> u32 { + unsafe { ffi::ts_node_named_child_count(self.0) } + } + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } + pub fn next_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) + } + + pub fn prev_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) + } + + pub fn next_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) + } + + pub fn prev_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) + } + pub fn to_sexp(&self) -> String { let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); @@ -304,26 +340,26 @@ impl<'a> Node<'a> { extern "C" { fn free(pointer: *mut c_void); } impl<'a> TreeCursor<'a> { - fn node(&'a self) -> Node<'a> { + pub fn node(&'a self) -> Node<'a> { Node( unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, PhantomData, ) } - fn goto_first_child(&mut self) -> bool { + pub fn goto_first_child(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; } - fn goto_parent(&mut self) -> bool { + pub fn goto_parent(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; } - fn goto_next_sibling(&mut self) -> bool { + pub fn goto_next_sibling(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; } - fn goto_first_child_for_index(&mut self, index: u32) -> Option { + pub fn goto_first_child_for_index(&mut self, index: u32) -> Option { let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; if result < 0 { None @@ -378,7 +414,7 @@ mod tests { ", None).unwrap(); let root_node = tree.root_node(); - assert_eq!(root_node.name(), "source_file"); + assert_eq!(root_node.kind(), "source_file"); assert_eq!( root_node.to_sexp(), @@ -386,7 +422,7 @@ mod tests { ); let struct_node = root_node.child(0).unwrap(); - assert_eq!(struct_node.name(), "struct_item"); + assert_eq!(struct_node.kind(), "struct_item"); } #[test] @@ -407,4 +443,82 @@ mod tests { assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); } + + #[test] + fn test_tree_cursor() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let tree = parser.parse_str(" + struct Stuff { + a: A; + b: Option, + } + ", None).unwrap(); + + let mut cursor = tree.walk(); + assert_eq!(cursor.node().kind(), "source_file"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct_item"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct"); + assert_eq!(cursor.node().is_named(), false); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "type_identifier"); + assert_eq!(cursor.node().is_named(), true); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "field_declaration_list"); + assert_eq!(cursor.node().is_named(), true); + } + + #[test] + fn test_custom_utf8_input() { + struct LineBasedInput { + lines: &'static [&'static str], + row: usize, + column: usize, + } + + impl Utf8Input for LineBasedInput { + fn read(&mut self) -> &[u8] { + if self.row < self.lines.len() { + let result = &self.lines[self.row].as_bytes()[self.column..]; + self.row += 1; + self.column = 0; + result + } else { + &[] + } + } + + fn seek(&mut self, _byte: u32, position: Point) { + self.row = position.row as usize; + self.column = position.column as usize; + } + } + + let mut parser = Parser::new(); + parser.set_language(rust()); + + let mut input = LineBasedInput { + lines: &[ + "pub fn main() {", + "}", + ], + row: 0, + column: 0 + }; + + let tree = parser.parse_utf8(&mut input, None).unwrap(); + let root = tree.root_node(); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + + let child = root.child(0).unwrap(); + assert_eq!(child.kind(), "function_item"); + } } From e10a817704c3982b4ed41928b2b504cdbdbaf702 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:55:42 -0700 Subject: [PATCH 018/102] Switch back to default c compiler on travis --- .travis.yml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5b99d596..10fcfe94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,21 +4,11 @@ rust: - stable env: - - CC=clang-3.6 RUST_TREE_SITTER_TEST=1 + - RUST_TREE_SITTER_TEST=1 before_install: - ./script/fetch-test-fixtures.sh -compiler: clang-3.6 - -addons: - apt: - sources: - - llvm-toolchain-precise-3.6 - - ubuntu-toolchain-r-test - packages: - - clang-3.6 - branches: only: - master From 870dc11f791425f441eb6e84f86332f4a6b1a21a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:15:37 -0700 Subject: [PATCH 019/102] Implement Eq and Debug for Node --- src/lib.rs | 66 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ef53e4de..2ecc7341 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,11 @@ mod ffi; +use std::fmt; use std::ffi::CStr; use std::marker::PhantomData; use std::os::raw::{c_char, c_int, c_void}; use std::ptr; -#[derive(Clone, Copy)] -pub struct Symbol(ffi::TSSymbol); - pub type Language = *const ffi::TSLanguage; pub trait Utf16Input { @@ -26,13 +24,13 @@ pub enum LogType { Lex, } -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Point { pub row: u32, pub column: u32, } -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { pub start_byte: u32, pub old_end_byte: u32, @@ -63,9 +61,19 @@ impl Parser { } } - pub fn set_language(&mut self, language: Language) { + pub fn set_language(&mut self, language: Language) -> Result<(), String> { unsafe { - ffi::ts_parser_set_language(self.0, language); + let version = ffi::ts_language_version(language) as usize; + if version == ffi::TREE_SITTER_LANGUAGE_VERSION { + ffi::ts_parser_set_language(self.0, language); + Ok(()) + } else { + Err(format!( + "Incompatible language version {}. Expected {}.", + version, + ffi::TREE_SITTER_LANGUAGE_VERSION + )) + } } } @@ -253,6 +261,10 @@ impl<'a> Node<'a> { } } + pub fn kind_id(&self) -> u16 { + unsafe { ffi::ts_node_symbol(self.0) } + } + pub fn kind(&self) -> &'static str { unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } @@ -330,6 +342,8 @@ impl<'a> Node<'a> { } pub fn to_sexp(&self) -> String { + extern "C" { fn free(pointer: *mut c_void); } + let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); unsafe { free(c_string as *mut c_void) }; @@ -337,7 +351,17 @@ impl<'a> Node<'a> { } } -extern "C" { fn free(pointer: *mut c_void); } +impl<'a> PartialEq for Node<'a> { + fn eq(&self, other: &Self) -> bool { + self.0.id == other.0.id + } +} + +impl<'a> fmt::Debug for Node<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{{Node {} {} - {}}}", self.kind(), self.start_position(), self.end_position()) + } +} impl<'a> TreeCursor<'a> { pub fn node(&'a self) -> Node<'a> { @@ -375,6 +399,12 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl fmt::Display for Point { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "({}, {})", self.row, self.column) + } +} + impl Into for Point { fn into(self) -> ffi::TSPoint { ffi::TSPoint { @@ -406,7 +436,7 @@ mod tests { #[test] fn test_basic_parsing() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let tree = parser.parse_str(" struct Stuff {} @@ -428,7 +458,7 @@ mod tests { #[test] fn test_logging() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let mut messages = Vec::new(); parser.set_logger(Some(&mut |log_type, message| { @@ -447,7 +477,7 @@ mod tests { #[test] fn test_tree_cursor() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let tree = parser.parse_str(" struct Stuff { @@ -502,7 +532,7 @@ mod tests { } let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let mut input = LineBasedInput { lines: &[ @@ -521,4 +551,16 @@ mod tests { let child = root.child(0).unwrap(); assert_eq!(child.kind(), "function_item"); } + + #[test] + fn test_node_equality() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("struct A {}", None).unwrap(); + let node1 = tree.root_node(); + let node2 = tree.root_node(); + assert_eq!(node1, node2); + assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); + assert_ne!(node1.child(0).unwrap(), node2); + } } From a27ac49dea32cb296ff4ebdd939c7fa01a3d72e7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:42:13 -0700 Subject: [PATCH 020/102] Flesh out README --- README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/README.md b/README.md index 40f5624f..43270713 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,94 @@ Rust Tree-sitter Rust bindings to the [Tree-sitter][] parsing library. +### Basic Usage + +First, create a parser: + +```rust +let parser = Parser::new(); +``` + +Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`: + +```rust +extern "C" fn tree_sitter_c() -> Language; +extern "C" fn tree_sitter_rust() -> Language; +extern "C" fn tree_sitter_javascript() -> Language; + +parser.set_language(unsafe { tree_sitter_rust() }).unwrap(); +``` + +Now you can parse source code: + +```rust +let source_code = "fn test() {}"; + +let tree = parser.parse_str(source_code, None); +let root_node = tree.root_node(); +assert_eq!(root_node.kind(), "source_file"); +assert_eq!(root_node.start_position().column, 0); +assert_eq!(root_node.end_position().column, 12); +``` + +### Editing + +Once you have a syntax tree, you can update it when your source code changes: + +```rust +let new_source_code = "fn test(a: u32) {}" + +tree.edit(InputEdit { + start_byte: 8, + old_end_byte: 8, + new_end_byte: 14, + start_position: Point::new(0, 8), + old_end_position: Point::new(0, 8), + new_end_position: Point::new(0, 14), +}); +let new_tree = parser.parse_str(new_source_code, Some(tree)); +``` + +### Text Input + + +The code can be provided either as a simple string or by any type that implements Tree-sitter's `Utf8Input` or `Utf16Input` traits: + +```rust +struct LineWiseInput { + lines: &'static [&'static str], + row: usize, + column: usize, +} + +impl tree_sitter::Utf8Input for LineWiseInput { + fn read(&mut self) -> &[u8] { + if self.row < self.lines.len() { + let result = &self.lines[self.row].as_bytes()[self.column..]; + self.row += 1; + self.column = 0; + result + } else { + &[] + } + } + + fn seek(&mut self, _byte: u32, position: Point) { + self.row = position.row as usize; + self.column = position.column as usize; + } +} + +let mut input = LineBasedInput { + lines: &[ + "pub fn main() {", + "}", + ], + row: 0, + column: 0 +}; + +let tree = parser.parse_utf8(&mut input, None).unwrap(); +``` + [tree-sitter]: https://github.com/tree-sitter/tree-sitter From c0b49e99357fbe25d62d800c9da2fd47566e9b31 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:51:46 -0700 Subject: [PATCH 021/102] Fix include globs in package manifest --- Cargo.toml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e20d40aa..560d9a71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,19 @@ [package] name = "tree-sitter" +description = "Rust bindings to the Tree-sitter parsing library" version = "0.1.0" authors = ["Max Brunsfeld "] build = "build.rs" -exclude = ["vendor/tree-sitter/**/*"] +license = "MIT" include = [ - "vendor/tree-sitter/src/runtime/*", - "vendor/tree-sitter/externals/utf8proc/utf8proc*" + "/build.rs", + "/Cargo.toml", + "/LICENSE", + "/README.md", + "/src/*", + "/vendor/tree-sitter/externals/utf8proc/utf8proc*", + "/vendor/tree-sitter/include/*", + "/vendor/tree-sitter/src/runtime/*", ] [build-dependencies] From e6d580597d5925f3d43bf01b2101d6e0ca9643fc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 12:02:52 -0700 Subject: [PATCH 022/102] Add crates.io badge to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 43270713..da6e1a80 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Rust Tree-sitter [![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) [![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master) +[![Crates.io](https://img.shields.io/crates/v/tree-sitter.svg)](https://crates.io/crates/tree-sitter) Rust bindings to the [Tree-sitter][] parsing library. From 819b14070123c4f6c61aa73c72654ce1b97fef16 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:06:49 -0700 Subject: [PATCH 023/102] Make set_logger take a boxed function --- src/lib.rs | 68 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2ecc7341..5ef80f70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,8 @@ pub enum LogType { Lex, } +type Logger<'a> = Box; + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Point { pub row: u32, @@ -44,7 +46,7 @@ pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); -pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); +pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); @@ -77,28 +79,42 @@ impl Parser { } } - pub fn set_logger ()>(&mut self, logger: Option<&mut F>) { - unsafe extern "C" fn log ()>( - payload: *mut c_void, - c_log_type: ffi::TSLogType, - c_message: *const c_char, - ) { - let callback = (payload as *mut F).as_mut().unwrap(); - if let Ok(message) = CStr::from_ptr(c_message).to_str() { - let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { - LogType::Parse - } else { - LogType::Lex - }; - callback(log_type, message); - } - }; + pub fn logger(&self) -> Option<&Logger> { + let logger = unsafe { ffi::ts_parser_logger(self.0) }; + unsafe { (logger.payload as *mut Logger).as_ref() } + } + + pub fn set_logger(&mut self, logger: Option) { + let prev_logger = unsafe { ffi::ts_parser_logger(self.0) }; + if !prev_logger.payload.is_null() { + unsafe { Box::from_raw(prev_logger.payload as *mut Logger) }; + } let c_logger; if let Some(logger) = logger { + let container = Box::new(logger); + + unsafe extern "C" fn log( + payload: *mut c_void, + c_log_type: ffi::TSLogType, + c_message: *const c_char, + ) { + let callback = (payload as *mut Logger).as_mut().unwrap(); + if let Ok(message) = CStr::from_ptr(c_message).to_str() { + let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { + LogType::Parse + } else { + LogType::Lex + }; + callback(log_type, message); + } + }; + + let raw_container = Box::into_raw(container); + c_logger = ffi::TSLogger { - payload: logger as *mut F as *mut c_void, - log: Some(log::), + payload: raw_container as *mut c_void, + log: Some(log), }; } else { c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; @@ -156,7 +172,7 @@ impl Parser { if new_tree_ptr.is_null() { None } else { - Some(Tree(new_tree_ptr, ffi::TSInputEncoding_TSInputEncodingUTF8)) + Some(Tree(new_tree_ptr)) } } @@ -204,16 +220,14 @@ impl Parser { if new_tree_ptr.is_null() { None } else { - Some(Tree( - new_tree_ptr, - ffi::TSInputEncoding_TSInputEncodingUTF16, - )) + Some(Tree(new_tree_ptr)) } } } impl Drop for Parser { fn drop(&mut self) { + self.set_logger(None); unsafe { ffi::ts_parser_delete(self.0) } } } @@ -248,7 +262,7 @@ impl Drop for Tree { impl Clone for Tree { fn clone(&self) -> Tree { - unsafe { Tree(ffi::ts_tree_copy(self.0), self.1) } + unsafe { Tree(ffi::ts_tree_copy(self.0)) } } } @@ -461,9 +475,9 @@ mod tests { parser.set_language(rust()).unwrap(); let mut messages = Vec::new(); - parser.set_logger(Some(&mut |log_type, message| { + parser.set_logger(Some(Box::new(|log_type, message| { messages.push((log_type, message.to_string())); - })); + }))); parser.parse_str(" struct Stuff {} From 4da669ce8d23cbfaeaba2d2c5969b678779ff0e9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:27:08 -0700 Subject: [PATCH 024/102] Fix bugs in editing/reparsing --- README.md | 2 +- src/lib.rs | 101 ++++++++++++++++++++++++++++++++++++++++----- vendor/tree-sitter | 2 +- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index da6e1a80..d0806bbb 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ tree.edit(InputEdit { old_end_position: Point::new(0, 8), new_end_position: Point::new(0, 14), }); -let new_tree = parser.parse_str(new_source_code, Some(tree)); +let new_tree = parser.parse_str(new_source_code, Some(&tree)); ``` ### Text Input diff --git a/src/lib.rs b/src/lib.rs index 5ef80f70..0ac1300e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -123,7 +123,7 @@ impl Parser { unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; } - pub fn parse_str(&mut self, input: &str, old_tree: Option) -> Option { + pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; self.parse_utf8(&mut input, old_tree) } @@ -131,7 +131,7 @@ impl Parser { pub fn parse_utf8( &mut self, input: &mut T, - old_tree: Option, + old_tree: Option<&Tree>, ) -> Option { unsafe extern "C" fn read( payload: *mut c_void, @@ -179,7 +179,7 @@ impl Parser { pub fn parse_utf16( &mut self, input: &mut T, - old_tree: Option, + old_tree: Option<&Tree>, ) -> Option { unsafe extern "C" fn read( payload: *mut c_void, @@ -266,7 +266,7 @@ impl Clone for Tree { } } -impl<'a> Node<'a> { +impl<'tree> Node<'tree> { fn new(node: ffi::TSNode) -> Option { if node.id.is_null() { None @@ -319,7 +319,7 @@ impl<'a> Node<'a> { } } - pub fn child(&self, i: u32) -> Option { + pub fn child(&self, i: u32) -> Option { Self::new(unsafe { ffi::ts_node_child(self.0, i) }) } @@ -327,7 +327,7 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_child_count(self.0) } } - pub fn named_child(&self, i: u32) -> Option { + pub fn named_child<'a>(&'a self, i: u32) -> Option { Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) } @@ -335,23 +335,23 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_named_child_count(self.0) } } - pub fn parent(&self) -> Option { + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } - pub fn next_sibling(&self) -> Option { + pub fn next_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) } - pub fn prev_sibling(&self) -> Option { + pub fn prev_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) } - pub fn next_named_sibling(&self) -> Option { + pub fn next_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) } - pub fn prev_named_sibling(&self) -> Option { + pub fn prev_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) } @@ -413,6 +413,12 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl Point { + pub fn new(row: u32, column: u32) -> Self { + Point { row, column } + } +} + impl fmt::Display for Point { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "({}, {})", self.row, self.column) @@ -577,4 +583,77 @@ mod tests { assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); assert_ne!(node1.child(0).unwrap(), node2); } + + #[test] + fn test_editing() { + struct SpyInput { + bytes: &'static [u8], + offset: usize, + bytes_read: Vec, + } + + impl Utf8Input for SpyInput { + fn read(&mut self) -> &[u8] { + if self.offset < self.bytes.len() { + let result = &self.bytes[self.offset..self.offset + 1]; + self.bytes_read.extend(result.iter()); + self.offset += 1; + result + } else { + &[] + } + } + + fn seek(&mut self, byte: u32, _position: Point) { + self.offset = byte as usize; + } + } + + let mut input = SpyInput { + bytes: "fn test(a: A, c: C) {}".as_bytes(), + offset: 0, + bytes_read: Vec::new(), + }; + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut tree = parser.parse_utf8(&mut input, None).unwrap(); + let parameters_sexp = tree.root_node() + .named_child(0).unwrap() + .named_child(1).unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + input.offset = 0; + input.bytes_read.clear(); + input.bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + tree.edit(&InputEdit{ + start_byte: 14, + old_end_byte: 14, + new_end_byte: 20, + start_position: Point::new(0, 14), + old_end_position: Point::new(0, 14), + new_end_position: Point::new(0, 20), + }); + + let tree = parser.parse_utf8(&mut input, Some(&tree)).unwrap(); + let parameters_sexp = tree.root_node() + .named_child(0).unwrap() + .named_child(1).unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + let retokenized_content = String::from_utf8(input.bytes_read).unwrap(); + assert!(retokenized_content.contains("b: B")); + assert!(!retokenized_content.contains("a: A")); + assert!(!retokenized_content.contains("c: C")); + assert!(!retokenized_content.contains("{}")); + } } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 9c1e82a7..78f28b14 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 9c1e82a7eac97767cee0469faa2722fd5753b065 +Subproject commit 78f28b14ce519ba085ab7886c2fc19739f7f7da0 From 45660e7b4e5db579905924717fa4da22f6a1d97d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:27:08 -0700 Subject: [PATCH 025/102] Make syntax trees implement Send --- src/lib.rs | 107 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0ac1300e..6084516c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -254,6 +254,14 @@ impl Tree { } } +unsafe impl Send for Tree {} + +impl fmt::Debug for Tree { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{{Tree {:?}}}", self.root_node()) + } +} + impl Drop for Tree { fn drop(&mut self) { unsafe { ffi::ts_tree_delete(self.0) } @@ -448,6 +456,7 @@ impl<'a> Utf8Input for FlatInput<'a> { #[cfg(test)] mod tests { + use std::thread; use super::*; fn rust() -> Language { unsafe { tree_sitter_rust() } } @@ -586,29 +595,6 @@ mod tests { #[test] fn test_editing() { - struct SpyInput { - bytes: &'static [u8], - offset: usize, - bytes_read: Vec, - } - - impl Utf8Input for SpyInput { - fn read(&mut self) -> &[u8] { - if self.offset < self.bytes.len() { - let result = &self.bytes[self.offset..self.offset + 1]; - self.bytes_read.extend(result.iter()); - self.offset += 1; - result - } else { - &[] - } - } - - fn seek(&mut self, byte: u32, _position: Point) { - self.offset = byte as usize; - } - } - let mut input = SpyInput { bytes: "fn test(a: A, c: C) {}".as_bytes(), offset: 0, @@ -656,4 +642,79 @@ mod tests { assert!(!retokenized_content.contains("c: C")); assert!(!retokenized_content.contains("{}")); } + + #[test] + fn test_parallel_parsing() { + // Parse this source file so that each thread has a non-trivial amount of + // work to do. + let this_file_source = include_str!("lib.rs"); + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str(this_file_source, None).unwrap(); + + let mut parse_threads = Vec::new(); + for thread_id in 1..5 { + let mut tree_clone = tree.clone(); + parse_threads.push(thread::spawn(move || { + + // For each thread, prepend a different number of declarations to the + // source code. + let mut prepend_line_count = 0; + let mut prepended_source = String::new(); + for _ in 0..thread_id { + prepend_line_count += 2; + prepended_source += "struct X {}\n\n"; + } + + tree_clone.edit(&InputEdit{ + start_byte: 0, + old_end_byte: 0, + new_end_byte: prepended_source.len() as u32, + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(prepend_line_count, 0), + }); + prepended_source += this_file_source; + + // Reparse using the old tree as a starting point. + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + parser.parse_str(&prepended_source, Some(&tree_clone)).unwrap() + })); + } + + // Check that the trees have the expected relationship to one another. + let trees = parse_threads + .into_iter() + .map(|thread| thread.join().unwrap()); + let child_count_differences = trees + .map(|t| t.root_node().child_count() - tree.root_node().child_count()) + .collect::>(); + + assert_eq!(child_count_differences, &[1, 2, 3, 4]); + } + + struct SpyInput { + bytes: &'static [u8], + offset: usize, + bytes_read: Vec, + } + + impl Utf8Input for SpyInput { + fn read(&mut self) -> &[u8] { + if self.offset < self.bytes.len() { + let result = &self.bytes[self.offset..self.offset + 1]; + self.bytes_read.extend(result.iter()); + self.offset += 1; + result + } else { + &[] + } + } + + fn seek(&mut self, byte: u32, _position: Point) { + self.offset = byte as usize; + } + } } From 0034fce8093374bc5193727c96d45d98b9816a32 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 15:05:31 -0700 Subject: [PATCH 026/102] Add some fields to the cargo manifest --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 560d9a71..13c84759 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,10 @@ version = "0.1.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" +readme = "README.md" +keywords = ["incremental", "parsing"] +categories = ["parsing", "text editors", "api bindings"] + include = [ "/build.rs", "/Cargo.toml", From 16a7366ec75f5c03d497a12bb796d883bfd32466 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 15:06:05 -0700 Subject: [PATCH 027/102] 0.1.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 13c84759..12d92923 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.1.0" +version = "0.1.1" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 5efc28f2f3741e9f3b1ff376be5de2890df80ed0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Jun 2018 16:19:37 -0700 Subject: [PATCH 028/102] Update to latest tree-sitter API --- README.md | 66 ++++++----- build.rs | 1 - src/bindings.rs | 22 +++- src/lib.rs | 273 +++++++++++++++++++++------------------------ vendor/tree-sitter | 2 +- 5 files changed, 175 insertions(+), 189 deletions(-) diff --git a/README.md b/README.md index d0806bbb..ff7140c5 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ Rust bindings to the [Tree-sitter][] parsing library. First, create a parser: ```rust +use tree_sitter::{Parser, Language}; + +// ... + let parser = Parser::new(); ``` @@ -22,16 +26,17 @@ extern "C" fn tree_sitter_c() -> Language; extern "C" fn tree_sitter_rust() -> Language; extern "C" fn tree_sitter_javascript() -> Language; -parser.set_language(unsafe { tree_sitter_rust() }).unwrap(); +let language = unsafe { tree_sitter_rust() }; +parser.set_language(language).unwrap(); ``` Now you can parse source code: ```rust let source_code = "fn test() {}"; - let tree = parser.parse_str(source_code, None); let root_node = tree.root_node(); + assert_eq!(root_node.kind(), "source_file"); assert_eq!(root_node.start_position().column, 0); assert_eq!(root_node.end_position().column, 12); @@ -39,7 +44,7 @@ assert_eq!(root_node.end_position().column, 12); ### Editing -Once you have a syntax tree, you can update it when your source code changes: +Once you have a syntax tree, you can update it when your source code changes. Passing in the previous edited tree makes `parse` run much more quickly: ```rust let new_source_code = "fn test(a: u32) {}" @@ -52,49 +57,42 @@ tree.edit(InputEdit { old_end_position: Point::new(0, 8), new_end_position: Point::new(0, 14), }); + let new_tree = parser.parse_str(new_source_code, Some(&tree)); ``` ### Text Input - -The code can be provided either as a simple string or by any type that implements Tree-sitter's `Utf8Input` or `Utf16Input` traits: +The source code to parse can be provided either as a string or as a function that returns text encoded as either UTF8 or UTF16: ```rust -struct LineWiseInput { - lines: &'static [&'static str], - row: usize, - column: usize, -} +// Store some source code in an array of lines. +let lines = &[ + "pub fn foo() {", + " 1", + "}", +]; -impl tree_sitter::Utf8Input for LineWiseInput { - fn read(&mut self) -> &[u8] { - if self.row < self.lines.len() { - let result = &self.lines[self.row].as_bytes()[self.column..]; - self.row += 1; - self.column = 0; - result +// Parse the source code using a custom callback. The callback is called +// with both a byte offset and a row/column offset. +let tree = parser.parse_utf8(&mut |_byte: u32, position: Point| -> &[u8] { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] } else { - &[] + "\n".as_bytes() } + } else { + &[] } +}, None).unwrap(); - fn seek(&mut self, _byte: u32, position: Point) { - self.row = position.row as usize; - self.column = position.column as usize; - } -} - -let mut input = LineBasedInput { - lines: &[ - "pub fn main() {", - "}", - ], - row: 0, - column: 0 -}; - -let tree = parser.parse_utf8(&mut input, None).unwrap(); +assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))" +); ``` [tree-sitter]: https://github.com/tree-sitter/tree-sitter diff --git a/build.rs b/build.rs index 2843c758..7d9ee83e 100644 --- a/build.rs +++ b/build.rs @@ -22,7 +22,6 @@ fn main() { "node.c", "parser.c", "stack.c", - "string_input.c", "subtree.c", "tree_cursor.c", "tree.c", diff --git a/src/bindings.rs b/src/bindings.rs index 1ab49bde..b2d83729 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -41,15 +41,12 @@ pub struct TSRange { pub struct TSInput { pub payload: *mut ::std::os::raw::c_void, pub read: ::std::option::Option< - unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) - -> *const ::std::os::raw::c_char, - >, - pub seek: ::std::option::Option< unsafe extern "C" fn( payload: *mut ::std::os::raw::c_void, byte_index: u32, position: TSPoint, - ) -> ::std::os::raw::c_int, + bytes_read: *mut u32, + ) -> *const ::std::os::raw::c_char, >, pub encoding: TSInputEncoding, } @@ -127,6 +124,21 @@ extern "C" { arg4: u32, ) -> *mut TSTree; } +extern "C" { + pub fn ts_parser_enabled(arg1: *const TSParser) -> bool; +} +extern "C" { + pub fn ts_parser_set_enabled(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + pub fn ts_parser_operation_limit(arg1: *const TSParser) -> usize; +} +extern "C" { + pub fn ts_parser_set_operation_limit(arg1: *mut TSParser, arg2: usize); +} +extern "C" { + pub fn ts_parser_reset(arg1: *mut TSParser); +} extern "C" { pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } diff --git a/src/lib.rs b/src/lib.rs index 6084516c..84d51f04 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,21 +3,11 @@ mod ffi; use std::fmt; use std::ffi::CStr; use std::marker::PhantomData; -use std::os::raw::{c_char, c_int, c_void}; +use std::os::raw::{c_char, c_void}; use std::ptr; pub type Language = *const ffi::TSLanguage; -pub trait Utf16Input { - fn read(&mut self) -> &[u16]; - fn seek(&mut self, u32, Point); -} - -pub trait Utf8Input { - fn read(&mut self) -> &[u8]; - fn seek(&mut self, u32, Point); -} - #[derive(Debug, PartialEq, Eq)] pub enum LogType { Parse, @@ -50,11 +40,6 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -struct FlatInput<'a> { - bytes: &'a [u8], - offset: usize, -} - impl Parser { pub fn new() -> Parser { unsafe { @@ -124,105 +109,86 @@ impl Parser { } pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { - let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; - self.parse_utf8(&mut input, old_tree) + let bytes = input.as_bytes(); + self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) } - pub fn parse_utf8( + pub fn parse_utf8<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read( + unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let result = input.read(); + let result = (*input)(byte_offset, position.into()); *bytes_read = result.len() as u32; return result.as_ptr() as *const c_char; }; - unsafe extern "C" fn seek( - payload: *mut c_void, - byte: u32, - position: ffi::TSPoint, - ) -> c_int { - let input = (payload as *mut T).as_mut().unwrap(); - input.seek( - byte, - Point { - row: position.row, - column: position.column, - }, - ); - return 1; - }; - let c_input = ffi::TSInput { payload: input as *mut T as *mut c_void, - read: Some(read::), - seek: Some(seek::), + read: Some(read::<'a, T>), encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, }; - let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; - if new_tree_ptr.is_null() { + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { None } else { - Some(Tree(new_tree_ptr)) + Some(Tree(c_new_tree)) } } - pub fn parse_utf16( + pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read( + unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let result = input.read(); + let result = (*input)(byte_offset, Point { + row: position.row, + column: position.column / 2, + }); *bytes_read = result.len() as u32 * 2; return result.as_ptr() as *const c_char; }; - unsafe extern "C" fn seek( - payload: *mut c_void, - byte: u32, - position: ffi::TSPoint, - ) -> c_int { - let input = (payload as *mut T).as_mut().unwrap(); - input.seek( - byte / 2, - Point { - row: position.row, - column: position.column / 2, - }, - ); - return 1; - }; - let c_input = ffi::TSInput { payload: input as *mut T as *mut c_void, - read: Some(read::), - seek: Some(seek::), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + read: Some(read::<'a, T>), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, }; - let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; - if new_tree_ptr.is_null() { + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { None } else { - Some(Tree(new_tree_ptr)) + Some(Tree(c_new_tree)) } } + + pub fn reset(&mut self) { + unsafe { ffi::ts_parser_reset(self.0) } + } + + pub fn set_operation_limit(&mut self, limit: usize) { + unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } + } } impl Drop for Parser { @@ -442,15 +408,12 @@ impl Into for Point { } } -impl<'a> Utf8Input for FlatInput<'a> { - fn read(&mut self) -> &[u8] { - let result = &self.bytes[self.offset..]; - self.offset = self.bytes.len(); - result - } - - fn seek(&mut self, offset: u32, _position: Point) { - self.offset = offset as usize; +impl From for Point { + fn from(point: ffi::TSPoint) -> Self { + Self { + row: point.row, + column: point.column, + } } } @@ -536,49 +499,70 @@ mod tests { #[test] fn test_custom_utf8_input() { - struct LineBasedInput { - lines: &'static [&'static str], - row: usize, - column: usize, - } - - impl Utf8Input for LineBasedInput { - fn read(&mut self) -> &[u8] { - if self.row < self.lines.len() { - let result = &self.lines[self.row].as_bytes()[self.column..]; - self.row += 1; - self.column = 0; - result - } else { - &[] - } - } - - fn seek(&mut self, _byte: u32, position: Point) { - self.row = position.row as usize; - self.column = position.column as usize; - } - } - let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let mut input = LineBasedInput { - lines: &[ - "pub fn main() {", - "}", - ], - row: 0, - column: 0 - }; + let lines = &[ + "pub fn foo() {", + " 1", + "}", + ]; + + let tree = parser.parse_utf8(&mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, None).unwrap(); - let tree = parser.parse_utf8(&mut input, None).unwrap(); let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); + } - let child = root.child(0).unwrap(); - assert_eq!(child.kind(), "function_item"); + #[test] + fn test_custom_utf16_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + parser.set_logger(Some(Box::new(|t, message| { + println!("log: {:?} {}", t, message); + }))); + + let lines: Vec> = [ + "pub fn foo() {", + " 1", + "}" + ].iter().map(|s| s.encode_utf16().collect()).collect(); + + let tree = parser.parse_utf16(&mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, None).unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); } #[test] @@ -595,16 +579,23 @@ mod tests { #[test] fn test_editing() { - let mut input = SpyInput { - bytes: "fn test(a: A, c: C) {}".as_bytes(), - offset: 0, - bytes_read: Vec::new(), - }; - let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let mut tree = parser.parse_utf8(&mut input, None).unwrap(); + let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); + let mut input_bytes_read = Vec::new(); + + let mut tree = parser.parse_utf8(&mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, None).unwrap(); + let parameters_sexp = tree.root_node() .named_child(0).unwrap() .named_child(1).unwrap() @@ -614,9 +605,8 @@ mod tests { "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" ); - input.offset = 0; - input.bytes_read.clear(); - input.bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + input_bytes_read.clear(); + input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); tree.edit(&InputEdit{ start_byte: 14, old_end_byte: 14, @@ -626,7 +616,17 @@ mod tests { new_end_position: Point::new(0, 20), }); - let tree = parser.parse_utf8(&mut input, Some(&tree)).unwrap(); + let tree = parser.parse_utf8(&mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, Some(&tree)).unwrap(); + let parameters_sexp = tree.root_node() .named_child(0).unwrap() .named_child(1).unwrap() @@ -636,7 +636,7 @@ mod tests { "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" ); - let retokenized_content = String::from_utf8(input.bytes_read).unwrap(); + let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); assert!(retokenized_content.contains("b: B")); assert!(!retokenized_content.contains("a: A")); assert!(!retokenized_content.contains("c: C")); @@ -694,27 +694,4 @@ mod tests { assert_eq!(child_count_differences, &[1, 2, 3, 4]); } - - struct SpyInput { - bytes: &'static [u8], - offset: usize, - bytes_read: Vec, - } - - impl Utf8Input for SpyInput { - fn read(&mut self) -> &[u8] { - if self.offset < self.bytes.len() { - let result = &self.bytes[self.offset..self.offset + 1]; - self.bytes_read.extend(result.iter()); - self.offset += 1; - result - } else { - &[] - } - } - - fn seek(&mut self, byte: u32, _position: Point) { - self.offset = byte as usize; - } - } } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 78f28b14..26ab57a6 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 78f28b14ce519ba085ab7886c2fc19739f7f7da0 +Subproject commit 26ab57a6562aaeb48b579e3ca29eb064925e857c From 86c8206e35757694d37d3fe627236d22a75eb3ec Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Jun 2018 16:20:58 -0700 Subject: [PATCH 029/102] 0.2.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 12d92923..bfc6b2e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.1.1" +version = "0.2.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 2eff3225bac3422b19fc442482eb45f0462fa478 Mon Sep 17 00:00:00 2001 From: Stephan Renatus Date: Thu, 28 Jun 2018 10:25:01 +0200 Subject: [PATCH 030/102] README.md: small fixes To call .set_language on parser, it needs to be mut; also, the syntax for the extern "C" blocks seemed to be a bit off. Both now corresponds to what's in the tests. Signed-off-by: Stephan Renatus --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ff7140c5..449c6c46 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,15 @@ use tree_sitter::{Parser, Language}; // ... -let parser = Parser::new(); +let mut parser = Parser::new(); ``` Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`: ```rust -extern "C" fn tree_sitter_c() -> Language; -extern "C" fn tree_sitter_rust() -> Language; -extern "C" fn tree_sitter_javascript() -> Language; +extern "C" { fn tree_sitter_c() -> Language; } +extern "C" { fn tree_sitter_rust() -> Language; } +extern "C" { fn tree_sitter_javascript() -> Language; } let language = unsafe { tree_sitter_rust() }; parser.set_language(language).unwrap(); From c477e45fccf746fcb9335ba777ace035a6292a48 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:32:22 -0700 Subject: [PATCH 031/102] Update to the latest Tree-sitter --- src/bindings.rs | 30 +++++++++++++++++++++++++----- src/lib.rs | 6 +++++- vendor/tree-sitter | 2 +- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/bindings.rs b/src/bindings.rs index b2d83729..58d0e510 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -33,8 +33,10 @@ pub struct TSPoint { #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSRange { - pub start: TSPoint, - pub end: TSPoint, + pub start_point: TSPoint, + pub end_point: TSPoint, + pub start_byte: u32, + pub end_byte: u32, } #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -80,7 +82,7 @@ pub struct TSInputEdit { pub struct TSNode { pub context: [u32; 4usize], pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, + pub tree: *const TSTree, } #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -139,6 +141,12 @@ extern "C" { extern "C" { pub fn ts_parser_reset(arg1: *mut TSParser); } +extern "C" { + pub fn ts_parser_set_included_ranges(arg1: *mut TSParser, arg2: *const TSRange, arg3: u32); +} +extern "C" { + pub fn ts_parser_included_ranges(arg1: *const TSParser, arg2: *mut u32) -> *const TSRange; +} extern "C" { pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } @@ -161,6 +169,9 @@ extern "C" { extern "C" { pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); } +extern "C" { + pub fn ts_tree_language(arg1: *const TSTree) -> *const TSLanguage; +} extern "C" { pub fn ts_node_start_byte(arg1: TSNode) -> u32; } @@ -251,7 +262,10 @@ extern "C" { ) -> TSNode; } extern "C" { - pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; + pub fn ts_node_edit(arg1: *mut TSNode, arg2: *const TSInputEdit); +} +extern "C" { + pub fn ts_tree_cursor_new(arg1: TSNode) -> TSTreeCursor; } extern "C" { pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); @@ -280,6 +294,12 @@ extern "C" { arg2: TSSymbol, ) -> *const ::std::os::raw::c_char; } +extern "C" { + pub fn ts_language_symbol_for_name( + arg1: *const TSLanguage, + arg2: *const ::std::os::raw::c_char, + ) -> TSSymbol; +} extern "C" { pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; } @@ -287,4 +307,4 @@ extern "C" { pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } -pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 9; diff --git a/src/lib.rs b/src/lib.rs index 84d51f04..9f0ef9b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,7 +216,7 @@ impl Tree { } pub fn walk(&self) -> TreeCursor { - TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + self.root_node().walk() } } @@ -337,6 +337,10 @@ impl<'tree> Node<'tree> { unsafe { free(c_string as *mut c_void) }; result } + + pub fn walk(&self) -> TreeCursor<'tree> { + TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + } } impl<'a> PartialEq for Node<'a> { diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 26ab57a6..16376c43 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 26ab57a6562aaeb48b579e3ca29eb064925e857c +Subproject commit 16376c43f5cc75bbc5297e6d5716bd94d55ccc05 From 47a7430da319b8e2a55cdb8998acc3f3f099a1c7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:32:56 -0700 Subject: [PATCH 032/102] 0.3.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index bfc6b2e2..746d2d47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.2.0" +version = "0.3.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From bdd52376a82ae2354b6226d9bb3b23649b81df4d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:36:12 -0700 Subject: [PATCH 033/102] Fix cargo category slugs --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 746d2d47..c2d733f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ build = "build.rs" license = "MIT" readme = "README.md" keywords = ["incremental", "parsing"] -categories = ["parsing", "text editors", "api bindings"] +categories = ["api-bindings", "parsing", "text-editors"] include = [ "/build.rs", From 5fbb261316737117c827db935e667bcfd3932348 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:36:42 -0700 Subject: [PATCH 034/102] 0.3.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c2d733f2..9adbcfd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.0" +version = "0.3.1" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From c8125ec617ec4a3e2d93c460bcc22c89f1c06981 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 11:32:40 -0700 Subject: [PATCH 035/102] Make Language send + sync, add language methods --- src/lib.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9f0ef9b9..434d05fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,8 @@ use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; -pub type Language = *const ffi::TSLanguage; +#[repr(transparent)] +pub struct Language (*const ffi::TSLanguage); #[derive(Debug, PartialEq, Eq)] pub enum LogType { @@ -50,9 +51,9 @@ impl Parser { pub fn set_language(&mut self, language: Language) -> Result<(), String> { unsafe { - let version = ffi::ts_language_version(language) as usize; + let version = ffi::ts_language_version(language.0) as usize; if version == ffi::TREE_SITTER_LANGUAGE_VERSION { - ffi::ts_parser_set_language(self.0, language); + ffi::ts_parser_set_language(self.0, language.0); Ok(()) } else { Err(format!( @@ -222,6 +223,24 @@ impl Tree { unsafe impl Send for Tree {} +impl Language { + pub fn node_kind_count(&self) -> usize { + unsafe { ffi::ts_language_symbol_count(self.0) as usize } + } + + pub fn node_kind_for_id(&self, id: u16) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) }.to_str().unwrap() + } + + pub fn node_kind_is_named(&self, id: u16) -> bool { + unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } + } +} + +unsafe impl Send for Language {} + +unsafe impl Sync for Language {} + impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "{{Tree {:?}}}", self.root_node()) @@ -527,7 +546,7 @@ mod tests { }, None).unwrap(); let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); assert_eq!(root.child(0).unwrap().kind(), "function_item"); @@ -563,7 +582,7 @@ mod tests { }, None).unwrap(); let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); assert_eq!(root.child(0).unwrap().kind(), "function_item"); From 0c2e1c189b2c4f696a1a1b48ee1ad04c7ef49936 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:32:58 -0700 Subject: [PATCH 036/102] Implement Clone for Language --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 434d05fb..81b4d09a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +#[derive(Clone, Copy)] #[repr(transparent)] pub struct Language (*const ffi::TSLanguage); From 572e8c202e36c98e875a67f2edadbbad341602cf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:33:11 -0700 Subject: [PATCH 037/102] Implement Send for Parser --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 81b4d09a..c547974b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,6 +200,8 @@ impl Drop for Parser { } } +unsafe impl Send for Parser {} + impl Tree { pub fn root_node(&self) -> Node { Node::new(unsafe { ffi::ts_tree_root_node(self.0) }).unwrap() From 91d35dec7d4ddf60054efbbc6631489af74c09f0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:33:43 -0700 Subject: [PATCH 038/102] Add Parser.parser_utf8_io() method --- src/lib.rs | 159 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c547974b..ff272a29 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ use std::ffi::CStr; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +use std::io::{self, Read, Seek}; #[derive(Clone, Copy)] #[repr(transparent)] @@ -115,37 +116,15 @@ impl Parser { self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) } - pub fn parse_utf8<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( + pub fn parse_utf8<'a, T: FnMut(u32, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( - payload: *mut c_void, - byte_offset: u32, - position: ffi::TSPoint, - bytes_read: *mut u32, - ) -> *const c_char { - let input = (payload as *mut T).as_mut().unwrap(); - let result = (*input)(byte_offset, position.into()); - *bytes_read = result.len() as u32; - return result.as_ptr() as *const c_char; - }; - - let c_input = ffi::TSInput { - payload: input as *mut T as *mut c_void, - read: Some(read::<'a, T>), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, - }; - - let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - - let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; - if c_new_tree.is_null() { - None - } else { - Some(Tree(c_new_tree)) - } + self.parse_utf8_ptr(&mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, old_tree) } pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( @@ -153,34 +132,43 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( - payload: *mut c_void, - byte_offset: u32, - position: ffi::TSPoint, - bytes_read: *mut u32, - ) -> *const c_char { - let input = (payload as *mut T).as_mut().unwrap(); - let result = (*input)(byte_offset, Point { - row: position.row, - column: position.column / 2, - }); - *bytes_read = result.len() as u32 * 2; - return result.as_ptr() as *const c_char; - }; + self.parse_utf16_ptr(&mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, old_tree) + } - let c_input = ffi::TSInput { - payload: input as *mut T as *mut c_void, - read: Some(read::<'a, T>), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, - }; + pub fn parse_utf8_io( + &mut self, + mut input: impl Read + Seek, + old_tree: Option<&Tree>, + ) -> io::Result> { + let mut error = None; + let mut current_offset = 0; + let mut buffer = [0; 10 * 1024]; + let result = self.parse_utf8_ptr(&mut |byte, _| { + if byte as u64 != current_offset { + current_offset = byte as u64; + if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { + error = Some(e); + return (ptr::null(), 0) + } + } - let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + match input.read(&mut buffer) { + Err(e) => { + error = Some(e); + (ptr::null(), 0) + }, + Ok(length) => { + (buffer.as_ptr(), length) + } + } + }, old_tree); - let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; - if c_new_tree.is_null() { - None - } else { - Some(Tree(c_new_tree)) + match error { + Some(e) => Err(e), + None => Ok(result) } } @@ -191,6 +179,73 @@ impl Parser { pub fn set_operation_limit(&mut self, limit: usize) { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } + + fn parse_utf8_ptr (*const u8, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u8, usize)> ( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)(byte_offset, position.into()); + *bytes_read = length as u32; + return ptr as *const c_char; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } + + fn parse_utf16_ptr (*const u16, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u16, usize)>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)(byte_offset, Point { + row: position.row, + column: position.column / 2, + }); + *bytes_read = length as u32 * 2; + ptr as *const c_char + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } } impl Drop for Parser { From a8cbde6dbfbc8ae9b7b37075ad0dffeed3e079b8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 9 Oct 2018 08:23:02 -0700 Subject: [PATCH 039/102] Run rustfmt on lib.rs --- src/lib.rs | 336 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 204 insertions(+), 132 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ff272a29..4a132a3f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,15 @@ mod ffi; -use std::fmt; use std::ffi::CStr; +use std::fmt; +use std::io::{self, Read, Seek}; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; -use std::io::{self, Read, Seek}; #[derive(Clone, Copy)] #[repr(transparent)] -pub struct Language (*const ffi::TSLanguage); +pub struct Language(*const ffi::TSLanguage); #[derive(Debug, PartialEq, Eq)] pub enum LogType { @@ -43,6 +43,26 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +impl Language { + pub fn node_kind_count(&self) -> usize { + unsafe { ffi::ts_language_symbol_count(self.0) as usize } + } + + pub fn node_kind_for_id(&self, id: u16) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) } + .to_str() + .unwrap() + } + + pub fn node_kind_is_named(&self, id: u16) -> bool { + unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } + } +} + +unsafe impl Send for Language {} + +unsafe impl Sync for Language {} + impl Parser { pub fn new() -> Parser { unsafe { @@ -105,7 +125,10 @@ impl Parser { log: Some(log), }; } else { - c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; + c_logger = ffi::TSLogger { + payload: ptr::null_mut(), + log: None, + }; } unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; @@ -121,10 +144,13 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - self.parse_utf8_ptr(&mut |byte, position| { - let slice = input(byte, position); - (slice.as_ptr(), slice.len()) - }, old_tree) + self.parse_utf8_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) } pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( @@ -132,10 +158,13 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - self.parse_utf16_ptr(&mut |byte, position| { - let slice = input(byte, position); - (slice.as_ptr(), slice.len()) - }, old_tree) + self.parse_utf16_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) } pub fn parse_utf8_io( @@ -146,29 +175,30 @@ impl Parser { let mut error = None; let mut current_offset = 0; let mut buffer = [0; 10 * 1024]; - let result = self.parse_utf8_ptr(&mut |byte, _| { - if byte as u64 != current_offset { - current_offset = byte as u64; - if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { - error = Some(e); - return (ptr::null(), 0) + let result = self.parse_utf8_ptr( + &mut |byte, _| { + if byte as u64 != current_offset { + current_offset = byte as u64; + if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { + error = Some(e); + return (ptr::null(), 0); + } } - } - match input.read(&mut buffer) { - Err(e) => { - error = Some(e); - (ptr::null(), 0) - }, - Ok(length) => { - (buffer.as_ptr(), length) + match input.read(&mut buffer) { + Err(e) => { + error = Some(e); + (ptr::null(), 0) + } + Ok(length) => (buffer.as_ptr(), length), } - } - }, old_tree); + }, + old_tree, + ); match error { Some(e) => Err(e), - None => Ok(result) + None => Ok(result), } } @@ -185,7 +215,7 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u8, usize)> ( + unsafe extern "C" fn read (*const u8, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, @@ -224,10 +254,13 @@ impl Parser { bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let (ptr, length) = (*input)(byte_offset, Point { - row: position.row, - column: position.column / 2, - }); + let (ptr, length) = (*input)( + byte_offset, + Point { + row: position.row, + column: position.column / 2, + }, + ); *bytes_read = length as u32 * 2; ptr as *const c_char }; @@ -281,24 +314,6 @@ impl Tree { unsafe impl Send for Tree {} -impl Language { - pub fn node_kind_count(&self) -> usize { - unsafe { ffi::ts_language_symbol_count(self.0) as usize } - } - - pub fn node_kind_for_id(&self, id: u16) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) }.to_str().unwrap() - } - - pub fn node_kind_is_named(&self, id: u16) -> bool { - unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } - } -} - -unsafe impl Send for Language {} - -unsafe impl Sync for Language {} - impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "{{Tree {:?}}}", self.root_node()) @@ -331,7 +346,9 @@ impl<'tree> Node<'tree> { } pub fn kind(&self) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } + .to_str() + .unwrap() } pub fn is_named(&self) -> bool { @@ -407,10 +424,15 @@ impl<'tree> Node<'tree> { } pub fn to_sexp(&self) -> String { - extern "C" { fn free(pointer: *mut c_void); } + extern "C" { + fn free(pointer: *mut c_void); + } let c_string = unsafe { ffi::ts_node_string(self.0) }; - let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); + let result = unsafe { CStr::from_ptr(c_string) } + .to_str() + .unwrap() + .to_string(); unsafe { free(c_string as *mut c_void) }; result } @@ -428,7 +450,13 @@ impl<'a> PartialEq for Node<'a> { impl<'a> fmt::Debug for Node<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - write!(f, "{{Node {} {} - {}}}", self.kind(), self.start_position(), self.end_position()) + write!( + f, + "{{Node {} {} - {}}}", + self.kind(), + self.start_position(), + self.end_position() + ) } } @@ -500,21 +528,30 @@ impl From for Point { #[cfg(test)] mod tests { - use std::thread; use super::*; + use std::thread; - fn rust() -> Language { unsafe { tree_sitter_rust() } } - extern "C" { fn tree_sitter_rust() -> Language; } + fn rust() -> Language { + unsafe { tree_sitter_rust() } + } + extern "C" { + fn tree_sitter_rust() -> Language; + } #[test] fn test_basic_parsing() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str(" + let tree = parser + .parse_str( + " struct Stuff {} fn main() {} - ", None).unwrap(); + ", + None, + ) + .unwrap(); let root_node = tree.root_node(); assert_eq!(root_node.kind(), "source_file"); @@ -538,12 +575,20 @@ mod tests { messages.push((log_type, message.to_string())); }))); - parser.parse_str(" + parser + .parse_str( + " struct Stuff {} fn main() {} - ", None).unwrap(); + ", + None, + ) + .unwrap(); - assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); + assert!(messages.contains(&( + LogType::Parse, + "reduce sym:struct_item, child_count:3".to_string() + ))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); } @@ -552,12 +597,17 @@ mod tests { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str(" + let tree = parser + .parse_str( + " struct Stuff { a: A; b: Option, } - ", None).unwrap(); + ", + None, + ) + .unwrap(); let mut cursor = tree.walk(); assert_eq!(cursor.node().kind(), "source_file"); @@ -583,25 +633,26 @@ mod tests { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let lines = &[ - "pub fn foo() {", - " 1", - "}", - ]; + let lines = &["pub fn foo() {", " 1", "}"]; - let tree = parser.parse_utf8(&mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; - if row < lines.len() { - if column < lines[row].as_bytes().len() { - &lines[row].as_bytes()[column..] - } else { - "\n".as_bytes() - } - } else { - &[] - } - }, None).unwrap(); + let tree = parser + .parse_utf8( + &mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, + None, + ) + .unwrap(); let root = tree.root_node(); assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); @@ -619,25 +670,29 @@ mod tests { println!("log: {:?} {}", t, message); }))); - let lines: Vec> = [ - "pub fn foo() {", - " 1", - "}" - ].iter().map(|s| s.encode_utf16().collect()).collect(); + let lines: Vec> = ["pub fn foo() {", " 1", "}"] + .iter() + .map(|s| s.encode_utf16().collect()) + .collect(); - let tree = parser.parse_utf16(&mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; - if row < lines.len() { - if column < lines[row].len() { - &lines[row][column..] - } else { - &[10] - } - } else { - &[] - } - }, None).unwrap(); + let tree = parser + .parse_utf16( + &mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, + None, + ) + .unwrap(); let root = tree.root_node(); assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); @@ -666,20 +721,28 @@ mod tests { let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); let mut input_bytes_read = Vec::new(); - let mut tree = parser.parse_utf8(&mut |offset, _| { - let offset = offset as usize; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, None).unwrap(); + let mut tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + None, + ) + .unwrap(); - let parameters_sexp = tree.root_node() - .named_child(0).unwrap() - .named_child(1).unwrap() + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() .to_sexp(); assert_eq!( parameters_sexp, @@ -688,7 +751,7 @@ mod tests { input_bytes_read.clear(); input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); - tree.edit(&InputEdit{ + tree.edit(&InputEdit { start_byte: 14, old_end_byte: 14, new_end_byte: 20, @@ -697,20 +760,28 @@ mod tests { new_end_position: Point::new(0, 20), }); - let tree = parser.parse_utf8(&mut |offset, _| { - let offset = offset as usize; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, Some(&tree)).unwrap(); + let tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + Some(&tree), + ) + .unwrap(); - let parameters_sexp = tree.root_node() - .named_child(0).unwrap() - .named_child(1).unwrap() + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() .to_sexp(); assert_eq!( parameters_sexp, @@ -738,7 +809,6 @@ mod tests { for thread_id in 1..5 { let mut tree_clone = tree.clone(); parse_threads.push(thread::spawn(move || { - // For each thread, prepend a different number of declarations to the // source code. let mut prepend_line_count = 0; @@ -748,7 +818,7 @@ mod tests { prepended_source += "struct X {}\n\n"; } - tree_clone.edit(&InputEdit{ + tree_clone.edit(&InputEdit { start_byte: 0, old_end_byte: 0, new_end_byte: prepended_source.len() as u32, @@ -761,7 +831,9 @@ mod tests { // Reparse using the old tree as a starting point. let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - parser.parse_str(&prepended_source, Some(&tree_clone)).unwrap() + parser + .parse_str(&prepended_source, Some(&tree_clone)) + .unwrap() })); } From db360b73fb33d5c03a226b42b1bfa60398645873 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 13 Oct 2018 14:09:36 -0700 Subject: [PATCH 040/102] Add Tree.walk_with_properties --- Cargo.toml | 5 + src/lib.rs | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 292 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9adbcfd1..485d369e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,5 +20,10 @@ include = [ "/vendor/tree-sitter/src/runtime/*", ] +[dependencies] +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" + [build-dependencies] cc = "1.0" diff --git a/src/lib.rs b/src/lib.rs index 4a132a3f..19b9a670 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,17 @@ mod ffi; +#[macro_use] +extern crate serde_derive; +extern crate serde_json; + +use std::collections::HashMap; use std::ffi::CStr; use std::fmt; use std::io::{self, Read, Seek}; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +use std::str; #[derive(Clone, Copy)] #[repr(transparent)] @@ -19,7 +25,7 @@ pub enum LogType { type Logger<'a> = Box; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Point { pub row: u32, pub column: u32, @@ -35,6 +41,22 @@ pub struct InputEdit { pub new_end_position: Point, } +struct PropertyTransition { + state_id: u32, + child_index: Option, +} + +struct PropertyState { + transitions: HashMap>, + property_set_id: u32, + default_next_state_id: u32, +} + +pub struct PropertySheet { + states: Vec, + property_sets: Vec>, +} + pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); @@ -43,6 +65,13 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +pub struct TreePropertyCursor<'a> { + cursor: TreeCursor<'a>, + state_stack: Vec, + child_index_stack: Vec, + property_sheet: &'a PropertySheet, +} + impl Language { pub fn node_kind_count(&self) -> usize { unsafe { ffi::ts_language_symbol_count(self.0) as usize } @@ -310,6 +339,13 @@ impl Tree { pub fn walk(&self) -> TreeCursor { self.root_node().walk() } + + pub fn walk_with_properties<'a>( + &'a self, + property_sheet: &'a PropertySheet, + ) -> TreePropertyCursor<'a> { + TreePropertyCursor::new(self, property_sheet) + } } unsafe impl Send for Tree {} @@ -437,6 +473,14 @@ impl<'tree> Node<'tree> { result } + pub fn utf8_text<'a>(&self, source: &'a str) -> Result<&'a str, str::Utf8Error> { + str::from_utf8(&source.as_bytes()[self.start_byte() as usize..self.end_byte() as usize]) + } + + pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { + &source[self.start_byte() as usize..self.end_byte() as usize] + } + pub fn walk(&self) -> TreeCursor<'tree> { TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) } @@ -461,7 +505,7 @@ impl<'a> fmt::Debug for Node<'a> { } impl<'a> TreeCursor<'a> { - pub fn node(&'a self) -> Node<'a> { + pub fn node(&self) -> Node<'a> { Node( unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, PhantomData, @@ -496,6 +540,87 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl<'a> TreePropertyCursor<'a> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet) -> Self { + Self { + cursor: tree.root_node().walk(), + child_index_stack: vec![0], + state_stack: vec![0], + property_sheet, + } + } + + pub fn node(&self) -> Node<'a> { + self.cursor.node() + } + + pub fn node_properties(&self) -> &'a HashMap { + &self.property_sheet.property_sets[self.current_state().property_set_id as usize] + } + + pub fn goto_first_child(&mut self) -> bool { + if self.cursor.goto_first_child() { + let child_index = 0; + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_next_sibling(&mut self) -> bool { + if self.cursor.goto_next_sibling() { + let child_index = self.child_index_stack.pop().unwrap() + 1; + self.state_stack.pop(); + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_parent(&mut self) -> bool { + if self.cursor.goto_parent() { + self.state_stack.pop(); + self.child_index_stack.pop(); + true + } else { + false + } + } + + fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: u32) -> u32 { + state + .transitions + .get(&node_kind_id) + .and_then(|transitions| { + for transition in transitions.iter() { + if transition.child_index == Some(node_child_index) || transition.child_index == None { + return Some(transition.state_id); + } + } + None + }) + .unwrap_or(state.default_next_state_id) + } + + fn current_state(&self) -> &PropertyState { + &self.property_sheet.states[*self.state_stack.last().unwrap() as usize] + } +} + impl Point { pub fn new(row: u32, column: u32) -> Self { Point { row, column } @@ -526,6 +651,64 @@ impl From for Point { } } +impl PropertySheet { + pub fn new(language: Language, json: &str) -> Result { + #[derive(Deserialize, Debug)] + struct PropertyTransitionJSON { + #[serde(rename = "type")] + kind: String, + named: bool, + index: Option, + state_id: u32, + } + + #[derive(Deserialize, Debug)] + struct PropertyStateJSON { + transitions: Vec, + property_set_id: u32, + default_next_state_id: u32, + } + + #[derive(Deserialize, Debug)] + struct PropertySheetJSON { + states: Vec, + property_sets: Vec>, + } + + let input: PropertySheetJSON = serde_json::from_str(json)?; + Ok(PropertySheet { + property_sets: input.property_sets, + states: input + .states + .iter() + .map(|state| { + let mut transitions = HashMap::new(); + let node_kind_count = language.node_kind_count(); + for transition in state.transitions.iter() { + for i in 0..node_kind_count { + let i = i as u16; + if language.node_kind_is_named(i) == transition.named + && transition.kind == language.node_kind_for_id(i) + { + let entry = transitions.entry(i).or_insert(Vec::new()); + entry.push(PropertyTransition { + child_index: transition.index, + state_id: transition.state_id, + }); + } + } + } + PropertyState { + transitions, + default_next_state_id: state.default_next_state_id, + property_set_id: state.property_set_id, + } + }) + .collect(), + }) + } +} + #[cfg(test)] mod tests { use super::*; @@ -600,11 +783,11 @@ mod tests { let tree = parser .parse_str( " - struct Stuff { - a: A; - b: Option, - } - ", + struct Stuff { + a: A; + b: Option, + } + ", None, ) .unwrap(); @@ -628,6 +811,103 @@ mod tests { assert_eq!(cursor.node().is_named(), true); } + #[test] + fn test_tree_property_matching() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); + + let property_sheet = PropertySheet::new( + rust(), + r##" + { + "states": [ + { + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1}, + {"type": "function_item", "named": true, "state_id": 2} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 4} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"reference": "function"}, + {"define": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "function_item"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "fn"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties()["define"], "function"); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "parameters"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "("); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), ")"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_parent()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties()["reference"], "function"); + } + #[test] fn test_custom_utf8_input() { let mut parser = Parser::new(); From afe722358236dfb1389471a1037531b7c5422d0f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Nov 2018 15:56:16 -0800 Subject: [PATCH 041/102] Upgrade Tree-sitter, use single source file in build script --- build.rs | 25 ++----------------------- vendor/tree-sitter | 2 +- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/build.rs b/build.rs index 7d9ee83e..add3bec7 100644 --- a/build.rs +++ b/build.rs @@ -13,29 +13,8 @@ fn main() { .flag_if_supported("-Wno-unused-parameter") .include(root_path.join("src")) .include(root_path.join("include")) - .include(root_path.join("externals").join("utf8proc")); - - let source_filenames = [ - "get_changed_ranges.c", - "language.c", - "lexer.c", - "node.c", - "parser.c", - "stack.c", - "subtree.c", - "tree_cursor.c", - "tree.c", - "utf16.c", - ]; - - config.files(source_filenames.iter().map(|source_filename| { - root_path - .join("src") - .join("runtime") - .join(&source_filename) - })); - - config.file(root_path.join("externals").join("utf8proc").join("utf8proc.c")); + .include(root_path.join("externals").join("utf8proc")) + .file(root_path.join("src").join("runtime").join("runtime.c")); if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 16376c43..6b8e5bd1 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 16376c43f5cc75bbc5297e6d5716bd94d55ccc05 +Subproject commit 6b8e5bd1f96ab63f17873ef9f7a72569a421810f From 8fdcf84ff3396e4c8fc8ee4cdc9e37ebe9f126cf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Nov 2018 16:00:45 -0800 Subject: [PATCH 042/102] 0.3.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 485d369e..2c92acc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.1" +version = "0.3.2" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From a741265ead8dc67de991046d295e2f316681cce0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 28 Nov 2018 17:26:16 -0800 Subject: [PATCH 043/102] Replace all u32s in the API with usizes Co-Authored-By: Timothy Clem --- src/lib.rs | 146 +++++++++++++++++++++++++++-------------------------- 1 file changed, 75 insertions(+), 71 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 19b9a670..fa3d970e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,29 +27,36 @@ type Logger<'a> = Box; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Point { - pub row: u32, - pub column: u32, + pub row: usize, + pub column: usize, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { - pub start_byte: u32, - pub old_end_byte: u32, - pub new_end_byte: u32, + pub start_byte: usize, + pub old_end_byte: usize, + pub new_end_byte: usize, pub start_position: Point, pub old_end_position: Point, pub new_end_position: Point, } struct PropertyTransition { - state_id: u32, - child_index: Option, + state_id: usize, + child_index: Option, + text_regex: Option, } struct PropertyState { transitions: HashMap>, - property_set_id: u32, - default_next_state_id: u32, + property_set_id: usize, + default_next_state_id: usize, +} + +#[derive(Debug)] +pub enum PropertySheetError { + InvalidJSON(serde_json::Error), + InvalidRegex(regex::Error) } pub struct PropertySheet { @@ -67,9 +74,10 @@ pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); pub struct TreePropertyCursor<'a> { cursor: TreeCursor<'a>, - state_stack: Vec, - child_index_stack: Vec, + state_stack: Vec, + child_index_stack: Vec, property_sheet: &'a PropertySheet, + source: &'a str, } impl Language { @@ -165,10 +173,10 @@ impl Parser { pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let bytes = input.as_bytes(); - self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) + self.parse_utf8(&mut |offset, _| &bytes[offset..], old_tree) } - pub fn parse_utf8<'a, T: FnMut(u32, Point) -> &'a [u8]>( + pub fn parse_utf8<'a, T: FnMut(usize, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, @@ -182,7 +190,7 @@ impl Parser { ) } - pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( + pub fn parse_utf16<'a, T: 'a + FnMut(usize, Point) -> &'a [u16]>( &mut self, input: &mut T, old_tree: Option<&Tree>, @@ -239,19 +247,19 @@ impl Parser { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } - fn parse_utf8_ptr (*const u8, usize)>( + fn parse_utf8_ptr (*const u8, usize)>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u8, usize)>( + unsafe extern "C" fn read (*const u8, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let (ptr, length) = (*input)(byte_offset, position.into()); + let (ptr, length) = (*input)(byte_offset as usize, position.into()); *bytes_read = length as u32; return ptr as *const c_char; }; @@ -271,12 +279,12 @@ impl Parser { } } - fn parse_utf16_ptr (*const u16, usize)>( + fn parse_utf16_ptr (*const u16, usize)>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u16, usize)>( + unsafe extern "C" fn read (*const u16, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, @@ -284,10 +292,10 @@ impl Parser { ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); let (ptr, length) = (*input)( - byte_offset, + byte_offset as usize, Point { - row: position.row, - column: position.column / 2, + row: position.row as usize, + column: position.column as usize / 2, }, ); *bytes_read = length as u32 * 2; @@ -326,9 +334,9 @@ impl Tree { pub fn edit(&mut self, edit: &InputEdit) { let edit = ffi::TSInputEdit { - start_byte: edit.start_byte, - old_end_byte: edit.old_end_byte, - new_end_byte: edit.new_end_byte, + start_byte: edit.start_byte as u32, + old_end_byte: edit.old_end_byte as u32, + new_end_byte: edit.new_end_byte as u32, start_point: edit.start_position.into(), old_end_point: edit.old_end_position.into(), new_end_point: edit.new_end_position.into(), @@ -399,44 +407,38 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_has_error(self.0) } } - pub fn start_byte(&self) -> u32 { - unsafe { ffi::ts_node_start_byte(self.0) } + pub fn start_byte(&self) -> usize { + unsafe { ffi::ts_node_start_byte(self.0) as usize } } - pub fn end_byte(&self) -> u32 { - unsafe { ffi::ts_node_end_byte(self.0) } + pub fn end_byte(&self) -> usize { + unsafe { ffi::ts_node_end_byte(self.0) as usize } } pub fn start_position(&self) -> Point { let result = unsafe { ffi::ts_node_start_point(self.0) }; - Point { - row: result.row, - column: result.column, - } + result.into() } pub fn end_position(&self) -> Point { let result = unsafe { ffi::ts_node_end_point(self.0) }; - Point { - row: result.row, - column: result.column, - } + result.into() } - pub fn child(&self, i: u32) -> Option { - Self::new(unsafe { ffi::ts_node_child(self.0, i) }) + pub fn child(&self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_child(self.0, i as u32) }) } - pub fn child_count(&self) -> u32 { - unsafe { ffi::ts_node_child_count(self.0) } + pub fn child_count(&self) -> usize { + unsafe { ffi::ts_node_child_count(self.0) as usize } } - pub fn named_child<'a>(&'a self, i: u32) -> Option { - Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) + pub fn named_child<'a>(&'a self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) } - pub fn named_child_count(&self) -> u32 { - unsafe { ffi::ts_node_named_child_count(self.0) } + pub fn named_child_count(&self) -> usize { + unsafe { ffi::ts_node_named_child_count(self.0) as usize } } pub fn parent(&self) -> Option { @@ -474,11 +476,11 @@ impl<'tree> Node<'tree> { } pub fn utf8_text<'a>(&self, source: &'a str) -> Result<&'a str, str::Utf8Error> { - str::from_utf8(&source.as_bytes()[self.start_byte() as usize..self.end_byte() as usize]) + str::from_utf8(&source.as_bytes()[self.start_byte()..self.end_byte()]) } pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { - &source[self.start_byte() as usize..self.end_byte() as usize] + &source[self.start_byte()..self.end_byte()] } pub fn walk(&self) -> TreeCursor<'tree> { @@ -524,12 +526,12 @@ impl<'a> TreeCursor<'a> { return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; } - pub fn goto_first_child_for_index(&mut self, index: u32) -> Option { - let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; + pub fn goto_first_child_for_index(&mut self, index: usize) -> Option { + let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; if result < 0 { None } else { - Some(result as u32) + Some(result as usize) } } } @@ -541,12 +543,13 @@ impl<'a> Drop for TreeCursor<'a> { } impl<'a> TreePropertyCursor<'a> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet) -> Self { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], state_stack: vec![0], property_sheet, + source, } } @@ -555,7 +558,7 @@ impl<'a> TreePropertyCursor<'a> { } pub fn node_properties(&self) -> &'a HashMap { - &self.property_sheet.property_sets[self.current_state().property_set_id as usize] + &self.property_sheet.property_sets[self.current_state().property_set_id] } pub fn goto_first_child(&mut self) -> bool { @@ -601,7 +604,7 @@ impl<'a> TreePropertyCursor<'a> { } } - fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: u32) -> u32 { + fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: usize) -> usize { state .transitions .get(&node_kind_id) @@ -617,12 +620,12 @@ impl<'a> TreePropertyCursor<'a> { } fn current_state(&self) -> &PropertyState { - &self.property_sheet.states[*self.state_stack.last().unwrap() as usize] + &self.property_sheet.states[*self.state_stack.last().unwrap()] } } impl Point { - pub fn new(row: u32, column: u32) -> Self { + pub fn new(row: usize, column: usize) -> Self { Point { row, column } } } @@ -636,8 +639,8 @@ impl fmt::Display for Point { impl Into for Point { fn into(self) -> ffi::TSPoint { ffi::TSPoint { - row: self.row, - column: self.column, + row: self.row as u32, + column: self.column as u32, } } } @@ -645,28 +648,29 @@ impl Into for Point { impl From for Point { fn from(point: ffi::TSPoint) -> Self { Self { - row: point.row, - column: point.column, + row: point.row as usize, + column: point.column as usize, } } } impl PropertySheet { - pub fn new(language: Language, json: &str) -> Result { + pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] kind: String, named: bool, - index: Option, - state_id: u32, + index: Option, + text: Option, + state_id: usize, } #[derive(Deserialize, Debug)] struct PropertyStateJSON { transitions: Vec, - property_set_id: u32, - default_next_state_id: u32, + property_set_id: usize, + default_next_state_id: usize, } #[derive(Deserialize, Debug)] @@ -918,8 +922,8 @@ mod tests { let tree = parser .parse_utf8( &mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; + let row = position.row; + let column = position.column; if row < lines.len() { if column < lines[row].as_bytes().len() { &lines[row].as_bytes()[column..] @@ -958,8 +962,8 @@ mod tests { let tree = parser .parse_utf16( &mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; + let row = position.row; + let column = position.column; if row < lines.len() { if column < lines[row].len() { &lines[row][column..] @@ -1004,7 +1008,7 @@ mod tests { let mut tree = parser .parse_utf8( &mut |offset, _| { - let offset = offset as usize; + let offset = offset; if offset < input_bytes.len() { let result = &input_bytes[offset..offset + 1]; input_bytes_read.extend(result.iter()); @@ -1043,7 +1047,7 @@ mod tests { let tree = parser .parse_utf8( &mut |offset, _| { - let offset = offset as usize; + let offset = offset; if offset < input_bytes.len() { let result = &input_bytes[offset..offset + 1]; input_bytes_read.extend(result.iter()); @@ -1101,7 +1105,7 @@ mod tests { tree_clone.edit(&InputEdit { start_byte: 0, old_end_byte: 0, - new_end_byte: prepended_source.len() as u32, + new_end_byte: prepended_source.len(), start_position: Point::new(0, 0), old_end_position: Point::new(0, 0), new_end_position: Point::new(prepend_line_count, 0), From d5b53cde7dded6ebbc0d78ed131e9a10f2a62c5b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 28 Nov 2018 17:26:48 -0800 Subject: [PATCH 044/102] Respect the `:text` pseudo-class in TreePropertyCursor Co-Authored-By: Timothy Clem --- Cargo.toml | 1 + src/lib.rs | 87 ++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2c92acc5..0ffee772 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ include = [ ] [dependencies] +regex = "1" serde = "1.0" serde_json = "1.0" serde_derive = "1.0" diff --git a/src/lib.rs b/src/lib.rs index fa3d970e..a76ed115 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,9 @@ mod ffi; #[macro_use] extern crate serde_derive; extern crate serde_json; +extern crate regex; +use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; use std::fmt; @@ -351,8 +353,9 @@ impl Tree { pub fn walk_with_properties<'a>( &'a self, property_sheet: &'a PropertySheet, + source: &'a str, ) -> TreePropertyCursor<'a> { - TreePropertyCursor::new(self, property_sheet) + TreePropertyCursor::new(self, property_sheet, source) } } @@ -610,9 +613,23 @@ impl<'a> TreePropertyCursor<'a> { .get(&node_kind_id) .and_then(|transitions| { for transition in transitions.iter() { - if transition.child_index == Some(node_child_index) || transition.child_index == None { - return Some(transition.state_id); + if let Some(text_regex) = transition.text_regex.as_ref() { + let node = self.cursor.node(); + let text = &self.source.as_bytes()[node.start_byte()..node.end_byte()]; + if let Ok(text) = str::from_utf8(text) { + if !text_regex.is_match(text) { + continue; + } + } } + + if let Some(child_index) = transition.child_index { + if child_index != node_child_index { + continue; + } + } + + return Some(transition.state_id); } None }) @@ -679,36 +696,42 @@ impl PropertySheet { property_sets: Vec>, } - let input: PropertySheetJSON = serde_json::from_str(json)?; + let input: PropertySheetJSON = serde_json::from_str(json) + .map_err(|e| PropertySheetError::InvalidJSON(e))?; + let mut states = Vec::new(); + + for state in input.states.iter() { + let mut transitions = HashMap::new(); + let node_kind_count = language.node_kind_count(); + for transition in state.transitions.iter() { + for i in 0..node_kind_count { + let i = i as u16; + if language.node_kind_is_named(i) == transition.named + && transition.kind == language.node_kind_for_id(i) + { + let entry = transitions.entry(i).or_insert(Vec::new()); + let text_regex = if let Some(text) = transition.text.as_ref() { + Some(Regex::new(&text).map_err(|e| PropertySheetError::InvalidRegex(e))?) + } else { + None + }; + entry.push(PropertyTransition { + child_index: transition.index, + state_id: transition.state_id, + text_regex + }); + } + } + } + states.push(PropertyState { + transitions, + default_next_state_id: state.default_next_state_id, + property_set_id: state.property_set_id, + }); + } Ok(PropertySheet { property_sets: input.property_sets, - states: input - .states - .iter() - .map(|state| { - let mut transitions = HashMap::new(); - let node_kind_count = language.node_kind_count(); - for transition in state.transitions.iter() { - for i in 0..node_kind_count { - let i = i as u16; - if language.node_kind_is_named(i) == transition.named - && transition.kind == language.node_kind_for_id(i) - { - let entry = transitions.entry(i).or_insert(Vec::new()); - entry.push(PropertyTransition { - child_index: transition.index, - state_id: transition.state_id, - }); - } - } - } - PropertyState { - transitions, - default_next_state_id: state.default_next_state_id, - property_set_id: state.property_set_id, - } - }) - .collect(), + states, }) } } @@ -869,7 +892,7 @@ mod tests { ) .unwrap(); - let mut cursor = tree.walk_with_properties(&property_sheet); + let mut cursor = tree.walk_with_properties(&property_sheet, ""); assert_eq!(cursor.node().kind(), "source_file"); assert_eq!(*cursor.node_properties(), HashMap::new()); From c9ce314695a5bad674aed9b267b9c430411bb731 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 29 Nov 2018 16:21:01 -0800 Subject: [PATCH 045/102] Make PropertySheet generic on the properties type Co-Authored-By: Timothy Clem --- src/lib.rs | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a76ed115..68715879 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,9 @@ mod ffi; extern crate serde_derive; extern crate serde_json; extern crate regex; +extern crate serde; +use serde::Deserialize; use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; @@ -61,9 +63,10 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet { +pub struct PropertySheet<'d, P: Deserialize<'d>> { states: Vec, - property_sets: Vec>, + property_sets: Vec

, + _phantom: &'d std::marker::PhantomData<()>, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -74,11 +77,11 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a> { +pub struct TreePropertyCursor<'a, 'd, P: Deserialize<'d>> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, - property_sheet: &'a PropertySheet, + property_sheet: &'a PropertySheet<'d, P>, source: &'a str, } @@ -350,11 +353,11 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a>( + pub fn walk_with_properties<'a, 'd, P: Deserialize<'d>>( &'a self, - property_sheet: &'a PropertySheet, + property_sheet: &'a PropertySheet<'d, P>, source: &'a str, - ) -> TreePropertyCursor<'a> { + ) -> TreePropertyCursor<'a, 'd, P> { TreePropertyCursor::new(self, property_sheet, source) } } @@ -545,8 +548,8 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a> TreePropertyCursor<'a> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet, source: &'a str) -> Self { +impl<'a, 'd, P: Deserialize<'d>> TreePropertyCursor<'a, 'd, P> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet<'d, P>, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], @@ -560,7 +563,7 @@ impl<'a> TreePropertyCursor<'a> { self.cursor.node() } - pub fn node_properties(&self) -> &'a HashMap { + pub fn node_properties(&self) -> &'a P { &self.property_sheet.property_sets[self.current_state().property_set_id] } @@ -671,8 +674,8 @@ impl From for Point { } } -impl PropertySheet { - pub fn new(language: Language, json: &str) -> Result { +impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { + pub fn new(language: Language, json: &'a str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -691,12 +694,12 @@ impl PropertySheet { } #[derive(Deserialize, Debug)] - struct PropertySheetJSON { + struct PropertySheetJSON

{ states: Vec, - property_sets: Vec>, + property_sets: Vec

, } - let input: PropertySheetJSON = serde_json::from_str(json) + let input: PropertySheetJSON

= serde_json::from_str(json) .map_err(|e| PropertySheetError::InvalidJSON(e))?; let mut states = Vec::new(); @@ -729,9 +732,10 @@ impl PropertySheet { property_set_id: state.property_set_id, }); } - Ok(PropertySheet { + Ok(Self { property_sets: input.property_sets, states, + _phantom: &std::marker::PhantomData, }) } } @@ -844,7 +848,7 @@ mod tests { parser.set_language(rust()).unwrap(); let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); - let property_sheet = PropertySheet::new( + let property_sheet = PropertySheet::>::new( rust(), r##" { From 11610e1df66214a1bf58bff2565b52d270bf0d5b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 29 Nov 2018 20:51:50 -0800 Subject: [PATCH 046/102] Eliminate deserializer lifetime on PropertySheet The PropertySheet is intended to be a long-lived object, whereas its JSON source is not needed once the property sheet is instantiated. Co-Authored-By: Timothy Clem --- src/lib.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 68715879..681af7fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ extern crate serde_json; extern crate regex; extern crate serde; -use serde::Deserialize; +use serde::de::DeserializeOwned; use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; @@ -63,10 +63,9 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet<'d, P: Deserialize<'d>> { +pub struct PropertySheet> { states: Vec, property_sets: Vec

, - _phantom: &'d std::marker::PhantomData<()>, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -77,11 +76,11 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a, 'd, P: Deserialize<'d>> { +pub struct TreePropertyCursor<'a, P: 'a + DeserializeOwned> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, - property_sheet: &'a PropertySheet<'d, P>, + property_sheet: &'a PropertySheet

, source: &'a str, } @@ -353,11 +352,11 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a, 'd, P: Deserialize<'d>>( + pub fn walk_with_properties<'a, P: DeserializeOwned>( &'a self, - property_sheet: &'a PropertySheet<'d, P>, + property_sheet: &'a PropertySheet

, source: &'a str, - ) -> TreePropertyCursor<'a, 'd, P> { + ) -> TreePropertyCursor<'a, P> { TreePropertyCursor::new(self, property_sheet, source) } } @@ -548,8 +547,8 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a, 'd, P: Deserialize<'d>> TreePropertyCursor<'a, 'd, P> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet<'d, P>, source: &'a str) -> Self { +impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], @@ -674,8 +673,8 @@ impl From for Point { } } -impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { - pub fn new(language: Language, json: &'a str) -> Result { +impl PropertySheet

{ + pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -735,7 +734,6 @@ impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { Ok(Self { property_sets: input.property_sets, states, - _phantom: &std::marker::PhantomData, }) } } From fbb220f19302ff44f172b6a48362ece7f62167ee Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 3 Dec 2018 10:43:58 -0800 Subject: [PATCH 047/102] Add test for regexes in property sheets --- src/lib.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 681af7fb..724a08bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -844,9 +844,18 @@ mod tests { fn test_tree_property_matching() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); + let source_code = "fn f1() { f2(); }"; + let tree = parser.parse_str(source_code, None).unwrap(); - let property_sheet = PropertySheet::>::new( + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + reference: Option, + define: Option, + } + + let empty_properties = Properties { reference: None, define: None }; + + let property_sheet = PropertySheet::::new( rust(), r##" { @@ -894,47 +903,126 @@ mod tests { ) .unwrap(); - let mut cursor = tree.walk_with_properties(&property_sheet, ""); + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "function_item"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "fn"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(!cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties()["define"], "function"); + assert_eq!(cursor.node_properties().define, Some("function".to_owned())); assert!(!cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "parameters"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "("); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), ")"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_parent()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties()["reference"], "function"); + assert_eq!(cursor.node_properties().reference, Some("function".to_owned())); + } + + #[test] + fn test_tree_property_matching_with_regexes() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { None(a()) }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + scope: Option, + } + + let empty_properties = Properties { scope: None }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "id": 0, + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "id": 1, + "transitions": [ + {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"scope": "constructor"}, + {"scope": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + cursor.goto_first_child(); + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties().scope, Some("constructor".to_owned())); } #[test] From beb60194d12b62cf70bc6b9e8652258ae07a9b44 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 3 Dec 2018 14:42:18 -0800 Subject: [PATCH 048/102] 0.3.3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0ffee772..f61b1583 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.2" +version = "0.3.3" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From a4c4b85a16ce0ecbb550d6de47801d2e387e629b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 5 Dec 2018 12:50:12 -0800 Subject: [PATCH 049/102] Initial commit --- .gitignore | 2 + Cargo.lock | 812 ++++++++++++++++++ Cargo.toml | 17 + src/build_tables/item.rs | 22 + src/build_tables/mod.rs | 34 + src/error.rs | 13 + src/generate.rs | 26 + src/grammars.rs | 98 +++ src/main.rs | 35 + src/parse_grammar.rs | 153 ++++ src/prepare_grammar/expand_repeats.rs | 220 +++++ src/prepare_grammar/extract_simple_aliases.rs | 10 + src/prepare_grammar/extract_tokens.rs | 7 + src/prepare_grammar/flatten_grammar.rs | 7 + src/prepare_grammar/intern_symbols.rs | 237 +++++ src/prepare_grammar/mod.rs | 40 + src/prepare_grammar/normalize_rules.rs | 5 + src/render/mod.rs | 16 + src/rules.rs | 205 +++++ src/tables.rs | 77 ++ 20 files changed, 2036 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/build_tables/item.rs create mode 100644 src/build_tables/mod.rs create mode 100644 src/error.rs create mode 100644 src/generate.rs create mode 100644 src/grammars.rs create mode 100644 src/main.rs create mode 100644 src/parse_grammar.rs create mode 100644 src/prepare_grammar/expand_repeats.rs create mode 100644 src/prepare_grammar/extract_simple_aliases.rs create mode 100644 src/prepare_grammar/extract_tokens.rs create mode 100644 src/prepare_grammar/flatten_grammar.rs create mode 100644 src/prepare_grammar/intern_symbols.rs create mode 100644 src/prepare_grammar/mod.rs create mode 100644 src/prepare_grammar/normalize_rules.rs create mode 100644 src/render/mod.rs create mode 100644 src/rules.rs create mode 100644 src/tables.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..53eaa219 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +**/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..20908681 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,812 @@ +[[package]] +name = "aho-corasick" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "argon2rs" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)", + "scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "arrayvec" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "atty" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace-sys" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bitvec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "blake2-rfc" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cc" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cfg-if" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clap" +version = "2.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-channel" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-utils" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "dirs" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure_derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fnv" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "globset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ignore" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "itoa" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lazy_static" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.44" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libloading" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libsqlite3-sys" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "linked-hash-map" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lock_api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lru-cache" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memchr" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memoffset" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "owning_ref" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_syscall" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "redox_users" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rusqlite" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", + "lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rust-tree-sitter-cli" +version = "0.1.0" +dependencies = [ + "bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ryu" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "same-file" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scopeguard" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "smallvec" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "stable_deref_trait" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "syn" +version = "0.15.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "synstructure" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termion" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "textwrap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "time" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tree-sitter" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ucd-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vcpkg" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "walkdir" +version = "2.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +"checksum argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3f67b0b6a86dae6e67ff4ca2b6201396074996379fba2b92ff649126f37cb392" +"checksum arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "f405cc4c21cd8b784f6c8fc2adf9bc00f59558f0049b5ec21517f875963040cc" +"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" +"checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" +"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" +"checksum bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e37e2176261200377c7cde4c6de020394174df556c356f965e4bc239f5ce1c5a" +"checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" +"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" +"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e" +"checksum crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b85741761b7f160bc5e7e0c14986ef685b7f8bf9b7ad081c60c604bb4649827" +"checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8" +"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015" +"checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816" +"checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a" +"checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7" +"checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596" +"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" +"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +"checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" +"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" +"checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" +"checksum libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d3711dfd91a1081d2458ad2d06ea30a8755256e74038be2ad927d94e1c955ca8" +"checksum linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7860ec297f7008ff7a1e3382d7f7e1dcd69efc94751a2284bafc3d013c2aa939" +"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" +"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" +"checksum lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d06ff7ff06f729ce5f4e227876cb88d10bc59cd4ae1e09fbb2bde15c850dc21" +"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" +"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" +"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" +"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" +"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" +"checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" +"checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" +"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" +"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" +"checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372" +"checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db" +"checksum redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)" = "679da7508e9a6390aeaf7fbd02a800fdc64b73fe2204dd2c8ae66d22d9d5ad5d" +"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +"checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" +"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c" +"checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" +"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" +"checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" +"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +"checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" +"checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" +"checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" +"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" +"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" +"checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" +"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" +"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" +"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b" +"checksum tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "311adf1e004ac816285a1196c93ea36364857c3adc37ffc9fd5ed0d70545391a" +"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" +"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..965cc81e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rust-tree-sitter-cli" +version = "0.1.0" +authors = ["Max Brunsfeld "] +edition = "2018" + +[dependencies] +bitvec = "0.8" +clap = "2.32" +dirs = "1.0.2" +ignore = "0.4.4" +libloading = "0.5" +rusqlite = "0.14.0" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0" +tree-sitter = "0.3.1" diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs new file mode 100644 index 00000000..c8d30997 --- /dev/null +++ b/src/build_tables/item.rs @@ -0,0 +1,22 @@ +use crate::grammars::Production; +use std::collections::HashMap; +use bitvec::BitVec; + +#[derive(Debug, PartialEq, Eq)] +pub(super) struct LookaheadSet { + terminal_bits: BitVec, + external_bits: BitVec, + eof: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(super) struct ParseItem { + variable_index: u32, + production_index: u32, + step_index: u32, +} + +#[derive(Debug, PartialEq, Eq)] +pub(super) struct ParseItemSet { + entries: HashMap +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs new file mode 100644 index 00000000..c5dd5b54 --- /dev/null +++ b/src/build_tables/mod.rs @@ -0,0 +1,34 @@ +mod item; + +use std::collections::{HashMap, VecDeque}; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; +use crate::tables::{ParseTable, LexTable, ParseStateId}; +use crate::rules::{AliasMap, Symbol}; +use crate::error::Result; +use self::item::ParseItemSet; + +type SymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + item_set: ParseItemSet, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + simple_aliases: &'a AliasMap, + state_ids_by_item_set: HashMap, + item_sets_by_state_id: Vec<&'a ParseItemSet>, + parse_state_queue: VecDeque, + parse_table: ParseTable, +} + +pub fn build_tables( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + simple_aliases: &AliasMap +) -> Result<(ParseTable, LexTable, LexTable, Option)> { + unimplemented!(); +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 00000000..90e7b8f9 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,13 @@ +#[derive(Debug)] +pub enum Error { + GrammarError(String), + SymbolError(String), +} + +pub type Result = std::result::Result; + +impl From for Error { + fn from(error: serde_json::Error) -> Self { + Error::GrammarError(error.to_string()) + } +} diff --git a/src/generate.rs b/src/generate.rs new file mode 100644 index 00000000..4507fb6f --- /dev/null +++ b/src/generate.rs @@ -0,0 +1,26 @@ +use crate::error::Result; +use crate::parse_grammar::parse_grammar; +use crate::prepare_grammar::prepare_grammar; +use crate::build_tables::build_tables; +use crate::render::render_c_code; + +pub fn generate_parser_for_grammar(input: String) -> Result { + let input_grammar = parse_grammar(&input)?; + let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases + )?; + let c_code = render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases + ); + Ok(c_code) +} diff --git a/src/grammars.rs b/src/grammars.rs new file mode 100644 index 00000000..6f5b772e --- /dev/null +++ b/src/grammars.rs @@ -0,0 +1,98 @@ +use crate::rules::{Associativity, Alias, Rule, Symbol}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VariableType { + Hidden, + Auxiliary, + Anonymous, + Named +} + +// Input grammar + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct InputVariable { + pub name: String, + pub kind: VariableType, + pub rule: Rule, +} + +#[derive(PartialEq, Eq)] +pub struct InputGrammar { + pub name: String, + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Option, +} + +// Extracted lexical grammar + +#[derive(PartialEq, Eq)] +pub struct LexicalVariable { + name: String, + kind: VariableType, + rule: Rule, + is_string: bool, +} + +pub struct LexicalGrammar { + variables: Vec, + separators: Vec, +} + +// Extracted syntax grammar + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ProductionStep { + symbol: Symbol, + precedence: i32, + associativity: Option, + alias: Option, + is_excluded: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Production { + steps: Vec, + dynamic_precedence: i32, +} + +#[derive(Clone, PartialEq, Eq)] +pub struct SyntaxVariable { + name: String, + kind: VariableType, +} + +#[derive(Clone, PartialEq, Eq)] +pub struct ExternalToken { + name: String, + kind: VariableType, + corresponding_internal_token: Symbol, +} + +pub struct SyntaxGrammar { + variables: Vec, + extra_tokens: Vec, + expected_conflicts: Vec>, + external_tokens: Vec, + variables_to_inline: Vec, + word_token: Symbol, +} + +#[cfg(test)] +impl InputVariable { + pub fn named(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Named, rule } + } + + pub fn auxiliary(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Auxiliary, rule } + } + + pub fn hidden(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Hidden, rule } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..3eeb306a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,35 @@ +use clap::{App, Arg, SubCommand}; + +#[macro_use] extern crate serde_derive; +#[macro_use] extern crate serde_json; + +mod build_tables; +mod error; +mod generate; +mod grammars; +mod parse_grammar; +mod prepare_grammar; +mod render; +mod rules; +mod tables; + +fn main() { + let matches = App::new("tree-sitter") + .version("0.1") + .author("Max Brunsfeld ") + .about("Generates and tests parsers") + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + ).subcommand( + SubCommand::with_name("parse") + .about("Parse a file") + .arg(Arg::with_name("path").index(1)) + ).subcommand( + SubCommand::with_name("test") + .about("Run a parser's tests") + .arg(Arg::with_name("path").index(1).required(true)) + .arg(Arg::with_name("line").index(2).required(true)) + .arg(Arg::with_name("column").index(3).required(true)) + ); +} diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs new file mode 100644 index 00000000..4c21e5ba --- /dev/null +++ b/src/parse_grammar.rs @@ -0,0 +1,153 @@ +use serde_json::{Map, Value}; +use crate::error::Result; +use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::rules::Rule; +use std::collections::HashMap; + +#[derive(Deserialize)] +#[serde(tag = "type")] +#[allow(non_camel_case_types)] +pub enum RuleJSON { + BLANK, + STRING { + value: String, + }, + PATTERN { + value: String, + }, + SYMBOL { + name: String, + }, + CHOICE { + members: Vec, + }, + SEQ { + members: Vec, + }, + REPEAT { + content: Box, + }, + PREC_LEFT { + value: i32, + content: Box, + }, + PREC_RIGHT { + value: i32, + content: Box, + }, + PREC { + value: i32, + content: Box, + }, + TOKEN { + content: Box, + }, + TOKEN_IMMEDIATE { + content: Box, + }, +} + +#[derive(Deserialize)] +struct GrammarJSON { + name: String, + rules: Map, + conflicts: Option>>, + externals: Option>, + extras: Option>, + inline: Option>, + word: Option, +} + +pub fn parse_grammar(input: &str) -> Result { + let grammar_json: GrammarJSON = serde_json::from_str(&input)?; + + let mut variables = Vec::with_capacity(grammar_json.rules.len()); + for (name, value) in grammar_json.rules { + variables.push(InputVariable { + name: name.to_owned(), + kind: VariableType::Named, + rule: parse_rule(serde_json::from_value(value)?), + }) + } + + let extra_tokens = grammar_json.extras + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let external_tokens = grammar_json.externals + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let expected_conflicts = grammar_json.conflicts + .unwrap_or(Vec::new()); + let variables_to_inline = grammar_json.inline + .unwrap_or(Vec::new()); + + Ok(InputGrammar { + name: grammar_json.name, + word_token: grammar_json.word, + variables, + extra_tokens, + expected_conflicts, + external_tokens, + variables_to_inline, + }) +} + +fn parse_rule(json: RuleJSON) -> Rule { + match json { + RuleJSON::BLANK => Rule::Blank, + RuleJSON::STRING { value } => Rule::String(value), + RuleJSON::PATTERN { value } => Rule::Pattern(value), + RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name), + RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), + RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), + RuleJSON::REPEAT { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), + RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), + RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), + RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), + RuleJSON::TOKEN_IMMEDIATE { content } => Rule::immediate_token(parse_rule(*content)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_grammar() { + let grammar = parse_grammar(&json!({ + "name": "my_lang", + "rules": { + "file": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "statement" + } + }, + "statement": { + "type": "STRING", + "value": "foo" + } + } + }).to_string()).unwrap(); + + assert_eq!(grammar.name, "my_lang"); + assert_eq!(grammar.variables, vec![ + InputVariable { + name: "file".to_string(), + kind: VariableType::Named, + rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) + }, + InputVariable { + name: "statement".to_string(), + kind: VariableType::Named, + rule: Rule::String("foo".to_string()) + }, + ]); + } +} diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs new file mode 100644 index 00000000..69db150c --- /dev/null +++ b/src/prepare_grammar/expand_repeats.rs @@ -0,0 +1,220 @@ +use crate::rules::{Rule, Symbol}; +use crate::grammars::{InputVariable, VariableType}; +use std::collections::HashMap; +use std::mem; +use std::rc::Rc; +use super::ExtractedGrammar; + +struct Expander { + variable_name: String, + repeat_count_in_variable: usize, + preceding_symbol_count: usize, + auxiliary_variables: Vec, + existing_repeats: HashMap +} + +impl Expander { + fn expand_variable(&mut self, variable: &mut InputVariable) { + self.variable_name.clear(); + self.variable_name.push_str(&variable.name); + self.repeat_count_in_variable = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.expand_rule(&rule); + } + + fn expand_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Choice { elements } => + Rule::Choice { + elements: elements.iter().map(|element| self.expand_rule(element)).collect() + }, + + Rule::Seq { left, right } => + Rule::Seq { + left: Rc::new(self.expand_rule(left)), + right: Rc::new(self.expand_rule(right)), + }, + + Rule::Repeat(content) => { + let inner_rule = self.expand_rule(content); + + if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) { + return Rule::Symbol(*existing_symbol); + } + + self.repeat_count_in_variable += 1; + let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); + let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); + let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); + self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); + self.auxiliary_variables.push(InputVariable { + name: rule_name, + kind: VariableType::Auxiliary, + rule: Rule::Choice { + elements: vec![ + Rule::Seq { + left: rc_symbol.clone(), + right: rc_symbol + }, + inner_rule + ], + }, + }); + + Rule::Symbol(repeat_symbol) + } + + Rule::Metadata { rule, params } => Rule::Metadata { + rule: Rc::new(self.expand_rule(rule)), + params: params.clone() + }, + + _ => rule.clone() + } + } +} + +pub(super) fn expand_repeats(mut grammar: ExtractedGrammar) -> ExtractedGrammar { + let mut expander = Expander { + variable_name: String::new(), + repeat_count_in_variable: 0, + preceding_symbol_count: grammar.variables.len(), + auxiliary_variables: Vec::new(), + existing_repeats: HashMap::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + expander.expand_variable(&mut variable); + } + + grammar.variables.extend(expander.auxiliary_variables.into_iter()); + grammar +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + // Repeats nested inside of sequences and choices are expanded. + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![ + Rule::repeat(Rule::terminal(11)), + Rule::repeat(Rule::terminal(12)), + ]), + Rule::terminal(13), + ])), + ])); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ]), + Rule::terminal(13), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(1), + ]), + Rule::terminal(11), + ])), + InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::terminal(12), + ])), + ]); + } + + #[test] + fn test_repeat_deduplication() { + // Terminal 4 appears inside of a repeat in three different places. + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::choice(vec![ + Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), + Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), + ])), + InputVariable::named("rule1", Rule::seq(vec![ + Rule::terminal(3), + Rule::repeat(Rule::terminal(4)), + ])), + ])); + + // Only one auxiliary rule is created for repeating terminal 4. + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::choice(vec![ + Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), + Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), + ])), + InputVariable::named("rule1", Rule::seq(vec![ + Rule::terminal(3), + Rule::non_terminal(2), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::terminal(4), + ])) + ]); + } + + #[test] + fn test_expansion_of_nested_repeats() { + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::repeat(Rule::seq(vec![ + Rule::terminal(11), + Rule::repeat(Rule::terminal(12)) + ])), + ])), + ])); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::non_terminal(2), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(1), + ]), + Rule::terminal(12), + ])), + InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::seq(vec![ + Rule::terminal(11), + Rule::non_terminal(1), + ]), + ])), + ]); + } + + fn build_grammar(variables: Vec) -> ExtractedGrammar { + ExtractedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs new file mode 100644 index 00000000..250246f3 --- /dev/null +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -0,0 +1,10 @@ +use crate::rules::AliasMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use super::ExtractedGrammar; + +pub(super) fn extract_simple_aliases( + syntax_grammar: &mut SyntaxGrammar, + lexical_grammar: &mut LexicalGrammar +) -> AliasMap { + unimplemented!(); +} diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs new file mode 100644 index 00000000..660d3819 --- /dev/null +++ b/src/prepare_grammar/extract_tokens.rs @@ -0,0 +1,7 @@ +use crate::error::Result; +use crate::grammars::LexicalGrammar; +use super::{InternedGrammar, ExtractedGrammar}; + +pub(super) fn extract_tokens(grammar: InternedGrammar) -> Result<(ExtractedGrammar, LexicalGrammar)> { + unimplemented!(); +} diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs new file mode 100644 index 00000000..36fe76c9 --- /dev/null +++ b/src/prepare_grammar/flatten_grammar.rs @@ -0,0 +1,7 @@ +use crate::error::Result; +use crate::grammars::SyntaxGrammar; +use super::ExtractedGrammar; + +pub(super) fn flatten_grammar(grammar: ExtractedGrammar) -> Result { + unimplemented!(); +} diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs new file mode 100644 index 00000000..00a5c330 --- /dev/null +++ b/src/prepare_grammar/intern_symbols.rs @@ -0,0 +1,237 @@ +use crate::error::{Error, Result}; +use crate::rules::{Rule, Symbol}; +use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use std::rc::Rc; +use super::InternedGrammar; + +pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result { + let interner = Interner { grammar }; + + if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { + return Err(Error::GrammarError("Grammar's start rule must be visible".to_string())); + } + + let mut variables = Vec::with_capacity(grammar.variables.len()); + for variable in grammar.variables.iter() { + variables.push(InputVariable { + name: variable.name.clone(), + kind: variable_type_for_name(&variable.name), + rule: interner.intern_rule(&variable.rule)?, + }); + } + + let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); + for external_token in grammar.external_tokens.iter() { + let rule = interner.intern_rule(&external_token)?; + let (name, kind) = if let Rule::NamedSymbol(name) = external_token { + (name.clone(), variable_type_for_name(&name)) + } else { + (String::new(), VariableType::Anonymous) + }; + external_tokens.push(InputVariable { name, kind, rule }); + } + + let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); + for extra_token in grammar.extra_tokens.iter() { + extra_tokens.push(interner.intern_rule(extra_token)?); + } + + let mut expected_conflicts = Vec::new(); + for conflict in grammar.expected_conflicts.iter() { + let mut interned_conflict = Vec::with_capacity(conflict.len()); + for name in conflict { + interned_conflict.push(interner + .intern_name(&name) + .ok_or_else(|| symbol_error(name))? + ); + } + expected_conflicts.push(interned_conflict); + } + + let mut variables_to_inline = Vec::new(); + for name in grammar.variables_to_inline.iter() { + if let Some(symbol) = interner.intern_name(&name) { + variables_to_inline.push(symbol); + } + } + + let mut word_token = None; + if let Some(name) = grammar.word_token.as_ref() { + word_token = Some(interner + .intern_name(&name) + .ok_or_else(|| symbol_error(&name))? + ); + } + + Ok(InternedGrammar { + variables, + external_tokens, + extra_tokens, + expected_conflicts, + variables_to_inline, + word_token, + }) +} + +struct Interner<'a> { + grammar: &'a InputGrammar +} + +impl<'a> Interner<'a> { + fn intern_rule(&self, rule: &Rule) -> Result { + match rule { + Rule::Choice { elements } => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Choice { elements: result }) + }, + + Rule::Seq { left, right } => + Ok(Rule::Seq { + left: Rc::new(self.intern_rule(left)?), + right: Rc::new(self.intern_rule(right)?), + }), + + Rule::Repeat(content) => + Ok(Rule::Repeat(Rc::new(self.intern_rule(content)?))), + + Rule::Metadata { rule, params } => + Ok(Rule::Metadata { + rule: Rc::new(self.intern_rule(rule)?), + params: params.clone() + }), + + Rule::NamedSymbol(name) => { + if let Some(symbol) = self.intern_name(&name) { + Ok(Rule::Symbol(symbol)) + } else { + Err(symbol_error(name)) + } + }, + + _ => Ok(rule.clone()) + + } + } + + fn intern_name(&self, symbol: &str) -> Option { + for (i, variable) in self.grammar.variables.iter().enumerate() { + if variable.name == symbol { + return Some(Symbol::non_terminal(i)) + } + } + + for (i, external_token) in self.grammar.external_tokens.iter().enumerate() { + if let Rule::NamedSymbol(name) = external_token { + if name == symbol { + return Some(Symbol::external(i)) + } + } + } + + return None + } +} + +fn symbol_error(name: &str) -> Error { + Error::SymbolError(format!("Undefined symbol '{}'", name)) +} + +fn variable_type_for_name(name: &str) -> VariableType { + if name.starts_with("_") { + VariableType::Hidden + } else { + VariableType::Named + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + let grammar = intern_symbols(&build_grammar(vec![ + InputVariable::named("x", Rule::choice(vec![ + Rule::named("y"), + Rule::named("_z"), + ])), + InputVariable::named("y", Rule::named("_z")), + InputVariable::named("_z", Rule::string("a")), + ])).unwrap(); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("x", Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + InputVariable::named("y", Rule::non_terminal(2)), + InputVariable::hidden("_z", Rule::string("a")), + ]); + } + + #[test] + fn test_interning_external_token_names() { + // Variable `y` is both an internal and an external token. + // Variable `z` is just an external token. + let mut input_grammar = build_grammar(vec![ + InputVariable::named("w", Rule::choice(vec![ + Rule::named("x"), + Rule::named("y"), + Rule::named("z"), + ])), + InputVariable::named("x", Rule::string("a")), + InputVariable::named("y", Rule::string("b")), + ]); + input_grammar.external_tokens.extend(vec![ + Rule::named("y"), + Rule::named("z"), + ]); + + let grammar = intern_symbols(&input_grammar).unwrap(); + + // Variable `y` is referred to by its internal index. + // Variable `z` is referred to by its external index. + assert_eq!(grammar.variables, vec![ + InputVariable::named("w", Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::external(1), + ])), + InputVariable::named("x", Rule::string("a")), + InputVariable::named("y", Rule::string("b")), + ]); + + // The external token for `y` refers back to its internal index. + assert_eq!(grammar.external_tokens, vec![ + InputVariable::named("y", Rule::non_terminal(2)), + InputVariable::named("z", Rule::external(1)), + ]); + } + + #[test] + fn test_grammar_with_undefined_symbols() { + let result = intern_symbols(&build_grammar(vec![ + InputVariable::named("x", Rule::named("y")), + ])); + + match result { + Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), + _ => panic!("Expected an error but got none"), + } + } + + fn build_grammar(variables: Vec) -> InputGrammar { + InputGrammar { + variables, + name: "the_language".to_string(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs new file mode 100644 index 00000000..0788edca --- /dev/null +++ b/src/prepare_grammar/mod.rs @@ -0,0 +1,40 @@ +mod intern_symbols; +mod extract_tokens; +mod expand_repeats; +mod flatten_grammar; +mod normalize_rules; +mod extract_simple_aliases; + +use crate::rules::{AliasMap, Rule, Symbol}; +use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, InputVariable, ExternalToken}; +use crate::error::Result; +use self::intern_symbols::intern_symbols; +use self::extract_tokens::extract_tokens; +use self::expand_repeats::expand_repeats; +use self::flatten_grammar::flatten_grammar; +use self::normalize_rules::normalize_rules; +use self::extract_simple_aliases::extract_simple_aliases; + +pub(self) struct IntermediateGrammar { + variables: Vec, + extra_tokens: Vec, + expected_conflicts: Vec>, + external_tokens: Vec, + variables_to_inline: Vec, + word_token: Option, +} + +pub(self) type InternedGrammar = IntermediateGrammar; +pub(self) type ExtractedGrammar = IntermediateGrammar; + +pub fn prepare_grammar( + input_grammar: &InputGrammar +) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { + let interned_grammar = intern_symbols(input_grammar)?; + let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; + let syntax_grammar = expand_repeats(syntax_grammar); + let mut syntax_grammar = flatten_grammar(syntax_grammar)?; + let mut lexical_grammar = normalize_rules(lexical_grammar); + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); + Ok((syntax_grammar, lexical_grammar, simple_aliases)) +} diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs new file mode 100644 index 00000000..9e625ef5 --- /dev/null +++ b/src/prepare_grammar/normalize_rules.rs @@ -0,0 +1,5 @@ +use crate::grammars::LexicalGrammar; + +pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { + unimplemented!(); +} diff --git a/src/render/mod.rs b/src/render/mod.rs new file mode 100644 index 00000000..85ce1f32 --- /dev/null +++ b/src/render/mod.rs @@ -0,0 +1,16 @@ +use crate::rules::{Symbol, AliasMap}; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; +use crate::tables::{ParseTable, LexTable}; + +pub fn render_c_code( + name: &str, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, +) -> String { + unimplemented!(); +} diff --git a/src/rules.rs b/src/rules.rs new file mode 100644 index 00000000..3cccca0d --- /dev/null +++ b/src/rules.rs @@ -0,0 +1,205 @@ +use std::rc::Rc; +use std::collections::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum SymbolType { + External, + Terminal, + NonTerminal, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Associativity { + Left, + Right +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct Alias { + value: String, + is_named: bool, +} + +pub type AliasMap = HashMap; + +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct MetadataParams { + precedence: Option, + dynamic_precedence: i32, + associativity: Option, + is_token: bool, + is_string: bool, + is_active: bool, + is_main_token: bool, + is_excluded: bool, + alias: Option, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct Symbol { + kind: SymbolType, + index: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Rule { + Blank, + CharacterSet(Vec), + String(String), + Pattern(String), + NamedSymbol(String), + Symbol(Symbol), + Choice { + elements: Vec, + }, + Metadata { + params: MetadataParams, + rule: Rc, + }, + Repeat(Rc), + Seq { + left: Rc, + right: Rc, + } +} + +impl Rule { + pub fn token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + }) + } + + pub fn immediate_token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + params.is_main_token = true; + }) + } + + pub fn prec(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.precedence = Some(value); + }) + } + + pub fn prec_left(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Left); + params.precedence = Some(value); + }) + } + + pub fn prec_right(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Right); + params.precedence = Some(value); + }) + } + + pub fn repeat(rule: Rule) -> Self { + Rule::Repeat(Rc::new(rule)) + } + + pub fn choice(rules: Vec) -> Self { + let mut elements = Vec::with_capacity(rules.len()); + for rule in rules { + choice_helper(&mut elements, rule); + } + Rule::Choice { elements } + } + + pub fn seq(rules: Vec) -> Self { + let mut result = Rule::Blank; + for rule in rules { + match rule { + Rule::Blank => continue, + Rule::Metadata { rule, params: _ } => { + if *rule == Rule::Blank { + continue; + } + }, + _ => { + if result == Rule::Blank { + result = rule; + } else { + result = Rule::Seq { + left: Rc::new(result), + right: Rc::new(rule), + } + } + } + } + } + result + } + + pub fn terminal(index: usize) -> Self { + Rule::Symbol(Symbol::terminal(index)) + } + + pub fn non_terminal(index: usize) -> Self { + Rule::Symbol(Symbol::non_terminal(index)) + } + + pub fn external(index: usize) -> Self { + Rule::Symbol(Symbol::external(index)) + } + + pub fn named(name: &'static str) -> Self { + Rule::NamedSymbol(name.to_string()) + } + + pub fn string(value: &'static str) -> Self { + Rule::String(value.to_string()) + } +} + +impl Symbol { + pub fn non_terminal(index: usize) -> Self { + Symbol { kind: SymbolType::NonTerminal, index } + } + + pub fn terminal(index: usize) -> Self { + Symbol { kind: SymbolType::Terminal, index } + } + + pub fn external(index: usize) -> Self { + Symbol { kind: SymbolType::External, index } + } +} + +impl From for Rule { + fn from(symbol: Symbol) -> Self { + Rule::Symbol(symbol) + } +} + +fn add_metadata(input: Rule, f: T) -> Rule { + match input { + Rule::Metadata { rule, mut params } => { + f(&mut params); + Rule::Metadata { rule, params } + }, + _ => { + let mut params = MetadataParams::default(); + f(&mut params); + Rule::Metadata { rule: Rc::new(input), params } + } + } +} + +fn choice_helper(result: &mut Vec, rule: Rule) { + match rule { + Rule::Choice {elements} => { + for element in elements { + choice_helper(result, element); + } + }, + _ => { + if !result.contains(&rule) { + result.push(rule); + } + } + } +} diff --git a/src/tables.rs b/src/tables.rs new file mode 100644 index 00000000..10b1e41d --- /dev/null +++ b/src/tables.rs @@ -0,0 +1,77 @@ +use std::collections::HashMap; +use std::ops::Range; +use crate::rules::{Associativity, Symbol, Alias}; + +pub type AliasSequenceId = usize; +pub type ParseStateId = usize; +pub type LexStateId = usize; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ParseActionType { + Error, + Shift, + Reduce, + Accept, + Recover, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ParseAction { + Accept, + Error, + Shift(ParseStateId), + ShiftExtra, + Recover, + Reduce { + symbol: Symbol, + child_count: usize, + precedence: i32, + dynamic_precedence: i32, + associativity: Option, + alias_sequence_id: Option, + is_repetition: bool, + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ParseTableEntry { + actions: Vec, + reusable: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ParseState { + terminal_entries: HashMap, + nonterminal_entries: HashMap +} + +#[derive(Debug, PartialEq, Eq)] +pub struct ParseTable { + states: Vec, + alias_sequences: Vec>, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AdvanceAction { + state: LexStateId, + precedence: Range, + in_main_token: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AcceptTokenAction { + symbol: Symbol, + precedence: i32, + implicit_precedence: i32, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LexState { + advance_actions: HashMap, + accept_action: Option, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct LexTable { + states: Vec, +} From 0688a5edd387e01ca7c83f9bbf2fb732852d2f5d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Dec 2018 22:11:52 -0800 Subject: [PATCH 050/102] Implement extract_tokens --- src/build_tables/mod.rs | 2 +- src/grammars.rs | 83 +++-- src/parse_grammar.rs | 12 +- src/prepare_grammar/expand_repeats.rs | 36 +- src/prepare_grammar/extract_tokens.rs | 492 +++++++++++++++++++++++++- src/prepare_grammar/intern_symbols.rs | 38 +- src/prepare_grammar/mod.rs | 8 +- src/render/mod.rs | 2 +- src/rules.rs | 52 +-- src/tables.rs | 24 +- 10 files changed, 621 insertions(+), 128 deletions(-) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c5dd5b54..c3518428 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -25,7 +25,7 @@ struct ParseTableBuilder<'a> { parse_table: ParseTable, } -pub fn build_tables( +pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap diff --git a/src/grammars.rs b/src/grammars.rs index 6f5b772e..62910637 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,7 +1,7 @@ use crate::rules::{Associativity, Alias, Rule, Symbol}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum VariableType { +pub(crate) enum VariableType { Hidden, Auxiliary, Anonymous, @@ -11,16 +11,16 @@ pub enum VariableType { // Input grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct InputVariable { +pub(crate) struct Variable { pub name: String, pub kind: VariableType, pub rule: Rule, } -#[derive(PartialEq, Eq)] -pub struct InputGrammar { +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct InputGrammar { pub name: String, - pub variables: Vec, + pub variables: Vec, pub extra_tokens: Vec, pub expected_conflicts: Vec>, pub external_tokens: Vec, @@ -30,60 +30,53 @@ pub struct InputGrammar { // Extracted lexical grammar -#[derive(PartialEq, Eq)] -pub struct LexicalVariable { - name: String, - kind: VariableType, - rule: Rule, - is_string: bool, -} - -pub struct LexicalGrammar { - variables: Vec, - separators: Vec, +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalGrammar { + pub variables: Vec, + pub separators: Vec, } // Extracted syntax grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ProductionStep { - symbol: Symbol, - precedence: i32, - associativity: Option, - alias: Option, - is_excluded: bool, +pub(crate) struct ProductionStep { + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, + pub is_excluded: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct Production { - steps: Vec, - dynamic_precedence: i32, +pub(crate) struct Production { + pub steps: Vec, + pub dynamic_precedence: i32, } -#[derive(Clone, PartialEq, Eq)] -pub struct SyntaxVariable { - name: String, - kind: VariableType, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct SyntaxVariable { + pub name: String, + pub kind: VariableType, } -#[derive(Clone, PartialEq, Eq)] -pub struct ExternalToken { - name: String, - kind: VariableType, - corresponding_internal_token: Symbol, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ExternalToken { + pub name: String, + pub kind: VariableType, + pub corresponding_internal_token: Option, } -pub struct SyntaxGrammar { - variables: Vec, - extra_tokens: Vec, - expected_conflicts: Vec>, - external_tokens: Vec, - variables_to_inline: Vec, - word_token: Symbol, +#[derive(Debug)] +pub(crate) struct SyntaxGrammar { + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Symbol, } -#[cfg(test)] -impl InputVariable { +impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } } @@ -95,4 +88,8 @@ impl InputVariable { pub fn hidden(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Hidden, rule } } + + pub fn anonymous(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Anonymous, rule } + } } diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 4c21e5ba..0f1f5008 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -1,13 +1,13 @@ use serde_json::{Map, Value}; use crate::error::Result; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] #[allow(non_camel_case_types)] -pub enum RuleJSON { +enum RuleJSON { BLANK, STRING { value: String, @@ -58,12 +58,12 @@ struct GrammarJSON { word: Option, } -pub fn parse_grammar(input: &str) -> Result { +pub(crate) fn parse_grammar(input: &str) -> Result { let grammar_json: GrammarJSON = serde_json::from_str(&input)?; let mut variables = Vec::with_capacity(grammar_json.rules.len()); for (name, value) in grammar_json.rules { - variables.push(InputVariable { + variables.push(Variable { name: name.to_owned(), kind: VariableType::Named, rule: parse_rule(serde_json::from_value(value)?), @@ -138,12 +138,12 @@ mod tests { assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ - InputVariable { + Variable { name: "file".to_string(), kind: VariableType::Named, rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) }, - InputVariable { + Variable { name: "statement".to_string(), kind: VariableType::Named, rule: Rule::String("foo".to_string()) diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index 69db150c..dcb8f916 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,5 +1,5 @@ use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputVariable, VariableType}; +use crate::grammars::{Variable, VariableType}; use std::collections::HashMap; use std::mem; use std::rc::Rc; @@ -9,12 +9,12 @@ struct Expander { variable_name: String, repeat_count_in_variable: usize, preceding_symbol_count: usize, - auxiliary_variables: Vec, + auxiliary_variables: Vec, existing_repeats: HashMap } impl Expander { - fn expand_variable(&mut self, variable: &mut InputVariable) { + fn expand_variable(&mut self, variable: &mut Variable) { self.variable_name.clear(); self.variable_name.push_str(&variable.name); self.repeat_count_in_variable = 0; @@ -48,7 +48,7 @@ impl Expander { let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); - self.auxiliary_variables.push(InputVariable { + self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, rule: Rule::Choice { @@ -100,7 +100,7 @@ mod tests { fn test_basic_repeat_expansion() { // Repeats nested inside of sequences and choices are expanded. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::repeat(Rule::terminal(11)), @@ -111,7 +111,7 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::non_terminal(1), @@ -119,14 +119,14 @@ mod tests { ]), Rule::terminal(13), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(11), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -140,11 +140,11 @@ mod tests { fn test_repeat_deduplication() { // Terminal 4 appears inside of a repeat in three different places. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::repeat(Rule::terminal(4)), ])), @@ -152,15 +152,15 @@ mod tests { // Only one auxiliary rule is created for repeating terminal 4. assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -173,7 +173,7 @@ mod tests { #[test] fn test_expansion_of_nested_repeats() { let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::repeat(Rule::seq(vec![ Rule::terminal(11), @@ -183,18 +183,18 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(12), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -207,7 +207,7 @@ mod tests { ]); } - fn build_grammar(variables: Vec) -> ExtractedGrammar { + fn build_grammar(variables: Vec) -> ExtractedGrammar { ExtractedGrammar { variables, extra_tokens: Vec::new(), diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 660d3819..ee90b3c8 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,7 +1,491 @@ -use crate::error::Result; -use crate::grammars::LexicalGrammar; +use std::collections::HashMap; +use std::rc::Rc; +use std::mem; +use crate::error::{Error, Result}; +use crate::rules::{Rule, MetadataParams, Symbol, SymbolType}; +use crate::grammars::{Variable, VariableType, LexicalGrammar, ExternalToken}; use super::{InternedGrammar, ExtractedGrammar}; -pub(super) fn extract_tokens(grammar: InternedGrammar) -> Result<(ExtractedGrammar, LexicalGrammar)> { - unimplemented!(); +pub(super) fn extract_tokens( + mut grammar: InternedGrammar +) -> Result<(ExtractedGrammar, LexicalGrammar)> { + let mut extractor = TokenExtractor { + current_variable_name: String::new(), + current_variable_token_count: 0, + extracted_variables: Vec::new(), + extracted_usage_counts: Vec::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + for mut variable in grammar.external_tokens.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len()); + for variable in extractor.extracted_variables { + lexical_variables.push(Variable { + name: variable.name, + kind: variable.kind, + rule: variable.rule, + }); + } + + // If a variable's entire rule was extracted as a token and that token didn't + // appear within any other rule, then remove that variable from the syntax + // grammar, giving its name to the token in the lexical grammar. Any symbols + // that pointed to that variable will need to be updated to point to the + // variable in the lexical grammar. Symbols that pointed to later variables + // will need to have their indices decremented. + let mut variables = Vec::new(); + let mut symbol_replacer = SymbolReplacer { replacements: HashMap::new() }; + for (i, variable) in grammar.variables.into_iter().enumerate() { + if let Rule::Symbol(Symbol { kind: SymbolType::Terminal, index }) = variable.rule { + if i > 0 && extractor.extracted_usage_counts[index] == 1 { + let mut lexical_variable = &mut lexical_variables[index]; + lexical_variable.kind = variable.kind; + lexical_variable.name = variable.name; + symbol_replacer.replacements.insert(i, index); + continue; + } + } + variables.push(variable); + } + + for variable in variables.iter_mut() { + variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); + } + + let expected_conflicts = grammar.expected_conflicts + .into_iter() + .map(|conflict| + conflict + .iter() + .map(|symbol| symbol_replacer.replace_symbol(*symbol)) + .collect() + ).collect(); + + let variables_to_inline = grammar.variables_to_inline + .into_iter() + .map(|symbol| symbol_replacer.replace_symbol(symbol)) + .collect(); + + let mut separators = Vec::new(); + let mut extra_tokens = Vec::new(); + for rule in grammar.extra_tokens { + if let Rule::Symbol(symbol) = rule { + let new_symbol = symbol_replacer.replace_symbol(symbol); + if new_symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-token symbol '{}' cannot be used as an extra token", + &variables[new_symbol.index].name + ))); + } else { + extra_tokens.push(new_symbol); + } + } else { + if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) { + extra_tokens.push(Symbol::terminal(index)); + } else { + separators.push(rule); + } + } + } + + let mut external_tokens = Vec::new(); + for external_token in grammar.external_tokens { + let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); + if let Rule::Symbol(symbol) = rule { + if symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Rule '{}' cannot be used as both an external token and a non-terminal rule", + &variables[symbol.index].name, + ))); + } + + if symbol.is_external() { + external_tokens.push(ExternalToken { + name: external_token.name, + kind: external_token.kind, + corresponding_internal_token: None, + }) + } else { + external_tokens.push(ExternalToken { + name: lexical_variables[symbol.index].name.clone(), + kind: external_token.kind, + corresponding_internal_token: Some(symbol), + }) + } + } else { + return Err(Error::GrammarError(format!( + "Non-symbol rules cannot be used as external tokens" + ))); + } + } + + let mut word_token = None; + if let Some(token) = grammar.word_token { + let token = symbol_replacer.replace_symbol(token); + if token.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-terminal symbol '{}' cannot be used as the word token", + &variables[token.index].name + ))); + } + word_token = Some(token); + } + + Ok(( + ExtractedGrammar { + variables, + expected_conflicts, + extra_tokens, + variables_to_inline, + external_tokens, + word_token, + }, + LexicalGrammar { + variables: lexical_variables, + separators, + } + )) +} + +struct TokenExtractor { + current_variable_name: String, + current_variable_token_count: usize, + extracted_variables: Vec, + extracted_usage_counts: Vec, +} + +struct SymbolReplacer { + replacements: HashMap +} + +impl TokenExtractor { + fn extract_tokens_in_variable(&mut self, variable: &mut Variable) { + self.current_variable_name.clear(); + self.current_variable_name.push_str(&variable.name); + self.current_variable_token_count = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.extract_tokens_in_rule(&rule); + } + + fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule { + match input { + Rule::String(name) => self.extract_token(input, Some(name)).into(), + Rule::Pattern(..) => self.extract_token(input, None).into(), + Rule::Metadata { params, rule } => { + if params.is_token { + let mut params = params.clone(); + params.is_token = false; + + let mut string_value = None; + if let Rule::String(value) = rule.as_ref() { + string_value = Some(value); + } + + let rule_to_extract = if params == MetadataParams::default() { + rule.as_ref() + } else { + input + }; + + self.extract_token(rule_to_extract, string_value).into() + } else { + Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.extract_tokens_in_rule((&rule).clone())) + } + } + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.extract_tokens_in_rule(content)) + ), + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.extract_tokens_in_rule(left)), + right: Rc::new(self.extract_tokens_in_rule(right)), + }, + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + }, + _ => input.clone() + } + } + + fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol { + for (i, variable) in self.extracted_variables.iter_mut().enumerate() { + if variable.rule == *rule { + self.extracted_usage_counts[i] += 1; + return Symbol::terminal(i) + } + } + + let index = self.extracted_variables.len(); + let variable = if let Some(string_value) = string_value { + Variable::anonymous(string_value, rule.clone()) + } else { + self.current_variable_token_count += 1; + Variable::auxiliary( + &format!( + "{}_token{}", + &self.current_variable_name, + self.current_variable_token_count + ), + rule.clone() + ) + }; + + self.extracted_variables.push(variable); + self.extracted_usage_counts.push(1); + Symbol::terminal(index) + } +} + +impl SymbolReplacer { + fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + }, + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.replace_symbols_in_rule(left)), + right: Rc::new(self.replace_symbols_in_rule(right)), + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.replace_symbols_in_rule(content)) + ), + Rule::Metadata { rule, params } => Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.replace_symbols_in_rule(rule)), + }, + _ => rule.clone() + } + } + + fn replace_symbol(&self, symbol: Symbol) -> Symbol { + if !symbol.is_non_terminal() { + return symbol + } + + if let Some(replacement) = self.replacements.get(&symbol.index) { + return Symbol::terminal(*replacement); + } + + let mut adjusted_index = symbol.index; + for (replaced_index, _) in self.replacements.iter() { + if *replaced_index < symbol.index { + adjusted_index -= 1; + } + } + + return Symbol::non_terminal(adjusted_index); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_extraction() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))) + ]) + ]))), + Variable::named("rule_1", Rule::pattern("e")), + Variable::named("rule_2", Rule::pattern("b")), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::Blank, + ])), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ]))), + + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::Blank, + ])), + ]); + + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary("rule_0_token2", Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + Variable::named("rule_1", Rule::pattern("e")), + ]); + } + + #[test] + fn test_start_rule_is_token() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::string("hello")), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::terminal(0)), + ]); + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("hello", Rule::string("hello")), + ]) + } + + #[test] + fn test_extracting_extra_tokens() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::string("x")), + Variable::named("comment", Rule::pattern("//.*")), + ]); + grammar.extra_tokens = vec![ + Rule::string(" "), + Rule::non_terminal(1), + ]; + + let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); + assert_eq!(syntax_grammar.extra_tokens, vec![ + Symbol::terminal(1), + ]); + assert_eq!(lexical_grammar.separators, vec![ + Rule::string(" "), + ]); + } + + #[test] + fn test_extract_externals() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::string("b")), + Variable::named("rule_2", Rule::string("c")), + ]); + grammar.external_tokens = vec![ + Variable::named("external_0", Rule::external(0)), + Variable::anonymous("a", Rule::string("a")), + Variable::named("rule_2", Rule::non_terminal(2)), + ]; + + let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); + + assert_eq!(syntax_grammar.external_tokens, vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ]); + } + + #[test] + fn test_error_on_non_terminal_symbol_extras() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::non_terminal(1)), + Variable::named("rule_1", Rule::non_terminal(2)), + Variable::named("rule_2", Rule::string("x")), + ]); + grammar.extra_tokens = vec![ + Rule::non_terminal(1), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Non-token symbol 'rule_1' cannot be used as an extra token"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + #[test] + fn test_error_on_external_with_same_name_as_non_terminal() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ])), + Variable::named("rule_2", Rule::string("a")), + ]); + grammar.external_tokens = vec![ + Variable::named("rule_1", Rule::non_terminal(1)), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + fn build_grammar(variables: Vec) -> InternedGrammar { + InternedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 00a5c330..e4cf7ff1 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -1,6 +1,6 @@ use crate::error::{Error, Result}; use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use std::rc::Rc; use super::InternedGrammar; @@ -13,7 +13,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let mut variables = Vec::with_capacity(grammar.variables.len()); for variable in grammar.variables.iter() { - variables.push(InputVariable { + variables.push(Variable { name: variable.name.clone(), kind: variable_type_for_name(&variable.name), rule: interner.intern_rule(&variable.rule)?, @@ -28,7 +28,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result } else { (String::new(), VariableType::Anonymous) }; - external_tokens.push(InputVariable { name, kind, rule }); + external_tokens.push(Variable { name, kind, rule }); } let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); @@ -154,21 +154,21 @@ mod tests { #[test] fn test_basic_repeat_expansion() { let grammar = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::named("y"), Rule::named("_z"), ])), - InputVariable::named("y", Rule::named("_z")), - InputVariable::named("_z", Rule::string("a")), + Variable::named("y", Rule::named("_z")), + Variable::named("_z", Rule::string("a")), ])).unwrap(); assert_eq!(grammar.variables, vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), ])), - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::hidden("_z", Rule::string("a")), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), ]); } @@ -177,13 +177,13 @@ mod tests { // Variable `y` is both an internal and an external token. // Variable `z` is just an external token. let mut input_grammar = build_grammar(vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::named("x"), Rule::named("y"), Rule::named("z"), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); input_grammar.external_tokens.extend(vec![ Rule::named("y"), @@ -195,26 +195,26 @@ mod tests { // Variable `y` is referred to by its internal index. // Variable `z` is referred to by its external index. assert_eq!(grammar.variables, vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), Rule::external(1), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); // The external token for `y` refers back to its internal index. assert_eq!(grammar.external_tokens, vec![ - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::named("z", Rule::external(1)), + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), ]); } #[test] fn test_grammar_with_undefined_symbols() { let result = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::named("y")), + Variable::named("x", Rule::named("y")), ])); match result { @@ -223,7 +223,7 @@ mod tests { } } - fn build_grammar(variables: Vec) -> InputGrammar { + fn build_grammar(variables: Vec) -> InputGrammar { InputGrammar { variables, name: "the_language".to_string(), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 0788edca..b860807a 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -6,7 +6,7 @@ mod normalize_rules; mod extract_simple_aliases; use crate::rules::{AliasMap, Rule, Symbol}; -use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, InputVariable, ExternalToken}; +use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, Variable, ExternalToken}; use crate::error::Result; use self::intern_symbols::intern_symbols; use self::extract_tokens::extract_tokens; @@ -16,7 +16,7 @@ use self::normalize_rules::normalize_rules; use self::extract_simple_aliases::extract_simple_aliases; pub(self) struct IntermediateGrammar { - variables: Vec, + variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, external_tokens: Vec, @@ -24,10 +24,10 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(self) type InternedGrammar = IntermediateGrammar; pub(self) type ExtractedGrammar = IntermediateGrammar; -pub fn prepare_grammar( +pub(crate) fn prepare_grammar( input_grammar: &InputGrammar ) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { let interned_grammar = intern_symbols(input_grammar)?; diff --git a/src/render/mod.rs b/src/render/mod.rs index 85ce1f32..5bd11a34 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,7 +2,7 @@ use crate::rules::{Symbol, AliasMap}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; use crate::tables::{ParseTable, LexTable}; -pub fn render_c_code( +pub(crate) fn render_c_code( name: &str, parse_table: ParseTable, main_lex_table: LexTable, diff --git a/src/rules.rs b/src/rules.rs index 3cccca0d..5c3b65fd 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -2,47 +2,47 @@ use std::rc::Rc; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum SymbolType { +pub(crate) enum SymbolType { External, Terminal, NonTerminal, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Associativity { +pub(crate) enum Associativity { Left, Right } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub struct Alias { - value: String, - is_named: bool, +pub(crate) struct Alias { + pub value: String, + pub is_named: bool, } -pub type AliasMap = HashMap; +pub(crate) type AliasMap = HashMap; #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] -pub struct MetadataParams { - precedence: Option, - dynamic_precedence: i32, - associativity: Option, - is_token: bool, - is_string: bool, - is_active: bool, - is_main_token: bool, - is_excluded: bool, - alias: Option, +pub(crate) struct MetadataParams { + pub precedence: Option, + pub dynamic_precedence: i32, + pub associativity: Option, + pub is_token: bool, + pub is_string: bool, + pub is_active: bool, + pub is_main_token: bool, + pub is_excluded: bool, + pub alias: Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct Symbol { - kind: SymbolType, - index: usize, +pub(crate) struct Symbol { + pub kind: SymbolType, + pub index: usize, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub enum Rule { +pub(crate) enum Rule { Blank, CharacterSet(Vec), String(String), @@ -153,9 +153,21 @@ impl Rule { pub fn string(value: &'static str) -> Self { Rule::String(value.to_string()) } + + pub fn pattern(value: &'static str) -> Self { + Rule::Pattern(value.to_string()) + } } impl Symbol { + pub fn is_non_terminal(&self) -> bool { + return self.kind == SymbolType::NonTerminal + } + + pub fn is_external(&self) -> bool { + return self.kind == SymbolType::External + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, index } } diff --git a/src/tables.rs b/src/tables.rs index 10b1e41d..de66253c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -2,12 +2,12 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; -pub type AliasSequenceId = usize; -pub type ParseStateId = usize; -pub type LexStateId = usize; +pub(crate) type AliasSequenceId = usize; +pub(crate) type ParseStateId = usize; +pub(crate) type LexStateId = usize; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseActionType { +pub(crate) enum ParseActionType { Error, Shift, Reduce, @@ -16,7 +16,7 @@ pub enum ParseActionType { } #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseAction { +pub(crate) enum ParseAction { Accept, Error, Shift(ParseStateId), @@ -34,44 +34,44 @@ pub enum ParseAction { } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseTableEntry { +pub(crate) struct ParseTableEntry { actions: Vec, reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseState { +pub(crate) struct ParseState { terminal_entries: HashMap, nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] -pub struct ParseTable { +pub(crate) struct ParseTable { states: Vec, alias_sequences: Vec>, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AdvanceAction { +pub(crate) struct AdvanceAction { state: LexStateId, precedence: Range, in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AcceptTokenAction { +pub(crate) struct AcceptTokenAction { symbol: Symbol, precedence: i32, implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct LexState { +pub(crate) struct LexState { advance_actions: HashMap, accept_action: Option, } #[derive(Debug, PartialEq, Eq)] -pub struct LexTable { +pub(crate) struct LexTable { states: Vec, } From ead6ca1738c52e8da4a2eb577d1c4c50b08593b4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 8 Dec 2018 13:44:11 -0800 Subject: [PATCH 051/102] Generate NFAs from regexes --- Cargo.lock | 1 + Cargo.toml | 1 + src/error.rs | 11 ++ src/main.rs | 1 + src/nfa.rs | 160 ++++++++++++++++++ src/prepare_grammar/normalize_rules.rs | 224 +++++++++++++++++++++++++ src/rules.rs | 2 +- 7 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 src/nfa.rs diff --git a/Cargo.lock b/Cargo.lock index 20908681..d5109fb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -466,6 +466,7 @@ dependencies = [ "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 965cc81e..93a49d2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tree-sitter = "0.3.1" +regex-syntax = "0.6.4" diff --git a/src/error.rs b/src/error.rs index 90e7b8f9..49064c22 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,10 +2,21 @@ pub enum Error { GrammarError(String), SymbolError(String), + RegexError(String), } pub type Result = std::result::Result; +impl Error { + pub fn grammar(message: &str) -> Self { + Error::GrammarError(message.to_string()) + } + + pub fn regex(message: &str) -> Self { + Error::RegexError(message.to_string()) + } +} + impl From for Error { fn from(error: serde_json::Error) -> Self { Error::GrammarError(error.to_string()) diff --git a/src/main.rs b/src/main.rs index 3eeb306a..4d376929 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod build_tables; mod error; mod generate; mod grammars; +mod nfa; mod parse_grammar; mod prepare_grammar; mod render; diff --git a/src/nfa.rs b/src/nfa.rs new file mode 100644 index 00000000..55aa11dc --- /dev/null +++ b/src/nfa.rs @@ -0,0 +1,160 @@ +use std::fmt; +use std::char; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum CharacterSet { + Include(Vec), + Exclude(Vec), +} + +#[derive(Debug)] +pub enum NfaState { + Advance(CharacterSet, u32), + Split(u32, u32), + Accept, +} + +pub struct Nfa { + pub states: Vec +} + +#[derive(Debug)] +pub struct NfaCursor<'a> { + indices: Vec, + nfa: &'a Nfa, +} + +impl CharacterSet { + pub fn empty() -> Self { + CharacterSet::Include(Vec::new()) + } + + pub fn all() -> Self { + CharacterSet::Exclude(Vec::new()) + } + + pub fn negate(self) -> CharacterSet { + match self { + CharacterSet::Include(chars) => CharacterSet::Exclude(chars), + CharacterSet::Exclude(chars) => CharacterSet::Include(chars), + } + } + + pub fn add_char(self, c: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + if let Err(i) = chars.binary_search(&c) { + chars.insert(i, c); + } + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add_range(self, start: char, end: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + let mut c = start as u32; + while c <= end as u32 { + chars.push(char::from_u32(c).unwrap()); + c += 1; + } + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add(self, other: CharacterSet) -> Self { + if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn contains(&self, c: char) -> bool { + match self { + CharacterSet::Include(chars) => chars.contains(&c), + CharacterSet::Exclude(chars) => !chars.contains(&c), + } + } +} + +impl Nfa { + pub fn new() -> Self { + Nfa { states: vec![NfaState::Accept] } + } + + pub fn start_index(&self) -> u32 { + self.states.len() as u32 - 1 + } + + pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { + self.states.push(f(self.start_index())); + } +} + +impl fmt::Debug for Nfa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Nfa {{ states: {{")?; + for (i, state) in self.states.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {:?}", i, state)?; + } + write!(f, "}} }}")?; + Ok(()) + } +} + +impl<'a> NfaCursor<'a> { + pub fn new(nfa: &'a Nfa) -> Self { + let mut result = Self { nfa, indices: Vec::new() }; + result.add_indices(&mut vec![nfa.start_index()]); + result + } + + pub fn advance(&mut self, c: char) -> bool { + let mut result = false; + let mut new_indices = Vec::new(); + for index in &self.indices { + if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if chars.contains(c) { + new_indices.push(*next_index); + result = true; + } + } + } + self.indices.clear(); + self.add_indices(&mut new_indices); + result + } + + pub fn is_done(&self) -> bool { + self.indices.iter().any(|index| { + if let NfaState::Accept = self.nfa.states[*index as usize] { + true + } else { + false + } + }) + } + + pub fn add_indices(&mut self, new_indices: &mut Vec) { + while let Some(index) = new_indices.pop() { + let state = &self.nfa.states[index as usize]; + if let NfaState::Split(left, right) = state { + new_indices.push(*left); + new_indices.push(*right); + } else if let Err(i) = self.indices.binary_search(&index) { + self.indices.insert(i, index); + } + } + } +} diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs index 9e625ef5..67177b4f 100644 --- a/src/prepare_grammar/normalize_rules.rs +++ b/src/prepare_grammar/normalize_rules.rs @@ -1,5 +1,229 @@ +use crate::error::{Error, Result}; +use crate::rules::Rule; use crate::grammars::LexicalGrammar; +use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet}; +use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; + +fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty() + .add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9') + } +} + +fn evaluate_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)), + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(evaluate_character_class(&item)?); + } + Ok(result) + } + _ => Err(Error::regex("Unsupported character class syntax")), + } +} + +fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { + match ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index)); + Ok(()) + }, + Ast::Dot(_) => { + nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index)); + Ok(()) + }, + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), + Class::Perl(class) => { + nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index)); + Ok(()) + }, + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + let character_set = evaluate_character_class(&item)?; + nfa.states.push(NfaState::Advance(character_set, next_state_index)); + Ok(()) + }, + ClassSet::BinaryOp(_) => { + Err(Error::regex("Binary operators in character classes aren't supported")) + } + } + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + regex_to_nfa(&repetition.ast, nfa, next_state_index)?; + nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); + Ok(()) + }, + RepetitionKind::OneOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + Ok(()) + }, + RepetitionKind::ZeroOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); + Ok(()) + }, + RepetitionKind::Range(_) => unimplemented!(), + }, + Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()), + Ast::Alternation(alternation) => { + let mut alternative_start_indices = Vec::new(); + for ast in alternation.asts.iter() { + regex_to_nfa(&ast, nfa, next_state_index)?; + alternative_start_indices.push(nfa.start_index()); + } + alternative_start_indices.pop(); + for alternative_start_index in alternative_start_indices { + nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + } + Ok(()) + }, + Ast::Concat(concat) => { + for ast in concat.asts.iter().rev() { + regex_to_nfa(&ast, nfa, next_state_index)?; + next_state_index = nfa.start_index(); + } + Ok(()) + } + } +} + +fn expand_rule(rule: Rule) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; + let mut nfa = Nfa::new(); + regex_to_nfa(&ast, &mut nfa, 0)?; + Ok(nfa) + }, + Rule::String(s) => { + let mut nfa = Nfa::new(); + for c in s.chars().rev() { + nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); + } + Ok(nfa) + }, + _ => Err(Error::grammar("Unexpected rule type")), + } +} pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { unimplemented!(); } + +#[cfg(test)] +mod tests { + use super::*; + + fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { + let mut result = None; + let mut char_count = 0; + let mut cursor = NfaCursor::new(nfa); + for c in s.chars() { + if cursor.is_done() { + result = Some(&s[0..char_count]); + } + if cursor.advance(c) { + char_count += 1; + } else { + break; + } + } + result + } + + #[test] + fn test_regex_expansion() { + struct Row { + pattern: &'static str, + examples: Vec<(&'static str, Option<&'static str>)>, + } + + let table = [ + Row { + pattern: "a|bc", + examples: vec![ + ("a12", Some("a")), + ("bc12", Some("bc")), + ("b12", None), + ("c12", None), + ], + }, + Row { + pattern: "(a|b|c)d(e|f|g)h?", + examples: vec![ + ("ade1", Some("ade")), + ("bdf1", Some("bdf")), + ("bdfh1", Some("bdfh")), + ("ad1", None), + ], + }, + Row { + pattern: "a*", + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], + }, + Row { + pattern: "a((bc)+|(de)*)f", + examples: vec![ + ("af1", Some("af")), + ("adedef1", Some("adedef")), + ("abcbcbcf1", Some("abcbcbcf")), + ("a", None), + ], + }, + Row { + pattern: "[a-fA-F0-9]+", + examples: vec![ + ("A1ff0", Some("A1ff")), + ], + }, + Row { + pattern: "\\w\\d\\s", + examples: vec![ + ("_0 ", Some("_0 ")), + ], + }, + ]; + + for Row { pattern, examples } in table.iter() { + let nfa = expand_rule(Rule::pattern(pattern)).unwrap(); + for (haystack, needle) in examples.iter() { + assert_eq!(simulate_nfa(&nfa, haystack), *needle); + } + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 5c3b65fd..b593496a 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,5 @@ use std::rc::Rc; +use std::char; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -44,7 +45,6 @@ pub(crate) struct Symbol { #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) enum Rule { Blank, - CharacterSet(Vec), String(String), Pattern(String), NamedSymbol(String), From d482894c7d40b9b563262fef49e2ec81f96d346a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 8 Dec 2018 23:35:48 -0800 Subject: [PATCH 052/102] Implement expand_tokens --- src/grammars.rs | 12 +- src/main.rs | 11 +- src/nfa.rs | 3 +- src/prepare_grammar/expand_repeats.rs | 40 +++--- .../{normalize_rules.rs => expand_tokens.rs} | 130 +++++++++++++----- src/prepare_grammar/extract_simple_aliases.rs | 1 - src/prepare_grammar/extract_tokens.rs | 45 +++--- src/prepare_grammar/flatten_grammar.rs | 4 +- src/prepare_grammar/intern_symbols.rs | 26 ++-- src/prepare_grammar/mod.rs | 15 +- src/rules.rs | 44 ++---- 11 files changed, 192 insertions(+), 139 deletions(-) rename src/prepare_grammar/{normalize_rules.rs => expand_tokens.rs} (61%) diff --git a/src/grammars.rs b/src/grammars.rs index 62910637..c5e9aaa1 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,4 +1,5 @@ use crate::rules::{Associativity, Alias, Rule, Symbol}; +use crate::nfa::Nfa; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { @@ -30,10 +31,17 @@ pub(crate) struct InputGrammar { // Extracted lexical grammar +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalVariable { + pub name: String, + pub kind: VariableType, + pub nfa: Nfa, +} + #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexicalGrammar { - pub variables: Vec, - pub separators: Vec, + pub variables: Vec, + pub separators: Vec, } // Extracted syntax grammar diff --git a/src/main.rs b/src/main.rs index 4d376929..b83764fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,7 +14,7 @@ mod render; mod rules; mod tables; -fn main() { +fn main() -> error::Result<()> { let matches = App::new("tree-sitter") .version("0.1") .author("Max Brunsfeld ") @@ -32,5 +32,12 @@ fn main() { .arg(Arg::with_name("path").index(1).required(true)) .arg(Arg::with_name("line").index(2).required(true)) .arg(Arg::with_name("column").index(3).required(true)) - ); + ).get_matches(); + + if let Some(matches) = matches.subcommand_matches("generate") { + let code = generate::generate_parser_for_grammar(String::new())?; + println!("{}", code); + } + + Ok(()) } diff --git a/src/nfa.rs b/src/nfa.rs index 55aa11dc..22cb2a2e 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -7,13 +7,14 @@ pub enum CharacterSet { Exclude(Vec), } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub enum NfaState { Advance(CharacterSet, u32), Split(u32, u32), Accept, } +#[derive(PartialEq, Eq)] pub struct Nfa { pub states: Vec } diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index dcb8f916..85f37c80 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -3,7 +3,7 @@ use crate::grammars::{Variable, VariableType}; use std::collections::HashMap; use std::mem; use std::rc::Rc; -use super::ExtractedGrammar; +use super::ExtractedSyntaxGrammar; struct Expander { variable_name: String, @@ -25,16 +25,11 @@ impl Expander { fn expand_rule(&mut self, rule: &Rule) -> Rule { match rule { - Rule::Choice { elements } => - Rule::Choice { - elements: elements.iter().map(|element| self.expand_rule(element)).collect() - }, + Rule::Choice(elements) => + Rule::Choice(elements.iter().map(|element| self.expand_rule(element)).collect()), - Rule::Seq { left, right } => - Rule::Seq { - left: Rc::new(self.expand_rule(left)), - right: Rc::new(self.expand_rule(right)), - }, + Rule::Seq(elements) => + Rule::Seq(elements.iter().map(|element| self.expand_rule(element)).collect()), Rule::Repeat(content) => { let inner_rule = self.expand_rule(content); @@ -46,27 +41,24 @@ impl Expander { self.repeat_count_in_variable += 1; let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); - let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, - rule: Rule::Choice { - elements: vec![ - Rule::Seq { - left: rc_symbol.clone(), - right: rc_symbol - }, - inner_rule - ], - }, + rule: Rule::Choice(vec![ + Rule::Seq(vec![ + Rule::Symbol(repeat_symbol), + Rule::Symbol(repeat_symbol), + ]), + inner_rule + ]), }); Rule::Symbol(repeat_symbol) } Rule::Metadata { rule, params } => Rule::Metadata { - rule: Rc::new(self.expand_rule(rule)), + rule: Box::new(self.expand_rule(rule)), params: params.clone() }, @@ -75,7 +67,7 @@ impl Expander { } } -pub(super) fn expand_repeats(mut grammar: ExtractedGrammar) -> ExtractedGrammar { +pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar { let mut expander = Expander { variable_name: String::new(), repeat_count_in_variable: 0, @@ -207,8 +199,8 @@ mod tests { ]); } - fn build_grammar(variables: Vec) -> ExtractedGrammar { - ExtractedGrammar { + fn build_grammar(variables: Vec) -> ExtractedSyntaxGrammar { + ExtractedSyntaxGrammar { variables, extra_tokens: Vec::new(), external_tokens: Vec::new(), diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/expand_tokens.rs similarity index 61% rename from src/prepare_grammar/normalize_rules.rs rename to src/prepare_grammar/expand_tokens.rs index 67177b4f..9cfa819f 100644 --- a/src/prepare_grammar/normalize_rules.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -1,10 +1,11 @@ use crate::error::{Error, Result}; use crate::rules::Rule; -use crate::grammars::LexicalGrammar; -use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet}; +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{Nfa, NfaState, CharacterSet}; +use super::{ExtractedLexicalGrammar}; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; -fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { +fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { match item { ClassPerlKind::Digit => CharacterSet::empty() .add_range('0', '9'), @@ -21,7 +22,7 @@ fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { } } -fn evaluate_character_class(item: &ClassSetItem) -> Result { +fn expand_character_class(item: &ClassSetItem) -> Result { match item { ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), @@ -29,7 +30,7 @@ fn evaluate_character_class(item: &ClassSetItem) -> Result { ClassSetItem::Union(union) => { let mut result = CharacterSet::empty(); for item in &union.items { - result = result.add(evaluate_character_class(&item)?); + result = result.add(expand_character_class(&item)?); } Ok(result) } @@ -37,7 +38,7 @@ fn evaluate_character_class(item: &ClassSetItem) -> Result { } } -fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match ast { Ast::Empty(_) => Ok(()), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), @@ -53,12 +54,12 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index)); + nfa.states.push(NfaState::Advance(expand_perl_character_class(&class.kind), next_state_index)); Ok(()) }, Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { - let character_set = evaluate_character_class(&item)?; + let character_set = expand_character_class(&item)?; nfa.states.push(NfaState::Advance(character_set, next_state_index)); Ok(()) }, @@ -69,14 +70,14 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - regex_to_nfa(&repetition.ast, nfa, next_state_index)?; + expand_regex(&repetition.ast, nfa, next_state_index)?; nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); Ok(()) }, RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); - regex_to_nfa(&repetition.ast, nfa, split_index)?; + expand_regex(&repetition.ast, nfa, split_index)?; nfa.states[split_index as usize] = NfaState::Split( nfa.start_index(), next_state_index @@ -86,7 +87,7 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); - regex_to_nfa(&repetition.ast, nfa, split_index)?; + expand_regex(&repetition.ast, nfa, split_index)?; nfa.states[split_index as usize] = NfaState::Split( nfa.start_index(), next_state_index @@ -96,11 +97,11 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), Ast::Alternation(alternation) => { let mut alternative_start_indices = Vec::new(); for ast in alternation.asts.iter() { - regex_to_nfa(&ast, nfa, next_state_index)?; + expand_regex(&ast, nfa, next_state_index)?; alternative_start_indices.push(nfa.start_index()); } alternative_start_indices.pop(); @@ -111,7 +112,7 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Concat(concat) => { for ast in concat.asts.iter().rev() { - regex_to_nfa(&ast, nfa, next_state_index)?; + expand_regex(&ast, nfa, next_state_index)?; next_state_index = nfa.start_index(); } Ok(()) @@ -119,32 +120,77 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( } } -fn expand_rule(rule: Rule) -> Result { +fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; - let mut nfa = Nfa::new(); - regex_to_nfa(&ast, &mut nfa, 0)?; - Ok(nfa) + expand_regex(&ast, nfa, next_state_index)?; + Ok(()) }, Rule::String(s) => { - let mut nfa = Nfa::new(); for c in s.chars().rev() { nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); } - Ok(nfa) + Ok(()) + }, + Rule::Choice(elements) => { + let mut alternative_start_indices = Vec::new(); + for element in elements { + expand_rule(element, nfa, next_state_index)?; + alternative_start_indices.push(nfa.start_index()); + } + alternative_start_indices.pop(); + for alternative_start_index in alternative_start_indices { + nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + } + Ok(()) + }, + Rule::Seq(elements) => { + for element in elements.into_iter().rev() { + expand_rule(element, nfa, next_state_index)?; + next_state_index = nfa.start_index(); + } + Ok(()) + }, + Rule::Repeat(rule) => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + expand_rule(*rule, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + Ok(()) }, _ => Err(Error::grammar("Unexpected rule type")), } } -pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { - unimplemented!(); +pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { + let mut variables = Vec::new(); + for variable in grammar.variables { + let mut nfa = Nfa::new(); + expand_rule(variable.rule, &mut nfa, 0)?; + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + nfa, + }); + } + let mut separators = Vec::new(); + for separator in grammar.separators { + let mut nfa = Nfa::new(); + expand_rule(separator, &mut nfa, 0)?; + separators.push(nfa); + } + + Ok(LexicalGrammar { variables, separators }) } #[cfg(test)] mod tests { use super::*; + use crate::nfa::NfaCursor; fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { let mut result = None; @@ -164,15 +210,15 @@ mod tests { } #[test] - fn test_regex_expansion() { + fn test_rule_expansion() { struct Row { - pattern: &'static str, + rule: Rule, examples: Vec<(&'static str, Option<&'static str>)>, } let table = [ Row { - pattern: "a|bc", + rule: Rule::pattern("a|bc"), examples: vec![ ("a12", Some("a")), ("bc12", Some("bc")), @@ -181,7 +227,7 @@ mod tests { ], }, Row { - pattern: "(a|b|c)d(e|f|g)h?", + rule: Rule::pattern("(a|b|c)d(e|f|g)h?"), examples: vec![ ("ade1", Some("ade")), ("bdf1", Some("bdf")), @@ -190,14 +236,14 @@ mod tests { ], }, Row { - pattern: "a*", + rule: Rule::pattern("a*"), examples: vec![ ("aaa1", Some("aaa")), ("b", Some("")), ], }, Row { - pattern: "a((bc)+|(de)*)f", + rule: Rule::pattern("a((bc)+|(de)*)f"), examples: vec![ ("af1", Some("af")), ("adedef1", Some("adedef")), @@ -206,21 +252,41 @@ mod tests { ], }, Row { - pattern: "[a-fA-F0-9]+", + rule: Rule::pattern("[a-fA-F0-9]+"), examples: vec![ ("A1ff0", Some("A1ff")), ], }, Row { - pattern: "\\w\\d\\s", + rule: Rule::pattern("\\w\\d\\s"), examples: vec![ ("_0 ", Some("_0 ")), ], }, + Row { + rule: Rule::string("abc"), + examples: vec![ + ("abcd", Some("abc")), + ("ab", None), + ], + }, + Row { + rule: Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ])), + examples: vec![ + ("{a}{", Some("{a}")), + ("{a}{d", Some("{a}")), + ("ab", None), + ], + }, ]; - for Row { pattern, examples } in table.iter() { - let nfa = expand_rule(Rule::pattern(pattern)).unwrap(); + for Row { rule, examples } in table.iter() { + let mut nfa = Nfa::new(); + expand_rule(rule.clone(), &mut nfa, 0).unwrap(); for (haystack, needle) in examples.iter() { assert_eq!(simulate_nfa(&nfa, haystack), *needle); } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 250246f3..2a175242 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -1,6 +1,5 @@ use crate::rules::AliasMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use super::ExtractedGrammar; pub(super) fn extract_simple_aliases( syntax_grammar: &mut SyntaxGrammar, diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index ee90b3c8..7322516f 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -3,12 +3,12 @@ use std::rc::Rc; use std::mem; use crate::error::{Error, Result}; use crate::rules::{Rule, MetadataParams, Symbol, SymbolType}; -use crate::grammars::{Variable, VariableType, LexicalGrammar, ExternalToken}; -use super::{InternedGrammar, ExtractedGrammar}; +use crate::grammars::{Variable, ExternalToken}; +use super::{InternedGrammar, ExtractedSyntaxGrammar, ExtractedLexicalGrammar}; pub(super) fn extract_tokens( mut grammar: InternedGrammar -) -> Result<(ExtractedGrammar, LexicalGrammar)> { +) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { let mut extractor = TokenExtractor { current_variable_name: String::new(), current_variable_token_count: 0, @@ -138,7 +138,7 @@ pub(super) fn extract_tokens( } Ok(( - ExtractedGrammar { + ExtractedSyntaxGrammar { variables, expected_conflicts, extra_tokens, @@ -146,7 +146,7 @@ pub(super) fn extract_tokens( external_tokens, word_token, }, - LexicalGrammar { + ExtractedLexicalGrammar { variables: lexical_variables, separators, } @@ -198,20 +198,19 @@ impl TokenExtractor { } else { Rule::Metadata { params: params.clone(), - rule: Rc::new(self.extract_tokens_in_rule((&rule).clone())) + rule: Box::new(self.extract_tokens_in_rule((&rule).clone())) } } }, Rule::Repeat(content) => Rule::Repeat( - Rc::new(self.extract_tokens_in_rule(content)) + Box::new(self.extract_tokens_in_rule(content)) + ), + Rule::Seq(elements) => Rule::Seq( + elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + ), + Rule::Choice(elements) => Rule::Choice( + elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() ), - Rule::Seq { left, right } => Rule::Seq { - left: Rc::new(self.extract_tokens_in_rule(left)), - right: Rc::new(self.extract_tokens_in_rule(right)), - }, - Rule::Choice { elements } => Rule::Choice { - elements: elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() - }, _ => input.clone() } } @@ -249,19 +248,18 @@ impl SymbolReplacer { fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { match rule { Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), - Rule::Choice { elements } => Rule::Choice { - elements: elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() - }, - Rule::Seq { left, right } => Rule::Seq { - left: Rc::new(self.replace_symbols_in_rule(left)), - right: Rc::new(self.replace_symbols_in_rule(right)), - }, + Rule::Choice(elements) => Rule::Choice( + elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + ), + Rule::Seq(elements) => Rule::Seq( + elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + ), Rule::Repeat(content) => Rule::Repeat( - Rc::new(self.replace_symbols_in_rule(content)) + Box::new(self.replace_symbols_in_rule(content)) ), Rule::Metadata { rule, params } => Rule::Metadata { params: params.clone(), - rule: Rc::new(self.replace_symbols_in_rule(rule)), + rule: Box::new(self.replace_symbols_in_rule(rule)), }, _ => rule.clone() } @@ -290,6 +288,7 @@ impl SymbolReplacer { #[cfg(test)] mod test { use super::*; + use crate::grammars::VariableType; #[test] fn test_extraction() { diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs index 36fe76c9..0f09cd14 100644 --- a/src/prepare_grammar/flatten_grammar.rs +++ b/src/prepare_grammar/flatten_grammar.rs @@ -1,7 +1,7 @@ use crate::error::Result; use crate::grammars::SyntaxGrammar; -use super::ExtractedGrammar; +use super::ExtractedSyntaxGrammar; -pub(super) fn flatten_grammar(grammar: ExtractedGrammar) -> Result { +pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { unimplemented!(); } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index e4cf7ff1..17132262 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -80,26 +80,26 @@ struct Interner<'a> { impl<'a> Interner<'a> { fn intern_rule(&self, rule: &Rule) -> Result { match rule { - Rule::Choice { elements } => { + Rule::Choice(elements) => { let mut result = Vec::with_capacity(elements.len()); for element in elements { result.push(self.intern_rule(element)?); } - Ok(Rule::Choice { elements: result }) + Ok(Rule::Choice(result)) }, - - Rule::Seq { left, right } => - Ok(Rule::Seq { - left: Rc::new(self.intern_rule(left)?), - right: Rc::new(self.intern_rule(right)?), - }), - - Rule::Repeat(content) => - Ok(Rule::Repeat(Rc::new(self.intern_rule(content)?))), - + Rule::Seq(elements) => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Seq(result)) + }, + Rule::Repeat(content) => Ok(Rule::Repeat( + Box::new(self.intern_rule(content)?) + )), Rule::Metadata { rule, params } => Ok(Rule::Metadata { - rule: Rc::new(self.intern_rule(rule)?), + rule: Box::new(self.intern_rule(rule)?), params: params.clone() }), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index b860807a..e2615479 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -2,7 +2,7 @@ mod intern_symbols; mod extract_tokens; mod expand_repeats; mod flatten_grammar; -mod normalize_rules; +mod expand_tokens; mod extract_simple_aliases; use crate::rules::{AliasMap, Rule, Symbol}; @@ -12,7 +12,7 @@ use self::intern_symbols::intern_symbols; use self::extract_tokens::extract_tokens; use self::expand_repeats::expand_repeats; use self::flatten_grammar::flatten_grammar; -use self::normalize_rules::normalize_rules; +use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; pub(self) struct IntermediateGrammar { @@ -25,7 +25,14 @@ pub(self) struct IntermediateGrammar { } pub(self) type InternedGrammar = IntermediateGrammar; -pub(self) type ExtractedGrammar = IntermediateGrammar; + +pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar; + +#[derive(Debug, PartialEq, Eq)] +pub(self) struct ExtractedLexicalGrammar { + variables: Vec, + separators: Vec, +} pub(crate) fn prepare_grammar( input_grammar: &InputGrammar @@ -34,7 +41,7 @@ pub(crate) fn prepare_grammar( let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; - let mut lexical_grammar = normalize_rules(lexical_grammar); + let mut lexical_grammar = expand_tokens(lexical_grammar)?; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); Ok((syntax_grammar, lexical_grammar, simple_aliases)) } diff --git a/src/rules.rs b/src/rules.rs index b593496a..c6f18cf4 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -49,18 +49,13 @@ pub(crate) enum Rule { Pattern(String), NamedSymbol(String), Symbol(Symbol), - Choice { - elements: Vec, - }, + Choice(Vec), Metadata { params: MetadataParams, - rule: Rc, + rule: Box, }, - Repeat(Rc), - Seq { - left: Rc, - right: Rc, - } + Repeat(Box), + Seq(Vec), } impl Rule { @@ -98,7 +93,7 @@ impl Rule { } pub fn repeat(rule: Rule) -> Self { - Rule::Repeat(Rc::new(rule)) + Rule::Repeat(Box::new(rule)) } pub fn choice(rules: Vec) -> Self { @@ -106,32 +101,11 @@ impl Rule { for rule in rules { choice_helper(&mut elements, rule); } - Rule::Choice { elements } + Rule::Choice(elements) } pub fn seq(rules: Vec) -> Self { - let mut result = Rule::Blank; - for rule in rules { - match rule { - Rule::Blank => continue, - Rule::Metadata { rule, params: _ } => { - if *rule == Rule::Blank { - continue; - } - }, - _ => { - if result == Rule::Blank { - result = rule; - } else { - result = Rule::Seq { - left: Rc::new(result), - right: Rc::new(rule), - } - } - } - } - } - result + Rule::Seq(rules) } pub fn terminal(index: usize) -> Self { @@ -196,14 +170,14 @@ fn add_metadata(input: Rule, f: T) -> Rule { _ => { let mut params = MetadataParams::default(); f(&mut params); - Rule::Metadata { rule: Rc::new(input), params } + Rule::Metadata { rule: Box::new(input), params } } } } fn choice_helper(result: &mut Vec, rule: Rule) { match rule { - Rule::Choice {elements} => { + Rule::Choice(elements) => { for element in elements { choice_helper(result, element); } From b0a7c854a4939915703980c229093e70147a1615 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Dec 2018 14:57:46 -0800 Subject: [PATCH 053/102] Avoid redundant regex complication when instantiating PropertySheets --- src/lib.rs | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 724a08bd..d70dc607 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,7 +48,7 @@ pub struct InputEdit { struct PropertyTransition { state_id: usize, child_index: Option, - text_regex: Option, + text_regex_index: Option, } struct PropertyState { @@ -66,6 +66,7 @@ pub enum PropertySheetError { pub struct PropertySheet> { states: Vec, property_sets: Vec

, + text_regexes: Vec, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -615,11 +616,11 @@ impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { .get(&node_kind_id) .and_then(|transitions| { for transition in transitions.iter() { - if let Some(text_regex) = transition.text_regex.as_ref() { + if let Some(text_regex_index) = transition.text_regex_index { let node = self.cursor.node(); let text = &self.source.as_bytes()[node.start_byte()..node.end_byte()]; if let Ok(text) = str::from_utf8(text) { - if !text_regex.is_match(text) { + if !self.property_sheet.text_regexes[text_regex_index].is_match(text) { continue; } } @@ -699,28 +700,37 @@ impl PropertySheet

{ } let input: PropertySheetJSON

= serde_json::from_str(json) - .map_err(|e| PropertySheetError::InvalidJSON(e))?; + .map_err(PropertySheetError::InvalidJSON)?; let mut states = Vec::new(); + let mut text_regexes = Vec::new(); + let mut text_regex_patterns = Vec::new(); for state in input.states.iter() { let mut transitions = HashMap::new(); let node_kind_count = language.node_kind_count(); for transition in state.transitions.iter() { - for i in 0..node_kind_count { - let i = i as u16; - if language.node_kind_is_named(i) == transition.named - && transition.kind == language.node_kind_for_id(i) + let text_regex_index = if let Some(regex_pattern) = transition.text.as_ref() { + if let Some(index) = text_regex_patterns.iter().position(|r| *r == regex_pattern) { + Some(index) + } else { + text_regex_patterns.push(regex_pattern); + text_regexes.push(Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?); + Some(text_regexes.len() - 1) + } + } else { + None + }; + + for i in 0..(node_kind_count as u16) { + if + transition.kind == language.node_kind_for_id(i) && + transition.named == language.node_kind_is_named(i) { let entry = transitions.entry(i).or_insert(Vec::new()); - let text_regex = if let Some(text) = transition.text.as_ref() { - Some(Regex::new(&text).map_err(|e| PropertySheetError::InvalidRegex(e))?) - } else { - None - }; entry.push(PropertyTransition { child_index: transition.index, state_id: transition.state_id, - text_regex + text_regex_index, }); } } @@ -734,6 +744,7 @@ impl PropertySheet

{ Ok(Self { property_sets: input.property_sets, states, + text_regexes, }) } } From ba9da0a9b48dd7d374438eece53749061453fefe Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 10:35:03 -0800 Subject: [PATCH 054/102] 0.3.4 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f61b1583..fde4fd31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.3" +version = "0.3.4" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 7acfb2b74e5ba3d66aff67d9afb698add9cb8708 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 12:14:34 -0800 Subject: [PATCH 055/102] Implement flatten_grammar --- src/grammars.rs | 19 +- src/prepare_grammar/expand_repeats.rs | 221 +++++++++-------- src/prepare_grammar/expand_tokens.rs | 119 +++++---- src/prepare_grammar/extract_tokens.rs | 327 +++++++++++++------------ src/prepare_grammar/flatten_grammar.rs | 312 ++++++++++++++++++++++- src/prepare_grammar/intern_symbols.rs | 137 ++++++----- src/prepare_grammar/mod.rs | 20 +- src/rules.rs | 8 +- 8 files changed, 773 insertions(+), 390 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index c5e9aaa1..3b3d47f7 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -52,7 +52,6 @@ pub(crate) struct ProductionStep { pub precedence: i32, pub associativity: Option, pub alias: Option, - pub is_excluded: bool, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -65,6 +64,7 @@ pub(crate) struct Production { pub(crate) struct SyntaxVariable { pub name: String, pub kind: VariableType, + pub productions: Vec, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -81,7 +81,22 @@ pub(crate) struct SyntaxGrammar { pub expected_conflicts: Vec>, pub external_tokens: Vec, pub variables_to_inline: Vec, - pub word_token: Symbol, + pub word_token: Option, +} + +impl ProductionStep { + pub(crate) fn new(symbol: Symbol) -> Self { + Self { symbol, precedence: 0, associativity: None, alias: None } + } + + pub(crate) fn with_prec(self, precedence: i32, associativity: Option) -> Self { + Self { + symbol: self.symbol, + precedence, + associativity, + alias: self.alias, + } + } } impl Variable { diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index 85f37c80..f3811c5f 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,16 +1,15 @@ -use crate::rules::{Rule, Symbol}; +use super::ExtractedSyntaxGrammar; use crate::grammars::{Variable, VariableType}; +use crate::rules::{Rule, Symbol}; use std::collections::HashMap; use std::mem; -use std::rc::Rc; -use super::ExtractedSyntaxGrammar; struct Expander { variable_name: String, repeat_count_in_variable: usize, preceding_symbol_count: usize, auxiliary_variables: Vec, - existing_repeats: HashMap + existing_repeats: HashMap, } impl Expander { @@ -25,11 +24,19 @@ impl Expander { fn expand_rule(&mut self, rule: &Rule) -> Rule { match rule { - Rule::Choice(elements) => - Rule::Choice(elements.iter().map(|element| self.expand_rule(element)).collect()), + Rule::Choice(elements) => Rule::Choice( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), - Rule::Seq(elements) => - Rule::Seq(elements.iter().map(|element| self.expand_rule(element)).collect()), + Rule::Seq(elements) => Rule::Seq( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), Rule::Repeat(content) => { let inner_rule = self.expand_rule(content); @@ -39,9 +46,15 @@ impl Expander { } self.repeat_count_in_variable += 1; - let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); - let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); - self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); + let rule_name = format!( + "{}_repeat{}", + self.variable_name, self.repeat_count_in_variable + ); + let repeat_symbol = Symbol::non_terminal( + self.preceding_symbol_count + self.auxiliary_variables.len(), + ); + self.existing_repeats + .insert(inner_rule.clone(), repeat_symbol); self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, @@ -50,7 +63,7 @@ impl Expander { Rule::Symbol(repeat_symbol), Rule::Symbol(repeat_symbol), ]), - inner_rule + inner_rule, ]), }); @@ -59,10 +72,10 @@ impl Expander { Rule::Metadata { rule, params } => Rule::Metadata { rule: Box::new(self.expand_rule(rule)), - params: params.clone() + params: params.clone(), }, - _ => rule.clone() + _ => rule.clone(), } } } @@ -80,7 +93,9 @@ pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSy expander.expand_variable(&mut variable); } - grammar.variables.extend(expander.auxiliary_variables.into_iter()); + grammar + .variables + .extend(expander.auxiliary_variables.into_iter()); grammar } @@ -91,112 +106,126 @@ mod tests { #[test] fn test_basic_repeat_expansion() { // Repeats nested inside of sequences and choices are expanded. - let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::seq(vec![ + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::repeat(Rule::terminal(11)), Rule::repeat(Rule::terminal(12)), ]), Rule::terminal(13), - ])), - ])); + ]), + )])); - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::seq(vec![ - Rule::terminal(10), - Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ]), - Rule::terminal(13), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(1), - ]), - Rule::terminal(11), - ])), - Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::terminal(12), - ])), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]), + Rule::terminal(13), + ]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(11), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(12), + ]) + ), + ] + ); } #[test] fn test_repeat_deduplication() { // Terminal 4 appears inside of a repeat in three different places. let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::choice(vec![ - Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), - Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), - ])), - Variable::named("rule1", Rule::seq(vec![ - Rule::terminal(3), - Rule::repeat(Rule::terminal(4)), - ])), + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]), + Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]), + ]), + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]), + ), ])); // Only one auxiliary rule is created for repeating terminal 4. - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::choice(vec![ - Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), - Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), - ])), - Variable::named("rule1", Rule::seq(vec![ - Rule::terminal(3), - Rule::non_terminal(2), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::terminal(4), - ])) - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]), + Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]), + ]) + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(4), + ]) + ) + ] + ); } #[test] fn test_expansion_of_nested_repeats() { - let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::seq(vec![ + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ Rule::terminal(10), Rule::repeat(Rule::seq(vec![ Rule::terminal(11), - Rule::repeat(Rule::terminal(12)) + Rule::repeat(Rule::terminal(12)), ])), - ])), - ])); + ]), + )])); - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::seq(vec![ - Rule::terminal(10), - Rule::non_terminal(2), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(1), - ]), - Rule::terminal(12), - ])), - Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::seq(vec![ - Rule::terminal(11), - Rule::non_terminal(1), - ]), - ])), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(12), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]), + ]) + ), + ] + ); } fn build_grammar(variables: Vec) -> ExtractedSyntaxGrammar { diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 9cfa819f..e0e1f9a9 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -1,14 +1,13 @@ +use super::ExtractedLexicalGrammar; use crate::error::{Error, Result}; -use crate::rules::Rule; use crate::grammars::{LexicalGrammar, LexicalVariable}; -use crate::nfa::{Nfa, NfaState, CharacterSet}; -use super::{ExtractedLexicalGrammar}; +use crate::nfa::{CharacterSet, Nfa, NfaState}; +use crate::rules::Rule; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { match item { - ClassPerlKind::Digit => CharacterSet::empty() - .add_range('0', '9'), + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), ClassPerlKind::Space => CharacterSet::empty() .add_char(' ') .add_char('\t') @@ -18,7 +17,7 @@ fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { .add_char('_') .add_range('A', 'Z') .add_range('a', 'z') - .add_range('0', '9') + .add_range('0', '9'), } } @@ -26,7 +25,9 @@ fn expand_character_class(item: &ClassSetItem) -> Result { match item { ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } ClassSetItem::Union(union) => { let mut result = CharacterSet::empty(); for item in &union.items { @@ -43,58 +44,64 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( Ast::Empty(_) => Ok(()), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index)); + nfa.states.push(NfaState::Advance( + CharacterSet::Include(vec![literal.c]), + next_state_index, + )); Ok(()) - }, + } Ast::Dot(_) => { - nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index)); + nfa.states.push(NfaState::Advance( + CharacterSet::Exclude(vec!['\n']), + next_state_index, + )); Ok(()) - }, + } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance(expand_perl_character_class(&class.kind), next_state_index)); + nfa.states.push(NfaState::Advance( + expand_perl_character_class(&class.kind), + next_state_index, + )); Ok(()) - }, + } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { let character_set = expand_character_class(&item)?; - nfa.states.push(NfaState::Advance(character_set, next_state_index)); + nfa.states + .push(NfaState::Advance(character_set, next_state_index)); Ok(()) - }, - ClassSet::BinaryOp(_) => { - Err(Error::regex("Binary operators in character classes aren't supported")) } - } + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { expand_regex(&repetition.ast, nfa, next_state_index)?; nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); Ok(()) - }, + } RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = + NfaState::Split(nfa.start_index(), next_state_index); Ok(()) - }, + } RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = + NfaState::Split(nfa.start_index(), next_state_index); nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); Ok(()) - }, + } RepetitionKind::Range(_) => unimplemented!(), }, Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), @@ -109,7 +116,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); } Ok(()) - }, + } Ast::Concat(concat) => { for ast in concat.asts.iter().rev() { expand_regex(&ast, nfa, next_state_index)?; @@ -123,16 +130,20 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match rule { Rule::Pattern(s) => { - let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; expand_regex(&ast, nfa, next_state_index)?; Ok(()) - }, + } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); + nfa.prepend(|start_index| { + NfaState::Advance(CharacterSet::empty().add_char(c), start_index) + }); } Ok(()) - }, + } Rule::Choice(elements) => { let mut alternative_start_indices = Vec::new(); for element in elements { @@ -144,24 +155,21 @@ fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); } Ok(()) - }, + } Rule::Seq(elements) => { for element in elements.into_iter().rev() { expand_rule(element, nfa, next_state_index)?; next_state_index = nfa.start_index(); } Ok(()) - }, + } Rule::Repeat(rule) => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_rule(*rule, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index); Ok(()) - }, + } _ => Err(Error::grammar("Unexpected rule type")), } } @@ -184,7 +192,10 @@ pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { let mut extractor = TokenExtractor { current_variable_name: String::new(), @@ -40,9 +39,15 @@ pub(super) fn extract_tokens( // variable in the lexical grammar. Symbols that pointed to later variables // will need to have their indices decremented. let mut variables = Vec::new(); - let mut symbol_replacer = SymbolReplacer { replacements: HashMap::new() }; + let mut symbol_replacer = SymbolReplacer { + replacements: HashMap::new(), + }; for (i, variable) in grammar.variables.into_iter().enumerate() { - if let Rule::Symbol(Symbol { kind: SymbolType::Terminal, index }) = variable.rule { + if let Rule::Symbol(Symbol { + kind: SymbolType::Terminal, + index, + }) = variable.rule + { if i > 0 && extractor.extracted_usage_counts[index] == 1 { let mut lexical_variable = &mut lexical_variables[index]; lexical_variable.kind = variable.kind; @@ -58,16 +63,19 @@ pub(super) fn extract_tokens( variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); } - let expected_conflicts = grammar.expected_conflicts + let expected_conflicts = grammar + .expected_conflicts .into_iter() - .map(|conflict| + .map(|conflict| { conflict .iter() .map(|symbol| symbol_replacer.replace_symbol(*symbol)) .collect() - ).collect(); + }) + .collect(); - let variables_to_inline = grammar.variables_to_inline + let variables_to_inline = grammar + .variables_to_inline .into_iter() .map(|symbol| symbol_replacer.replace_symbol(symbol)) .collect(); @@ -149,7 +157,7 @@ pub(super) fn extract_tokens( ExtractedLexicalGrammar { variables: lexical_variables, separators, - } + }, )) } @@ -161,7 +169,7 @@ struct TokenExtractor { } struct SymbolReplacer { - replacements: HashMap + replacements: HashMap, } impl TokenExtractor { @@ -198,20 +206,24 @@ impl TokenExtractor { } else { Rule::Metadata { params: params.clone(), - rule: Box::new(self.extract_tokens_in_rule((&rule).clone())) + rule: Box::new(self.extract_tokens_in_rule((&rule).clone())), } } - }, - Rule::Repeat(content) => Rule::Repeat( - Box::new(self.extract_tokens_in_rule(content)) - ), + } + Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))), Rule::Seq(elements) => Rule::Seq( - elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), ), Rule::Choice(elements) => Rule::Choice( - elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), ), - _ => input.clone() + _ => input.clone(), } } @@ -219,7 +231,7 @@ impl TokenExtractor { for (i, variable) in self.extracted_variables.iter_mut().enumerate() { if variable.rule == *rule { self.extracted_usage_counts[i] += 1; - return Symbol::terminal(i) + return Symbol::terminal(i); } } @@ -231,10 +243,9 @@ impl TokenExtractor { Variable::auxiliary( &format!( "{}_token{}", - &self.current_variable_name, - self.current_variable_token_count + &self.current_variable_name, self.current_variable_token_count ), - rule.clone() + rule.clone(), ) }; @@ -249,25 +260,29 @@ impl SymbolReplacer { match rule { Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), Rule::Choice(elements) => Rule::Choice( - elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), ), Rule::Seq(elements) => Rule::Seq( - elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() - ), - Rule::Repeat(content) => Rule::Repeat( - Box::new(self.replace_symbols_in_rule(content)) + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), ), + Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))), Rule::Metadata { rule, params } => Rule::Metadata { params: params.clone(), rule: Box::new(self.replace_symbols_in_rule(rule)), }, - _ => rule.clone() + _ => rule.clone(), } } fn replace_symbol(&self, symbol: Symbol) -> Symbol { if !symbol.is_non_terminal() { - return symbol + return symbol; } if let Some(replacement) = self.replacements.get(&symbol.index) { @@ -293,81 +308,95 @@ mod test { #[test] fn test_extraction() { let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ - Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ - Rule::string("a"), - Rule::pattern("b"), - Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - Rule::token(Rule::repeat(Rule::choice(vec![ - Rule::string("c"), - Rule::string("d"), - ]))) - ]) - ]))), + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + ]), + ])), + ), Variable::named("rule_1", Rule::pattern("e")), Variable::named("rule_2", Rule::pattern("b")), - Variable::named("rule_3", Rule::seq(vec![ - Rule::non_terminal(2), - Rule::Blank, - ])), - ])).unwrap(); + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]), + ), + ])) + .unwrap(); - assert_eq!(syntax_grammar.variables, vec![ - Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ - // The string "a" was replaced by a symbol referencing the lexical grammar - Rule::terminal(0), + assert_eq!( + syntax_grammar.variables, + vec![ + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ])) + ), + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,]) + ), + ] + ); - // The pattern "b" was replaced by a symbol referencing the lexical grammar - Rule::terminal(1), - Rule::choice(vec![ - // The symbol referencing `rule_1` was replaced by a symbol referencing - // the lexical grammar. - Rule::terminal(3), - - // The symbol referencing `rule_2` had its index decremented because - // `rule_1` was moved to the lexical grammar. - Rule::non_terminal(1), - - // The rule wrapped in `token` was replaced by a symbol referencing - // the lexical grammar. - Rule::terminal(2), - ]) - ]))), - - // The pattern "e" was only used in once place: as the definition of `rule_1`, - // so that rule was moved to the lexical grammar. The pattern "b" appeared in - // two places, so it was not moved into the lexical grammar. - Variable::named("rule_2", Rule::terminal(1)), - Variable::named("rule_3", Rule::seq(vec![ - Rule::non_terminal(1), - Rule::Blank, - ])), - ]); - - assert_eq!(lexical_grammar.variables, vec![ - Variable::anonymous("a", Rule::string("a")), - Variable::auxiliary("rule_0_token1", Rule::pattern("b")), - Variable::auxiliary("rule_0_token2", Rule::repeat(Rule::choice(vec![ - Rule::string("c"), - Rule::string("d"), - ]))), - Variable::named("rule_1", Rule::pattern("e")), - ]); + assert_eq!( + lexical_grammar.variables, + vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary( + "rule_0_token2", + Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),])) + ), + Variable::named("rule_1", Rule::pattern("e")), + ] + ); } #[test] fn test_start_rule_is_token() { - let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ - Variable::named("rule_0", Rule::string("hello")), - ])).unwrap(); + let (syntax_grammar, lexical_grammar) = + extract_tokens(build_grammar(vec![Variable::named( + "rule_0", + Rule::string("hello"), + )])) + .unwrap(); - assert_eq!(syntax_grammar.variables, vec![ - Variable::named("rule_0", Rule::terminal(0)), - ]); - assert_eq!(lexical_grammar.variables, vec![ - Variable::anonymous("hello", Rule::string("hello")), - ]) + assert_eq!( + syntax_grammar.variables, + vec![Variable::named("rule_0", Rule::terminal(0)),] + ); + assert_eq!( + lexical_grammar.variables, + vec![Variable::anonymous("hello", Rule::string("hello")),] + ) } #[test] @@ -376,29 +405,25 @@ mod test { Variable::named("rule_0", Rule::string("x")), Variable::named("comment", Rule::pattern("//.*")), ]); - grammar.extra_tokens = vec![ - Rule::string(" "), - Rule::non_terminal(1), - ]; + grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)]; let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); - assert_eq!(syntax_grammar.extra_tokens, vec![ - Symbol::terminal(1), - ]); - assert_eq!(lexical_grammar.separators, vec![ - Rule::string(" "), - ]); + assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]); + assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]); } #[test] fn test_extract_externals() { let mut grammar = build_grammar(vec![ - Variable::named("rule_0", Rule::seq(vec![ - Rule::external(0), - Rule::string("a"), - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), + Variable::named( + "rule_0", + Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ]), + ), Variable::named("rule_1", Rule::string("b")), Variable::named("rule_2", Rule::string("c")), ]); @@ -410,23 +435,26 @@ mod test { let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); - assert_eq!(syntax_grammar.external_tokens, vec![ - ExternalToken { - name: "external_0".to_string(), - kind: VariableType::Named, - corresponding_internal_token: None, - }, - ExternalToken { - name: "a".to_string(), - kind: VariableType::Anonymous, - corresponding_internal_token: Some(Symbol::terminal(0)), - }, - ExternalToken { - name: "rule_2".to_string(), - kind: VariableType::Named, - corresponding_internal_token: Some(Symbol::terminal(2)), - }, - ]); + assert_eq!( + syntax_grammar.external_tokens, + vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ] + ); } #[test] @@ -436,14 +464,15 @@ mod test { Variable::named("rule_1", Rule::non_terminal(2)), Variable::named("rule_2", Rule::string("x")), ]); - grammar.extra_tokens = vec![ - Rule::non_terminal(1), - ]; + grammar.extra_tokens = vec![Rule::non_terminal(1)]; match extract_tokens(grammar) { Err(Error::GrammarError(s)) => { - assert_eq!(s, "Non-token symbol 'rule_1' cannot be used as an extra token"); - }, + assert_eq!( + s, + "Non-token symbol 'rule_1' cannot be used as an extra token" + ); + } _ => { panic!("Expected an error but got no error"); } @@ -453,24 +482,22 @@ mod test { #[test] fn test_error_on_external_with_same_name_as_non_terminal() { let mut grammar = build_grammar(vec![ - Variable::named("rule_0", Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), - Variable::named("rule_1", Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ])), + Variable::named( + "rule_0", + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), + ), + Variable::named( + "rule_1", + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]), + ), Variable::named("rule_2", Rule::string("a")), ]); - grammar.external_tokens = vec![ - Variable::named("rule_1", Rule::non_terminal(1)), - ]; + grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; match extract_tokens(grammar) { Err(Error::GrammarError(s)) => { assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); - }, + } _ => { panic!("Expected an error but got no error"); } diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs index 0f09cd14..3ffef086 100644 --- a/src/prepare_grammar/flatten_grammar.rs +++ b/src/prepare_grammar/flatten_grammar.rs @@ -1,7 +1,313 @@ -use crate::error::Result; -use crate::grammars::SyntaxGrammar; use super::ExtractedSyntaxGrammar; +use crate::error::Result; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; +use crate::rules::{Alias, Associativity, Rule}; + +struct RuleFlattener { + production: Production, + precedence_stack: Vec, + associativity_stack: Vec, + alias_stack: Vec, +} + +impl RuleFlattener { + fn new() -> Self { + Self { + production: Production { + steps: Vec::new(), + dynamic_precedence: 0, + }, + precedence_stack: Vec::new(), + associativity_stack: Vec::new(), + alias_stack: Vec::new(), + } + } + + fn flatten(mut self, rule: Rule) -> Production { + self.apply(rule, true); + self.production + } + + fn apply(&mut self, rule: Rule, at_end: bool) { + match rule { + Rule::Seq(members) => { + let last_index = members.len() - 1; + for (i, member) in members.into_iter().enumerate() { + self.apply(member, i == last_index && at_end); + } + } + Rule::Metadata { rule, params } => { + let mut has_precedence = false; + if let Some(precedence) = params.precedence { + has_precedence = true; + self.precedence_stack.push(precedence); + } + + let mut has_associativity = false; + if let Some(associativity) = params.associativity { + has_associativity = true; + self.associativity_stack.push(associativity); + } + + let mut has_alias = false; + if let Some(alias) = params.alias { + has_alias = true; + self.alias_stack.push(alias); + } + + if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() { + self.production.dynamic_precedence = params.dynamic_precedence; + } + + self.apply(*rule, at_end); + + if has_precedence { + self.precedence_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().precedence = + self.precedence_stack.last().cloned().unwrap_or(0); + } + } + + if has_associativity { + self.associativity_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().associativity = + self.associativity_stack.last().cloned(); + } + } + + if has_alias { + self.alias_stack.pop(); + } + } + Rule::Symbol(symbol) => { + self.production.steps.push(ProductionStep { + symbol, + precedence: self.precedence_stack.last().cloned().unwrap_or(0), + associativity: self.associativity_stack.last().cloned(), + alias: self.alias_stack.last().cloned(), + }); + } + _ => (), + } + } +} + +fn extract_choices(rule: Rule) -> Vec { + match rule { + Rule::Seq(elements) => { + let mut result = vec![Rule::Blank]; + for element in elements { + let extraction = extract_choices(element); + let mut next_result = Vec::new(); + for entry in result { + for extraction_entry in extraction.iter() { + next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()])); + } + } + result = next_result; + } + result + } + Rule::Choice(elements) => { + let mut result = Vec::new(); + for element in elements { + for rule in extract_choices(element) { + result.push(rule); + } + } + result + } + Rule::Metadata { rule, params } => extract_choices(*rule) + .into_iter() + .map(|rule| Rule::Metadata { + rule: Box::new(rule), + params: params.clone(), + }) + .collect(), + _ => vec![rule], + } +} + +fn flatten_variable(variable: Variable) -> Result { + let mut productions = Vec::new(); + for rule in extract_choices(variable.rule) { + let production = RuleFlattener::new().flatten(rule); + if !productions.contains(&production) { + productions.push(production); + } + } + Ok(SyntaxVariable { + name: variable.name, + kind: variable.kind, + productions, + }) +} pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { - unimplemented!(); + let mut variables = Vec::new(); + for variable in grammar.variables { + variables.push(flatten_variable(variable)?); + } + Ok(SyntaxGrammar { + extra_tokens: grammar.extra_tokens, + expected_conflicts: grammar.expected_conflicts, + variables_to_inline: grammar.variables_to_inline, + external_tokens: grammar.external_tokens, + word_token: grammar.word_token, + variables, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::VariableType; + use crate::rules::Symbol; + + #[test] + fn test_flatten_grammar() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_left( + 101, + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_right( + 102, + Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), + ), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ]), + ), + Rule::non_terminal(7), + ]), + }) + .unwrap(); + + assert_eq!( + result.productions, + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(3)) + .with_prec(102, Some(Associativity::Right)), + ProductionStep::new(Symbol::non_terminal(4)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(5)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + ] + ); + } + + #[test] + fn test_flatten_grammar_with_maximum_dynamic_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_dynamic(101, Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_dynamic(102, Rule::seq(vec![ + Rule::non_terminal(3), + Rule::non_terminal(4) + ])), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ])), + Rule::non_terminal(7), + ]) + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 102, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(3)), + ProductionStep::new(Symbol::non_terminal(4)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + Production { + dynamic_precedence: 101, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(5)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + ]); + } + + #[test] + fn test_flatten_grammar_with_final_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + } } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 17132262..5165875c 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -1,14 +1,15 @@ -use crate::error::{Error, Result}; -use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputGrammar, Variable, VariableType}; -use std::rc::Rc; use super::InternedGrammar; +use crate::error::{Error, Result}; +use crate::grammars::{InputGrammar, Variable, VariableType}; +use crate::rules::{Rule, Symbol}; pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result { let interner = Interner { grammar }; if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { - return Err(Error::GrammarError("Grammar's start rule must be visible".to_string())); + return Err(Error::GrammarError( + "Grammar's start rule must be visible".to_string(), + )); } let mut variables = Vec::with_capacity(grammar.variables.len()); @@ -40,9 +41,10 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result for conflict in grammar.expected_conflicts.iter() { let mut interned_conflict = Vec::with_capacity(conflict.len()); for name in conflict { - interned_conflict.push(interner - .intern_name(&name) - .ok_or_else(|| symbol_error(name))? + interned_conflict.push( + interner + .intern_name(&name) + .ok_or_else(|| symbol_error(name))?, ); } expected_conflicts.push(interned_conflict); @@ -57,9 +59,10 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let mut word_token = None; if let Some(name) = grammar.word_token.as_ref() { - word_token = Some(interner - .intern_name(&name) - .ok_or_else(|| symbol_error(&name))? + word_token = Some( + interner + .intern_name(&name) + .ok_or_else(|| symbol_error(&name))?, ); } @@ -74,7 +77,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result } struct Interner<'a> { - grammar: &'a InputGrammar + grammar: &'a InputGrammar, } impl<'a> Interner<'a> { @@ -86,22 +89,19 @@ impl<'a> Interner<'a> { result.push(self.intern_rule(element)?); } Ok(Rule::Choice(result)) - }, + } Rule::Seq(elements) => { let mut result = Vec::with_capacity(elements.len()); for element in elements { result.push(self.intern_rule(element)?); } Ok(Rule::Seq(result)) - }, - Rule::Repeat(content) => Ok(Rule::Repeat( - Box::new(self.intern_rule(content)?) - )), - Rule::Metadata { rule, params } => - Ok(Rule::Metadata { - rule: Box::new(self.intern_rule(rule)?), - params: params.clone() - }), + } + Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))), + Rule::Metadata { rule, params } => Ok(Rule::Metadata { + rule: Box::new(self.intern_rule(rule)?), + params: params.clone(), + }), Rule::NamedSymbol(name) => { if let Some(symbol) = self.intern_name(&name) { @@ -109,29 +109,28 @@ impl<'a> Interner<'a> { } else { Err(symbol_error(name)) } - }, - - _ => Ok(rule.clone()) + } + _ => Ok(rule.clone()), } } fn intern_name(&self, symbol: &str) -> Option { for (i, variable) in self.grammar.variables.iter().enumerate() { if variable.name == symbol { - return Some(Symbol::non_terminal(i)) + return Some(Symbol::non_terminal(i)); } } for (i, external_token) in self.grammar.external_tokens.iter().enumerate() { if let Rule::NamedSymbol(name) = external_token { if name == symbol { - return Some(Symbol::external(i)) + return Some(Symbol::external(i)); } } } - return None + return None; } } @@ -154,22 +153,23 @@ mod tests { #[test] fn test_basic_repeat_expansion() { let grammar = intern_symbols(&build_grammar(vec![ - Variable::named("x", Rule::choice(vec![ - Rule::named("y"), - Rule::named("_z"), - ])), + Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])), Variable::named("y", Rule::named("_z")), Variable::named("_z", Rule::string("a")), - ])).unwrap(); + ])) + .unwrap(); - assert_eq!(grammar.variables, vec![ - Variable::named("x", Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), - Variable::named("y", Rule::non_terminal(2)), - Variable::hidden("_z", Rule::string("a")), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "x", + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]) + ), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), + ] + ); } #[test] @@ -177,45 +177,50 @@ mod tests { // Variable `y` is both an internal and an external token. // Variable `z` is just an external token. let mut input_grammar = build_grammar(vec![ - Variable::named("w", Rule::choice(vec![ - Rule::named("x"), - Rule::named("y"), - Rule::named("z"), - ])), + Variable::named( + "w", + Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]), + ), Variable::named("x", Rule::string("a")), Variable::named("y", Rule::string("b")), ]); - input_grammar.external_tokens.extend(vec![ - Rule::named("y"), - Rule::named("z"), - ]); + input_grammar + .external_tokens + .extend(vec![Rule::named("y"), Rule::named("z")]); let grammar = intern_symbols(&input_grammar).unwrap(); // Variable `y` is referred to by its internal index. // Variable `z` is referred to by its external index. - assert_eq!(grammar.variables, vec![ - Variable::named("w", Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - Rule::external(1), - ])), - Variable::named("x", Rule::string("a")), - Variable::named("y", Rule::string("b")), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "w", + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::external(1), + ]) + ), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), + ] + ); // The external token for `y` refers back to its internal index. - assert_eq!(grammar.external_tokens, vec![ - Variable::named("y", Rule::non_terminal(2)), - Variable::named("z", Rule::external(1)), - ]); + assert_eq!( + grammar.external_tokens, + vec![ + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), + ] + ); } #[test] fn test_grammar_with_undefined_symbols() { - let result = intern_symbols(&build_grammar(vec![ - Variable::named("x", Rule::named("y")), - ])); + let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); match result { Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index e2615479..08233c53 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -1,19 +1,19 @@ -mod intern_symbols; -mod extract_tokens; mod expand_repeats; -mod flatten_grammar; mod expand_tokens; mod extract_simple_aliases; +mod extract_tokens; +mod flatten_grammar; +mod intern_symbols; -use crate::rules::{AliasMap, Rule, Symbol}; -use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, Variable, ExternalToken}; -use crate::error::Result; -use self::intern_symbols::intern_symbols; -use self::extract_tokens::extract_tokens; use self::expand_repeats::expand_repeats; -use self::flatten_grammar::flatten_grammar; use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; +use self::extract_tokens::extract_tokens; +use self::flatten_grammar::flatten_grammar; +use self::intern_symbols::intern_symbols; +use crate::error::Result; +use crate::grammars::{ExternalToken, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable}; +use crate::rules::{AliasMap, Rule, Symbol}; pub(self) struct IntermediateGrammar { variables: Vec, @@ -35,7 +35,7 @@ pub(self) struct ExtractedLexicalGrammar { } pub(crate) fn prepare_grammar( - input_grammar: &InputGrammar + input_grammar: &InputGrammar, ) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { let interned_grammar = intern_symbols(input_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; diff --git a/src/rules.rs b/src/rules.rs index c6f18cf4..5d0af86c 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,5 +1,3 @@ -use std::rc::Rc; -use std::char; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -92,6 +90,12 @@ impl Rule { }) } + pub fn prec_dynamic(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.dynamic_precedence = value; + }) + } + pub fn repeat(rule: Rule) -> Self { Rule::Repeat(Box::new(rule)) } From 85347541f155736e423203944903033c76993187 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 17:30:12 -0800 Subject: [PATCH 056/102] Allow PropertySheet selectors to match the root node Co-Authored-By: Timothy Clem --- src/lib.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d70dc607..ad31d3c4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -550,13 +550,16 @@ impl<'a> Drop for TreeCursor<'a> { impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { - Self { + let mut result = Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], state_stack: vec![0], property_sheet, source, - } + }; + let state = result.next_state(&result.current_state(), result.cursor.node().kind_id(), 0); + result.state_stack.push(state); + result } pub fn node(&self) -> Node<'a> { From 40d24097ecdcc188f255a9fbb03adca05c5f39fd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 12:37:09 -0800 Subject: [PATCH 057/102] Implement extract_simple_aliases --- src/grammars.rs | 9 + src/prepare_grammar/extract_simple_aliases.rs | 191 +++++++++++++++++- src/prepare_grammar/mod.rs | 4 +- 3 files changed, 199 insertions(+), 5 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index 3b3d47f7..b76a583e 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -97,6 +97,15 @@ impl ProductionStep { alias: self.alias, } } + + pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self { + Self { + symbol: self.symbol, + precedence: self.precedence, + associativity: self.associativity, + alias: Some(Alias { value: value.to_string(), is_named }), + } + } } impl Variable { diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 2a175242..a10c7982 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -1,9 +1,194 @@ -use crate::rules::AliasMap; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +#[derive(Clone, Default)] +struct SymbolStatus { + alias: Option, + conflicting: bool, +} + pub(super) fn extract_simple_aliases( syntax_grammar: &mut SyntaxGrammar, - lexical_grammar: &mut LexicalGrammar + lexical_grammar: &LexicalGrammar ) -> AliasMap { - unimplemented!(); + // Determine which symbols in the grammars are *always* aliased to a single name. + let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; + let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + for variable in syntax_grammar.variables.iter() { + for production in variable.productions.iter() { + for step in production.steps.iter() { + let mut status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + }; + + if step.alias.is_none() { + status.alias = None; + status.conflicting = true; + } + + if !status.conflicting { + if status.alias.is_none() { + status.alias = step.alias.clone(); + } else if status.alias != step.alias { + status.alias = None; + status.conflicting = true; + } + } + } + } + } + + // Remove the aliases for those symbols. + for variable in syntax_grammar.variables.iter_mut() { + for production in variable.productions.iter_mut() { + for step in production.steps.iter_mut() { + let status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + }; + + if status.alias.is_some() { + step.alias = None; + } + } + } + } + + // Populate a map of the symbols to their aliases. + let mut result = AliasMap::new(); + for (i, status) in terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::terminal(i), alias); + } + } + for (i, status) in non_terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::non_terminal(i), alias); + } + } + for (i, status) in external_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::external(i), alias); + } + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; + use crate::nfa::Nfa; + + #[test] + fn test_extract_simple_aliases() { + let mut syntax_grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + + // Token 1 is aliased above, but not here. + ProductionStep::new(Symbol::terminal(1)), + + // Token 2 is aliased differently than above. + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ], + extra_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let lexical_grammar = LexicalGrammar { + variables: vec![ + LexicalVariable { + name: "t1".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + }, + LexicalVariable { + name: "t2".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + }, + LexicalVariable { + name: "t3".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + } + ], + separators: Vec::new(), + }; + + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + assert_eq!(simple_aliases.len(), 1); + assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias { + value: "a1".to_string(), + is_named: true, + }); + + assert_eq!(syntax_grammar.variables, vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // 'Simple' alias removed + ProductionStep::new(Symbol::terminal(0)), + + // Other aliases unchanged + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)), + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ]); + } } diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 08233c53..22435fca 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -41,7 +41,7 @@ pub(crate) fn prepare_grammar( let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; - let mut lexical_grammar = expand_tokens(lexical_grammar)?; - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); + let lexical_grammar = expand_tokens(lexical_grammar)?; + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); Ok((syntax_grammar, lexical_grammar, simple_aliases)) } From 0103a83f3f88cb8745706517a96f32c01ef1286a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 18:04:29 -0800 Subject: [PATCH 058/102] Integrate separator rules into lexer nfa --- src/grammars.rs | 4 +- src/nfa.rs | 44 +-- src/prepare_grammar/expand_tokens.rs | 265 +++++++++++------- src/prepare_grammar/extract_simple_aliases.rs | 8 +- src/rules.rs | 1 - 5 files changed, 199 insertions(+), 123 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index b76a583e..74c213e1 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -35,13 +35,13 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, - pub nfa: Nfa, + pub start_state: u32, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexicalGrammar { + pub nfa: Nfa, pub variables: Vec, - pub separators: Vec, } // Extracted syntax grammar diff --git a/src/nfa.rs b/src/nfa.rs index 22cb2a2e..66861434 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -9,9 +9,13 @@ pub enum CharacterSet { #[derive(Debug, PartialEq, Eq)] pub enum NfaState { - Advance(CharacterSet, u32), + Advance { + chars: CharacterSet, + state: u32, + is_sep: bool, + }, Split(u32, u32), - Accept, + Accept(usize), } #[derive(PartialEq, Eq)] @@ -23,6 +27,7 @@ pub struct Nfa { pub struct NfaCursor<'a> { indices: Vec, nfa: &'a Nfa, + in_sep: bool, } impl CharacterSet { @@ -88,15 +93,15 @@ impl CharacterSet { impl Nfa { pub fn new() -> Self { - Nfa { states: vec![NfaState::Accept] } + Nfa { states: Vec::new() } } - pub fn start_index(&self) -> u32 { + pub fn last_state(&self) -> u32 { self.states.len() as u32 - 1 } pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.start_index())); + self.states.push(f(self.last_state())); } } @@ -116,38 +121,45 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa) -> Self { - let mut result = Self { nfa, indices: Vec::new() }; - result.add_indices(&mut vec![nfa.start_index()]); + let mut result = Self { nfa, indices: Vec::new(), in_sep: true }; + result.add_states(&mut vec![nfa.last_state()]); result } pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_indices = Vec::new(); + let mut any_sep_transitions = false; for index in &self.indices { - if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] { + if *is_sep { + any_sep_transitions = true; + } if chars.contains(c) { - new_indices.push(*next_index); + new_indices.push(*state); result = true; } } } + if !any_sep_transitions { + self.in_sep = false; + } self.indices.clear(); - self.add_indices(&mut new_indices); + self.add_states(&mut new_indices); result } - pub fn is_done(&self) -> bool { - self.indices.iter().any(|index| { - if let NfaState::Accept = self.nfa.states[*index as usize] { - true + pub fn finished_ids<'b>(&'b self) -> impl Iterator + 'b { + self.indices.iter().filter_map(move |index| { + if let NfaState::Accept(i) = self.nfa.states[*index as usize] { + Some(i) } else { - false + None } }) } - pub fn add_indices(&mut self, new_indices: &mut Vec) { + pub fn add_states(&mut self, new_indices: &mut Vec) { while let Some(index) = new_indices.pop() { let state = &self.nfa.states[index as usize]; if let NfaState::Split(left, right) = state { diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index e0e1f9a9..3019b2be 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } } -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match ast { - Ast::Empty(_) => Ok(()), + Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Include(vec![literal.c]), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Include(vec![literal.c]), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Dot(_) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Exclude(vec!['\n']), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Exclude(vec!['\n']), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance( - expand_perl_character_class(&class.kind), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: expand_perl_character_class(&class.kind), + state: next_state_index, + is_sep, + }); + Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { let character_set = expand_character_class(&item)?; - nfa.states - .push(NfaState::Advance(character_set, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: character_set, + state: next_state_index, + is_sep, + }); + Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( "Binary operators in character classes aren't supported", @@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - expand_regex(&repetition.ast, nfa, next_state_index)?; - nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); - Ok(()) + if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? { + nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::OneOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + nfa.states.pop(); + Ok(false) + } } RepetitionKind::ZeroOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep), Ast::Alternation(alternation) => { let mut alternative_start_indices = Vec::new(); for ast in alternation.asts.iter() { - expand_regex(&ast, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Ast::Concat(concat) => { + let mut result = false; for ast in concat.asts.iter().rev() { - expand_regex(&ast, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } } } -fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_index)?; - Ok(()) + expand_regex(&ast, nfa, next_state_index, is_sep) } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|start_index| { - NfaState::Advance(CharacterSet::empty().add_char(c), start_index) + nfa.prepend(|last_state| { + NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state: last_state, + is_sep, + } }); } - Ok(()) + Ok(s.len() > 0) } Rule::Choice(elements) => { let mut alternative_start_indices = Vec::new(); for element in elements { - expand_rule(element, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_rule(element, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Rule::Seq(elements) => { + let mut result = false; for element in elements.into_iter().rev() { - expand_rule(element, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_rule(element, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_rule(*rule, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_rule(rule, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + Ok(false) + } } - _ => Err(Error::grammar("Unexpected rule type")), + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + let mut variables = Vec::new(); - for variable in grammar.variables { - let mut nfa = Nfa::new(); - expand_rule(variable.rule, &mut nfa, 0)?; + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state = nfa.last_state(); + expand_rule(&variable.rule, &mut nfa, last_state, false)?; + + if !is_immediate_token { + let last_state = nfa.last_state(); + expand_rule(&separator_rule, &mut nfa, last_state, true)?; + } + variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - nfa, + start_state: nfa.last_state(), }); } - let mut separators = Vec::new(); - for separator in grammar.separators { - let mut nfa = Nfa::new(); - expand_rule(separator, &mut nfa, 0)?; - separators.push(nfa); - } - Ok(LexicalGrammar { - variables, - separators, - }) + Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; use crate::nfa::NfaCursor; + use crate::grammars::Variable; fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { let mut result = None; let mut char_count = 0; let mut cursor = NfaCursor::new(nfa); for c in s.chars() { - if cursor.is_done() { + if cursor.finished_ids().count() > 0 { result = Some(&s[0..char_count]); } if cursor.advance(c) { @@ -223,13 +266,13 @@ mod tests { #[test] fn test_rule_expansion() { struct Row { - rule: Rule, + rules: Vec, examples: Vec<(&'static str, Option<&'static str>)>, } let table = [ Row { - rule: Rule::pattern("a|bc"), + rules: vec![Rule::pattern("a|bc")], examples: vec![ ("a12", Some("a")), ("bc12", Some("bc")), @@ -238,7 +281,7 @@ mod tests { ], }, Row { - rule: Rule::pattern("(a|b|c)d(e|f|g)h?"), + rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], examples: vec![ ("ade1", Some("ade")), ("bdf1", Some("bdf")), @@ -247,11 +290,14 @@ mod tests { ], }, Row { - rule: Rule::pattern("a*"), - examples: vec![("aaa1", Some("aaa")), ("b", Some(""))], + rules: vec![Rule::pattern("a*")], + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], }, Row { - rule: Rule::pattern("a((bc)+|(de)*)f"), + rules: vec![Rule::pattern("a((bc)+|(de)*)f")], examples: vec![ ("af1", Some("af")), ("adedef1", Some("adedef")), @@ -260,32 +306,51 @@ mod tests { ], }, Row { - rule: Rule::pattern("[a-fA-F0-9]+"), - examples: vec![("A1ff0", Some("A1ff"))], + rules: vec![Rule::pattern("[a-fA-F0-9]+")], + examples: vec![ + ("A1ff0", Some("A1ff")), + ], }, Row { - rule: Rule::pattern("\\w\\d\\s"), - examples: vec![("_0 ", Some("_0 "))], + rules: vec![Rule::pattern("\\w\\d\\s")], + examples: vec![ + ("_0 ", Some("_0 ")), + ], }, Row { - rule: Rule::string("abc"), - examples: vec![("abcd", Some("abc")), ("ab", None)], + rules: vec![Rule::string("abc")], + examples: vec![ + ("abcd", Some("abc")), + ("ab", None) + ], }, Row { - rule: Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)], + rules: vec![ + Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ])), + ], + examples: vec![ + ("{a}{", Some("{a}")), + ("{a}{d", Some("{a}")), + ("ab", None), + ], }, ]; - for Row { rule, examples } in table.iter() { - let mut nfa = Nfa::new(); - expand_rule(rule.clone(), &mut nfa, 0).unwrap(); + for Row { rules, examples } in &table { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: vec![], + variables: rules + .into_iter() + .map(|rule| Variable::named("", rule.clone())) + .collect(), + }).unwrap(); + for (haystack, needle) in examples.iter() { - assert_eq!(simulate_nfa(&nfa, haystack), *needle); + assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle); } } } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index a10c7982..8b87ea2e 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -130,24 +130,24 @@ mod tests { }; let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), variables: vec![ LexicalVariable { name: "t1".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t2".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t3".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, } ], - separators: Vec::new(), }; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); diff --git a/src/rules.rs b/src/rules.rs index 5d0af86c..d7234f45 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -30,7 +30,6 @@ pub(crate) struct MetadataParams { pub is_string: bool, pub is_active: bool, pub is_main_token: bool, - pub is_excluded: bool, pub alias: Option, } From 842421633c1161351ec0ba764be8927d09b15728 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 20:58:26 -0800 Subject: [PATCH 059/102] Fix bugs in nfa generation --- src/nfa.rs | 95 +++++++---- src/prepare_grammar/expand_tokens.rs | 230 ++++++++++++++++++--------- 2 files changed, 212 insertions(+), 113 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index 66861434..bc084ede 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -11,7 +11,7 @@ pub enum CharacterSet { pub enum NfaState { Advance { chars: CharacterSet, - state: u32, + state_id: u32, is_sep: bool, }, Split(u32, u32), @@ -25,7 +25,7 @@ pub struct Nfa { #[derive(Debug)] pub struct NfaCursor<'a> { - indices: Vec, + pub(crate) state_ids: Vec, nfa: &'a Nfa, in_sep: bool, } @@ -96,23 +96,20 @@ impl Nfa { Nfa { states: Vec::new() } } - pub fn last_state(&self) -> u32 { + pub fn last_state_id(&self) -> u32 { self.states.len() as u32 - 1 } pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.last_state())); + self.states.push(f(self.last_state_id())); } } impl fmt::Debug for Nfa { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Nfa {{ states: {{")?; + write!(f, "Nfa {{ states: {{\n")?; for (i, state) in self.states.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {:?}", i, state)?; + write!(f, " {}: {:?},\n", i, state)?; } write!(f, "}} }}")?; Ok(()) @@ -120,23 +117,23 @@ impl fmt::Debug for Nfa { } impl<'a> NfaCursor<'a> { - pub fn new(nfa: &'a Nfa) -> Self { - let mut result = Self { nfa, indices: Vec::new(), in_sep: true }; - result.add_states(&mut vec![nfa.last_state()]); + pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { + let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true }; + result.add_states(&mut states); result } pub fn advance(&mut self, c: char) -> bool { let mut result = false; - let mut new_indices = Vec::new(); + let mut new_state_ids = Vec::new(); let mut any_sep_transitions = false; - for index in &self.indices { - if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] { - if *is_sep { - any_sep_transitions = true; - } + for current_state_id in &self.state_ids { + if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] { if chars.contains(c) { - new_indices.push(*state); + if *is_sep { + any_sep_transitions = true; + } + new_state_ids.push(*state_id); result = true; } } @@ -144,30 +141,58 @@ impl<'a> NfaCursor<'a> { if !any_sep_transitions { self.in_sep = false; } - self.indices.clear(); - self.add_states(&mut new_indices); + self.state_ids.clear(); + self.add_states(&mut new_state_ids); result } - pub fn finished_ids<'b>(&'b self) -> impl Iterator + 'b { - self.indices.iter().filter_map(move |index| { - if let NfaState::Accept(i) = self.nfa.states[*index as usize] { - Some(i) - } else { - None + pub fn finished_id(&self) -> Option { + let mut result = None; + for state_id in self.state_ids.iter() { + if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] { + match result { + None => { + result = Some(id) + }, + Some(existing_id) => if id < existing_id { + result = Some(id) + } + } } - }) + } + result } - pub fn add_states(&mut self, new_indices: &mut Vec) { - while let Some(index) = new_indices.pop() { - let state = &self.nfa.states[index as usize]; + pub fn in_separator(&self) -> bool { + self.in_sep + } + + pub fn add_states(&mut self, new_state_ids: &mut Vec) { + let mut i = 0; + while i < new_state_ids.len() { + let state_id = new_state_ids[i]; + let state = &self.nfa.states[state_id as usize]; if let NfaState::Split(left, right) = state { - new_indices.push(*left); - new_indices.push(*right); - } else if let Err(i) = self.indices.binary_search(&index) { - self.indices.insert(i, index); + let mut has_left = false; + let mut has_right = false; + for new_state_id in new_state_ids.iter() { + if *new_state_id == *left { + has_left = true; + } + if *new_state_id == *right { + has_right = true; + } + } + if !has_left { + new_state_ids.push(*left); + } + if !has_right { + new_state_ids.push(*right); + } + } else if let Err(i) = self.state_ids.binary_search(&state_id) { + self.state_ids.insert(i, state_id); } + i += 1; } } } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 3019b2be..8b8cd03a 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -39,14 +39,14 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } } -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match ast { Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { nfa.states.push(NfaState::Advance { chars: CharacterSet::Include(vec![literal.c]), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -54,7 +54,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo Ast::Dot(_) => { nfa.states.push(NfaState::Advance { chars: CharacterSet::Exclude(vec!['\n']), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -65,7 +65,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo Class::Perl(class) => { nfa.states.push(NfaState::Advance { chars: expand_perl_character_class(&class.kind), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -75,7 +75,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo let character_set = expand_character_class(&item)?; nfa.states.push(NfaState::Advance { chars: character_set, - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -87,8 +87,8 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? { - nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state)); + if expand_regex(&repetition.ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); Ok(true) } else { Ok(false) @@ -96,10 +96,10 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = - NfaState::Split(nfa.last_state(), next_state_index); + let split_state_id = nfa.last_state_id(); + if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); Ok(true) } else { nfa.states.pop(); @@ -108,11 +108,11 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = - NfaState::Split(nfa.last_state(), next_state_index); - nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index)); + let split_state_id = nfa.last_state_id(); + if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); Ok(true) } else { Ok(false) @@ -120,47 +120,49 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), Ast::Alternation(alternation) => { - let mut alternative_start_indices = Vec::new(); + let mut alternative_state_ids = Vec::new(); for ast in alternation.asts.iter() { - if expand_regex(&ast, nfa, next_state_index, is_sep)? { - alternative_start_indices.push(nfa.last_state()); + if expand_regex(&ast, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); } } - alternative_start_indices.pop(); - for alternative_start_index in alternative_start_indices { - nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); } Ok(true) } Ast::Concat(concat) => { let mut result = false; for ast in concat.asts.iter().rev() { - if expand_regex(&ast, nfa, next_state_index, is_sep)? { + if expand_regex(&ast, nfa, next_state_id, is_sep)? { result = true; } - next_state_index = nfa.last_state(); + next_state_id = nfa.last_state_id(); } Ok(result) } } } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_index, is_sep) + expand_regex(&ast, nfa, next_state_id, is_sep) } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|last_state| { + nfa.prepend(|last_state_id| { NfaState::Advance { chars: CharacterSet::empty().add_char(c), - state: last_state, + state_id: last_state_id, is_sep, } }); @@ -168,33 +170,35 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bo Ok(s.len() > 0) } Rule::Choice(elements) => { - let mut alternative_start_indices = Vec::new(); + let mut alternative_state_ids = Vec::new(); for element in elements { - if expand_rule(element, nfa, next_state_index, is_sep)? { - alternative_start_indices.push(nfa.last_state()); + if expand_rule(element, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); } } - alternative_start_indices.pop(); - for alternative_start_index in alternative_start_indices { - nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); } Ok(true) } Rule::Seq(elements) => { let mut result = false; for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_index, is_sep)? { + if expand_rule(element, nfa, next_state_id, is_sep)? { result = true; } - next_state_index = nfa.last_state(); + next_state_id = nfa.last_state_id(); } Ok(result) } Rule::Repeat(rule) => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_rule(rule, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index); + let split_state_id = nfa.last_state_id(); + if expand_rule(rule, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); Ok(true) } else { Ok(false) @@ -205,10 +209,11 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bo } } -pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { +pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut nfa = Nfa::new(); let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); Rule::repeat(Rule::choice(grammar.separators)) } else { Rule::Blank @@ -222,18 +227,18 @@ pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { + fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { + let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); + let mut cursor = NfaCursor::new(&grammar.nfa, start_states); + let mut result = None; - let mut char_count = 0; - let mut cursor = NfaCursor::new(nfa); + let mut start_char = 0; + let mut end_char = 0; for c in s.chars() { - if cursor.finished_ids().count() > 0 { - result = Some(&s[0..char_count]); + if let Some(id) = cursor.finished_id() { + result = Some((id, &s[start_char..end_char])); } if cursor.advance(c) { - char_count += 1; + end_char += 1; + if cursor.in_separator() { + start_char = end_char; + } } else { break; } } + + if let Some(id) = cursor.finished_id() { + result = Some((id, &s[start_char..end_char])); + } + result } @@ -267,63 +283,74 @@ mod tests { fn test_rule_expansion() { struct Row { rules: Vec, - examples: Vec<(&'static str, Option<&'static str>)>, + separators: Vec, + examples: Vec<(&'static str, Option<(usize, &'static str)>)>, } let table = [ - Row { - rules: vec![Rule::pattern("a|bc")], - examples: vec![ - ("a12", Some("a")), - ("bc12", Some("bc")), - ("b12", None), - ("c12", None), - ], - }, + // regex with sequences and alternatives Row { rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], + separators: vec![], examples: vec![ - ("ade1", Some("ade")), - ("bdf1", Some("bdf")), - ("bdfh1", Some("bdfh")), + ("ade1", Some((0, "ade"))), + ("bdf1", Some((0, "bdf"))), + ("bdfh1", Some((0, "bdfh"))), ("ad1", None), ], }, + + // regex with repeats Row { rules: vec![Rule::pattern("a*")], + separators: vec![], examples: vec![ - ("aaa1", Some("aaa")), - ("b", Some("")), + ("aaa1", Some((0, "aaa"))), + ("b", Some((0, ""))), ], }, + + // regex with repeats in sequences Row { rules: vec![Rule::pattern("a((bc)+|(de)*)f")], + separators: vec![], examples: vec![ - ("af1", Some("af")), - ("adedef1", Some("adedef")), - ("abcbcbcf1", Some("abcbcbcf")), + ("af1", Some((0, "af"))), + ("adedef1", Some((0, "adedef"))), + ("abcbcbcf1", Some((0, "abcbcbcf"))), ("a", None), ], }, + + // regex with character ranges Row { rules: vec![Rule::pattern("[a-fA-F0-9]+")], + separators: vec![], examples: vec![ - ("A1ff0", Some("A1ff")), + ("A1ff0.", Some((0, "A1ff0"))), ], }, + + // regex with perl character classes Row { rules: vec![Rule::pattern("\\w\\d\\s")], + separators: vec![], examples: vec![ - ("_0 ", Some("_0 ")), + ("_0 ", Some((0, "_0 "))), ], }, + + // string Row { rules: vec![Rule::string("abc")], + separators: vec![], examples: vec![ - ("abcd", Some("abc")), + ("abcd", Some((0, "abc"))), ("ab", None) ], }, + + // complex rule containing strings and regexes Row { rules: vec![ Rule::repeat(Rule::seq(vec![ @@ -332,17 +359,64 @@ mod tests { Rule::string("}"), ])), ], + separators: vec![], examples: vec![ - ("{a}{", Some("{a}")), - ("{a}{d", Some("{a}")), + ("{a}{", Some((0, "{a}"))), + ("{a}{d", Some((0, "{a}"))), ("ab", None), ], }, + + // longest match rule + Row { + rules: vec![ + Rule::pattern("a|bc"), + Rule::pattern("aa"), + Rule::pattern("bcd"), + ], + separators: vec![], + examples: vec![ + ("a.", Some((0, "a"))), + ("bc.", Some((0, "bc"))), + ("aa.", Some((1, "aa"))), + ("bcd?", Some((2, "bcd"))), + ("b.", None), + ("c.", None), + ], + }, + + // regexes with alternatives including the empty string + Row { + rules: vec![Rule::pattern("a(b|)+c")], + separators: vec![], + examples: vec![ + ("ac.", Some((0, "ac"))), + ("abc.", Some((0, "abc"))), + ("abbc.", Some((0, "abbc"))), + ], + }, + + // separators + Row { + rules: vec![ + Rule::pattern("[a-f]+"), + ], + separators: vec![ + Rule::string("\\\n"), + Rule::pattern("\\s"), + ], + examples: vec![ + (" a", Some((0, "a"))), + (" \nb", Some((0, "b"))), + (" \\a", None), + (" \\\na", Some((0, "a"))), + ], + }, ]; - for Row { rules, examples } in &table { + for Row { rules, separators, examples } in &table { let grammar = expand_tokens(ExtractedLexicalGrammar { - separators: vec![], + separators: separators.clone(), variables: rules .into_iter() .map(|rule| Variable::named("", rule.clone())) @@ -350,7 +424,7 @@ mod tests { }).unwrap(); for (haystack, needle) in examples.iter() { - assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle); + assert_eq!(simulate_nfa(&grammar, haystack), *needle); } } } From 5fa586f7c92916db288e258c91a0424e3af04f30 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 21:01:41 -0800 Subject: [PATCH 060/102] Format expand_tokens file --- src/prepare_grammar/expand_tokens.rs | 281 +++++++++++++-------------- 1 file changed, 130 insertions(+), 151 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 8b8cd03a..7a1d2f4d 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -5,37 +5,98 @@ use crate::nfa::{CharacterSet, Nfa, NfaState}; use crate::rules::Rule; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), +pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + + let mut variables = Vec::new(); + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state_id = nfa.last_state_id(); + expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; + + if !is_immediate_token { + let last_state_id = nfa.last_state_id(); + expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + } + + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + start_state: nfa.last_state_id(), + }); } + + Ok(LexicalGrammar { nfa, variables }) } -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + expand_regex(&ast, nfa, next_state_id, is_sep) } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); + Rule::String(s) => { + for c in s.chars().rev() { + nfa.prepend(|last_state_id| NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state_id: last_state_id, + is_sep, + }); + } + Ok(s.len() > 0) + } + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if expand_rule(element, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); + } + Ok(true) + } + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if expand_rule(element, nfa, next_state_id, is_sep)? { + result = true; + } + next_state_id = nfa.last_state_id(); } Ok(result) } - _ => Err(Error::regex("Unsupported character class syntax")), + Rule::Repeat(rule) => { + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_state_id = nfa.last_state_id(); + if expand_rule(rule, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); + Ok(true) + } else { + Ok(false) + } + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } @@ -149,107 +210,45 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) } } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) +fn expand_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| { - NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - } - }); - } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; - } - next_state_id = nfa.last_state_id(); + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(expand_character_class(&item)?); } Ok(result) } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + _ => Err(Error::regex("Unsupported character class syntax")), } } -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); - - let separator_rule = if grammar.separators.len() > 0 { - grammar.separators.push(Rule::Blank); - Rule::repeat(Rule::choice(grammar.separators)) - } else { - Rule::Blank - }; - - let mut variables = Vec::new(); - for (i, variable) in grammar.variables.into_iter().enumerate() { - let is_immediate_token = match &variable.rule { - Rule::Metadata { params, .. } => params.is_main_token, - _ => false, - }; - - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; - - if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; - } - - variables.push(LexicalVariable { - name: variable.name, - kind: variable.kind, - start_state: nfa.last_state_id(), - }); +fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - - Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; - use crate::nfa::NfaCursor; use crate::grammars::Variable; + use crate::nfa::NfaCursor; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); @@ -299,17 +298,12 @@ mod tests { ("ad1", None), ], }, - // regex with repeats Row { rules: vec![Rule::pattern("a*")], separators: vec![], - examples: vec![ - ("aaa1", Some((0, "aaa"))), - ("b", Some((0, ""))), - ], + examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))], }, - // regex with repeats in sequences Row { rules: vec![Rule::pattern("a((bc)+|(de)*)f")], @@ -321,44 +315,31 @@ mod tests { ("a", None), ], }, - // regex with character ranges Row { rules: vec![Rule::pattern("[a-fA-F0-9]+")], separators: vec![], - examples: vec![ - ("A1ff0.", Some((0, "A1ff0"))), - ], + examples: vec![("A1ff0.", Some((0, "A1ff0")))], }, - // regex with perl character classes Row { rules: vec![Rule::pattern("\\w\\d\\s")], separators: vec![], - examples: vec![ - ("_0 ", Some((0, "_0 "))), - ], + examples: vec![("_0 ", Some((0, "_0 ")))], }, - // string Row { rules: vec![Rule::string("abc")], separators: vec![], - examples: vec![ - ("abcd", Some((0, "abc"))), - ("ab", None) - ], + examples: vec![("abcd", Some((0, "abc"))), ("ab", None)], }, - // complex rule containing strings and regexes Row { - rules: vec![ - Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - ], + rules: vec![Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ]))], separators: vec![], examples: vec![ ("{a}{", Some((0, "{a}"))), @@ -366,7 +347,6 @@ mod tests { ("ab", None), ], }, - // longest match rule Row { rules: vec![ @@ -384,8 +364,7 @@ mod tests { ("c.", None), ], }, - - // regexes with alternatives including the empty string + // regex with an alternative including the empty string Row { rules: vec![Rule::pattern("a(b|)+c")], separators: vec![], @@ -395,16 +374,10 @@ mod tests { ("abbc.", Some((0, "abbc"))), ], }, - // separators Row { - rules: vec![ - Rule::pattern("[a-f]+"), - ], - separators: vec![ - Rule::string("\\\n"), - Rule::pattern("\\s"), - ], + rules: vec![Rule::pattern("[a-f]+")], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], examples: vec![ (" a", Some((0, "a"))), (" \nb", Some((0, "b"))), @@ -414,14 +387,20 @@ mod tests { }, ]; - for Row { rules, separators, examples } in &table { + for Row { + rules, + separators, + examples, + } in &table + { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: separators.clone(), variables: rules .into_iter() .map(|rule| Variable::named("", rule.clone())) .collect(), - }).unwrap(); + }) + .unwrap(); for (haystack, needle) in examples.iter() { assert_eq!(simulate_nfa(&grammar, haystack), *needle); From 494329c93b4c54b583e68634132e1f45b383e91f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 10:08:25 -0800 Subject: [PATCH 061/102] Add Parser.set_included_ranges and Node.range --- src/lib.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index ad31d3c4..98d2234e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,14 @@ pub struct Point { pub column: usize, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Range { + pub start_byte: usize, + pub end_byte: usize, + pub start_point: Point, + pub end_point: Point, +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { pub start_byte: usize, @@ -252,6 +260,14 @@ impl Parser { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } + pub fn set_included_ranges(&mut self, ranges: &[Range]) { + let ts_ranges: Vec = + ranges.iter().cloned().map(|range| range.into()).collect(); + unsafe { + ffi::ts_parser_set_included_ranges(self.0, ts_ranges.as_ptr(), ts_ranges.len() as u32) + }; + } + fn parse_utf8_ptr (*const u8, usize)>( &mut self, input: &mut T, @@ -421,6 +437,15 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_end_byte(self.0) as usize } } + pub fn range(&self) -> Range { + Range { + start_byte: self.start_byte(), + end_byte: self.end_byte(), + start_point: self.start_position(), + end_point: self.end_position(), + } + } + pub fn start_position(&self) -> Point { let result = unsafe { ffi::ts_node_start_point(self.0) }; result.into() @@ -677,6 +702,17 @@ impl From for Point { } } +impl Into for Range { + fn into(self) -> ffi::TSRange { + ffi::TSRange { + start_byte: self.start_byte as u32, + end_byte: self.end_byte as u32, + start_point: self.start_point.into(), + end_point: self.end_point.into(), + } + } +} + impl PropertySheet

{ pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] From 4a361fbb3fafa41ffa1247501f8199938e5aab6c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 10:08:50 -0800 Subject: [PATCH 062/102] Implement Copy for Node --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 98d2234e..428e8101 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,6 +77,7 @@ pub struct PropertySheet> { text_regexes: Vec, } +#[derive(Clone, Copy)] pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); From bdd3f20522eefe01831ad9cd74002dfe95de20d1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:30:40 -0800 Subject: [PATCH 063/102] Add PropertySheet::map method --- src/lib.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 428e8101..0a53e320 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,7 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet> { +pub struct PropertySheet

> { states: Vec, property_sets: Vec

, text_regexes: Vec, @@ -86,7 +86,7 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a, P: 'a + DeserializeOwned> { +pub struct TreePropertyCursor<'a, P> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, @@ -370,7 +370,7 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a, P: DeserializeOwned>( + pub fn walk_with_properties<'a, P>( &'a self, property_sheet: &'a PropertySheet

, source: &'a str, @@ -574,7 +574,7 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { +impl<'a, P> TreePropertyCursor<'a, P> { fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { let mut result = Self { cursor: tree.root_node().walk(), @@ -714,8 +714,11 @@ impl Into for Range { } } -impl PropertySheet

{ - pub fn new(language: Language, json: &str) -> Result { +impl

PropertySheet

{ + pub fn new(language: Language, json: &str) -> Result + where + P: DeserializeOwned, + { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -787,6 +790,21 @@ impl PropertySheet

{ text_regexes, }) } + + pub fn map(self, mut f: F) -> Result, E> + where + F: FnMut(P) -> Result, + { + let mut property_sets = Vec::with_capacity(self.property_sets.len()); + for set in self.property_sets { + property_sets.push(f(set)?); + } + Ok(PropertySheet { + states: self.states, + text_regexes: self.text_regexes, + property_sets, + }) + } } #[cfg(test)] From 6d3835d292e7bc37965ad5623c3688c4862ee4b1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:32:10 -0800 Subject: [PATCH 064/102] Add Node::children method --- src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 0a53e320..f1a83203 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -465,6 +465,12 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_child_count(self.0) as usize } } + pub fn children<'a>(&'a self) -> impl Iterator> + 'a { + (0..self.child_count()) + .into_iter() + .map(move |i| self.child(i).unwrap()) + } + pub fn named_child<'a>(&'a self, i: usize) -> Option { Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) } From 3f1fc65a2736a573920c4139a844d99187ebb894 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:32:22 -0800 Subject: [PATCH 065/102] Auto-format lib.rs --- src/lib.rs | 59 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f1a83203..65a57d16 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,12 +2,12 @@ mod ffi; #[macro_use] extern crate serde_derive; -extern crate serde_json; extern crate regex; extern crate serde; +extern crate serde_json; -use serde::de::DeserializeOwned; use regex::Regex; +use serde::de::DeserializeOwned; use std::collections::HashMap; use std::ffi::CStr; use std::fmt; @@ -68,7 +68,7 @@ struct PropertyState { #[derive(Debug)] pub enum PropertySheetError { InvalidJSON(serde_json::Error), - InvalidRegex(regex::Error) + InvalidRegex(regex::Error), } pub struct PropertySheet

> { @@ -187,7 +187,16 @@ impl Parser { pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let bytes = input.as_bytes(); - self.parse_utf8(&mut |offset, _| &bytes[offset..], old_tree) + self.parse_utf8( + &mut |offset, _| { + if offset < bytes.len() { + &bytes[offset..] + } else { + &[] + } + }, + old_tree, + ) } pub fn parse_utf8<'a, T: FnMut(usize, Point) -> &'a [u8]>( @@ -565,7 +574,8 @@ impl<'a> TreeCursor<'a> { } pub fn goto_first_child_for_index(&mut self, index: usize) -> Option { - let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; + let result = + unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; if result < 0 { None } else { @@ -645,7 +655,12 @@ impl<'a, P> TreePropertyCursor<'a, P> { } } - fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: usize) -> usize { + fn next_state( + &self, + state: &PropertyState, + node_kind_id: u16, + node_child_index: usize, + ) -> usize { state .transitions .get(&node_kind_id) @@ -748,8 +763,8 @@ impl

PropertySheet

{ property_sets: Vec

, } - let input: PropertySheetJSON

= serde_json::from_str(json) - .map_err(PropertySheetError::InvalidJSON)?; + let input: PropertySheetJSON

= + serde_json::from_str(json).map_err(PropertySheetError::InvalidJSON)?; let mut states = Vec::new(); let mut text_regexes = Vec::new(); let mut text_regex_patterns = Vec::new(); @@ -759,11 +774,15 @@ impl

PropertySheet

{ let node_kind_count = language.node_kind_count(); for transition in state.transitions.iter() { let text_regex_index = if let Some(regex_pattern) = transition.text.as_ref() { - if let Some(index) = text_regex_patterns.iter().position(|r| *r == regex_pattern) { + if let Some(index) = + text_regex_patterns.iter().position(|r| *r == regex_pattern) + { Some(index) } else { text_regex_patterns.push(regex_pattern); - text_regexes.push(Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?); + text_regexes.push( + Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?, + ); Some(text_regexes.len() - 1) } } else { @@ -771,9 +790,8 @@ impl

PropertySheet

{ }; for i in 0..(node_kind_count as u16) { - if - transition.kind == language.node_kind_for_id(i) && - transition.named == language.node_kind_is_named(i) + if transition.kind == language.node_kind_for_id(i) + && transition.named == language.node_kind_is_named(i) { let entry = transitions.entry(i).or_insert(Vec::new()); entry.push(PropertyTransition { @@ -928,7 +946,10 @@ mod tests { define: Option, } - let empty_properties = Properties { reference: None, define: None }; + let empty_properties = Properties { + reference: None, + define: None, + }; let property_sheet = PropertySheet::::new( rust(), @@ -1018,7 +1039,10 @@ mod tests { assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().reference, Some("function".to_owned())); + assert_eq!( + cursor.node_properties().reference, + Some("function".to_owned()) + ); } #[test] @@ -1097,7 +1121,10 @@ mod tests { assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().scope, Some("constructor".to_owned())); + assert_eq!( + cursor.node_properties().scope, + Some("constructor".to_owned()) + ); } #[test] From d79203f58c7e3bb06232385a6da701ed5dfde739 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:42:46 -0800 Subject: [PATCH 066/102] Add test script --- script/test.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100755 script/test.sh diff --git a/script/test.sh b/script/test.sh new file mode 100755 index 00000000..eb6183c0 --- /dev/null +++ b/script/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +RUST_TREE_SITTER_TEST=1 cargo test $@ From 7bd9eaa97065c3153ae44d1f219d3bfc741e82a6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:43:44 -0800 Subject: [PATCH 067/102] 0.3.5 --- Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fde4fd31..7f0458ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,8 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.4" +version = "0.3.5" authors = ["Max Brunsfeld "] -build = "build.rs" license = "MIT" readme = "README.md" keywords = ["incremental", "parsing"] From 889f232b4ca2cbdc932510bb75da6f686059eceb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Dec 2018 16:05:36 -0800 Subject: [PATCH 068/102] Implement variable inlining --- Cargo.lock | 15 +- Cargo.toml | 3 +- src/build_tables/inline_variables.rs | 318 +++++++++++++++++++++++++++ src/build_tables/item.rs | 213 ++++++++++++++++-- src/build_tables/mod.rs | 1 + src/grammars.rs | 12 + src/main.rs | 1 + src/parse_grammar.rs | 1 - src/rules.rs | 34 ++- 9 files changed, 567 insertions(+), 31 deletions(-) create mode 100644 src/build_tables/inline_variables.rs diff --git a/Cargo.lock b/Cargo.lock index d5109fb7..410580fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,11 +67,6 @@ name = "bitflags" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "bitvec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "blake2-rfc" version = "0.2.18" @@ -461,16 +456,17 @@ dependencies = [ name = "rust-tree-sitter-cli" version = "0.1.0" dependencies = [ - "bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -548,6 +544,11 @@ dependencies = [ "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "smallbitvec" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "smallvec" version = "0.6.7" @@ -729,7 +730,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e37e2176261200377c7cde4c6de020394174df556c356f965e4bc239f5ce1c5a" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" @@ -787,6 +787,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" "checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" "checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" diff --git a/Cargo.toml b/Cargo.toml index 93a49d2c..f3880a1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,8 @@ authors = ["Max Brunsfeld "] edition = "2018" [dependencies] -bitvec = "0.8" +lazy_static = "1.2.0" +smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" ignore = "0.4.4" diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs new file mode 100644 index 00000000..d201519f --- /dev/null +++ b/src/build_tables/inline_variables.rs @@ -0,0 +1,318 @@ +use super::item::ParseItem; +use crate::grammars::{Production, SyntaxGrammar}; +use std::collections::HashMap; + +pub(crate) struct InlinedProductionMap { + pub inlined_productions: Vec, + item_map: HashMap>, +} + +impl InlinedProductionMap { + pub fn new(grammar: &SyntaxGrammar) -> Self { + let mut result = Self { + inlined_productions: Vec::new(), + item_map: HashMap::new(), + }; + + let mut items_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + items_to_process.push(ParseItem::Normal { + variable_index: variable_index as u32, + production_index: production_index as u32, + step_index: 0, + }); + while !items_to_process.is_empty() { + let mut i = 0; + while i < items_to_process.len() { + let item = &items_to_process[i]; + if let Some(step) = item.step(grammar, &result) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_items = result + .inline(*item, grammar) + .into_iter() + .map(|production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index: *production_index, + step_index: item.step_index() as u32, + }) + .collect::>(); + items_to_process.splice(i..i + 1, inlined_items); + } else { + items_to_process[i] = item.successor(); + i += 1; + } + } else { + items_to_process.remove(i); + } + } + } + } + } + + result + } + + pub fn inlined_items<'a>( + &'a self, + item: ParseItem, + ) -> Option + 'a> { + self.item_map.get(&item).map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index, + step_index: item.step_index() as u32, + }) + }) + } + + fn inline(&mut self, item: ParseItem, grammar: &SyntaxGrammar) -> &Vec { + let step_index = item.step_index(); + let mut productions_to_add = grammar.variables + [item.step(grammar, self).unwrap().symbol.index] + .productions + .clone(); + + let mut i = 0; + while i < productions_to_add.len() { + if let Some(first_symbol) = productions_to_add[i].first_symbol() { + if grammar.variables_to_inline.contains(&first_symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[first_symbol.index] + .productions + .iter() + .map(|p| { + let mut p = p.clone(); + p.steps.extend(production.steps[1..].iter().cloned()); + p + }), + ); + continue; + } + } + i += 1; + } + + let result = productions_to_add + .into_iter() + .map(|production_to_add| { + let mut inlined_production = item.production(grammar, &self).clone(); + inlined_production.steps.splice( + step_index..step_index + 1, + production_to_add.steps.iter().cloned(), + ); + self.inlined_productions + .iter() + .position(|p| *p == inlined_production) + .unwrap_or({ + self.inlined_productions.push(inlined_production); + self.inlined_productions.len() - 1 + }) as u32 + }) + .collect(); + + self.item_map.entry(item).or_insert(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::Symbol; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + // Nothing to inline at step 0. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0 + }), + &grammar, + &inline_map + ), + None + ); + + // Inlining variable 1 yields two productions. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }), + &grammar, + &inline_map + ), + Some(vec![ + "terminal-10 • terminal-12 terminal-13 terminal-11".to_string(), + "terminal-10 • terminal-14 terminal-11".to_string(), + ]) + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "var2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "var3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + let items = inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }).unwrap().collect::>(); + + assert_eq!( + display_items(Some(items.iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), + "terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() + ]) + ); + + let item = items[0].successor().successor(); + assert_eq!( + display_items(Some([item].iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), + ]) + ); + + assert_eq!( + display_items(inline_map.inlined_items(item), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), + ]) + ); + } + + fn display_items( + items: Option>, + grammar: &SyntaxGrammar, + inline_map: &InlinedProductionMap, + ) -> Option> { + items.map(|items| { + items + .map(|item| format!("{}", item.with(grammar, inline_map))) + .collect() + }) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c8d30997..537b0928 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,22 +1,209 @@ -use crate::grammars::Production; +use super::inline_variables::InlinedProductionMap; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Symbol, SymbolType}; +use smallbitvec::SmallBitVec; use std::collections::HashMap; -use bitvec::BitVec; +use std::hash::{Hash, Hasher}; +use std::fmt; -#[derive(Debug, PartialEq, Eq)] -pub(super) struct LookaheadSet { - terminal_bits: BitVec, - external_bits: BitVec, +lazy_static! { + static ref START_PRODUCTION: Production = Production { + dynamic_precedence: 0, + steps: vec![ProductionStep { + symbol: Symbol { + index: 0, + kind: SymbolType::NonTerminal, + }, + precedence: 0, + associativity: None, + alias: None, + }], + }; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LookaheadSet { + terminal_bits: SmallBitVec, + external_bits: SmallBitVec, eof: bool, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub(super) struct ParseItem { - variable_index: u32, - production_index: u32, - step_index: u32, +pub(crate) enum ParseItem { + Start { + step_index: u32, + }, + Normal { + variable_index: u32, + production_index: u32, + step_index: u32, + }, + Inlined { + variable_index: u32, + production_index: u32, + step_index: u32, + }, } -#[derive(Debug, PartialEq, Eq)] -pub(super) struct ParseItemSet { - entries: HashMap +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseItemSet { + pub entries: HashMap, +} + +impl LookaheadSet { + pub fn new() -> Self { + Self { + terminal_bits: SmallBitVec::new(), + external_bits: SmallBitVec::new(), + eof: false, + } + } + + pub fn insert(&mut self, other: Symbol) { + match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.set(other.index, true), + SymbolType::External => self.external_bits.set(other.index, true), + } + } + + pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + let mut result = false; + if other.terminal_bits.len() > self.terminal_bits.len() { + self.terminal_bits.resize(other.terminal_bits.len(), false); + } + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } + for (i, element) in other.terminal_bits.iter().enumerate() { + if element { + result |= !self.terminal_bits[i]; + self.terminal_bits.set(i, element); + } + } + for (i, element) in other.external_bits.iter().enumerate() { + if element { + result |= !self.external_bits[i]; + self.external_bits.set(i, element); + } + } + if other.eof { + result |= !self.eof; + self.eof = true; + } + result + } +} + +impl ParseItem { + pub fn is_kernel(&self) -> bool { + match self { + ParseItem::Start { .. } => true, + ParseItem::Normal { step_index, .. } | ParseItem::Inlined { step_index, .. } => { + *step_index > 0 + } + } + } + + pub fn production<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> &'a Production { + match self { + ParseItem::Start { .. } => &START_PRODUCTION, + ParseItem::Normal { + variable_index, + production_index, + .. + } => { + &grammar.variables[*variable_index as usize].productions[*production_index as usize] + } + ParseItem::Inlined { + production_index, + .. + } => &inlined_productions.inlined_productions[*production_index as usize], + } + } + + pub fn step<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> Option<&'a ProductionStep> { + self.production(grammar, inlined_productions).steps.get(self.step_index()) + } + + pub fn variable_index(&self) -> u32 { + match self { + ParseItem::Start { .. } => panic!("Start item doesn't have a variable index"), + ParseItem::Normal { variable_index, .. } + | ParseItem::Inlined { variable_index, .. } => *variable_index, + } + } + + pub fn step_index(&self) -> usize { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => *step_index as usize, + } + } + + fn step_index_mut(&mut self) -> &mut u32 { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => step_index, + } + } + + pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, grammar, inlines) + } + + pub fn successor(&self) -> ParseItem { + let mut result = self.clone(); + *result.step_index_mut() += 1; + result + } +} + +pub struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); + +impl<'a> fmt::Display for ParseItemDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + let step_index = self.0.step_index(); + let production = self.0.production(self.1, self.2); + for (i, step) in production.steps.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + + if i == step_index { + write!(f, "• ")?; + } + + let name = if step.symbol.is_terminal() { + "terminal" + } else if step.symbol.is_external() { + "external" + } else { + "non-terminal" + }; + + write!(f, "{}-{}", name, step.symbol.index)?; + } + Ok(()) + } +} + +impl Hash for ParseItemSet { + fn hash(&self, hasher: &mut H) { + hasher.write_usize(self.entries.len()); + for (item, lookaheads) in self.entries.iter() { + item.hash(hasher); + lookaheads.hash(hasher); + } + } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c3518428..f7bb1f9c 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,4 +1,5 @@ mod item; +mod inline_variables; use std::collections::{HashMap, VecDeque}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; diff --git a/src/grammars.rs b/src/grammars.rs index 74c213e1..8abdad24 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -108,6 +108,18 @@ impl ProductionStep { } } +impl Production { + pub fn first_symbol(&self) -> Option { + self.steps.first().map(|s| s.symbol.clone()) + } +} + +impl Default for Production { + fn default() -> Self { + Production { dynamic_precedence: 0, steps: Vec::new() } + } +} + impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } diff --git a/src/main.rs b/src/main.rs index b83764fc..9dc9efb2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use clap::{App, Arg, SubCommand}; #[macro_use] extern crate serde_derive; #[macro_use] extern crate serde_json; +#[macro_use] extern crate lazy_static; mod build_tables; mod error; diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 0f1f5008..27dc8b05 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -2,7 +2,6 @@ use serde_json::{Map, Value}; use crate::error::Result; use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; -use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] diff --git a/src/rules.rs b/src/rules.rs index d7234f45..9374a283 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -10,7 +10,7 @@ pub(crate) enum SymbolType { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub(crate) enum Associativity { Left, - Right + Right, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] @@ -137,24 +137,37 @@ impl Rule { } impl Symbol { + pub fn is_terminal(&self) -> bool { + self.kind == SymbolType::Terminal + } + pub fn is_non_terminal(&self) -> bool { - return self.kind == SymbolType::NonTerminal + self.kind == SymbolType::NonTerminal } pub fn is_external(&self) -> bool { - return self.kind == SymbolType::External + self.kind == SymbolType::External } pub fn non_terminal(index: usize) -> Self { - Symbol { kind: SymbolType::NonTerminal, index } + Symbol { + kind: SymbolType::NonTerminal, + index, + } } pub fn terminal(index: usize) -> Self { - Symbol { kind: SymbolType::Terminal, index } + Symbol { + kind: SymbolType::Terminal, + index, + } } pub fn external(index: usize) -> Self { - Symbol { kind: SymbolType::External, index } + Symbol { + kind: SymbolType::External, + index, + } } } @@ -169,11 +182,14 @@ fn add_metadata(input: Rule, f: T) -> Rule { Rule::Metadata { rule, mut params } => { f(&mut params); Rule::Metadata { rule, params } - }, + } _ => { let mut params = MetadataParams::default(); f(&mut params); - Rule::Metadata { rule: Box::new(input), params } + Rule::Metadata { + rule: Box::new(input), + params, + } } } } @@ -184,7 +200,7 @@ fn choice_helper(result: &mut Vec, rule: Rule) { for element in elements { choice_helper(result, element); } - }, + } _ => { if !result.contains(&rule) { result.push(rule); From 143588c148a130217beb7c547647d8e3442b9762 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Dec 2018 17:31:54 -0800 Subject: [PATCH 069/102] Implement ItemSetBuilder --- src/build_tables/item.rs | 16 +- src/build_tables/item_set_builder.rs | 279 +++++++++++++++++++++++++++ src/build_tables/mod.rs | 2 + 3 files changed, 294 insertions(+), 3 deletions(-) create mode 100644 src/build_tables/item_set_builder.rs diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 537b0928..c99815eb 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -50,6 +50,8 @@ pub(crate) struct ParseItemSet { pub entries: HashMap, } +pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); + impl LookaheadSet { pub fn new() -> Self { Self { @@ -96,6 +98,10 @@ impl LookaheadSet { } impl ParseItem { + pub fn start() -> Self { + ParseItem::Start { step_index: 0 } + } + pub fn is_kernel(&self) -> bool { match self { ParseItem::Start { .. } => true, @@ -106,7 +112,7 @@ impl ParseItem { } pub fn production<'a>( - &'a self, + &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> &'a Production { @@ -127,7 +133,7 @@ impl ParseItem { } pub fn step<'a>( - &'a self, + &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> Option<&'a ProductionStep> { @@ -169,7 +175,11 @@ impl ParseItem { } } -pub struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); +impl ParseItemSet { + pub fn new() -> Self { + Self { entries: HashMap::new() } + } +} impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs new file mode 100644 index 00000000..61d45ded --- /dev/null +++ b/src/build_tables/item_set_builder.rs @@ -0,0 +1,279 @@ +use super::inline_variables::InlinedProductionMap; +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::rules::Symbol; +use std::collections::{HashMap, HashSet}; + +#[derive(Clone, Debug, PartialEq, Eq)] +struct TransitiveClosureAddition { + item: ParseItem, + info: FollowSetInfo, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +struct FollowSetInfo { + lookaheads: LookaheadSet, + propagates_lookaheads: bool, +} + +pub(crate) struct ParseItemSetBuilder { + first_sets: HashMap, + last_sets: HashMap, + transitive_closure_additions: Vec>, + inlined_production_map: InlinedProductionMap, +} + +fn find_or_push(vector: &mut Vec, value: T) { + if !vector.contains(&value) { + vector.push(value); + } +} + +impl ParseItemSetBuilder { + pub fn new(syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar) -> Self { + let mut result = Self { + first_sets: HashMap::new(), + last_sets: HashMap::new(), + transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], + inlined_production_map: InlinedProductionMap::new(syntax_grammar), + }; + + // For each grammar symbol, populate the FIRST and LAST sets: the set of + // terminals that appear at the beginning and end that symbol's productions, + // respectively. + // + // For a terminal symbol, the FIRST and LAST set just consists of the + // terminal itself. + for i in 0..lexical_grammar.variables.len() { + let symbol = Symbol::terminal(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + for i in 0..syntax_grammar.external_tokens.len() { + let symbol = Symbol::external(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + // The FIRST set of a non-terminal `i` is the union of the following sets: + // * the set of all terminals that appear at the beginings of i's productions + // * the FIRST sets of all the non-terminals that appear at the beginnings + // of i's productions + // + // Rather than computing these sets using recursion, we use an explicit stack + // called `symbols_to_process`. + let mut symbols_to_process = Vec::new(); + let mut processed_non_terminals = HashSet::new(); + for i in 0..syntax_grammar.variables.len() { + let symbol = Symbol::non_terminal(i); + + let first_set = &mut result + .first_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + first_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.first() { + symbols_to_process.push(step.symbol); + } + } + } + } + + // The LAST set is defined in a similar way to the FIRST set. + let last_set = &mut result + .last_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + last_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.last() { + symbols_to_process.push(step.symbol); + } + } + } + } + } + + // To compute an item set's transitive closure, we find each item in the set + // whose next symbol is a non-terminal, and we add new items to the set for + // each of that symbols' productions. These productions might themselves begin + // with non-terminals, so the process continues recursively. In this process, + // the total set of entries that get added depends only on two things: + // * the set of non-terminal symbols that occur at each item's current position + // * the set of terminals that occurs after each of these non-terminal symbols + // + // So we can avoid a lot of duplicated recursive work by precomputing, for each + // non-terminal symbol `i`, a final list of *additions* that must be made to an + // item set when `i` occurs as the next symbol in one if its core items. The + // structure of an *addition* is as follows: + // * `item` - the new item that must be added as part of the expansion of `i` + // * `lookaheads` - lookahead tokens that can always come after that item in + // the expansion of `i` + // * `propagates_lookaheads` - a boolean indicating whether or not `item` can + // occur at the *end* of the expansion of `i`, so that i's own current + // lookahead tokens can occur after `item`. + // + // Again, rather than computing these additions recursively, we use an explicit + // stack called `entries_to_process`. + for i in 0..syntax_grammar.variables.len() { + let empty_lookaheads = LookaheadSet::new(); + let mut entries_to_process = vec![(i, &empty_lookaheads, true)]; + + // First, build up a map whose keys are all of the non-terminals that can + // appear at the beginning of non-terminal `i`, and whose values store + // information about the tokens that can follow each non-terminal. + let mut follow_set_info_by_non_terminal = HashMap::new(); + while let Some(entry) = entries_to_process.pop() { + let (variable_index, lookaheads, propagates_lookaheads) = entry; + let existing_info = follow_set_info_by_non_terminal + .entry(variable_index) + .or_insert_with(|| FollowSetInfo { + lookaheads: LookaheadSet::new(), + propagates_lookaheads: false, + }); + + let did_add_follow_set_info; + if propagates_lookaheads { + did_add_follow_set_info = !existing_info.propagates_lookaheads; + existing_info.propagates_lookaheads = true; + } else { + did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads); + } + + if did_add_follow_set_info { + for production in &syntax_grammar.variables[variable_index].productions { + if let Some(symbol) = production.first_symbol() { + if symbol.is_non_terminal() { + if production.steps.len() == 1 { + entries_to_process.push(( + symbol.index, + lookaheads, + propagates_lookaheads, + )); + } else { + entries_to_process.push(( + symbol.index, + &result.first_sets[&production.steps[1].symbol], + false, + )); + } + } + } + } + } + } + + // Store all of those non-terminals' productions, along with their associated + // lookahead info, as *additions* associated with non-terminal `i`. + let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; + for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { + let variable = &syntax_grammar.variables[variable_index]; + for production_index in 0..variable.productions.len() { + let item = ParseItem::Normal { + variable_index: variable_index as u32, + production_index: production_index as u32, + step_index: 0, + }; + + if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) { + for inlined_item in inlined_items { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item: inlined_item, + info: follow_set_info.clone(), + }, + ); + } + } else { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item, + info: follow_set_info.clone(), + }, + ); + } + } + } + } + + result + } + + pub(crate) fn transitive_closure( + &mut self, + item_set: ParseItemSet, + grammar: &SyntaxGrammar, + ) -> ParseItemSet { + let mut result = ParseItemSet::new(); + for (item, lookaheads) in item_set.entries { + if let Some(items) = self.inlined_production_map.inlined_items(item) { + for item in items { + self.add_item(&mut result, item, lookaheads.clone(), grammar); + } + } else { + self.add_item(&mut result, item, lookaheads, grammar); + } + } + result + } + + fn add_item( + &self, + set: &mut ParseItemSet, + item: ParseItem, + lookaheads: LookaheadSet, + grammar: &SyntaxGrammar, + ) { + if let Some(step) = item.step(grammar, &self.inlined_production_map) { + if step.symbol.is_non_terminal() { + let next_step = item.successor().step(grammar, &self.inlined_production_map); + + // Determine which tokens can follow this non-terminal. + let following_tokens = if let Some(next_step) = next_step { + self.first_sets.get(&next_step.symbol).unwrap() + } else { + &lookaheads + }; + + // Use the pre-computed *additions* to expand the non-terminal. + for addition in &self.transitive_closure_additions[step.symbol.index] { + let lookaheads = set + .entries + .entry(addition.item) + .or_insert_with(|| LookaheadSet::new()); + lookaheads.insert_all(&addition.info.lookaheads); + if addition.info.propagates_lookaheads { + lookaheads.insert_all(following_tokens); + } + } + } + } + set.entries.insert(item, lookaheads); + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index f7bb1f9c..01d9219d 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,5 +1,7 @@ mod item; mod inline_variables; +mod item; +mod item_set_builder; use std::collections::{HashMap, VecDeque}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; From d078c263b0fc003c24ba2d08355fb1a87af6b65f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:35:13 -0800 Subject: [PATCH 070/102] Fix bugs in grammar JSON parsing --- Cargo.lock | 7 +++++++ Cargo.toml | 5 ++++- src/parse_grammar.rs | 23 +++++++++++++++++++---- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 410580fa..538517f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,6 +229,11 @@ dependencies = [ "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "indexmap" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "itoa" version = "0.4.3" @@ -539,6 +544,7 @@ name = "serde_json" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ + "indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -748,6 +754,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" "checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" +"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" "checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" "checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" diff --git a/Cargo.toml b/Cargo.toml index f3880a1c..b29bc85e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,9 @@ libloading = "0.5" rusqlite = "0.14.0" serde = "1.0" serde_derive = "1.0" -serde_json = "1.0" tree-sitter = "0.3.1" regex-syntax = "0.6.4" + +[dependencies.serde_json] +version = "1.0" +features = ["preserve_order"] diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 27dc8b05..07396329 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -7,6 +7,11 @@ use crate::rules::Rule; #[serde(tag = "type")] #[allow(non_camel_case_types)] enum RuleJSON { + ALIAS { + content: Box, + named: bool, + value: String, + }, BLANK, STRING { value: String, @@ -26,6 +31,13 @@ enum RuleJSON { REPEAT { content: Box, }, + REPEAT1 { + content: Box, + }, + PREC_DYNAMIC { + value: i32, + content: Box, + }, PREC_LEFT { value: i32, content: Box, @@ -41,7 +53,7 @@ enum RuleJSON { TOKEN { content: Box, }, - TOKEN_IMMEDIATE { + IMMEDIATE_TOKEN { content: Box, }, } @@ -97,18 +109,21 @@ pub(crate) fn parse_grammar(input: &str) -> Result { fn parse_rule(json: RuleJSON) -> Rule { match json { + RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named), RuleJSON::BLANK => Rule::Blank, RuleJSON::STRING { value } => Rule::String(value), RuleJSON::PATTERN { value } => Rule::Pattern(value), RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name), RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), - RuleJSON::REPEAT { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]), RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), + RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)), RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), - RuleJSON::TOKEN_IMMEDIATE { content } => Rule::immediate_token(parse_rule(*content)), + RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)), } } @@ -122,7 +137,7 @@ mod tests { "name": "my_lang", "rules": { "file": { - "type": "REPEAT", + "type": "REPEAT1", "content": { "type": "SYMBOL", "name": "statement" From 988dc7de35278f2ab36df90190a83c3727f391c9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:35:34 -0800 Subject: [PATCH 071/102] Handle precedence and aliases properly when inlining variables --- src/build_tables/inline_variables.rs | 263 ++++++++++++++++++++------- 1 file changed, 193 insertions(+), 70 deletions(-) diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs index d201519f..affbe163 100644 --- a/src/build_tables/inline_variables.rs +++ b/src/build_tables/inline_variables.rs @@ -108,10 +108,25 @@ impl InlinedProductionMap { .into_iter() .map(|production_to_add| { let mut inlined_production = item.production(grammar, &self).clone(); - inlined_production.steps.splice( - step_index..step_index + 1, - production_to_add.steps.iter().cloned(), - ); + let removed_step = inlined_production + .steps + .splice( + step_index..step_index + 1, + production_to_add.steps.iter().cloned(), + ) + .next() + .unwrap(); + let inserted_steps = &mut inlined_production.steps + [step_index..step_index + production_to_add.steps.len()]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } self.inlined_productions .iter() .position(|p| *p == inlined_production) @@ -129,8 +144,9 @@ impl InlinedProductionMap { #[cfg(test)] mod tests { use super::*; - use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; - use crate::rules::Symbol; + use crate::grammars::{LexicalGrammar, ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::{Alias, Associativity, Symbol}; + use std::borrow::Borrow; #[test] fn test_basic_inlining() { @@ -142,7 +158,7 @@ mod tests { variables_to_inline: vec![Symbol::non_terminal(1)], variables: vec![ SyntaxVariable { - name: "var0".to_string(), + name: "non-terminal-0".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -154,7 +170,7 @@ mod tests { }], }, SyntaxVariable { - name: "var1".to_string(), + name: "non-terminal-1".to_string(), kind: VariableType::Named, productions: vec![ Production { @@ -176,34 +192,32 @@ mod tests { let inline_map = InlinedProductionMap::new(&grammar); // Nothing to inline at step 0. - assert_eq!( - display_items( - inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0 - }), - &grammar, - &inline_map - ), - None - ); + assert!(inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0 + }) + .is_none()); // Inlining variable 1 yields two productions. assert_eq!( display_items( - inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }), + inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }) + .unwrap(), &grammar, &inline_map ), - Some(vec![ - "terminal-10 • terminal-12 terminal-13 terminal-11".to_string(), - "terminal-10 • terminal-14 terminal-11".to_string(), - ]) + vec![ + "non-terminal-0 → terminal-10 • terminal-12 terminal-13 terminal-11" + .to_string(), + "non-terminal-0 → terminal-10 • terminal-14 terminal-11".to_string(), + ] ); } @@ -212,23 +226,21 @@ mod tests { let grammar = SyntaxGrammar { variables: vec![ SyntaxVariable { - name: "var0".to_string(), + name: "non-terminal-0".to_string(), kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ProductionStep::new(Symbol::terminal(12)), - ], - }, - ], + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }], }, SyntaxVariable { - name: "var1".to_string(), + name: "non-terminal-1".to_string(), kind: VariableType::Named, productions: vec![ Production { @@ -245,7 +257,7 @@ mod tests { ], }, SyntaxVariable { - name: "var2".to_string(), + name: "non-terminal-2".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -253,7 +265,7 @@ mod tests { }], }, SyntaxVariable { - name: "var3".to_string(), + name: "non-terminal-3".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -274,45 +286,156 @@ mod tests { let inline_map = InlinedProductionMap::new(&grammar); - let items = inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }).unwrap().collect::>(); + let items = inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1, + }) + .unwrap() + .collect::>(); assert_eq!( - display_items(Some(items.iter().cloned()), &grammar, &inline_map), - Some(vec![ - "terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), - "terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() - ]) + display_items(&items, &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), + "non-terminal-0 → terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() + ] ); let item = items[0].successor().successor(); assert_eq!( - display_items(Some([item].iter().cloned()), &grammar, &inline_map), - Some(vec![ - "terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), - ]) + display_items(&[item], &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), + ] ); assert_eq!( - display_items(inline_map.inlined_items(item), &grammar, &inline_map), - Some(vec![ - "terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), - ]) + display_items(inline_map.inlined_items(item).unwrap(), &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), + ] ); } + #[test] + fn test_inlining_with_precedence_and_alias() { + let grammar = SyntaxGrammar { + variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)) // inlined + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13)) + .with_alias("outer_alias", true)], + }], + }, + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + let items = inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0, + }) + .unwrap() + .collect::>(); + assert_eq!( + display_items(&items, &grammar, &inline_map)[0], + "non-terminal-0 → • terminal-11 terminal-12 terminal-10 non-terminal-2".to_string(), + ); + + // The first step in the inlined production retains its precedence and alias. + let item = items[0].successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 • terminal-12 terminal-10 non-terminal-2".to_string(), + ); + assert_eq!(item.precedence(&grammar, &inline_map), 2); + assert_eq!( + items[0].step(&grammar, &inline_map).unwrap().alias, + Some(Alias { + value: "inner_alias".to_string(), + is_named: true, + }) + ); + + // The final terminal of the inlined production inherits the precedence of + // the inlined step. + let item = item.successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 • terminal-10 non-terminal-2".to_string(), + ); + assert_eq!(item.precedence(&grammar, &inline_map), 1); + + let item = item.successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 terminal-10 • non-terminal-2".to_string(), + ); + + // All steps of the inlined production inherit their alias from the + // inlined step. + let items = inline_map.inlined_items(item).unwrap().collect::>(); + assert_eq!( + display_items(&items, &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 terminal-10 • terminal-13".to_string(), + ); + assert_eq!( + items[0].step(&grammar, &inline_map).unwrap().alias, + Some(Alias { + value: "outer_alias".to_string(), + is_named: true, + }) + ) + } + fn display_items( - items: Option>, + items: impl IntoIterator>, grammar: &SyntaxGrammar, inline_map: &InlinedProductionMap, - ) -> Option> { - items.map(|items| { - items - .map(|item| format!("{}", item.with(grammar, inline_map))) - .collect() - }) + ) -> Vec { + let lex = LexicalGrammar::default(); + items + .into_iter() + .map(|item| format!("{}", item.borrow().display_with(grammar, &lex, inline_map))) + .collect() } } From 5eb88069597ed72d9dd6b4f5b2ed5d772463a853 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:36:21 -0800 Subject: [PATCH 072/102] Handle repetition ranges in regexes --- src/prepare_grammar/expand_tokens.rs | 114 ++++++++++++++++++++------- 1 file changed, 86 insertions(+), 28 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 7a1d2f4d..37f75e5a 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -3,7 +3,9 @@ use crate::error::{Error, Result}; use crate::grammars::{LexicalGrammar, LexicalVariable}; use crate::nfa::{CharacterSet, Nfa, NfaState}; use crate::rules::Rule; -use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; +use regex_syntax::ast::{ + parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, +}; pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut nfa = Nfa::new(); @@ -24,7 +26,10 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Error::RegexError(format!("Rule {} {}", variable.name, msg)), + _ => e, + })?; if !is_immediate_token { let last_state_id = nfa.last_state_id(); @@ -95,11 +100,62 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) Ok(false) } } + Rule::Metadata { rule, .. } => { + // TODO - implement precedence + expand_rule(rule, nfa, next_state_id, is_sep) + } Rule::Blank => Ok(false), _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } +fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_state_id = nfa.last_state_id(); + if expand_regex(&ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); + Ok(true) + } else { + nfa.states.pop(); + Ok(false) + } +} + +fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + if expand_regex(ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + Ok(true) + } else { + Ok(false) + } +} + +fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + Ok(true) + } else { + Ok(false) + } +} + +fn expand_count( + ast: &Ast, + count: u32, + nfa: &mut Nfa, + mut next_state_id: u32, + is_sep: bool, +) -> Result { + let mut result = false; + for _ in 0..count { + if expand_regex(ast, nfa, next_state_id, is_sep)? { + result = true; + next_state_id = nfa.last_state_id(); + } + } + Ok(result) +} + fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match ast { Ast::Empty(_) => Ok(false), @@ -148,38 +204,36 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - if expand_regex(&repetition.ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); - Ok(true) - } else { - Ok(false) - } + expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep) } RepetitionKind::OneOrMore => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - nfa.states.pop(); - Ok(false) - } + expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep) } RepetitionKind::ZeroOrMore => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); - Ok(true) + expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + expand_count(&repetition.ast, count, nfa, next_state_id, is_sep) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { + expand_count(ast, min, nfa, next_state_id, is_sep) } else { Ok(false) } } - RepetitionKind::Range(_) => unimplemented!(), + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?; + for _ in min..max { + if result { + next_state_id = nfa.last_state_id(); + } + if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? { + result = true; + } + } + Ok(result) + } }, Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), Ast::Alternation(alternation) => { @@ -202,8 +256,8 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) for ast in concat.asts.iter().rev() { if expand_regex(&ast, nfa, next_state_id, is_sep)? { result = true; + next_state_id = nfa.last_state_id(); } - next_state_id = nfa.last_state_id(); } Ok(result) } @@ -224,7 +278,11 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } Ok(result) } - _ => Err(Error::regex("Unsupported character class syntax")), + ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), } } From a3dcfa0a52b74fc56a53aef270bd9f4a474732e8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:36:39 -0800 Subject: [PATCH 073/102] Implement more of parse table generation --- src/build_tables/item.rs | 260 +++++++- src/build_tables/item_set_builder.rs | 34 +- src/build_tables/mod.rs | 596 +++++++++++++++++- src/error.rs | 1 + src/generate.rs | 4 +- src/grammars.rs | 16 +- src/js/dsl.js | 334 ++++++++++ src/main.rs | 65 +- src/nfa.rs | 6 + src/prepare_grammar/extract_simple_aliases.rs | 2 + src/prepare_grammar/extract_tokens.rs | 7 +- src/render/mod.rs | 206 +++++- src/rules.rs | 23 +- src/tables.rs | 68 +- 14 files changed, 1515 insertions(+), 107 deletions(-) create mode 100644 src/js/dsl.js diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c99815eb..9208f602 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,10 +1,10 @@ use super::inline_variables::InlinedProductionMap; -use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; -use crate::rules::{Symbol, SymbolType}; +use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Associativity, Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; +use std::collections::{HashMap, BTreeMap}; use std::fmt; +use std::hash::{Hash, Hasher}; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -28,7 +28,7 @@ pub(crate) struct LookaheadSet { eof: bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum ParseItem { Start { step_index: u32, @@ -47,10 +47,29 @@ pub(crate) enum ParseItem { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet { - pub entries: HashMap, + pub entries: BTreeMap, } -pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); +pub(crate) struct ParseItemDisplay<'a>( + &'a ParseItem, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +pub(crate) struct ParseItemSetDisplay<'a>( + &'a ParseItemSet, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +struct ParseItemSetMapEntry(ParseItemSet, u64); +pub(crate) struct ParseItemSetMap { + map: HashMap +} impl LookaheadSet { pub fn new() -> Self { @@ -61,12 +80,61 @@ impl LookaheadSet { } } - pub fn insert(&mut self, other: Symbol) { - match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), - SymbolType::Terminal => self.terminal_bits.set(other.index, true), - SymbolType::External => self.external_bits.set(other.index, true), + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + .chain( + self.external_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::external(i)) + } else { + None + } + }), + ) + .chain(if self.eof { Some(Symbol::end()) } else { None }) + } + + pub fn with<'a>(symbols: impl IntoIterator) -> Self { + let mut result = Self::new(); + for symbol in symbols { + result.insert(*symbol); } + result + } + + pub fn contains(&self, symbol: &Symbol) -> bool { + match symbol.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), + SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), + SymbolType::End => self.eof, + } + } + + pub fn insert(&mut self, other: Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = true; + return; + } + }; + vec.resize(other.index + 1, false); + vec.set(other.index, true); } pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { @@ -95,6 +163,14 @@ impl LookaheadSet { } result } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> LookaheadSetDisplay<'a> { + LookaheadSetDisplay(self, syntax_grammar, lexical_grammar) + } } impl ParseItem { @@ -126,18 +202,53 @@ impl ParseItem { &grammar.variables[*variable_index as usize].productions[*production_index as usize] } ParseItem::Inlined { - production_index, - .. + production_index, .. } => &inlined_productions.inlined_productions[*production_index as usize], } } + pub fn symbol( + &self, + grammar: &SyntaxGrammar, + inlined_productions: &InlinedProductionMap, + ) -> Option { + self.step(grammar, inlined_productions).map(|s| s.symbol) + } + pub fn step<'a>( &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> Option<&'a ProductionStep> { - self.production(grammar, inlined_productions).steps.get(self.step_index()) + self.production(grammar, inlined_productions) + .steps + .get(self.step_index()) + } + + pub fn precedence<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> i32 { + self.production(grammar, inlines) + .steps + .get(self.step_index() - 1) + .map(|s| s.precedence) + .unwrap_or(0) + } + + pub fn associativity<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> Option { + let production = self.production(grammar, inlines); + let step_index = self.step_index(); + if step_index == production.steps.len() { + production.steps.last().and_then(|s| s.associativity) + } else { + None + } } pub fn variable_index(&self) -> u32 { @@ -156,6 +267,14 @@ impl ParseItem { } } + pub fn is_final(&self) -> bool { + if let ParseItem::Start { step_index: 1 } = self { + true + } else { + false + } + } + fn step_index_mut(&mut self) -> &mut u32 { match self { ParseItem::Start { step_index } @@ -164,8 +283,13 @@ impl ParseItem { } } - pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, grammar, inlines) + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines) } pub fn successor(&self) -> ParseItem { @@ -176,33 +300,107 @@ impl ParseItem { } impl ParseItemSet { - pub fn new() -> Self { - Self { entries: HashMap::new() } + pub fn with<'a>(elements: impl IntoIterator) -> Self { + let mut result = Self::default(); + for (item, lookaheads) in elements { + result.entries.insert(*item, lookaheads.clone()); + } + result + } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemSetDisplay<'a> { + ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines) + } +} + +impl Default for ParseItemSet { + fn default() -> Self { + Self { + entries: BTreeMap::new(), + } } } impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + if let ParseItem::Start { .. } = &self.0 { + write!(f, "START →")?; + } else { + write!( + f, + "{} →", + &self.1.variables[self.0.variable_index() as usize].name + )?; + } + let step_index = self.0.step_index(); - let production = self.0.production(self.1, self.2); + let production = self.0.production(self.1, self.3); for (i, step) in production.steps.iter().enumerate() { - if i > 0 { - write!(f, " ")?; - } - if i == step_index { - write!(f, "• ")?; + write!(f, " •")?; } - let name = if step.symbol.is_terminal() { - "terminal" + write!(f, " ")?; + if step.symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(step.symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", step.symbol.index)?; + } } else if step.symbol.is_external() { - "external" + write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?; } else { - "non-terminal" - }; + write!(f, "{}", &self.1.variables[step.symbol.index].name)?; + } + } - write!(f, "{}-{}", name, step.symbol.index)?; + if production.steps.len() == step_index { + write!(f, " •")?; + } + + Ok(()) + } +} + +impl<'a> fmt::Display for LookaheadSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "[")?; + for (i, symbol) in self.0.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + + if symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", symbol.index)?; + } + } else if symbol.is_external() { + write!(f, "{}", &self.1.external_tokens[symbol.index].name)?; + } else { + write!(f, "{}", &self.1.variables[symbol.index].name)?; + } + } + write!(f, "]")?; + Ok(()) + } +} + +impl<'a> fmt::Display for ParseItemSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + for (item, lookaheads) in self.0.entries.iter() { + writeln!( + f, + "{}\t{}", + item.display_with(self.1, self.2, self.3), + lookaheads.display_with(self.1, self.2) + )?; } Ok(()) } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 61d45ded..530c1f25 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -20,7 +20,7 @@ pub(crate) struct ParseItemSetBuilder { first_sets: HashMap, last_sets: HashMap, transitive_closure_additions: Vec>, - inlined_production_map: InlinedProductionMap, + pub inlines: InlinedProductionMap, } fn find_or_push(vector: &mut Vec, value: T) { @@ -35,7 +35,7 @@ impl ParseItemSetBuilder { first_sets: HashMap::new(), last_sets: HashMap::new(), transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], - inlined_production_map: InlinedProductionMap::new(syntax_grammar), + inlines: InlinedProductionMap::new(syntax_grammar), }; // For each grammar symbol, populate the FIRST and LAST sets: the set of @@ -192,6 +192,10 @@ impl ParseItemSetBuilder { let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { let variable = &syntax_grammar.variables[variable_index]; + let non_terminal = Symbol::non_terminal(variable_index); + if syntax_grammar.variables_to_inline.contains(&non_terminal) { + continue; + } for production_index in 0..variable.productions.len() { let item = ParseItem::Normal { variable_index: variable_index as u32, @@ -199,7 +203,7 @@ impl ParseItemSetBuilder { step_index: 0, }; - if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) { + if let Some(inlined_items) = result.inlines.inlined_items(item) { for inlined_item in inlined_items { find_or_push( additions_for_non_terminal, @@ -227,32 +231,36 @@ impl ParseItemSetBuilder { pub(crate) fn transitive_closure( &mut self, - item_set: ParseItemSet, + item_set: &ParseItemSet, grammar: &SyntaxGrammar, ) -> ParseItemSet { - let mut result = ParseItemSet::new(); - for (item, lookaheads) in item_set.entries { - if let Some(items) = self.inlined_production_map.inlined_items(item) { + let mut result = ParseItemSet::default(); + for (item, lookaheads) in &item_set.entries { + if let Some(items) = self.inlines.inlined_items(*item) { for item in items { - self.add_item(&mut result, item, lookaheads.clone(), grammar); + self.add_item(&mut result, item, lookaheads, grammar); } } else { - self.add_item(&mut result, item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads, grammar); } } result } + pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + fn add_item( &self, set: &mut ParseItemSet, item: ParseItem, - lookaheads: LookaheadSet, + lookaheads: &LookaheadSet, grammar: &SyntaxGrammar, ) { - if let Some(step) = item.step(grammar, &self.inlined_production_map) { + if let Some(step) = item.step(grammar, &self.inlines) { if step.symbol.is_non_terminal() { - let next_step = item.successor().step(grammar, &self.inlined_production_map); + let next_step = item.successor().step(grammar, &self.inlines); // Determine which tokens can follow this non-terminal. let following_tokens = if let Some(next_step) = next_step { @@ -274,6 +282,6 @@ impl ParseItemSetBuilder { } } } - set.entries.insert(item, lookaheads); + set.entries.insert(item, lookaheads.clone()); } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 01d9219d..091c5486 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,37 +1,611 @@ -mod item; mod inline_variables; mod item; mod item_set_builder; -use std::collections::{HashMap, VecDeque}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable, ParseStateId}; -use crate::rules::{AliasMap, Symbol}; -use crate::error::Result; -use self::item::ParseItemSet; +use self::item::{LookaheadSet, ParseItem, ParseItemSet}; +use self::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::ParseTableEntry; +use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; struct ParseStateQueueEntry { preceding_symbols: SymbolSequence, - item_set: ParseItemSet, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, state_id: ParseStateId, } struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, simple_aliases: &'a AliasMap, state_ids_by_item_set: HashMap, - item_sets_by_state_id: Vec<&'a ParseItemSet>, + item_sets_by_state_id: Vec, parse_state_queue: VecDeque, parse_table: ParseTable, } +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { + // Ensure that the empty rename sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::default(), + ); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + + Err(Error::grammar("oh no")) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => { + // eprintln!("Item set already processed at state {}", *o.get()); + *o.get() + } + Entry::Vacant(v) => { + // eprintln!("Item set not yet processed"); + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id].display_with( + &self.syntax_grammar, + &self.lexical_grammar, + &self.item_set_builder.inlines + ) + ); + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + ); + + // println!("TRANSITIVE CLOSURE:"); + // for item in item_set.entries.keys() { + // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); + // } + // println!(""); + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = + item.symbol(self.syntax_grammar, &self.item_set_builder.inlines) + { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_final() { + ParseAction::Accept + } else { + let production = + item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index() as usize), + child_count: item.step_index(), + precedence: production.last_precedence(), + associativity: production.last_associativity(), + dynamic_precedence: production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let step_index = item.step_index(); + if let Some(step) = production.steps.get(step_index) { + if step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = production.steps[step_index - 1].precedence; + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index(); + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index() == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index() as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index()) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index() as usize].name + ) + .unwrap(); + + for (j, step) in item + .production(&self.syntax_grammar, &self.item_set_builder.inlines) + .steps + .iter() + .enumerate() + { + if j == item.step_index() { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item + .step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines); + let associativity = + item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol) + { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect(); + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - simple_aliases: &AliasMap + simple_aliases: &AliasMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - unimplemented!(); + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + simple_aliases, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() } diff --git a/src/error.rs b/src/error.rs index 49064c22..b03efa93 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,7 @@ pub enum Error { GrammarError(String), SymbolError(String), RegexError(String), + ConflictError(String), } pub type Result = std::result::Result; diff --git a/src/generate.rs b/src/generate.rs index 4507fb6f..dc3d5176 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -4,8 +4,8 @@ use crate::prepare_grammar::prepare_grammar; use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: String) -> Result { - let input_grammar = parse_grammar(&input)?; +pub fn generate_parser_for_grammar(input: &str) -> Result { + let input_grammar = parse_grammar(input)?; let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, diff --git a/src/grammars.rs b/src/grammars.rs index 8abdad24..7512ec03 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -38,7 +38,7 @@ pub(crate) struct LexicalVariable { pub start_state: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Default, PartialEq, Eq)] pub(crate) struct LexicalGrammar { pub nfa: Nfa, pub variables: Vec, @@ -112,6 +112,14 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } + + pub fn last_precedence(&self) -> i32 { + self.steps.last().map(|s| s.precedence).unwrap_or(0) + } + + pub fn last_associativity(&self) -> Option { + self.steps.last().map(|s| s.associativity).unwrap_or(None) + } } impl Default for Production { @@ -137,3 +145,9 @@ impl Variable { Self { name: name.to_string(), kind: VariableType::Anonymous, rule } } } + +impl SyntaxVariable { + pub fn is_auxiliary(&self) -> bool { + self.kind == VariableType::Auxiliary + } +} diff --git a/src/js/dsl.js b/src/js/dsl.js new file mode 100644 index 00000000..ba3962cd --- /dev/null +++ b/src/js/dsl.js @@ -0,0 +1,334 @@ +const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; +const DELIMITER_ESCAPE_PATTERN = /\\\//g; + +function alias(rule, value) { + const result = { + type: "ALIAS", + content: normalize(rule), + named: false, + value: null + }; + + switch (value.constructor) { + case String: + result.named = false; + result.value = value; + return result; + case ReferenceError: + result.named = true; + result.value = value.symbol.name; + return result; + case Object: + if (typeof value.type === 'string' && value.type === 'SYMBOL') { + result.named = true; + result.value = value.name; + return result; + } + } + + throw new Error('Invalid alias value ' + value); +} + +function blank() { + return { + type: "BLANK" + }; +} + +function choice(...elements) { + return { + type: "CHOICE", + members: elements.map(normalize) + }; +} + +function optional(value) { + return choice(value, blank()); +} + +function prec(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC", + value: number, + content: normalize(rule) + }; +} + +prec.left = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_LEFT", + value: number, + content: normalize(rule) + }; +} + +prec.right = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_RIGHT", + value: number, + content: normalize(rule) + }; +} + +prec.dynamic = function(number, rule) { + return { + type: "PREC_DYNAMIC", + value: number, + content: normalize(rule) + }; +} + +function repeat(rule) { + return { + type: "REPEAT", + content: normalize(rule) + }; +} + +function repeat1(rule) { + return { + type: "REPEAT1", + content: normalize(rule) + }; +} + +function seq(...elements) { + return { + type: "SEQ", + members: elements.map(normalize) + }; +} + +function sym(name) { + return { + type: "SYMBOL", + name: name + }; +} + +function token(value) { + return { + type: "TOKEN", + content: normalize(value) + }; +} + +token.immediate = function(value) { + return { + type: "IMMEDIATE_TOKEN", + content: normalize(value) + }; +} + +function normalize(value) { + + if (typeof value == "undefined") + throw new Error("Undefined symbol"); + + switch (value.constructor) { + case String: + return { + type: 'STRING', + value + }; + case RegExp: + return { + type: 'PATTERN', + value: value.source + .replace( + DELIMITER_ESCAPE_PATTERN, + '/' + ) + .replace( + UNICODE_ESCAPE_PATTERN, + (match, group) => String.fromCharCode(parseInt(group, 16)) + ) + }; + case ReferenceError: + throw value + default: + if (typeof value.type === 'string') { + return value; + } else { + throw new TypeError("Invalid rule: " + value.toString()); + } + } +} + +function RuleBuilder(ruleMap) { + return new Proxy({}, { + get(target, propertyName) { + const symbol = { + type: 'SYMBOL', + name: propertyName + }; + + if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) { + return symbol; + } else { + const error = new ReferenceError(`Undefined symbol '${propertyName}'`); + error.symbol = symbol; + return error; + } + } + }) +} + +function grammar(baseGrammar, options) { + if (!options) { + options = baseGrammar; + baseGrammar = { + name: null, + rules: {}, + extras: [normalize(/\s/)], + conflicts: [], + externals: [], + inline: [] + }; + } + + let externals = baseGrammar.externals; + if (options.externals) { + if (typeof options.externals !== "function") { + throw new Error("Grammar's 'externals' property must be a function."); + } + + const externalsRuleBuilder = RuleBuilder(null) + const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals); + + if (!Array.isArray(externalRules)) { + throw new Error("Grammar's 'externals' property must return an array of rules."); + } + + externals = externalRules.map(normalize); + } + + const ruleMap = {}; + for (const key in options.rules) { + ruleMap[key] = true; + } + for (const key in baseGrammar.rules) { + ruleMap[key] = true; + } + for (const external of externals) { + if (typeof external.name === 'string') { + ruleMap[external.name] = true; + } + } + + const ruleBuilder = RuleBuilder(ruleMap); + + const name = options.name; + if (typeof name !== "string") { + throw new Error("Grammar's 'name' property must be a string."); + } + + if (!/^[a-zA-Z_]\w*$/.test(name)) { + throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters."); + } + + let rules = Object.assign({}, baseGrammar.rules); + if (options.rules) { + if (typeof options.rules !== "object") { + throw new Error("Grammar's 'rules' property must be an object."); + } + + for (const ruleName in options.rules) { + const ruleFn = options.rules[ruleName]; + if (typeof ruleFn !== "function") { + throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not."); + } + rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName])); + } + } + + let extras = baseGrammar.extras.slice(); + if (options.extras) { + if (typeof options.extras !== "function") { + throw new Error("Grammar's 'extras' property must be a function."); + } + + extras = options.extras + .call(ruleBuilder, ruleBuilder, baseGrammar.extras) + .map(normalize); + } + + let word = baseGrammar.word; + if (options.word) { + word = options.word.call(ruleBuilder, ruleBuilder).name; + if (typeof word != 'string') { + throw new Error("Grammar's 'word' property must be a named rule."); + } + } + + let conflicts = baseGrammar.conflicts; + if (options.conflicts) { + if (typeof options.conflicts !== "function") { + throw new Error("Grammar's 'conflicts' property must be a function."); + } + + const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym)); + const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules); + + if (!Array.isArray(conflictRules)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + conflicts = conflictRules.map(conflictSet => { + if (!Array.isArray(conflictSet)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + return conflictSet.map(symbol => symbol.name); + }); + } + + let inline = baseGrammar.inline; + if (options.inline) { + if (typeof options.inline !== "function") { + throw new Error("Grammar's 'inline' property must be a function."); + } + + const baseInlineRules = baseGrammar.inline.map(sym); + const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules); + + if (!Array.isArray(inlineRules)) { + throw new Error("Grammar's inline must be an array of rules."); + } + + inline = inlineRules.map(symbol => symbol.name); + } + + if (Object.keys(rules).length == 0) { + throw new Error("Grammar must have at least one rule."); + } + + return {name, word, rules, extras, conflicts, externals, inline}; + } + +global.alias = alias; +global.blank = blank; +global.choice = choice; +global.optional = optional; +global.prec = prec; +global.repeat = repeat; +global.repeat1 = repeat1; +global.seq = seq; +global.sym = sym; +global.token = token; +global.grammar = grammar; diff --git a/src/main.rs b/src/main.rs index 9dc9efb2..c7ca2ca5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,15 @@ -use clap::{App, Arg, SubCommand}; +#[macro_use] +extern crate serde_derive; +#[macro_use] +extern crate serde_json; +#[macro_use] +extern crate lazy_static; -#[macro_use] extern crate serde_derive; -#[macro_use] extern crate serde_json; -#[macro_use] extern crate lazy_static; +use std::path::PathBuf; +use clap::{App, Arg, SubCommand}; +use std::env; +use std::io::Write; +use std::process::{Command, Stdio}; mod build_tables; mod error; @@ -20,25 +27,59 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") + .subcommand(SubCommand::with_name("generate").about("Generate a parser")) .subcommand( - SubCommand::with_name("generate") - .about("Generate a parser") - ).subcommand( SubCommand::with_name("parse") .about("Parse a file") - .arg(Arg::with_name("path").index(1)) - ).subcommand( + .arg(Arg::with_name("path").index(1)), + ) + .subcommand( SubCommand::with_name("test") .about("Run a parser's tests") .arg(Arg::with_name("path").index(1).required(true)) .arg(Arg::with_name("line").index(2).required(true)) - .arg(Arg::with_name("column").index(3).required(true)) - ).get_matches(); + .arg(Arg::with_name("column").index(3).required(true)), + ) + .get_matches(); if let Some(matches) = matches.subcommand_matches("generate") { - let code = generate::generate_parser_for_grammar(String::new())?; + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); + grammar_path.push("grammar.js"); + let grammar_json = load_js_grammar_file(grammar_path); + let code = generate::generate_parser_for_grammar(&grammar_json)?; println!("{}", code); } Ok(()) } + +fn load_js_grammar_file(grammar_path: PathBuf) -> String { + let mut node_process = Command::new("node") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .expect("Failed to run `node`"); + + let js_prelude = include_str!("./js/dsl.js"); + let mut node_stdin = node_process + .stdin + .take() + .expect("Failed to open stdin for node"); + write!( + node_stdin, + "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", + js_prelude, + grammar_path.to_str().unwrap() + ).expect("Failed to write to node's stdin"); + drop(node_stdin); + let output = node_process + .wait_with_output() + .expect("Failed to read output from node"); + match output.status.code() { + None => panic!("Node process was killed"), + Some(0) => {} + Some(code) => panic!(format!("Node process exited with status {}", code)), + } + + String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") +} diff --git a/src/nfa.rs b/src/nfa.rs index bc084ede..f6acb67a 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -23,6 +23,12 @@ pub struct Nfa { pub states: Vec } +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + #[derive(Debug)] pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 8b87ea2e..ff7204a0 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -22,6 +22,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if step.alias.is_none() { @@ -49,6 +50,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if status.alias.is_some() { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index d53555af..eaeede90 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -67,10 +67,13 @@ pub(super) fn extract_tokens( .expected_conflicts .into_iter() .map(|conflict| { - conflict + let mut result: Vec<_> = conflict .iter() .map(|symbol| symbol_replacer.replace_symbol(*symbol)) - .collect() + .collect(); + result.sort_unstable(); + result.dedup(); + result }) .collect(); diff --git a/src/render/mod.rs b/src/render/mod.rs index 5bd11a34..2ca610a6 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,6 +1,188 @@ -use crate::rules::{Symbol, AliasMap}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use std::collections::{HashMap, HashSet}; +use std::fmt::Write; + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + for _ in 0..$this.indent_level { + write!(&mut $this.buffer, " ").unwrap(); + } + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + $this.buffer += "\n"; + } +} + +struct Generator { + buffer: String, + indent_level: usize, + + language_name: String, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, + symbol_ids: HashMap, + parse_table_entries: Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: usize, + unique_aliases: HashSet, +} + +impl Generator { + fn generate(mut self) -> String { + self.add_includes(); + self.add_pragmas(); + self.add_stats(); + self.add_symbol_enum(); + self.add_symbol_names_list(); + self.buffer + } + + fn add_includes(&mut self) { + add_line!(self, "#include "); + add_line!(self, ""); + } + + fn add_pragmas(&mut self) { + add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); + add_line!(self, "#pragma GCC diagnostic push"); + add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!(self, "#endif"); + add_line!(self, ""); + + // Compiling large lexer functions can be very slow, especially when + // using Visual Studio on Windows. Disabling optimizations is not + // ideal, but only a very small fraction of overall parse time is + // spent lexing, so the performance impact of this is pretty small. + if self.main_lex_table.states.len() > 500 { + add_line!(self, "#ifdef _MSC_VER"); + add_line!(self, "#pragma optimize(\"\", off)"); + add_line!(self, "#endif"); + add_line!(self, ""); + } + } + + fn add_stats(&mut self) { + let mut token_count = 0; + + for symbol in &self.parse_table.symbols { + if symbol.is_terminal() { + token_count += 1; + } else if symbol.is_external() { + let external_token = &self.syntax_grammar.external_tokens[symbol.index]; + if external_token.corresponding_internal_token.is_none() { + token_count += 1; + } + } + } + + for alias_sequence in &self.parse_table.alias_sequences { + for entry in alias_sequence { + if let Some(alias) = entry { + self.unique_aliases.insert(alias.clone()); + } + } + } + + let mut symbol_id_values = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); + } + + add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); + add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); + add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!(self, "#define TOKEN_COUNT {}", token_count); + add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); + // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!(self, ""); + } + + fn add_symbol_enum(&mut self) { + add_line!(self, "enum {{"); + self.indent(); + for i in 0..self.parse_table.symbols.len() { + let symbol = self.parse_table.symbols[i]; + if symbol != Symbol::end() { + add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + } + } + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_symbol_names_list(&mut self) { + add_line!(self, "static const char *ts_symbol_names[] = {{"); + self.indent(); + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = self.metadata_for_symbol(symbol); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", self.sanitize_name(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_ids.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_ids.insert(id.clone()); + self.symbol_ids.insert(symbol, id); + } + + fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { + match symbol.kind { + SymbolType::End => ("end", VariableType::Auxiliary), + SymbolType::NonTerminal => { + let variable = &self.syntax_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::External => { + let token = &self.syntax_grammar.external_tokens[symbol.index]; + (&token.name, token.kind) + } + } + } + + fn sanitize_name(&self, name: &str) -> String { + name.to_string() + } + + fn indent(&mut self) { + self.indent_level += 1; + } + + fn dedent(&mut self) { + self.indent_level -= 1; + } +} pub(crate) fn render_c_code( name: &str, @@ -12,5 +194,21 @@ pub(crate) fn render_c_code( lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, ) -> String { - unimplemented!(); + Generator { + buffer: String::new(), + indent_level: 0, + language_name: name.to_string(), + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + symbol_ids: HashMap::new(), + parse_table_entries: Vec::new(), + next_parse_action_list_index: 0, + unique_aliases: HashSet::new(), + } + .generate() } diff --git a/src/rules.rs b/src/rules.rs index 9374a283..34f4c8b9 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { External, Terminal, NonTerminal, + End, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -33,7 +34,7 @@ pub(crate) struct MetadataParams { pub alias: Option, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct Symbol { pub kind: SymbolType, pub index: usize, @@ -56,6 +57,15 @@ pub(crate) enum Rule { } impl Rule { + pub fn alias(content: Rule, value: String, is_named: bool) -> Self { + add_metadata(content, move |params| { + params.alias = Some(Alias { + is_named, + value + }); + }) + } + pub fn token(content: Rule) -> Self { add_metadata(content, |params| { params.is_token = true; @@ -169,6 +179,13 @@ impl Symbol { index, } } + + pub fn end() -> Self { + Symbol { + kind: SymbolType::End, + index: 0, + } + } } impl From for Rule { @@ -177,7 +194,7 @@ impl From for Rule { } } -fn add_metadata(input: Rule, f: T) -> Rule { +fn add_metadata(input: Rule, f: T) -> Rule { match input { Rule::Metadata { rule, mut params } => { f(&mut params); diff --git a/src/tables.rs b/src/tables.rs index de66253c..9100b81e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -6,20 +6,13 @@ pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum ParseActionType { - Error, - Shift, - Reduce, - Accept, - Recover, -} - #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum ParseAction { Accept, - Error, - Shift(ParseStateId), + Shift { + state: ParseStateId, + is_repetition: bool, + }, ShiftExtra, Recover, Reduce { @@ -28,50 +21,69 @@ pub(crate) enum ParseAction { precedence: i32, dynamic_precedence: i32, associativity: Option, - alias_sequence_id: Option, - is_repetition: bool, + alias_sequence_id: AliasSequenceId, } } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseTableEntry { - actions: Vec, - reusable: bool, + pub actions: Vec, + pub reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { - terminal_entries: HashMap, - nonterminal_entries: HashMap + pub terminal_entries: HashMap, + pub nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] pub(crate) struct ParseTable { - states: Vec, - alias_sequences: Vec>, + pub states: Vec, + pub symbols: Vec, + pub alias_sequences: Vec>>, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { - state: LexStateId, - precedence: Range, - in_main_token: bool, + pub state: LexStateId, + pub precedence: Range, + pub in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AcceptTokenAction { - symbol: Symbol, - precedence: i32, - implicit_precedence: i32, + pub symbol: Symbol, + pub precedence: i32, + pub implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - advance_actions: HashMap, - accept_action: Option, + pub advance_actions: HashMap, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexTable { - states: Vec, + pub states: Vec, +} + +impl ParseTableEntry { + pub fn new() -> Self { + Self { + reusable: true, + actions: Vec::new(), + } + } +} + +impl ParseAction { + pub fn precedence(&self) -> i32 { + if let ParseAction::Reduce { precedence, .. } = self { + *precedence + } else { + 0 + } + } } From 261a7fd07347b20ad500b58ac3d1dbf96990da81 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 21 Dec 2018 15:02:48 -0800 Subject: [PATCH 074/102] Represent ParseItem with reference to Production Implement comparisons in a way that disregards past steps. --- src/build_tables/inline_variables.rs | 441 ----------------------- src/build_tables/item.rs | 315 ++++++++-------- src/build_tables/item_set_builder.rs | 66 ++-- src/build_tables/mod.rs | 120 +++---- src/generate.rs | 5 +- src/grammars.rs | 78 +++- src/prepare_grammar/mod.rs | 16 +- src/prepare_grammar/process_inlines.rs | 477 +++++++++++++++++++++++++ src/rules.rs | 4 +- 9 files changed, 803 insertions(+), 719 deletions(-) delete mode 100644 src/build_tables/inline_variables.rs create mode 100644 src/prepare_grammar/process_inlines.rs diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs deleted file mode 100644 index affbe163..00000000 --- a/src/build_tables/inline_variables.rs +++ /dev/null @@ -1,441 +0,0 @@ -use super::item::ParseItem; -use crate::grammars::{Production, SyntaxGrammar}; -use std::collections::HashMap; - -pub(crate) struct InlinedProductionMap { - pub inlined_productions: Vec, - item_map: HashMap>, -} - -impl InlinedProductionMap { - pub fn new(grammar: &SyntaxGrammar) -> Self { - let mut result = Self { - inlined_productions: Vec::new(), - item_map: HashMap::new(), - }; - - let mut items_to_process = Vec::new(); - for (variable_index, variable) in grammar.variables.iter().enumerate() { - for production_index in 0..variable.productions.len() { - items_to_process.push(ParseItem::Normal { - variable_index: variable_index as u32, - production_index: production_index as u32, - step_index: 0, - }); - while !items_to_process.is_empty() { - let mut i = 0; - while i < items_to_process.len() { - let item = &items_to_process[i]; - if let Some(step) = item.step(grammar, &result) { - if grammar.variables_to_inline.contains(&step.symbol) { - let inlined_items = result - .inline(*item, grammar) - .into_iter() - .map(|production_index| ParseItem::Inlined { - variable_index: item.variable_index(), - production_index: *production_index, - step_index: item.step_index() as u32, - }) - .collect::>(); - items_to_process.splice(i..i + 1, inlined_items); - } else { - items_to_process[i] = item.successor(); - i += 1; - } - } else { - items_to_process.remove(i); - } - } - } - } - } - - result - } - - pub fn inlined_items<'a>( - &'a self, - item: ParseItem, - ) -> Option + 'a> { - self.item_map.get(&item).map(|production_indices| { - production_indices - .iter() - .cloned() - .map(move |production_index| ParseItem::Inlined { - variable_index: item.variable_index(), - production_index, - step_index: item.step_index() as u32, - }) - }) - } - - fn inline(&mut self, item: ParseItem, grammar: &SyntaxGrammar) -> &Vec { - let step_index = item.step_index(); - let mut productions_to_add = grammar.variables - [item.step(grammar, self).unwrap().symbol.index] - .productions - .clone(); - - let mut i = 0; - while i < productions_to_add.len() { - if let Some(first_symbol) = productions_to_add[i].first_symbol() { - if grammar.variables_to_inline.contains(&first_symbol) { - // Remove the production from the vector, replacing it with a placeholder. - let production = productions_to_add - .splice(i..i + 1, [Production::default()].iter().cloned()) - .next() - .unwrap(); - - // Replace the placeholder with the inlined productions. - productions_to_add.splice( - i..i + 1, - grammar.variables[first_symbol.index] - .productions - .iter() - .map(|p| { - let mut p = p.clone(); - p.steps.extend(production.steps[1..].iter().cloned()); - p - }), - ); - continue; - } - } - i += 1; - } - - let result = productions_to_add - .into_iter() - .map(|production_to_add| { - let mut inlined_production = item.production(grammar, &self).clone(); - let removed_step = inlined_production - .steps - .splice( - step_index..step_index + 1, - production_to_add.steps.iter().cloned(), - ) - .next() - .unwrap(); - let inserted_steps = &mut inlined_production.steps - [step_index..step_index + production_to_add.steps.len()]; - if let Some(alias) = removed_step.alias { - for inserted_step in inserted_steps.iter_mut() { - inserted_step.alias = Some(alias.clone()); - } - } - if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; - } - self.inlined_productions - .iter() - .position(|p| *p == inlined_production) - .unwrap_or({ - self.inlined_productions.push(inlined_production); - self.inlined_productions.len() - 1 - }) as u32 - }) - .collect(); - - self.item_map.entry(item).or_insert(result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::grammars::{LexicalGrammar, ProductionStep, SyntaxVariable, VariableType}; - use crate::rules::{Alias, Associativity, Symbol}; - use std::borrow::Borrow; - - #[test] - fn test_basic_inlining() { - let grammar = SyntaxGrammar { - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - variables_to_inline: vec![Symbol::non_terminal(1)], - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(12)), - ProductionStep::new(Symbol::terminal(13)), - ], - }, - Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(14))], - }, - ], - }, - ], - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - // Nothing to inline at step 0. - assert!(inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0 - }) - .is_none()); - - // Inlining variable 1 yields two productions. - assert_eq!( - display_items( - inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }) - .unwrap(), - &grammar, - &inline_map - ), - vec![ - "non-terminal-0 → terminal-10 • terminal-12 terminal-13 terminal-11" - .to_string(), - "non-terminal-0 → terminal-10 • terminal-14 terminal-11".to_string(), - ] - ); - } - - #[test] - fn test_nested_inlining() { - let grammar = SyntaxGrammar { - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ProductionStep::new(Symbol::terminal(12)), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(13))], - }, - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(3)), // inlined - ProductionStep::new(Symbol::terminal(14)), - ], - }, - ], - }, - SyntaxVariable { - name: "non-terminal-2".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(15))], - }], - }, - SyntaxVariable { - name: "non-terminal-3".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(16))], - }], - }, - ], - variables_to_inline: vec![ - Symbol::non_terminal(1), - Symbol::non_terminal(2), - Symbol::non_terminal(3), - ], - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - let items = inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1, - }) - .unwrap() - .collect::>(); - - assert_eq!( - display_items(&items, &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), - "non-terminal-0 → terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() - ] - ); - - let item = items[0].successor().successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), - ] - ); - - assert_eq!( - display_items(inline_map.inlined_items(item).unwrap(), &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), - ] - ); - } - - #[test] - fn test_inlining_with_precedence_and_alias() { - let grammar = SyntaxGrammar { - variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)) // inlined - .with_prec(1, Some(Associativity::Left)), - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(11)) - .with_prec(2, None) - .with_alias("inner_alias", true), - ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-2".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(13)) - .with_alias("outer_alias", true)], - }], - }, - ], - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - let items = inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0, - }) - .unwrap() - .collect::>(); - assert_eq!( - display_items(&items, &grammar, &inline_map)[0], - "non-terminal-0 → • terminal-11 terminal-12 terminal-10 non-terminal-2".to_string(), - ); - - // The first step in the inlined production retains its precedence and alias. - let item = items[0].successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 • terminal-12 terminal-10 non-terminal-2".to_string(), - ); - assert_eq!(item.precedence(&grammar, &inline_map), 2); - assert_eq!( - items[0].step(&grammar, &inline_map).unwrap().alias, - Some(Alias { - value: "inner_alias".to_string(), - is_named: true, - }) - ); - - // The final terminal of the inlined production inherits the precedence of - // the inlined step. - let item = item.successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 • terminal-10 non-terminal-2".to_string(), - ); - assert_eq!(item.precedence(&grammar, &inline_map), 1); - - let item = item.successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 terminal-10 • non-terminal-2".to_string(), - ); - - // All steps of the inlined production inherit their alias from the - // inlined step. - let items = inline_map.inlined_items(item).unwrap().collect::>(); - assert_eq!( - display_items(&items, &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 terminal-10 • terminal-13".to_string(), - ); - assert_eq!( - items[0].step(&grammar, &inline_map).unwrap().alias, - Some(Alias { - value: "outer_alias".to_string(), - is_named: true, - }) - ) - } - - fn display_items( - items: impl IntoIterator>, - grammar: &SyntaxGrammar, - inline_map: &InlinedProductionMap, - ) -> Vec { - let lex = LexicalGrammar::default(); - items - .into_iter() - .map(|item| format!("{}", item.borrow().display_with(grammar, &lex, inline_map))) - .collect() - } -} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 9208f602..49ab4f27 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,10 +1,12 @@ -use super::inline_variables::InlinedProductionMap; use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; -use crate::rules::{Associativity, Symbol, SymbolType}; +use crate::rules::Associativity; +use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; use std::collections::{HashMap, BTreeMap}; use std::fmt; use std::hash::{Hash, Hasher}; +use std::u32; +use std::cmp::Ordering; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -28,49 +30,26 @@ pub(crate) struct LookaheadSet { eof: bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub(crate) enum ParseItem { - Start { - step_index: u32, - }, - Normal { - variable_index: u32, - production_index: u32, - step_index: u32, - }, - Inlined { - variable_index: u32, - production_index: u32, - step_index: u32, - }, +#[derive(Clone, Copy, Debug)] +pub(crate) struct ParseItem<'a> { + pub variable_index: u32, + pub step_index: u32, + pub production: &'a Production, } #[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct ParseItemSet { - pub entries: BTreeMap, +pub(crate) struct ParseItemSet<'a> { + pub entries: BTreeMap, LookaheadSet>, } -pub(crate) struct ParseItemDisplay<'a>( - &'a ParseItem, - &'a SyntaxGrammar, - &'a LexicalGrammar, - &'a InlinedProductionMap, -); - +pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem<'a>, &'a SyntaxGrammar, &'a LexicalGrammar); pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); - pub(crate) struct ParseItemSetDisplay<'a>( - &'a ParseItemSet, + &'a ParseItemSet<'a>, &'a SyntaxGrammar, &'a LexicalGrammar, - &'a InlinedProductionMap, ); -struct ParseItemSetMapEntry(ParseItemSet, u64); -pub(crate) struct ParseItemSetMap { - map: HashMap -} - impl LookaheadSet { pub fn new() -> Self { Self { @@ -173,152 +152,79 @@ impl LookaheadSet { } } -impl ParseItem { +impl<'a> ParseItem<'a> { pub fn start() -> Self { - ParseItem::Start { step_index: 0 } - } - - pub fn is_kernel(&self) -> bool { - match self { - ParseItem::Start { .. } => true, - ParseItem::Normal { step_index, .. } | ParseItem::Inlined { step_index, .. } => { - *step_index > 0 - } + ParseItem { + variable_index: u32::MAX, + production: &START_PRODUCTION, + step_index: 0, } } - pub fn production<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlined_productions: &'a InlinedProductionMap, - ) -> &'a Production { - match self { - ParseItem::Start { .. } => &START_PRODUCTION, - ParseItem::Normal { - variable_index, - production_index, - .. - } => { - &grammar.variables[*variable_index as usize].productions[*production_index as usize] - } - ParseItem::Inlined { - production_index, .. - } => &inlined_productions.inlined_productions[*production_index as usize], + pub fn step(&self) -> Option<&'a ProductionStep> { + self.production.steps.get(self.step_index as usize) + } + + pub fn symbol(&self) -> Option { + self.step().map(|step| step.symbol) + } + + pub fn associativity(&self) -> Option { + self.prev_step().and_then(|step| step.associativity) + } + + pub fn precedence(&self) -> i32 { + self.prev_step().map_or(0, |step| step.precedence) + } + + pub fn prev_step(&self) -> Option<&'a ProductionStep> { + self.production.steps.get(self.step_index as usize - 1) + } + + pub fn is_done(&self) -> bool { + self.step_index as usize == self.production.steps.len() + } + + pub fn is_augmented(&self) -> bool { + self.variable_index == u32::MAX + } + + pub fn successor(&self) -> ParseItem<'a> { + ParseItem { + variable_index: self.variable_index, + production: self.production, + step_index: self.step_index + 1, } } - pub fn symbol( - &self, - grammar: &SyntaxGrammar, - inlined_productions: &InlinedProductionMap, - ) -> Option { - self.step(grammar, inlined_productions).map(|s| s.symbol) - } - - pub fn step<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlined_productions: &'a InlinedProductionMap, - ) -> Option<&'a ProductionStep> { - self.production(grammar, inlined_productions) - .steps - .get(self.step_index()) - } - - pub fn precedence<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> i32 { - self.production(grammar, inlines) - .steps - .get(self.step_index() - 1) - .map(|s| s.precedence) - .unwrap_or(0) - } - - pub fn associativity<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> Option { - let production = self.production(grammar, inlines); - let step_index = self.step_index(); - if step_index == production.steps.len() { - production.steps.last().and_then(|s| s.associativity) - } else { - None - } - } - - pub fn variable_index(&self) -> u32 { - match self { - ParseItem::Start { .. } => panic!("Start item doesn't have a variable index"), - ParseItem::Normal { variable_index, .. } - | ParseItem::Inlined { variable_index, .. } => *variable_index, - } - } - - pub fn step_index(&self) -> usize { - match self { - ParseItem::Start { step_index } - | ParseItem::Normal { step_index, .. } - | ParseItem::Inlined { step_index, .. } => *step_index as usize, - } - } - - pub fn is_final(&self) -> bool { - if let ParseItem::Start { step_index: 1 } = self { - true - } else { - false - } - } - - fn step_index_mut(&mut self) -> &mut u32 { - match self { - ParseItem::Start { step_index } - | ParseItem::Normal { step_index, .. } - | ParseItem::Inlined { step_index, .. } => step_index, - } - } - - pub fn display_with<'a>( + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, ) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines) - } - - pub fn successor(&self) -> ParseItem { - let mut result = self.clone(); - *result.step_index_mut() += 1; - result + ParseItemDisplay(self, syntax_grammar, lexical_grammar) } } -impl ParseItemSet { - pub fn with<'a>(elements: impl IntoIterator) -> Self { +impl<'a> ParseItemSet<'a> { + pub fn with(elements: impl IntoIterator, LookaheadSet)>) -> Self { let mut result = Self::default(); for (item, lookaheads) in elements { - result.entries.insert(*item, lookaheads.clone()); + result.entries.insert(item, lookaheads); } result } - pub fn display_with<'a>( + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, ) -> ParseItemSetDisplay<'a> { - ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines) + ParseItemSetDisplay(self, syntax_grammar, lexical_grammar) } } -impl Default for ParseItemSet { +impl<'a> Default for ParseItemSet<'a> { fn default() -> Self { Self { entries: BTreeMap::new(), @@ -328,20 +234,18 @@ impl Default for ParseItemSet { impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - if let ParseItem::Start { .. } = &self.0 { + if self.0.is_augmented() { write!(f, "START →")?; } else { write!( f, "{} →", - &self.1.variables[self.0.variable_index() as usize].name + &self.1.variables[self.0.variable_index as usize].name )?; } - let step_index = self.0.step_index(); - let production = self.0.production(self.1, self.3); - for (i, step) in production.steps.iter().enumerate() { - if i == step_index { + for (i, step) in self.0.production.steps.iter().enumerate() { + if i == self.0.step_index as usize { write!(f, " •")?; } @@ -359,7 +263,7 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } } - if production.steps.len() == step_index { + if self.0.is_done() { write!(f, " •")?; } @@ -398,7 +302,7 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { writeln!( f, "{}\t{}", - item.display_with(self.1, self.2, self.3), + item.display_with(self.1, self.2), lookaheads.display_with(self.1, self.2) )?; } @@ -406,7 +310,94 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { } } -impl Hash for ParseItemSet { +impl<'a> Hash for ParseItem<'a> { + fn hash(&self, hasher: &mut H) { + hasher.write_u32(self.variable_index); + hasher.write_u32(self.step_index); + hasher.write_i32(self.production.dynamic_precedence); + hasher.write_usize(self.production.steps.len()); + hasher.write_i32(self.precedence()); + self.associativity().hash(hasher); + for step in &self.production.steps[0..self.step_index as usize] { + step.alias.hash(hasher); + } + for step in &self.production.steps[self.step_index as usize..] { + step.hash(hasher); + } + } +} + +impl<'a> PartialEq for ParseItem<'a> { + fn eq(&self, other: &Self) -> bool { + if self.variable_index != other.variable_index + || self.step_index != other.step_index + || self.production.dynamic_precedence != other.production.dynamic_precedence + || self.production.steps.len() != other.production.steps.len() + || self.precedence() != other.precedence() + || self.associativity() != other.associativity() + { + return false; + } + + for (i, step) in self.production.steps.iter().enumerate() { + if i < self.step_index as usize { + if step.alias != other.production.steps[i].alias { + return false; + } + } else { + if *step != other.production.steps[i] { + return false; + } + } + } + + return true; + } +} + +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + if let Some(o) = self.variable_index.partial_cmp(&other.variable_index) { + return Some(o); + } + if let Some(o) = self.step_index.partial_cmp(&other.step_index) { + return Some(o); + } + if let Some(o) = self.production.dynamic_precedence.partial_cmp(&other.production.dynamic_precedence) { + return Some(o); + } + if let Some(o) = self.production.steps.len().partial_cmp(&other.production.steps.len()) { + return Some(o); + } + if let Some(o) = self.precedence().partial_cmp(&other.precedence()) { + return Some(o); + } + if let Some(o) = self.associativity().partial_cmp(&other.associativity()) { + return Some(o); + } + for (i, step) in self.production.steps.iter().enumerate() { + let cmp = if i < self.step_index as usize { + step.alias.partial_cmp(&other.production.steps[i].alias) + } else { + step.partial_cmp(&other.production.steps[i]) + }; + if let Some(o) = cmp { + return Some(o); + } + } + return None; + } +} + +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap_or(Ordering::Equal) + } +} + +impl<'a> Eq for ParseItem<'a> {} + +impl<'a> Hash for ParseItemSet<'a> { fn hash(&self, hasher: &mut H) { hasher.write_usize(self.entries.len()); for (item, lookaheads) in self.entries.iter() { diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 530c1f25..52ee0a45 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,12 +1,11 @@ -use super::inline_variables::InlinedProductionMap; use super::item::{LookaheadSet, ParseItem, ParseItemSet}; -use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use std::collections::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq, Eq)] -struct TransitiveClosureAddition { - item: ParseItem, +struct TransitiveClosureAddition<'a> { + item: ParseItem<'a>, info: FollowSetInfo, } @@ -16,11 +15,10 @@ struct FollowSetInfo { propagates_lookaheads: bool, } -pub(crate) struct ParseItemSetBuilder { +pub(crate) struct ParseItemSetBuilder<'a> { first_sets: HashMap, last_sets: HashMap, - transitive_closure_additions: Vec>, - pub inlines: InlinedProductionMap, + transitive_closure_additions: Vec>>, } fn find_or_push(vector: &mut Vec, value: T) { @@ -29,13 +27,16 @@ fn find_or_push(vector: &mut Vec, value: T) { } } -impl ParseItemSetBuilder { - pub fn new(syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar) -> Self { +impl<'a> ParseItemSetBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> Self { let mut result = Self { first_sets: HashMap::new(), last_sets: HashMap::new(), transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], - inlines: InlinedProductionMap::new(syntax_grammar), }; // For each grammar symbol, populate the FIRST and LAST sets: the set of @@ -193,22 +194,28 @@ impl ParseItemSetBuilder { for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { let variable = &syntax_grammar.variables[variable_index]; let non_terminal = Symbol::non_terminal(variable_index); + let variable_index = variable_index as u32; if syntax_grammar.variables_to_inline.contains(&non_terminal) { continue; } - for production_index in 0..variable.productions.len() { - let item = ParseItem::Normal { - variable_index: variable_index as u32, - production_index: production_index as u32, + for (production_index, production) in variable.productions.iter().enumerate() { + let item = ParseItem { + variable_index, + production, step_index: 0, }; - if let Some(inlined_items) = result.inlines.inlined_items(item) { - for inlined_item in inlined_items { + // let step_id = item.as_step_id(syntax_grammar, inlines); + if let Some(inlined_productions) = inlines.inlined_productions(item.production, item.step_index) { + for production in inlined_productions { find_or_push( additions_for_non_terminal, TransitiveClosureAddition { - item: inlined_item, + item: ParseItem { + variable_index, + production, + step_index: item.step_index, + }, info: follow_set_info.clone(), }, ); @@ -231,14 +238,19 @@ impl ParseItemSetBuilder { pub(crate) fn transitive_closure( &mut self, - item_set: &ParseItemSet, - grammar: &SyntaxGrammar, - ) -> ParseItemSet { + item_set: &ParseItemSet<'a>, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(items) = self.inlines.inlined_items(*item) { - for item in items { - self.add_item(&mut result, item, lookaheads, grammar); + if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) { + for production in productions { + self.add_item(&mut result, ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, lookaheads, grammar); } } else { self.add_item(&mut result, *item, lookaheads, grammar); @@ -253,14 +265,14 @@ impl ParseItemSetBuilder { fn add_item( &self, - set: &mut ParseItemSet, - item: ParseItem, + set: &mut ParseItemSet<'a>, + item: ParseItem<'a>, lookaheads: &LookaheadSet, grammar: &SyntaxGrammar, ) { - if let Some(step) = item.step(grammar, &self.inlines) { + if let Some(step) = item.step() { if step.symbol.is_non_terminal() { - let next_step = item.successor().step(grammar, &self.inlines); + let next_step = item.successor().step(); // Determine which tokens can follow this non-terminal. let following_tokens = if let Some(next_step) = next_step { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 091c5486..27951453 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,14 +1,14 @@ -mod inline_variables; mod item; mod item_set_builder; use self::item::{LookaheadSet, ParseItem, ParseItemSet}; use self::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; -use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; -use crate::tables::ParseTableEntry; -use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable}; +use crate::tables::{ + AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; use core::ops::Range; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet, VecDeque}; @@ -30,12 +30,13 @@ struct ParseStateQueueEntry { } struct ParseTableBuilder<'a> { - item_set_builder: ParseItemSetBuilder, + item_set_builder: ParseItemSetBuilder<'a>, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, simple_aliases: &'a AliasMap, - state_ids_by_item_set: HashMap, - item_sets_by_state_id: Vec, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, } @@ -46,16 +47,17 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. - let error_state_id = self.add_parse_state( - &Vec::new(), - &Vec::new(), - ParseItemSet::default(), - ); + let error_state_id = + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); self.add_parse_state( &Vec::new(), &Vec::new(), - ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]), + ParseItemSet::with( + [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + .iter() + .cloned(), + ), ); self.process_part_state_queue()?; @@ -68,7 +70,7 @@ impl<'a> ParseTableBuilder<'a> { &mut self, preceding_symbols: &SymbolSequence, preceding_auxiliary_symbols: &AuxiliarySymbolSequence, - item_set: ParseItemSet, + item_set: ParseItemSet<'a>, ) -> ParseStateId { match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => { @@ -99,16 +101,14 @@ impl<'a> ParseTableBuilder<'a> { println!( "ITEM SET {}:\n{}", entry.state_id, - self.item_sets_by_state_id[entry.state_id].display_with( - &self.syntax_grammar, - &self.lexical_grammar, - &self.item_set_builder.inlines - ) + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) ); let item_set = self.item_set_builder.transitive_closure( &self.item_sets_by_state_id[entry.state_id], self.syntax_grammar, + self.inlines, ); // println!("TRANSITIVE CLOSURE:"); @@ -131,7 +131,7 @@ impl<'a> ParseTableBuilder<'a> { &mut self, mut preceding_symbols: SymbolSequence, mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet, + item_set: ParseItemSet<'a>, state_id: ParseStateId, ) -> Result<()> { let mut terminal_successors = HashMap::new(); @@ -139,9 +139,7 @@ impl<'a> ParseTableBuilder<'a> { let mut lookaheads_with_conflicts = HashSet::new(); for (item, lookaheads) in &item_set.entries { - if let Some(next_symbol) = - item.symbol(self.syntax_grammar, &self.item_set_builder.inlines) - { + if let Some(next_symbol) = item.symbol() { let successor = item.successor(); if next_symbol.is_non_terminal() { // Keep track of where auxiliary non-terminals (repeat symbols) are @@ -169,17 +167,15 @@ impl<'a> ParseTableBuilder<'a> { .insert_all(lookaheads); } } else { - let action = if item.is_final() { + let action = if item.is_augmented() { ParseAction::Accept } else { - let production = - item.production(&self.syntax_grammar, &self.item_set_builder.inlines); ParseAction::Reduce { - symbol: Symbol::non_terminal(item.variable_index() as usize), - child_count: item.step_index(), - precedence: production.last_precedence(), - associativity: production.last_associativity(), - dynamic_precedence: production.dynamic_precedence, + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, alias_sequence_id: self.get_alias_sequence_id(item), } }; @@ -280,17 +276,15 @@ impl<'a> ParseTableBuilder<'a> { let mut shift_precedence: Option> = None; let mut conflicting_items = HashSet::new(); for (item, lookaheads) in &item_set.entries { - let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); - let step_index = item.step_index(); - if let Some(step) = production.steps.get(step_index) { - if step_index > 0 { + if let Some(step) = item.step() { + if item.step_index > 0 { if self .item_set_builder .first_set(&step.symbol) .contains(&conflicting_lookahead) { conflicting_items.insert(item); - let precedence = production.steps[step_index - 1].precedence; + let precedence = item.precedence(); if let Some(range) = &mut shift_precedence { if precedence < range.start { range.start = precedence; @@ -316,11 +310,11 @@ impl<'a> ParseTableBuilder<'a> { // by leaving it in the parse table, but marking the SHIFT action with // an `is_repetition` flag. let conflicting_variable_index = - conflicting_items.iter().next().unwrap().variable_index(); + conflicting_items.iter().next().unwrap().variable_index; if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { if conflicting_items .iter() - .all(|item| item.variable_index() == conflicting_variable_index) + .all(|item| item.variable_index == conflicting_variable_index) { *is_repetition = true; return Ok(()); @@ -340,10 +334,7 @@ impl<'a> ParseTableBuilder<'a> { && shift_precedence.start < reduce_precedence) { entry.actions.pop(); - conflicting_items.retain(|item| { - item.step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - }); + conflicting_items.retain(|item| item.is_done()); } // If the SHIFT and REDUCE actions have the same predence, consider // the REDUCE actions' associativity. @@ -367,10 +358,7 @@ impl<'a> ParseTableBuilder<'a> { match (has_left, has_non, has_right) { (true, false, false) => { entry.actions.pop(); - conflicting_items.retain(|item| { - item.step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - }); + conflicting_items.retain(|item| item.is_done()); } (false, false, true) => { entry.actions.drain(0..entry.actions.len() - 1); @@ -392,7 +380,7 @@ impl<'a> ParseTableBuilder<'a> { // Determine the set of parent symbols involved in this conflict. let mut actual_conflict = Vec::new(); for item in &conflicting_items { - let symbol = Symbol::non_terminal(item.variable_index() as usize); + let symbol = Symbol::non_terminal(item.variable_index as usize); if self.syntax_grammar.variables[symbol.index].is_auxiliary() { actual_conflict.extend( preceding_auxiliary_symbols @@ -441,7 +429,7 @@ impl<'a> ParseTableBuilder<'a> { for preceding_symbol in preceding_symbols .iter() - .take(preceding_symbols.len() - item.step_index()) + .take(preceding_symbols.len() - item.step_index as usize) { write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); } @@ -449,17 +437,12 @@ impl<'a> ParseTableBuilder<'a> { write!( &mut msg, " ({}", - &self.syntax_grammar.variables[item.variable_index() as usize].name + &self.syntax_grammar.variables[item.variable_index as usize].name ) .unwrap(); - for (j, step) in item - .production(&self.syntax_grammar, &self.item_set_builder.inlines) - .steps - .iter() - .enumerate() - { - if j == item.step_index() { + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { write!(&mut msg, " •").unwrap(); } write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); @@ -467,10 +450,7 @@ impl<'a> ParseTableBuilder<'a> { write!(&mut msg, ")").unwrap(); - if item - .step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - { + if item.is_done() { write!( &mut msg, " • {}", @@ -479,9 +459,8 @@ impl<'a> ParseTableBuilder<'a> { .unwrap(); } - let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines); - let associativity = - item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines); + let precedence = item.precedence(); + let associativity = item.associativity(); if precedence != 0 || associativity.is_some() { write!( &mut msg, @@ -506,8 +485,7 @@ impl<'a> ParseTableBuilder<'a> { .entries .keys() .filter_map(|item| { - if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol) - { + if item.symbol() == Some(symbol) { None } else { None @@ -554,8 +532,12 @@ impl<'a> ParseTableBuilder<'a> { } fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); - let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect(); + let alias_sequence = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); if let Some(index) = self .parse_table .alias_sequences @@ -592,12 +574,14 @@ pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap, + inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { ParseTableBuilder { syntax_grammar, lexical_grammar, simple_aliases, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar), + inlines, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), parse_state_queue: VecDeque::new(), diff --git a/src/generate.rs b/src/generate.rs index dc3d5176..cdbbea4f 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -6,11 +6,12 @@ use crate::render::render_c_code; pub fn generate_parser_for_grammar(input: &str) -> Result { let input_grammar = parse_grammar(input)?; - let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, - &simple_aliases + &simple_aliases, + &inlines )?; let c_code = render_c_code( &input_grammar.name, diff --git a/src/grammars.rs b/src/grammars.rs index 7512ec03..b751e4e4 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,12 +1,13 @@ -use crate::rules::{Associativity, Alias, Rule, Symbol}; use crate::nfa::Nfa; +use crate::rules::{Alias, Associativity, Rule, Symbol}; +use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { Hidden, Auxiliary, Anonymous, - Named + Named, } // Input grammar @@ -46,12 +47,12 @@ pub(crate) struct LexicalGrammar { // Extracted syntax grammar -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct ProductionStep { - pub symbol: Symbol, - pub precedence: i32, - pub associativity: Option, - pub alias: Option, + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -60,6 +61,11 @@ pub(crate) struct Production { pub dynamic_precedence: i32, } +pub(crate) struct InlinedProductionMap { + pub productions: Vec, + pub production_map: HashMap<(*const Production, u32), Vec>, +} + #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct SyntaxVariable { pub name: String, @@ -86,7 +92,12 @@ pub(crate) struct SyntaxGrammar { impl ProductionStep { pub(crate) fn new(symbol: Symbol) -> Self { - Self { symbol, precedence: 0, associativity: None, alias: None } + Self { + symbol, + precedence: 0, + associativity: None, + alias: None, + } } pub(crate) fn with_prec(self, precedence: i32, associativity: Option) -> Self { @@ -103,7 +114,10 @@ impl ProductionStep { symbol: self.symbol, precedence: self.precedence, associativity: self.associativity, - alias: Some(Alias { value: value.to_string(), is_named }), + alias: Some(Alias { + value: value.to_string(), + is_named, + }), } } } @@ -124,25 +138,44 @@ impl Production { impl Default for Production { fn default() -> Self { - Production { dynamic_precedence: 0, steps: Vec::new() } + Production { + dynamic_precedence: 0, + steps: Vec::new(), + } } } impl Variable { pub fn named(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Named, rule } + Self { + name: name.to_string(), + kind: VariableType::Named, + rule, + } } pub fn auxiliary(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Auxiliary, rule } + Self { + name: name.to_string(), + kind: VariableType::Auxiliary, + rule, + } } pub fn hidden(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Hidden, rule } + Self { + name: name.to_string(), + kind: VariableType::Hidden, + rule, + } } pub fn anonymous(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Anonymous, rule } + Self { + name: name.to_string(), + kind: VariableType::Anonymous, + rule, + } } } @@ -151,3 +184,20 @@ impl SyntaxVariable { self.kind == VariableType::Auxiliary } } + +impl InlinedProductionMap { + pub fn inlined_productions<'a>( + &'a self, + production: &Production, + step_index: u32, + ) -> Option + 'a> { + self.production_map + .get(&(production as *const Production, step_index)) + .map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |index| &self.productions[index]) + }) + } +} diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 22435fca..f325383b 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -4,6 +4,7 @@ mod extract_simple_aliases; mod extract_tokens; mod flatten_grammar; mod intern_symbols; +mod process_inlines; use self::expand_repeats::expand_repeats; use self::expand_tokens::expand_tokens; @@ -11,8 +12,11 @@ use self::extract_simple_aliases::extract_simple_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; use self::intern_symbols::intern_symbols; +use self::process_inlines::process_inlines; use crate::error::Result; -use crate::grammars::{ExternalToken, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable}; +use crate::grammars::{ + ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable, +}; use crate::rules::{AliasMap, Rule, Symbol}; pub(self) struct IntermediateGrammar { @@ -36,12 +40,18 @@ pub(self) struct ExtractedLexicalGrammar { pub(crate) fn prepare_grammar( input_grammar: &InputGrammar, -) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { +) -> Result<( + SyntaxGrammar, + LexicalGrammar, + InlinedProductionMap, + AliasMap, +)> { let interned_grammar = intern_symbols(input_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; let lexical_grammar = expand_tokens(lexical_grammar)?; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); - Ok((syntax_grammar, lexical_grammar, simple_aliases)) + let inlines = process_inlines(&syntax_grammar); + Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases)) } diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs new file mode 100644 index 00000000..0d7f6827 --- /dev/null +++ b/src/prepare_grammar/process_inlines.rs @@ -0,0 +1,477 @@ +use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; +use std::collections::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct ProductionStepId { + variable_index: Option, + production_index: usize, + step_index: usize, +} + +struct InlinedProductionMapBuilder { + production_indices_by_step_id: HashMap>, + productions: Vec, +} + +impl ProductionStepId { + pub fn successor(&self) -> Self { + Self { + variable_index: self.variable_index, + production_index: self.production_index, + step_index: self.step_index + 1, + } + } +} + +fn production_for_id<'a>( + map: &'a InlinedProductionMapBuilder, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> &'a Production { + if let Some(variable_index) = id.variable_index { + &grammar.variables[variable_index].productions[id.production_index] + } else { + &map.productions[id.production_index] + } +} + +fn production_step_for_id<'a>( + map: &'a InlinedProductionMapBuilder, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> Option<&'a ProductionStep> { + production_for_id(map, id, grammar).steps.get(id.step_index) +} + +fn inline<'a>( + map: &'a mut InlinedProductionMapBuilder, + step_id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> &'a Vec { + let step = production_step_for_id(map, step_id, grammar).unwrap(); + let mut productions_to_add = grammar.variables[step.symbol.index].productions.clone(); + + let mut i = 0; + while i < productions_to_add.len() { + if let Some(first_symbol) = productions_to_add[i].first_symbol() { + if grammar.variables_to_inline.contains(&first_symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[first_symbol.index] + .productions + .iter() + .map(|p| { + let mut p = p.clone(); + p.steps.extend(production.steps[1..].iter().cloned()); + p + }), + ); + continue; + } + } + i += 1; + } + + let result = productions_to_add + .into_iter() + .map(|production_to_add| { + let mut inlined_production = production_for_id(&map, step_id, grammar).clone(); + let removed_step = inlined_production + .steps + .splice( + step_id.step_index..step_id.step_index + 1, + production_to_add.steps.iter().cloned(), + ) + .next() + .unwrap(); + let inserted_steps = &mut inlined_production.steps + [step_id.step_index..step_id.step_index + production_to_add.steps.len()]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } + map.productions + .iter() + .position(|p| *p == inlined_production) + .unwrap_or({ + map.productions.push(inlined_production); + map.productions.len() - 1 + }) + }) + .collect(); + + map.production_indices_by_step_id + .entry(step_id) + .or_insert(result) +} + +pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { + let mut result = InlinedProductionMapBuilder { + productions: Vec::new(), + production_indices_by_step_id: HashMap::new(), + }; + + let mut step_ids_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + step_ids_to_process.push(ProductionStepId { + variable_index: Some(variable_index), + production_index, + step_index: 0, + }); + while !step_ids_to_process.is_empty() { + let mut i = 0; + while i < step_ids_to_process.len() { + let step_id = step_ids_to_process[i]; + if let Some(step) = production_step_for_id(&result, step_id, grammar) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_step_ids = inline(&mut result, step_id, grammar) + .into_iter() + .cloned() + .map(|production_index| ProductionStepId { + variable_index: None, + production_index, + step_index: step_id.step_index, + }) + .collect::>(); + step_ids_to_process.splice(i..i + 1, inlined_step_ids); + } else { + step_ids_to_process[i] = step_id.successor(); + i += 1; + } + } else { + step_ids_to_process.remove(i); + } + } + } + } + } + + // result + let productions = result.productions; + let production_indices_by_step_id = result.production_indices_by_step_id; + + let production_map = production_indices_by_step_id + .into_iter() + .map(|(step_id, production_indices)| { + let production = if let Some(variable_index) = step_id.variable_index { + &grammar.variables[variable_index].productions[step_id.production_index] + } else { + &productions[step_id.production_index] + } as *const Production; + ((production, step_id.step_index as u32), production_indices) + }) + .collect(); + + InlinedProductionMap { productions, production_map } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::{Associativity, Symbol}; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + let inline_map = process_inlines(&grammar); + + // Nothing to inline at step 0. + assert!(inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .is_none()); + + // Inlining variable 1 yields two productions. + assert_eq!( + inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .cloned() + .collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + ] + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "non-terminal-3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + let inline_map = process_inlines(&grammar); + + let productions: Vec<&Production> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(16)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ] + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::terminal(15)), + ProductionStep::new(Symbol::terminal(12)), + ], + },] + ); + } + + #[test] + fn test_inlining_with_precedence_and_alias() { + let grammar = SyntaxGrammar { + variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // inlined + ProductionStep::new(Symbol::non_terminal(1)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // inlined + ProductionStep::new(Symbol::non_terminal(2)) + .with_alias("outer_alias", true), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }], + }, + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = process_inlines(&grammar); + + let productions: Vec<_> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + // The first step in the inlined production retains its precedence + // and alias. + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + // The final step of the inlined production inherits the precedence of + // the inlined step. + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_alias("outer_alias", true), + ] + }], + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // All steps of the inlined production inherit their alias from the + // inlined step. + ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true), + ] + }], + ); + } +} diff --git a/src/rules.rs b/src/rules.rs index 34f4c8b9..3bfd5181 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -8,13 +8,13 @@ pub(crate) enum SymbolType { End, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum Associativity { Left, Right, } -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct Alias { pub value: String, pub is_named: bool, From 99ecf29e4b4bb394b17f9818ce31f5da781f7575 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 23 Dec 2018 10:15:23 -0800 Subject: [PATCH 075/102] Fix typo causing infinite recursion in expand_regex --- src/prepare_grammar/expand_tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 37f75e5a..5ee9861f 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -217,7 +217,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) } RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { - expand_count(ast, min, nfa, next_state_id, is_sep) + expand_count(&repetition.ast, min, nfa, next_state_id, is_sep) } else { Ok(false) } From 5258ee2e6ad3f202e43f98a093c82da1143a27fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 23 Dec 2018 10:16:03 -0800 Subject: [PATCH 076/102] Implement more C code generation --- src/build_tables/item.rs | 60 +- src/build_tables/item_set_builder.rs | 27 +- src/build_tables/lex_table_builder.rs | 24 + src/build_tables/mod.rs | 61 ++- src/render/mod.rs | 761 ++++++++++++++++++++++++-- src/tables.rs | 12 +- 6 files changed, 840 insertions(+), 105 deletions(-) create mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 49ab4f27..28723d24 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,7 +2,7 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::{HashMap, BTreeMap}; +use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; @@ -178,7 +178,11 @@ impl<'a> ParseItem<'a> { } pub fn prev_step(&self) -> Option<&'a ProductionStep> { - self.production.steps.get(self.step_index as usize - 1) + if self.step_index > 0 { + Some(&self.production.steps[self.step_index as usize - 1]) + } else { + None + } } pub fn is_done(&self) -> bool { @@ -355,43 +359,49 @@ impl<'a> PartialEq for ParseItem<'a> { } } -impl<'a> PartialOrd for ParseItem<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - if let Some(o) = self.variable_index.partial_cmp(&other.variable_index) { - return Some(o); +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + let o = self.variable_index.cmp(&other.variable_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.step_index.partial_cmp(&other.step_index) { - return Some(o); + let o = self.step_index.cmp(&other.step_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.dynamic_precedence.partial_cmp(&other.production.dynamic_precedence) { - return Some(o); + let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.steps.len().partial_cmp(&other.production.steps.len()) { - return Some(o); + let o = self.production.steps.len().cmp(&other.production.steps.len()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.precedence().partial_cmp(&other.precedence()) { - return Some(o); + let o = self.precedence().cmp(&other.precedence()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.associativity().partial_cmp(&other.associativity()) { - return Some(o); + let o = self.associativity().cmp(&other.associativity()); + if o != Ordering::Equal { + return o; } for (i, step) in self.production.steps.iter().enumerate() { - let cmp = if i < self.step_index as usize { - step.alias.partial_cmp(&other.production.steps[i].alias) + let o = if i < self.step_index as usize { + step.alias.cmp(&other.production.steps[i].alias) } else { - step.partial_cmp(&other.production.steps[i]) + step.cmp(&other.production.steps[i]) }; - if let Some(o) = cmp { - return Some(o); + if o != Ordering::Equal { + return o; } } - return None; + return Ordering::Equal; } } -impl<'a> Ord for ParseItem<'a> { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap_or(Ordering::Equal) +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) } } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 52ee0a45..d7883988 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -198,15 +198,16 @@ impl<'a> ParseItemSetBuilder<'a> { if syntax_grammar.variables_to_inline.contains(&non_terminal) { continue; } - for (production_index, production) in variable.productions.iter().enumerate() { + for production in &variable.productions { let item = ParseItem { variable_index, production, step_index: 0, }; - // let step_id = item.as_step_id(syntax_grammar, inlines); - if let Some(inlined_productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(inlined_productions) = + inlines.inlined_productions(item.production, item.step_index) + { for production in inlined_productions { find_or_push( additions_for_non_terminal, @@ -244,16 +245,21 @@ impl<'a> ParseItemSetBuilder<'a> { ) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + { for production in productions { - self.add_item(&mut result, ParseItem { - variable_index: item.variable_index, - production, - step_index: item.step_index, - }, lookaheads, grammar); + self.add_item( + &mut result, + ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, + lookaheads, + ); } } else { - self.add_item(&mut result, *item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads); } } result @@ -268,7 +274,6 @@ impl<'a> ParseItemSetBuilder<'a> { set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet, - grammar: &SyntaxGrammar, ) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs new file mode 100644 index 00000000..86d1578b --- /dev/null +++ b/src/build_tables/lex_table_builder.rs @@ -0,0 +1,24 @@ +use crate::rules::Symbol; +use crate::tables::LexTable; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; + +pub(crate) struct LexTableBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + table: LexTable, +} + +impl<'a> LexTableBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> Self { + Self { + syntax_grammar, lexical_grammar, table: LexTable::default() + } + } + + pub fn build(self) -> (LexTable, LexTable, Option) { + (LexTable::default(), LexTable::default(), None) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 27951453..fc17ce7f 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,10 +1,13 @@ mod item; mod item_set_builder; +mod lex_table_builder; use self::item::{LookaheadSet, ParseItem, ParseItemSet}; use self::item_set_builder::ParseItemSetBuilder; +use self::lex_table_builder::LexTableBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::Alias; use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; use crate::tables::{ AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, @@ -43,7 +46,7 @@ struct ParseTableBuilder<'a> { impl<'a> ParseTableBuilder<'a> { fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty rename sequence has index 0. + // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. @@ -61,9 +64,18 @@ impl<'a> ParseTableBuilder<'a> { ); self.process_part_state_queue()?; + + let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); + self.populate_used_symbols(); - Err(Error::grammar("oh no")) + let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); + Ok(( + self.parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + )) } fn add_parse_state( @@ -82,6 +94,7 @@ impl<'a> ParseTableBuilder<'a> { let state_id = self.parse_table.states.len(); self.item_sets_by_state_id.push(v.key().clone()); self.parse_table.states.push(ParseState { + lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), }); @@ -98,12 +111,16 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } let item_set = self.item_set_builder.transitive_closure( &self.item_sets_by_state_id[entry.state_id], @@ -111,11 +128,12 @@ impl<'a> ParseTableBuilder<'a> { self.inlines, ); - // println!("TRANSITIVE CLOSURE:"); - // for item in item_set.entries.keys() { - // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); - // } - // println!(""); + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } self.add_actions( entry.preceding_symbols, @@ -249,6 +267,17 @@ impl<'a> ParseTableBuilder<'a> { )?; } + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + Ok(()) } @@ -514,6 +543,7 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } + self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); @@ -532,12 +562,15 @@ impl<'a> ParseTableBuilder<'a> { } fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let alias_sequence = item + let mut alias_sequence: Vec> = item .production .steps .iter() .map(|s| s.alias.clone()) .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } if let Some(index) = self .parse_table .alias_sequences diff --git a/src/render/mod.rs b/src/render/mod.rs index 2ca610a6..fc4cdafb 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,8 +1,16 @@ -use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; use std::fmt::Write; +use std::mem::swap; + +macro_rules! add { + ($this: tt, $($arg: tt)*) => {{ + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + }} +} macro_rules! add_line { ($this: tt, $($arg: tt)*) => { @@ -14,10 +22,21 @@ macro_rules! add_line { } } +macro_rules! indent { + ($this: tt) => { + $this.indent_level += 1; + }; +} + +macro_rules! dedent { + ($this: tt) => { + $this.indent_level -= 1; + }; +} + struct Generator { buffer: String, indent_level: usize, - language_name: String, parse_table: ParseTable, main_lex_table: LexTable, @@ -27,9 +46,9 @@ struct Generator { lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, symbol_ids: HashMap, - parse_table_entries: Vec<(usize, ParseTableEntry)>, - next_parse_action_list_index: usize, - unique_aliases: HashSet, + alias_ids: HashMap, + external_scanner_states: Vec>, + alias_map: HashMap>, } impl Generator { @@ -39,6 +58,30 @@ impl Generator { self.add_stats(); self.add_symbol_enum(); self.add_symbol_names_list(); + self.add_symbol_metadata_list(); + self.add_alias_sequences(); + + let mut main_lex_table = LexTable::default(); + swap(&mut main_lex_table, &mut self.main_lex_table); + self.add_lex_function("ts_lex", main_lex_table); + + if self.keyword_capture_token.is_some() { + let mut keyword_lex_table = LexTable::default(); + swap(&mut keyword_lex_table, &mut self.keyword_lex_table); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); + } + + self.add_lex_modes_list(); + + if !self.syntax_grammar.external_tokens.is_empty() { + self.add_external_token_enum(); + self.add_external_scanner_symbol_map(); + self.add_external_scanner_states_list(); + } + + self.add_parse_table(); + self.add_parser_export(); + self.buffer } @@ -50,7 +93,10 @@ impl Generator { fn add_pragmas(&mut self) { add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); add_line!(self, "#pragma GCC diagnostic push"); - add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!( + self, + "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"" + ); add_line!(self, "#endif"); add_line!(self, ""); @@ -67,81 +113,639 @@ impl Generator { } fn add_stats(&mut self) { - let mut token_count = 0; - - for symbol in &self.parse_table.symbols { - if symbol.is_terminal() { - token_count += 1; - } else if symbol.is_external() { - let external_token = &self.syntax_grammar.external_tokens[symbol.index]; - if external_token.corresponding_internal_token.is_none() { - token_count += 1; + let token_count = self + .parse_table + .symbols + .iter() + .filter(|symbol| { + if symbol.is_terminal() { + true + } else if symbol.is_external() { + self.syntax_grammar.external_tokens[symbol.index] + .corresponding_internal_token + .is_none() + } else { + false } - } + }) + .count(); + + let mut symbol_identifiers = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); } for alias_sequence in &self.parse_table.alias_sequences { for entry in alias_sequence { if let Some(alias) = entry { - self.unique_aliases.insert(alias.clone()); + let alias_kind = if alias.is_named { + VariableType::Named + } else { + VariableType::Anonymous + }; + let matching_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { + let (name, kind) = self.metadata_for_symbol(*symbol); + name == alias.value && kind == alias_kind + }); + let alias_id = if let Some(symbol) = matching_symbol { + self.symbol_ids[&symbol].clone() + } else if alias.is_named { + format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + }; + self.alias_ids.entry(alias.clone()).or_insert(alias_id); + self.alias_map + .entry(alias.clone()) + .or_insert(matching_symbol); } } } - let mut symbol_id_values = HashSet::new(); - for i in 0..self.parse_table.symbols.len() { - self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); - } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); - add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); - add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); - add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!( + self, + "#define STATE_COUNT {}", + self.parse_table.states.len() + ); + add_line!( + self, + "#define SYMBOL_COUNT {}", + self.parse_table.symbols.len() + ); + add_line!( + self, + "#define ALIAS_COUNT {}", + self.alias_map.iter().filter(|e| e.1.is_none()).count() + ); add_line!(self, "#define TOKEN_COUNT {}", token_count); - add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); - // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!( + self, + "#define EXTERNAL_TOKEN_COUNT {}", + self.syntax_grammar.external_tokens.len() + ); + if let Some(max_alias_sequence_length) = self + .parse_table + .alias_sequences + .iter() + .map(|seq| seq.len()) + .max() + { + add_line!( + self, + "#define MAX_ALIAS_SEQUENCE_LENGTH {}", + max_alias_sequence_length + ); + } add_line!(self, ""); } fn add_symbol_enum(&mut self) { add_line!(self, "enum {{"); - self.indent(); - for i in 0..self.parse_table.symbols.len() { - let symbol = self.parse_table.symbols[i]; - if symbol != Symbol::end() { - add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + indent!(self); + let mut i = 1; + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!(self, "{} = {},", self.symbol_ids[&symbol], i); + i += 1; } } - self.dedent(); + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!(self, "{} = {},", self.alias_ids[&alias], i); + } + i += 1; + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } fn add_symbol_names_list(&mut self) { add_line!(self, "static const char *ts_symbol_names[] = {{"); - self.indent(); - self.dedent(); + indent!(self); + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!( + self, + "[{}] = \"{}\",", + self.symbol_ids[&symbol], + self.sanitize_string(self.metadata_for_symbol(*symbol).0) + ); + } + } + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!( + self, + "[{}] = \"{}\",", + self.alias_ids[&alias], + self.sanitize_string(&alias.value) + ); + } + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } - fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + fn add_symbol_metadata_list(&mut self) { + add_line!( + self, + "static const TSSymbolMetadata ts_symbol_metadata[] = {{" + ); + indent!(self); + for symbol in &self.parse_table.symbols { + add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); + indent!(self); + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } + } + dedent!(self); + add_line!(self, "}},"); + } + for (alias, matching_symbol) in &self.alias_map { + if matching_symbol.is_none() { + add_line!(self, "[{}] = {{", self.alias_ids[&alias]); + indent!(self); + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", alias.is_named); + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_alias_sequences(&mut self) { + add_line!( + self, + "static TSSymbol ts_alias_sequences[{}][MAX_ALIAS_SEQUENCE_LENGTH] = {{", + self.parse_table.alias_sequences.len() + ); + indent!(self); + for (i, sequence) in self.parse_table.alias_sequences.iter().enumerate().skip(1) { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (j, alias) in sequence.iter().enumerate() { + if let Some(alias) = alias { + add_line!(self, "[{}] = {},", j, self.alias_ids[&alias]); + } + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { + add_line!( + self, + "static bool {}(TSLexer *lexer, TSStateId state) {{", + name + ); + indent!(self); + add_line!(self, "START_LEXER();"); + add_line!(self, "switch (state) {{"); + indent!(self); + + for (i, state) in lex_table.states.into_iter().enumerate() { + add_line!(self, "case {}:", i); + indent!(self); + self.add_lex_state(state); + dedent!(self); + } + + add_line!(self, "default:"); + indent!(self); + add_line!(self, "return false;"); + dedent!(self); + + dedent!(self); + add_line!(self, "}}"); + dedent!(self); + add_line!(self, "}}"); + add_line!(self, ""); + } + + fn add_lex_state(&mut self, state: LexState) { + if let Some(accept_action) = state.accept_action { + add_line!( + self, + "ACCEPT_TOKEN({})", + self.symbol_ids[&accept_action.symbol] + ); + } + + let mut ruled_out_characters = HashSet::new(); + for (characters, action) in state.advance_actions { + let previous_length = self.buffer.len(); + + add!(self, "if ("); + if self.add_character_set_condition(&characters, &ruled_out_characters) { + add!(self, ")"); + indent!(self); + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + if let CharacterSet::Include(chars) = characters { + ruled_out_characters.extend(chars.iter()); + } + dedent!(self); + } else { + self.buffer.truncate(previous_length); + } + } + + add_line!(self, "END_STATE();"); + } + + fn add_character_set_condition( + &mut self, + characters: &CharacterSet, + ruled_out_characters: &HashSet, + ) -> bool { + true + } + + fn add_lex_modes_list(&mut self) { + self.get_external_scanner_state_id(HashSet::new()); + + let mut external_tokens_by_corresponding_internal_token = HashMap::new(); + for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() { + if let Some(symbol) = external_token.corresponding_internal_token { + external_tokens_by_corresponding_internal_token.insert(symbol.index, i); + } + } + + add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{"); + indent!(self); + for i in 0..self.parse_table.states.len() { + let mut external_tokens = HashSet::new(); + for token in self.parse_table.states[i].terminal_entries.keys() { + if token.is_external() { + external_tokens.insert(token.index); + } else if token.is_terminal() { + if let Some(external_index) = + external_tokens_by_corresponding_internal_token.get(&token.index) + { + external_tokens.insert(*external_index); + } + } + } + + let external_state_id = self.get_external_scanner_state_id(external_tokens); + let state = &self.parse_table.states[i]; + if external_state_id > 0 { + add_line!( + self, + "[{}] = {{.lex_state = {}, .external_lex_state = {}}},", + i, + state.lex_state_id, + external_state_id + ); + } else { + add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_token_enum(&mut self) { + add_line!(self, "enum {{"); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "{} = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + i + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_symbol_map(&mut self) { + add_line!( + self, + "static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {{" + ); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "[{}] = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + self.symbol_ids[&Symbol::external(i)], + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_states_list(&mut self) { + add_line!( + self, + "static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{", + self.external_scanner_states.len(), + ); + indent!(self); + for i in 0..self.external_scanner_states.len() { + if !self.external_scanner_states[i].is_empty() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for token_index in &self.external_scanner_states[i] { + add_line!( + self, + "[{}] = true,", + self.external_token_id(&self.syntax_grammar.external_tokens[*token_index]) + ); + } + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parse_table(&mut self) { + let mut parse_table_entries = Vec::new(); + let mut next_parse_action_list_index = 0; + + self.get_parse_action_list_id( + &ParseTableEntry { + actions: Vec::new(), + reusable: false, + }, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + + add_line!( + self, + "static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{" + ); + indent!(self); + for (i, state) in self.parse_table.states.iter().enumerate() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (symbol, state_id) in &state.nonterminal_entries { + add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id); + } + for (symbol, entry) in &state.terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!( + self, + "[{}] = ACTIONS({}),", + self.symbol_ids[symbol], + entry_id + ); + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + + self.add_parse_action_list(parse_table_entries); + } + + fn add_parse_action_list(&mut self, parse_table_entries: Vec<(usize, ParseTableEntry)>) { + add_line!(self, "static TSParseActionEntry ts_parse_actions[] = {{"); + indent!(self); + for (i, entry) in parse_table_entries { + add!( + self, + " [{}] = {{.count = {}, .reusable = {}}},", + i, + entry.actions.len(), + entry.reusable + ); + for action in entry.actions { + add!(self, " "); + match action { + ParseAction::Accept => add!(self, " ACCEPT_INPUT()"), + ParseAction::Recover => add!(self, "RECOVER()"), + ParseAction::ShiftExtra => add!(self, "SHIFT_EXTRA()"), + ParseAction::Shift { + state, + is_repetition, + } => { + if is_repetition { + add!(self, "SHIFT_REPEAT({})", state); + } else { + add!(self, "SHIFT({})", state); + } + } + ParseAction::Reduce { + symbol, + child_count, + dynamic_precedence, + alias_sequence_id, + .. + } => { + if !self.symbol_ids.contains_key(&symbol) { + eprintln!( + "SYMBOL: {:?} {:?}", + symbol, + self.metadata_for_symbol(symbol) + ); + } + add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); + if dynamic_precedence != 0 { + add!(self, ", .dynamic_precedence = {}", dynamic_precedence); + } + if alias_sequence_id != 0 { + add!(self, ", .alias_sequence_id = {}", alias_sequence_id); + } + add!(self, ")"); + } + } + add!(self, ",") + } + add!(self, "\n"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parser_export(&mut self) { + let language_function_name = format!("tree_sitter_{}", self.language_name); + let external_scanner_name = format!("{}_external_scanner", language_function_name); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, "void *{}_create();", external_scanner_name); + add_line!(self, "void {}_destroy(void *);", external_scanner_name); + add_line!( + self, + "bool {}_scan(void *, TSLexer *, const bool *);", + external_scanner_name + ); + add_line!( + self, + "unsigned {}_serialize(void *, char *);", + external_scanner_name + ); + add_line!( + self, + "void {}_deserialize(void *, const char *, unsigned);", + external_scanner_name + ); + add_line!(self, ""); + } + + add_line!(self, "#ifdef _WIN32"); + add_line!(self, "#define extern __declspec(dllexport)"); + add_line!(self, "#endif"); + add_line!(self, ""); + + add_line!( + self, + "extern const TSLanguage *{}() {{", + language_function_name + ); + indent!(self); + add_line!(self, "static TSLanguage language = {{"); + indent!(self); + add_line!(self, ".version = LANGUAGE_VERSION,"); + add_line!(self, ".symbol_count = SYMBOL_COUNT,"); + add_line!(self, ".alias_count = ALIAS_COUNT,"); + add_line!(self, ".token_count = TOKEN_COUNT,"); + add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); + add_line!( + self, + ".parse_table = (const unsigned short *)ts_parse_table," + ); + add_line!(self, ".parse_actions = ts_parse_actions,"); + add_line!(self, ".lex_modes = ts_lex_modes,"); + add_line!(self, ".symbol_names = ts_symbol_names,"); + add_line!( + self, + ".alias_sequences = (const TSSymbol *)ts_alias_sequences," + ); + + add_line!( + self, + ".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH," + ); + add_line!(self, ".lex_fn = ts_lex,"); + + if let Some(keyword_capture_token) = self.keyword_capture_token { + add_line!(self, ".keyword_lex_fn = ts_lex_keywords,"); + add_line!( + self, + ".keyword_capture_token = {},", + self.symbol_ids[&keyword_capture_token] + ); + } + + add_line!(self, ".external_token_count = EXTERNAL_TOKEN_COUNT,"); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, ".external_scanner = {{"); + indent!(self); + add_line!(self, "(const bool *)ts_external_scanner_states,"); + add_line!(self, "ts_external_scanner_symbol_map,"); + add_line!(self, "{}_create,", external_scanner_name); + add_line!(self, "{}_destroy,", external_scanner_name); + add_line!(self, "{}_scan,", external_scanner_name); + add_line!(self, "{}_serialize,", external_scanner_name); + add_line!(self, "{}_deserialize,", external_scanner_name); + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + + add_line!(self, "}};"); + add_line!(self, "return &language;"); + dedent!(self); + add_line!(self, "}}"); + } + + fn get_parse_action_list_id( + &self, + entry: &ParseTableEntry, + parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: &mut usize, + ) -> usize { + if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { + return *index; + } + + let result = *next_parse_action_list_index; + parse_table_entries.push((result, entry.clone())); + *next_parse_action_list_index += 1 + entry.actions.len(); + result + } + + fn get_external_scanner_state_id(&mut self, external_tokens: HashSet) -> usize { + self.external_scanner_states + .iter() + .position(|tokens| *tokens == external_tokens) + .unwrap_or_else(|| { + self.external_scanner_states.push(external_tokens); + self.external_scanner_states.len() - 1 + }) + } + + fn external_token_id(&self, token: &ExternalToken) -> String { + format!( + "ts_external_token_{}", + self.sanitize_identifier(&token.name) + ) + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { let mut id; if symbol == Symbol::end() { id = "ts_builtin_sym_end".to_string(); } else { let (name, kind) = self.metadata_for_symbol(symbol); id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), - VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), VariableType::Hidden | VariableType::Named => { - format!("sym_{}", self.sanitize_name(name)) + format!("sym_{}", self.sanitize_identifier(name)) } }; let mut suffix_number = 1; let mut suffix = String::new(); - while used_ids.contains(&id) { + while used_identifiers.contains(&id) { id.drain(id.len() - suffix.len()..); suffix_number += 1; suffix = suffix_number.to_string(); @@ -149,7 +753,7 @@ impl Generator { } } - used_ids.insert(id.clone()); + used_identifiers.insert(id.clone()); self.symbol_ids.insert(symbol, id); } @@ -171,16 +775,67 @@ impl Generator { } } - fn sanitize_name(&self, name: &str) -> String { - name.to_string() + fn sanitize_identifier(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + { + result.push(c); + } else { + result += match c { + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + _ => continue, + } + } + } + result } - fn indent(&mut self) { - self.indent_level += 1; - } - - fn dedent(&mut self) { - self.indent_level -= 1; + fn sanitize_string(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ['\\', '\n', '\r', '\"'].contains(&c) { + result.push('\\'); + } + result.push(c); + } + result } } @@ -206,9 +861,9 @@ pub(crate) fn render_c_code( lexical_grammar, simple_aliases, symbol_ids: HashMap::new(), - parse_table_entries: Vec::new(), - next_parse_action_list_index: 0, - unique_aliases: HashSet::new(), + alias_ids: HashMap::new(), + external_scanner_states: Vec::new(), + alias_map: HashMap::new(), } .generate() } diff --git a/src/tables.rs b/src/tables.rs index 9100b81e..01cecb49 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; +use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -34,7 +35,8 @@ pub(crate) struct ParseTableEntry { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { pub terminal_entries: HashMap, - pub nonterminal_entries: HashMap + pub nonterminal_entries: HashMap, + pub lex_state_id: usize, } #[derive(Debug, PartialEq, Eq)] @@ -60,7 +62,7 @@ pub(crate) struct AcceptTokenAction { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, + pub advance_actions: HashMap, pub accept_action: Option, } @@ -78,6 +80,12 @@ impl ParseTableEntry { } } +impl Default for LexTable { + fn default() -> Self { + LexTable { states: Vec::new() } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self { From 479400e5d3e7fdc1395868c0f19fe6415cb68bda Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:56:00 -0800 Subject: [PATCH 077/102] Add handling of precedence within tokens --- src/nfa.rs | 366 +++++++++++++++++- src/prepare_grammar/expand_tokens.rs | 557 +++++++++++++++------------ src/prepare_grammar/mod.rs | 14 +- 3 files changed, 670 insertions(+), 267 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index f6acb67a..4a4fa17b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -1,5 +1,8 @@ -use std::fmt; use std::char; +use std::cmp::max; +use std::cmp::Ordering; +use std::fmt; +use std::mem::swap; #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum CharacterSet { @@ -13,14 +16,18 @@ pub enum NfaState { chars: CharacterSet, state_id: u32, is_sep: bool, + precedence: i32, }, Split(u32, u32), - Accept(usize), + Accept { + variable_index: usize, + precedence: i32, + }, } #[derive(PartialEq, Eq)] pub struct Nfa { - pub states: Vec + pub states: Vec, } impl Default for Nfa { @@ -78,14 +85,57 @@ impl CharacterSet { } } - pub fn add(self, other: CharacterSet) -> Self { - if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - CharacterSet::Include(chars) + pub fn add(self, other: &CharacterSet) -> Self { + if let CharacterSet::Include(other_chars) = other { + if let CharacterSet::Include(mut chars) = self { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + return CharacterSet::Include(chars); + } + } + panic!("Called add with a negated character set"); + } + + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => { + CharacterSet::Include(remove_chars(chars, other_chars, true)) + } + CharacterSet::Exclude(other_chars) => { + let mut removed = remove_chars(chars, other_chars, false); + add_chars(other_chars, chars); + swap(&mut removed, chars); + CharacterSet::Include(removed) + } + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => { + let mut removed = remove_chars(other_chars, chars, false); + add_chars(chars, other_chars); + swap(&mut removed, other_chars); + CharacterSet::Include(removed) + } + CharacterSet::Exclude(other_chars) => { + let removed = remove_chars(chars, other_chars, true); + let mut included_characters = Vec::new(); + let mut other_included_characters = Vec::new(); + swap(&mut included_characters, other_chars); + swap(&mut other_included_characters, chars); + *self = CharacterSet::Include(included_characters); + *other = CharacterSet::Include(other_included_characters); + CharacterSet::Exclude(removed) + } + }, + } + } + + pub fn is_empty(&self) -> bool { + if let CharacterSet::Include(c) = self { + c.is_empty() } else { - panic!("Called add with a negated character set"); + false } } @@ -97,6 +147,84 @@ impl CharacterSet { } } +impl Ord for CharacterSet { + fn cmp(&self, other: &CharacterSet) -> Ordering { + match self { + CharacterSet::Include(chars) => { + if let CharacterSet::Include(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Less + } + } + CharacterSet::Exclude(chars) => { + if let CharacterSet::Exclude(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Greater + } + } + } + } +} + +impl PartialOrd for CharacterSet { + fn partial_cmp(&self, other: &CharacterSet) -> Option { + Some(self.cmp(other)) + } +} + +fn add_chars(left: &mut Vec, right: &Vec) { + for c in right { + match left.binary_search(c) { + Err(i) => left.insert(i, *c), + _ => {} + } + } +} + +fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) -> Vec { + let mut result = Vec::new(); + right.retain(|right_char| { + if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) { + left.remove(index); + result.push(*right_char); + false || !mutate_right + } else { + true + } + }); + result +} + +fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { + if chars.is_empty() { + if other_chars.is_empty() { + Ordering::Equal + } else { + Ordering::Less + } + } else if other_chars.is_empty() { + Ordering::Greater + } else { + let mut other_c = other_chars.iter(); + for c in chars.iter() { + if let Some(other_c) = other_c.next() { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } + } else { + return Ordering::Greater; + } + } + if other_c.next().is_some() { + return Ordering::Less; + } + Ordering::Equal + } +} + impl Nfa { pub fn new() -> Self { Nfa { states: Vec::new() } @@ -124,17 +252,32 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { - let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true }; + let mut result = Self { + nfa, + state_ids: Vec::new(), + in_sep: true, + }; result.add_states(&mut states); result } + pub fn reset(&mut self, mut states: Vec) { + self.state_ids.clear(); + self.add_states(&mut states); + } + pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_state_ids = Vec::new(); let mut any_sep_transitions = false; for current_state_id in &self.state_ids { - if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] { + if let NfaState::Advance { + chars, + state_id, + is_sep, + .. + } = &self.nfa.states[*current_state_id as usize] + { if chars.contains(c) { if *is_sep { any_sep_transitions = true; @@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> { result } - pub fn finished_id(&self) -> Option { + pub fn successors(&self) -> impl Iterator { + self.state_ids.iter().filter_map(move |id| { + if let NfaState::Advance { + chars, + state_id, + precedence, + .. + } = &self.nfa.states[*id as usize] + { + Some((chars, *precedence, *state_id)) + } else { + None + } + }) + } + + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + Self::group_successors(self.successors()) + } + + fn group_successors<'b>( + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec)> { + let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); + for (chars, prec, state) in iter { + let mut chars = chars.clone(); + let mut i = 0; + while i < result.len() { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let mut precedence = result[i].1; + states.push(state); + result.insert(i, (intersection, max(precedence, prec), states)); + i += 1; + } + i += 1; + } + if !chars.is_empty() { + result.push((chars, prec, vec![state])); + } + } + result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + result + } + + pub fn finished_id(&self) -> Option<(usize, i32)> { let mut result = None; for state_id in self.state_ids.iter() { - if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] { + if let NfaState::Accept { + variable_index, + precedence, + } = self.nfa.states[*state_id as usize] + { match result { - None => { - result = Some(id) - }, - Some(existing_id) => if id < existing_id { - result = Some(id) + None => result = Some((variable_index, precedence)), + Some((existing_id, existing_precedence)) => { + if precedence > existing_precedence + || (precedence == existing_precedence && variable_index < existing_id) + { + result = Some((variable_index, precedence)) + } } } } @@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_group_successors() { + let table = [ + ( + vec![ + (CharacterSet::empty().add_range('a', 'f'), 0, 1), + (CharacterSet::empty().add_range('d', 'i'), 1, 2), + ], + vec![ + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), + (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + ], + ), + ( + vec![ + (CharacterSet::empty().add_range('a', 'z'), 0, 1), + (CharacterSet::empty().add_char('d'), 0, 2), + (CharacterSet::empty().add_char('i'), 0, 3), + (CharacterSet::empty().add_char('f'), 0, 4), + ], + vec![ + ( + CharacterSet::empty() + .add_range('a', 'c') + .add_char('e') + .add_range('g', 'h') + .add_range('j', 'z'), + 0, + vec![1], + ), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + ], + ), + ]; + + for row in table.iter() { + assert_eq!( + NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + row.1 + ); + } + + // let successors = NfaCursor::group_successors( + // [ + // (&CharacterSet::empty().add_range('a', 'f'), 1), + // (&CharacterSet::empty().add_range('d', 'i'), 2), + // ] + // .iter() + // .cloned(), + // ); + // + // assert_eq!( + // successors, + // vec![ + // (CharacterSet::empty().add_range('a', 'c'), vec![1],), + // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), + // (CharacterSet::empty().add_range('g', 'i'), vec![2],), + // ] + // ); + } + + #[test] + fn test_character_set_intersection() { + // whitelist - whitelist + // both sets contain 'c', 'd', and 'f' + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + // whitelist - blacklist + // both sets contain 'e', 'f', and 'm' + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + // blacklist - blacklist + // both sets exclude 'c', 'd', and 'e' + let mut a = CharacterSet::empty().add_range('a', 'e').negate(); + let mut b = CharacterSet::empty().add_range('c', 'h').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Exclude(vec!['c', 'd', 'e']) + ); + assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); + assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + } +} diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 5ee9861f..b0d2ae04 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -7,8 +7,18 @@ use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); +struct NfaBuilder { + nfa: Nfa, + is_sep: bool, + precedence_stack: Vec, +} + +pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut builder = NfaBuilder { + nfa: Nfa::new(), + is_sep: true, + precedence_stack: vec![0], + }; let separator_rule = if grammar.separators.len() > 0 { grammar.separators.push(Rule::Blank); @@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result false, }; - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e { - Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)), - _ => e, - })?; + builder.is_sep = false; + builder.nfa.states.push(NfaState::Accept { + variable_index: i, + precedence: 0, + }); + let last_state_id = builder.nfa.last_state_id(); + builder + .expand_rule(&variable.rule, last_state_id) + .map_err(|e| match e { + Error::RegexError(msg) => { + Error::RegexError(format!("Rule {} {}", variable.name, msg)) + } + _ => e, + })?; if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + builder.is_sep = true; + let last_state_id = builder.nfa.last_state_id(); + builder.expand_rule(&separator_rule, last_state_id)?; } variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - start_state: nfa.last_state_id(), + start_state: builder.nfa.last_state_id(), }); } - Ok(LexicalGrammar { nfa, variables }) + Ok(LexicalGrammar { + nfa: builder.nfa, + variables, + }) } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) - } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - }); +impl NfaBuilder { + fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + self.expand_regex(&ast, next_state_id) } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); + Rule::String(s) => { + for c in s.chars().rev() { + self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id()); } + Ok(s.len() > 0) } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if self.expand_rule(element, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } } - next_state_id = nfa.last_state_id(); - } - Ok(result) - } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Metadata { rule, .. } => { - // TODO - implement precedence - expand_rule(rule, nfa, next_state_id, is_sep) - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), - } -} - -fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - nfa.states.pop(); - Ok(false) - } -} - -fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_count( - ast: &Ast, - count: u32, - nfa: &mut Nfa, - mut next_state_id: u32, - is_sep: bool, -) -> Result { - let mut result = false; - for _ in 0..count { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); - } - } - Ok(result) -} - -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match ast { - Ast::Empty(_) => Ok(false), - Ast::Flags(_) => Err(Error::regex("Flags are not supported")), - Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Include(vec![literal.c]), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Dot(_) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Exclude(vec!['\n']), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), - Ast::Class(class) => match class { - Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), - Class::Perl(class) => { - nfa.states.push(NfaState::Advance { - chars: expand_perl_character_class(&class.kind), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Class::Bracketed(class) => match &class.kind { - ClassSet::Item(item) => { - let character_set = expand_character_class(&item)?; - nfa.states.push(NfaState::Advance { - chars: character_set, - state_id: next_state_id, - is_sep, + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) }); - Ok(true) } - ClassSet::BinaryOp(_) => Err(Error::regex( - "Binary operators in character classes aren't supported", - )), - }, - }, - Ast::Repetition(repetition) => match repetition.op.kind { - RepetitionKind::ZeroOrOne => { - expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep) + Ok(true) } - RepetitionKind::OneOrMore => { - expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep) + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if self.expand_rule(element, next_state_id)? { + result = true; + } + next_state_id = self.nfa.last_state_id(); + } + Ok(result) } - RepetitionKind::ZeroOrMore => { - expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::Exactly(count)) => { - expand_count(&repetition.ast, count, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { - if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { - expand_count(&repetition.ast, min, nfa, next_state_id, is_sep) + Rule::Repeat(rule) => { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_rule(rule, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); + Ok(true) } else { Ok(false) } } - RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { - let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?; - for _ in min..max { - if result { - next_state_id = nfa.last_state_id(); + Rule::Metadata { rule, params } => { + if let Some(precedence) = params.precedence { + self.precedence_stack.push(precedence); + } + let result = self.expand_rule(rule, next_state_id); + if params.precedence.is_some() { + self.precedence_stack.pop(); + } + result + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + } + } + + fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result { + match ast { + Ast::Empty(_) => Ok(false), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id); + Ok(true) + } + Ast::Dot(_) => { + self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id); + Ok(true) + } + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => { + Err(Error::regex("Unicode character classes are not supported")) + } + Class::Perl(class) => { + self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + Ok(true) + } + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + self.push_advance(self.expand_character_class(&item)?, next_state_id); + Ok(true) } - if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? { + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + self.expand_zero_or_one(&repetition.ast, next_state_id) + } + RepetitionKind::OneOrMore => { + self.expand_one_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::ZeroOrMore => { + self.expand_zero_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + self.expand_count(&repetition.ast, count, next_state_id) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if self.expand_zero_or_more(&repetition.ast, next_state_id)? { + self.expand_count(&repetition.ast, min, next_state_id) + } else { + Ok(false) + } + } + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = self.expand_count(&repetition.ast, min, next_state_id)?; + for _ in min..max { + if result { + next_state_id = self.nfa.last_state_id(); + } + if self.expand_zero_or_one(&repetition.ast, next_state_id)? { + result = true; + } + } + Ok(result) + } + }, + Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()), + Ast::Alternation(alternation) => { + let mut alternative_state_ids = Vec::new(); + for ast in alternation.asts.iter() { + if self.expand_regex(&ast, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) + }); + } + Ok(true) + } + Ast::Concat(concat) => { + let mut result = false; + for ast in concat.asts.iter().rev() { + if self.expand_regex(&ast, next_state_id)? { result = true; + next_state_id = self.nfa.last_state_id(); } } Ok(result) } - }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), - Ast::Alternation(alternation) => { - let mut alternative_state_ids = Vec::new(); - for ast in alternation.asts.iter() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } + } + } + + fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_regex(&ast, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); Ok(true) + } else { + self.nfa.states.pop(); + Ok(false) } - Ast::Concat(concat) => { - let mut result = false; - for ast in concat.asts.iter().rev() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); + } + + fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_regex(ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_one_or_more(&ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result { + let mut result = false; + for _ in 0..count { + if self.expand_regex(ast, next_state_id)? { + result = true; + next_state_id = self.nfa.last_state_id(); + } + } + Ok(result) + } + + fn expand_character_class(&self, item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(&self.expand_character_class(&item)?); } + Ok(result) } - Ok(result) + ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), } } -} -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); - } - Ok(result) - } - ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)), - _ => Err(Error::regex(&format!( - "Unsupported character class syntax {:?}", - item - ))), } -} -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), + fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { + let precedence = *self.precedence_stack.last().unwrap(); + self.add_precedence(precedence, vec![state_id]); + self.nfa.states.push(NfaState::Advance { + chars, + state_id, + precedence, + is_sep: self.is_sep, + }); + } + + fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { + let mut i = 0; + while i < state_ids.len() { + let state_id = state_ids[i]; + let (left, right) = match &mut self.nfa.states[state_id as usize] { + NfaState::Accept {precedence, ..} => { + *precedence = prec; + return; + }, + NfaState::Split(left, right) => (*left, *right), + _ => return + }; + if !state_ids.contains(&left) { + state_ids.push(left); + } + if !state_ids.contains(&right) { + state_ids.push(right); + } + i += 1; + } } } @@ -313,11 +367,15 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; + let mut result_precedence = 0; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } if cursor.advance(c) { end_char += 1; @@ -329,8 +387,11 @@ mod tests { } } - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } result @@ -443,6 +504,20 @@ mod tests { (" \\\na", Some((0, "a"))), ], }, + // shorter tokens with higher precedence + Row { + rules: vec![ + Rule::prec(2, Rule::pattern("abc")), + Rule::prec(1, Rule::pattern("ab[cd]e")), + Rule::pattern("[a-e]+"), + ], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], + examples: vec![ + ("abceef", Some((0, "abc"))), + ("abdeef", Some((1, "abde"))), + ("aeeeef", Some((2, "aeeee"))), + ], + }, ]; for Row { diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index f325383b..b0c1d2a3 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -7,7 +7,7 @@ mod intern_symbols; mod process_inlines; use self::expand_repeats::expand_repeats; -use self::expand_tokens::expand_tokens; +pub(crate) use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; @@ -19,7 +19,7 @@ use crate::grammars::{ }; use crate::rules::{AliasMap, Rule, Symbol}; -pub(self) struct IntermediateGrammar { +pub(crate) struct IntermediateGrammar { variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, @@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(crate) type InternedGrammar = IntermediateGrammar; -pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar; +pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar; #[derive(Debug, PartialEq, Eq)] -pub(self) struct ExtractedLexicalGrammar { - variables: Vec, - separators: Vec, +pub(crate) struct ExtractedLexicalGrammar { + pub variables: Vec, + pub separators: Vec, } pub(crate) fn prepare_grammar( From 605b50e58bf03661774ce7eb18f3b98dbd767ce3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:57:34 -0800 Subject: [PATCH 078/102] Start work on shrinking parse table --- src/build_tables/build_parse_table.rs | 605 ++++++++++++++++++++++++ src/build_tables/mod.rs | 630 +------------------------ src/build_tables/shrink_parse_table.rs | 117 +++++ src/build_tables/token_conflict_map.rs | 77 +++ src/tables.rs | 56 ++- 5 files changed, 866 insertions(+), 619 deletions(-) create mode 100644 src/build_tables/build_parse_table.rs create mode 100644 src/build_tables/shrink_parse_table.rs create mode 100644 src/build_tables/token_conflict_map.rs diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs new file mode 100644 index 00000000..5087c55c --- /dev/null +++ b/src/build_tables/build_parse_table.rs @@ -0,0 +1,605 @@ +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::{ + AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} + +type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder<'a>, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, + parse_state_queue: VecDeque, + parse_table: ParseTable, +} + +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result { + // Ensure that the empty alias sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with( + [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + .iter() + .cloned(), + ), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + Ok(self.parse_table) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet<'a>, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + lex_state_id: 0, + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.inlines, + ); + + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet<'a>, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = item.symbol() { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_augmented() { + ParseAction::Accept + } else { + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + if let Some(step) = item.step() { + if item.step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = item.precedence(); + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index; + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index as usize) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index as usize].name + ) + .unwrap(); + + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item.is_done() { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(); + let associativity = item.associativity(); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol() == Some(symbol) { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let mut alias_sequence: Vec> = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + +pub(crate) fn build_parse_table( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + inlines: &InlinedProductionMap, +) -> Result { + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + inlines, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index fc17ce7f..a5ac74fb 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,607 +1,17 @@ +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseTable}; + +mod build_parse_table; mod item; mod item_set_builder; mod lex_table_builder; +mod shrink_parse_table; +mod token_conflict_map; -use self::item::{LookaheadSet, ParseItem, ParseItemSet}; -use self::item_set_builder::ParseItemSetBuilder; -use self::lex_table_builder::LexTableBuilder; -use crate::error::{Error, Result}; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::Alias; -use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; -use crate::tables::{ - AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, -}; -use core::ops::Range; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet, VecDeque}; -use std::fmt::Write; - -#[derive(Clone)] -struct AuxiliarySymbolInfo { - auxiliary_symbol: Symbol, - parent_symbols: Vec, -} - -type SymbolSequence = Vec; -type AuxiliarySymbolSequence = Vec; - -struct ParseStateQueueEntry { - preceding_symbols: SymbolSequence, - preceding_auxiliary_symbols: AuxiliarySymbolSequence, - state_id: ParseStateId, -} - -struct ParseTableBuilder<'a> { - item_set_builder: ParseItemSetBuilder<'a>, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, - simple_aliases: &'a AliasMap, - state_ids_by_item_set: HashMap, ParseStateId>, - item_sets_by_state_id: Vec>, - parse_state_queue: VecDeque, - parse_table: ParseTable, -} - -impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty alias sequence has index 0. - self.parse_table.alias_sequences.push(Vec::new()); - - // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); - - self.add_parse_state( - &Vec::new(), - &Vec::new(), - ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] - .iter() - .cloned(), - ), - ); - - self.process_part_state_queue()?; - - let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); - - self.populate_used_symbols(); - - let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); - Ok(( - self.parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - )) - } - - fn add_parse_state( - &mut self, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &AuxiliarySymbolSequence, - item_set: ParseItemSet<'a>, - ) -> ParseStateId { - match self.state_ids_by_item_set.entry(item_set) { - Entry::Occupied(o) => { - // eprintln!("Item set already processed at state {}", *o.get()); - *o.get() - } - Entry::Vacant(v) => { - // eprintln!("Item set not yet processed"); - let state_id = self.parse_table.states.len(); - self.item_sets_by_state_id.push(v.key().clone()); - self.parse_table.states.push(ParseState { - lex_state_id: 0, - terminal_entries: HashMap::new(), - nonterminal_entries: HashMap::new(), - }); - self.parse_state_queue.push_back(ParseStateQueueEntry { - state_id, - preceding_symbols: preceding_symbols.clone(), - preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), - }); - v.insert(state_id); - state_id - } - } - } - - fn process_part_state_queue(&mut self) -> Result<()> { - while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - - self.add_actions( - entry.preceding_symbols, - entry.preceding_auxiliary_symbols, - item_set, - entry.state_id, - )?; - } - Ok(()) - } - - fn add_actions( - &mut self, - mut preceding_symbols: SymbolSequence, - mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet<'a>, - state_id: ParseStateId, - ) -> Result<()> { - let mut terminal_successors = HashMap::new(); - let mut non_terminal_successors = HashMap::new(); - let mut lookaheads_with_conflicts = HashSet::new(); - - for (item, lookaheads) in &item_set.entries { - if let Some(next_symbol) = item.symbol() { - let successor = item.successor(); - if next_symbol.is_non_terminal() { - // Keep track of where auxiliary non-terminals (repeat symbols) are - // used within visible symbols. This information may be needed later - // for conflict resolution. - if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { - preceding_auxiliary_symbols - .push(self.get_auxiliary_node_info(&item_set, next_symbol)); - } - - non_terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } else { - terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } - } else { - let action = if item.is_augmented() { - ParseAction::Accept - } else { - ParseAction::Reduce { - symbol: Symbol::non_terminal(item.variable_index as usize), - child_count: item.step_index as usize, - precedence: item.precedence(), - associativity: item.associativity(), - dynamic_precedence: item.production.dynamic_precedence, - alias_sequence_id: self.get_alias_sequence_id(item), - } - }; - - for lookahead in lookaheads.iter() { - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(lookahead); - let entry = entry.or_insert_with(|| ParseTableEntry::new()); - if entry.actions.is_empty() { - entry.actions.push(action); - } else if action.precedence() > entry.actions[0].precedence() { - entry.actions.clear(); - entry.actions.push(action); - lookaheads_with_conflicts.remove(&lookahead); - } else if action.precedence() == entry.actions[0].precedence() { - entry.actions.push(action); - lookaheads_with_conflicts.insert(lookahead); - } - } - } - } - - for (symbol, next_item_set) in terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(symbol); - if let Entry::Occupied(e) = &entry { - if !e.get().actions.is_empty() { - lookaheads_with_conflicts.insert(symbol); - } - } - - entry - .or_insert_with(|| ParseTableEntry::new()) - .actions - .push(ParseAction::Shift { - state: next_state_id, - is_repetition: false, - }); - } - - for (symbol, next_item_set) in non_terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - self.parse_table.states[state_id] - .nonterminal_entries - .insert(symbol, next_state_id); - } - - for symbol in lookaheads_with_conflicts { - self.handle_conflict( - &item_set, - state_id, - &preceding_symbols, - &preceding_auxiliary_symbols, - symbol, - )?; - } - - let state = &mut self.parse_table.states[state_id]; - for extra_token in &self.syntax_grammar.extra_tokens { - state - .terminal_entries - .entry(*extra_token) - .or_insert(ParseTableEntry { - reusable: true, - actions: vec![ParseAction::ShiftExtra], - }); - } - - Ok(()) - } - - fn handle_conflict( - &mut self, - item_set: &ParseItemSet, - state_id: ParseStateId, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &Vec, - conflicting_lookahead: Symbol, - ) -> Result<()> { - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - - // Determine which items in the set conflict with each other, and the - // precedences associated with SHIFT vs REDUCE actions. There won't - // be multiple REDUCE actions with different precedences; that is - // sorted out ahead of time in `add_actions`. But there can still be - // REDUCE-REDUCE conflicts where all actions have the *same* - // precedence, and there can still be SHIFT/REDUCE conflicts. - let reduce_precedence = entry.actions[0].precedence(); - let mut considered_associativity = false; - let mut shift_precedence: Option> = None; - let mut conflicting_items = HashSet::new(); - for (item, lookaheads) in &item_set.entries { - if let Some(step) = item.step() { - if item.step_index > 0 { - if self - .item_set_builder - .first_set(&step.symbol) - .contains(&conflicting_lookahead) - { - conflicting_items.insert(item); - let precedence = item.precedence(); - if let Some(range) = &mut shift_precedence { - if precedence < range.start { - range.start = precedence; - } else if precedence > range.end { - range.end = precedence; - } - } else { - shift_precedence = Some(precedence..precedence); - } - } - } - } else if lookaheads.contains(&conflicting_lookahead) { - conflicting_items.insert(item); - } - } - - if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { - let shift_precedence = shift_precedence.unwrap_or(0..0); - - // If all of the items in the conflict have the same parent symbol, - // and that parent symbols is auxiliary, then this is just the intentional - // ambiguity associated with a repeat rule. Resolve that class of ambiguity - // by leaving it in the parse table, but marking the SHIFT action with - // an `is_repetition` flag. - let conflicting_variable_index = - conflicting_items.iter().next().unwrap().variable_index; - if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { - if conflicting_items - .iter() - .all(|item| item.variable_index == conflicting_variable_index) - { - *is_repetition = true; - return Ok(()); - } - } - - // If the SHIFT action has higher precedence, remove all the REDUCE actions. - if shift_precedence.start > reduce_precedence - || (shift_precedence.start == reduce_precedence - && shift_precedence.end > reduce_precedence) - { - entry.actions.drain(0..entry.actions.len() - 1); - } - // If the REDUCE actions have higher precedence, remove the SHIFT action. - else if shift_precedence.end < reduce_precedence - || (shift_precedence.end == reduce_precedence - && shift_precedence.start < reduce_precedence) - { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - // If the SHIFT and REDUCE actions have the same predence, consider - // the REDUCE actions' associativity. - else if shift_precedence == (reduce_precedence..reduce_precedence) { - considered_associativity = true; - let mut has_left = false; - let mut has_right = false; - let mut has_non = false; - for action in &entry.actions { - if let ParseAction::Reduce { associativity, .. } = action { - match associativity { - Some(Associativity::Left) => has_left = true, - Some(Associativity::Right) => has_right = true, - None => has_non = true, - } - } - } - - // If all reduce actions are left associative, remove the SHIFT action. - // If all reduce actions are right associative, remove the REDUCE actions. - match (has_left, has_non, has_right) { - (true, false, false) => { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - (false, false, true) => { - entry.actions.drain(0..entry.actions.len() - 1); - } - _ => {} - } - } - } - - // If all of the actions but one have been eliminated, then there's no problem. - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - if entry.actions.len() == 1 { - return Ok(()); - } - - // Determine the set of parent symbols involved in this conflict. - let mut actual_conflict = Vec::new(); - for item in &conflicting_items { - let symbol = Symbol::non_terminal(item.variable_index as usize); - if self.syntax_grammar.variables[symbol.index].is_auxiliary() { - actual_conflict.extend( - preceding_auxiliary_symbols - .iter() - .rev() - .find_map(|info| { - if info.auxiliary_symbol == symbol { - Some(&info.parent_symbols) - } else { - None - } - }) - .unwrap() - .iter(), - ); - } else { - actual_conflict.push(symbol); - } - } - actual_conflict.sort_unstable(); - actual_conflict.dedup(); - - // If this set of symbols has been whitelisted, then there's no error. - if self - .syntax_grammar - .expected_conflicts - .contains(&actual_conflict) - { - return Ok(()); - } - - let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); - for symbol in preceding_symbols { - write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); - } - - write!( - &mut msg, - " • {} …\n\n", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - write!(&mut msg, "Possible interpretations:\n").unwrap(); - for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, "\n {}:", i).unwrap(); - - for preceding_symbol in preceding_symbols - .iter() - .take(preceding_symbols.len() - item.step_index as usize) - { - write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); - } - - write!( - &mut msg, - " ({}", - &self.syntax_grammar.variables[item.variable_index as usize].name - ) - .unwrap(); - - for (j, step) in item.production.steps.iter().enumerate() { - if j as u32 == item.step_index { - write!(&mut msg, " •").unwrap(); - } - write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); - } - - write!(&mut msg, ")").unwrap(); - - if item.is_done() { - write!( - &mut msg, - " • {}", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - } - - let precedence = item.precedence(); - let associativity = item.associativity(); - if precedence != 0 || associativity.is_some() { - write!( - &mut msg, - "(precedence: {}, associativity: {:?})", - precedence, associativity - ) - .unwrap(); - } - } - - // TODO - generate suggested resolutions - - Err(Error::ConflictError(msg)) - } - - fn get_auxiliary_node_info( - &self, - item_set: &ParseItemSet, - symbol: Symbol, - ) -> AuxiliarySymbolInfo { - let parent_symbols = item_set - .entries - .keys() - .filter_map(|item| { - if item.symbol() == Some(symbol) { - None - } else { - None - } - }) - .collect(); - AuxiliarySymbolInfo { - auxiliary_symbol: symbol, - parent_symbols, - } - } - - fn populate_used_symbols(&mut self) { - let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; - let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; - let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; - for state in &self.parse_table.states { - for symbol in state.terminal_entries.keys() { - match symbol.kind { - SymbolType::Terminal => terminal_usages[symbol.index] = true, - SymbolType::External => external_usages[symbol.index] = true, - _ => {} - } - } - for symbol in state.nonterminal_entries.keys() { - non_terminal_usages[symbol.index] = true; - } - } - self.parse_table.symbols.push(Symbol::end()); - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } - for (i, value) in non_terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::non_terminal(i)); - } - } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); - } - } - } - - fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let mut alias_sequence: Vec> = item - .production - .steps - .iter() - .map(|s| s.alias.clone()) - .collect(); - while alias_sequence.last() == Some(&None) { - alias_sequence.pop(); - } - if let Some(index) = self - .parse_table - .alias_sequences - .iter() - .position(|seq| *seq == alias_sequence) - { - index - } else { - self.parse_table.alias_sequences.push(alias_sequence); - self.parse_table.alias_sequences.len() - 1 - } - } - - fn symbol_name(&self, symbol: &Symbol) -> String { - match symbol.kind { - SymbolType::End => "EOF".to_string(), - SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] - .name - .clone(), - SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), - SymbolType::Terminal => { - let variable = &self.lexical_grammar.variables[symbol.index]; - if variable.kind == VariableType::Named { - variable.name.clone() - } else { - format!("\"{}\"", &variable.name) - } - } - } - } -} +use self::build_parse_table::build_parse_table; +use self::shrink_parse_table::shrink_parse_table; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -609,20 +19,8 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - ParseTableBuilder { - syntax_grammar, - lexical_grammar, - simple_aliases, - inlines, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), - state_ids_by_item_set: HashMap::new(), - item_sets_by_state_id: Vec::new(), - parse_state_queue: VecDeque::new(), - parse_table: ParseTable { - states: Vec::new(), - alias_sequences: Vec::new(), - symbols: Vec::new(), - }, - } - .build() + + let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + Ok((parse_table, LexTable::default(), LexTable::default(), None)) } diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs new file mode 100644 index 00000000..8e826f5c --- /dev/null +++ b/src/build_tables/shrink_parse_table.rs @@ -0,0 +1,117 @@ +use crate::grammars::{SyntaxGrammar, VariableType}; +use crate::rules::AliasMap; +use crate::tables::{ParseAction, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) fn shrink_parse_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + remove_unused_states(parse_table); +} + +fn remove_unit_reductions( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + let mut aliased_symbols = HashSet::new(); + for variable in &syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } + } + } + } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && syntax_grammar.variables[symbol.index].kind != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } +} + +fn remove_unused_states(parse_table: &mut ParseTable) { + let mut state_usage_map = vec![false; parse_table.states.len()]; + for state in &parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } + } + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; parse_table.states.len()]; + for state_id in 0..parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < parse_table.states.len() { + if state_usage_map[original_state_id] { + parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + parse_table.states.remove(state_id); + } + original_state_id += 1; + } +} diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs new file mode 100644 index 00000000..46a00986 --- /dev/null +++ b/src/build_tables/token_conflict_map.rs @@ -0,0 +1,77 @@ +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{CharacterSet, NfaCursor}; +use std::collections::HashSet; + +#[derive(Default)] +struct TokenConflictStatus { + matches_same_string: bool, + matches_longer_string_with_valid_next_char: bool, +} + +pub(crate) struct TokenConflictMap { + starting_chars_by_index: Vec, + status_matrix: Vec, +} + +impl TokenConflictMap { + pub fn new(grammar: &LexicalGrammar) -> Self { + let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + + let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _, _) in cursor.successors() { + all_chars = all_chars.add(chars); + } + starting_chars_by_index.push(all_chars); + } + + let status_matrix = + Vec::with_capacity(grammar.variables.len() * grammar.variables.len()); + + TokenConflictMap { + starting_chars_by_index, + status_matrix, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{Variable, VariableType}; + use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; + use crate::rules::Rule; + + #[test] + fn test_starting_characters() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "token_0".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("[a-f]1|0x\\d"), + }, + Variable { + name: "token_1".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("d*ef"), + }, + ], + }) + .unwrap(); + + let token_map = TokenConflictMap::new(&grammar); + + assert_eq!( + token_map.starting_chars_by_index[0], + CharacterSet::empty().add_range('a', 'f').add_char('0') + ); + assert_eq!( + token_map.starting_chars_by_index[1], + CharacterSet::empty().add_range('d', 'e') + ); + } +} diff --git a/src/tables.rs b/src/tables.rs index 01cecb49..0815aac8 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,7 @@ +use crate::nfa::CharacterSet; +use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; use std::ops::Range; -use crate::rules::{Associativity, Symbol, Alias}; -use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -23,7 +23,7 @@ pub(crate) enum ParseAction { dynamic_precedence: i32, associativity: Option, alias_sequence_id: AliasSequenceId, - } + }, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -86,6 +86,56 @@ impl Default for LexTable { } } +impl ParseState { + pub fn referenced_states<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_entries + .iter() + .flat_map(|(_, entry)| { + entry.actions.iter().filter_map(|action| match action { + ParseAction::Shift { state, .. } => Some(*state), + _ => None, + }) + }) + .chain(self.nonterminal_entries.iter().map(|(_, state)| *state)) + } + + pub fn update_referenced_states(&mut self, mut f: F) + where + F: FnMut(usize, &ParseState) -> usize, + { + let mut updates = Vec::new(); + for (symbol, entry) in &self.terminal_entries { + for (i, action) in entry.actions.iter().enumerate() { + if let ParseAction::Shift { state, .. } = action { + let result = f(*state, self); + if result != *state { + updates.push((*symbol, i, result)); + } + } + } + } + for (symbol, other_state) in &self.nonterminal_entries { + let result = f(*other_state, self); + if result != *other_state { + updates.push((*symbol, 0, result)); + } + } + for (symbol, action_index, new_state) in updates { + if symbol.is_non_terminal() { + self.nonterminal_entries.insert(symbol, new_state); + } else { + let entry = self.terminal_entries.get_mut(&symbol).unwrap(); + if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] { + entry.actions[action_index] = ParseAction::Shift { + state: new_state, + is_repetition, + }; + } + } + } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self { From c6b9e97c5820bd2f24c42e58fd2e82944354a6b6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 30 Dec 2018 19:31:17 -0800 Subject: [PATCH 079/102] Implement token conflict map --- src/build_tables/build_parse_table.rs | 20 +- src/build_tables/item_set_builder.rs | 4 + src/build_tables/mod.rs | 6 +- src/build_tables/token_conflict_map.rs | 315 +++++++++++++++++- src/grammars.rs | 7 + src/nfa.rs | 156 ++++++--- src/prepare_grammar/expand_tokens.rs | 40 ++- src/prepare_grammar/extract_simple_aliases.rs | 3 + 8 files changed, 471 insertions(+), 80 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 5087c55c..a7911689 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -2,7 +2,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType}; +use crate::rules::{Alias, Associativity, Symbol, SymbolType}; use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; @@ -35,10 +35,11 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, + following_tokens: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result { + fn build(mut self) -> Result<(ParseTable, Vec)> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -58,7 +59,7 @@ impl<'a> ParseTableBuilder<'a> { self.process_part_state_queue()?; self.populate_used_symbols(); - Ok(self.parse_table) + Ok((self.parse_table, self.following_tokens)) } fn add_parse_state( @@ -67,6 +68,16 @@ impl<'a> ParseTableBuilder<'a> { preceding_auxiliary_symbols: &AuxiliarySymbolSequence, item_set: ParseItemSet<'a>, ) -> ParseStateId { + if preceding_symbols.len() > 1 { + let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]); + for left_token in left_tokens.iter() { + if left_token.is_terminal() { + self.following_tokens[left_token.index].insert_all(right_tokens); + } + } + } + match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -586,7 +597,7 @@ pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, -) -> Result { +) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, @@ -600,6 +611,7 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), symbols: Vec::new(), }, + following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], } .build() } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index d7883988..8649cb52 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -269,6 +269,10 @@ impl<'a> ParseItemSetBuilder<'a> { &self.first_sets[symbol] } + pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + fn add_item( &self, set: &mut ParseItemSet<'a>, diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index a5ac74fb..d1983068 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -12,6 +12,7 @@ mod token_conflict_map; use self::build_parse_table::build_parse_table; use self::shrink_parse_table::shrink_parse_table; +use self::token_conflict_map::TokenConflictMap; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -19,8 +20,9 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - - let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + let (mut parse_table, following_tokens) = + build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); Ok((parse_table, LexTable::default(), LexTable::default(), None)) } diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs index 46a00986..52c68cc7 100644 --- a/src/build_tables/token_conflict_map.rs +++ b/src/build_tables/token_conflict_map.rs @@ -1,40 +1,262 @@ -use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::build_tables::item::LookaheadSet; +use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; +use std::fmt; -#[derive(Default)] +#[derive(Clone, Debug, Default)] struct TokenConflictStatus { + does_overlap: bool, + does_match_valid_continuation: bool, matches_same_string: bool, - matches_longer_string_with_valid_next_char: bool, } pub(crate) struct TokenConflictMap { - starting_chars_by_index: Vec, + n: usize, status_matrix: Vec, + starting_chars_by_index: Vec, + following_chars_by_index: Vec, } impl TokenConflictMap { - pub fn new(grammar: &LexicalGrammar) -> Self { + pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + let starting_chars = get_starting_chars(&mut cursor, grammar); + let following_chars = get_following_chars(&starting_chars, following_tokens); - let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len()); - for variable in &grammar.variables { - cursor.reset(vec![variable.start_state]); - let mut all_chars = CharacterSet::empty(); - for (chars, _, _) in cursor.successors() { - all_chars = all_chars.add(chars); + let n = grammar.variables.len(); + let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; + for i in 0..grammar.variables.len() { + for j in 0..i { + let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j); + status_matrix[matrix_index(n, i, j)] = status.0; + status_matrix[matrix_index(n, j, i)] = status.1; } - starting_chars_by_index.push(all_chars); } - let status_matrix = - Vec::with_capacity(grammar.variables.len() * grammar.variables.len()); - TokenConflictMap { - starting_chars_by_index, + n, status_matrix, + starting_chars_by_index: starting_chars, + following_chars_by_index: following_chars, } } + + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].matches_same_string + } + + pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation + } + + pub fn does_overlap(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].does_overlap + } +} + +impl fmt::Debug for TokenConflictMap { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "TokenConflictMap {{\n")?; + + write!(f, " starting_characters: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; + } + write!(f, " }},\n")?; + + write!(f, " following_characters: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; + } + write!(f, " }},\n")?; + + write!(f, " status_matrix: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", i)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + j, + self.status_matrix[matrix_index(self.n, i, j)] + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} + +fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize { + variable_count * i + j +} + +fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec { + let mut result = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _, _) in cursor.successors() { + all_chars = all_chars.add(chars); + } + result.push(all_chars); + } + result +} + +fn get_following_chars( + starting_chars: &Vec, + following_tokens: Vec, +) -> Vec { + following_tokens + .into_iter() + .map(|following_tokens| { + let mut chars = CharacterSet::empty(); + for token in following_tokens.iter() { + if token.is_terminal() { + chars = chars.add(&starting_chars[token.index]); + } + } + chars + }) + .collect() +} + +fn compute_conflict_status( + cursor: &mut NfaCursor, + grammar: &LexicalGrammar, + following_chars: &Vec, + i: usize, + j: usize, +) -> (TokenConflictStatus, TokenConflictStatus) { + let mut visited_state_sets = HashSet::new(); + let mut state_set_queue = vec![vec![ + grammar.variables[i].start_state, + grammar.variables[j].start_state, + ]]; + let mut result = ( + TokenConflictStatus::default(), + TokenConflictStatus::default(), + ); + + while let Some(state_set) = state_set_queue.pop() { + // Don't pursue states where there's no potential for conflict. + if variable_ids_for_states(&state_set, grammar).count() > 1 { + cursor.reset(state_set); + } else { + continue; + } + + let mut completion = None; + for (id, precedence) in cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if id == prev_id { + continue; + } + + // Prefer tokens with higher precedence. For tokens with equal precedence, + // prefer those listed earlier in the grammar. + let winning_id; + if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) { + winning_id = prev_id; + } else { + winning_id = id; + completion = Some((id, precedence)); + } + + if winning_id == i { + result.0.matches_same_string = true; + result.0.does_overlap = true; + } else { + result.1.matches_same_string = true; + result.1.does_overlap = true; + } + } else { + completion = Some((id, precedence)); + } + } + + for (chars, advance_precedence, next_states) in cursor.grouped_successors() { + let mut can_advance = true; + if let Some((completed_id, completed_precedence)) = completion { + let mut other_id = None; + let mut successor_contains_completed_id = false; + for variable_id in variable_ids_for_states(&next_states, grammar) { + if variable_id == completed_id { + successor_contains_completed_id = true; + break; + } else { + other_id = Some(variable_id); + } + } + + if let (Some(other_id), false) = (other_id, successor_contains_completed_id) { + let winning_id; + if advance_precedence < completed_precedence { + winning_id = completed_id; + can_advance = false; + } else { + winning_id = other_id; + } + + if winning_id == i { + result.0.does_overlap = true; + if chars.does_intersect(&following_chars[j]) { + result.0.does_match_valid_continuation = true; + } + } else { + result.1.does_overlap = true; + if chars.does_intersect(&following_chars[i]) { + result.1.does_match_valid_continuation = true; + } + } + } + } + + if can_advance && visited_state_sets.insert(next_states.clone()) { + state_set_queue.push(next_states); + } + } + } + result +} + +fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match ( + grammar.variables[left.1].is_string, + grammar.variables[right.1].is_string, + ) { + (true, false) => return true, + (false, true) => return false, + _ => {} + } + + left.0 < right.0 +} + +fn variable_ids_for_states<'a>( + state_ids: &'a Vec, + grammar: &'a LexicalGrammar, +) -> impl Iterator + 'a { + let mut prev = None; + state_ids.iter().filter_map(move |state_id| { + let variable_id = grammar.variable_index_for_nfa_state(*state_id); + if prev != Some(variable_id) { + prev = Some(variable_id); + prev + } else { + None + } + }) } #[cfg(test)] @@ -42,7 +264,7 @@ mod tests { use super::*; use crate::grammars::{Variable, VariableType}; use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; - use crate::rules::Rule; + use crate::rules::{Rule, Symbol}; #[test] fn test_starting_characters() { @@ -63,7 +285,7 @@ mod tests { }) .unwrap(); - let token_map = TokenConflictMap::new(&grammar); + let token_map = TokenConflictMap::new(&grammar, Vec::new()); assert_eq!( token_map.starting_chars_by_index[0], @@ -74,4 +296,61 @@ mod tests { CharacterSet::empty().add_range('d', 'e') ); } + + #[test] + fn test_token_conflicts() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "in".to_string(), + kind: VariableType::Named, + rule: Rule::string("in"), + }, + Variable { + name: "identifier".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("\\w+"), + }, + Variable { + name: "instanceof".to_string(), + kind: VariableType::Named, + rule: Rule::string("instanceof"), + }, + ], + }) + .unwrap(); + + let var = |name| index_of_var(&grammar, name); + + let token_map = TokenConflictMap::new( + &grammar, + vec![ + LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + LookaheadSet::with(&[Symbol::terminal(var("in"))]), + LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + ], + ); + + // Given the string "in", the `in` token is preferred over the `identifier` token + assert!(token_map.does_match_same_string(var("in"), var("identifier"))); + assert!(!token_map.does_match_same_string(var("identifier"), var("in"))); + + // Depending on what character follows, the string "in" may be treated as part of an + // `identifier` token. + assert!(token_map.does_match_valid_continuation(var("identifier"), var("in"))); + + // Depending on what character follows, the string "instanceof" may be treated as part of + // an `identifier` token. + assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof"))); + assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in"))); + } + + fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { + grammar + .variables + .iter() + .position(|v| v.name == name) + .unwrap() + } } diff --git a/src/grammars.rs b/src/grammars.rs index b751e4e4..18da86d8 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -36,6 +36,7 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, + pub is_string: bool, pub start_state: u32, } @@ -179,6 +180,12 @@ impl Variable { } } +impl LexicalGrammar { + pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize { + self.variables.iter().position(|v| v.start_state >= state_id).unwrap() + } +} + impl SyntaxVariable { pub fn is_auxiliary(&self) -> bool { self.kind == VariableType::Auxiliary diff --git a/src/nfa.rs b/src/nfa.rs index 4a4fa17b..738d1b40 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -97,6 +97,19 @@ impl CharacterSet { panic!("Called add with a negated character set"); } + pub fn does_intersect(&self, other: &CharacterSet) -> bool { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common, + CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only, + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only, + CharacterSet::Exclude(_) => true, + }, + } + } + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { match self { CharacterSet::Include(chars) => match other { @@ -152,14 +165,14 @@ impl Ord for CharacterSet { match self { CharacterSet::Include(chars) => { if let CharacterSet::Include(other_chars) = other { - compare_chars(chars, other_chars) + order_chars(chars, other_chars) } else { Ordering::Less } } CharacterSet::Exclude(chars) => { if let CharacterSet::Exclude(other_chars) = other { - compare_chars(chars, other_chars) + order_chars(chars, other_chars) } else { Ordering::Greater } @@ -197,7 +210,39 @@ fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) result } -fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { +struct SetComparision { + left_only: bool, + common: bool, + right_only: bool, +} + +fn compare_chars(left: &Vec, right: &Vec) -> SetComparision { + let mut result = SetComparision { + left_only: false, + common: false, + right_only: false, + }; + let mut left = left.iter().cloned(); + let mut right = right.iter().cloned(); + let mut i = left.next(); + let mut j = right.next(); + while let (Some(left_char), Some(right_char)) = (i, j) { + if left_char < right_char { + i = left.next(); + result.left_only = true; + } else if left_char > right_char { + j = right.next(); + result.right_only = true; + } else { + i = left.next(); + j = right.next(); + result.common = true; + } + } + result +} + +fn order_chars(chars: &Vec, other_chars: &Vec) -> Ordering { if chars.is_empty() { if other_chars.is_empty() { Ordering::Equal @@ -207,19 +252,15 @@ fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { } else if other_chars.is_empty() { Ordering::Greater } else { - let mut other_c = other_chars.iter(); - for c in chars.iter() { - if let Some(other_c) = other_c.next() { - let cmp = c.cmp(other_c); - if cmp != Ordering::Equal { - return cmp; - } - } else { - return Ordering::Greater; - } + let cmp = chars.len().cmp(&other_chars.len()); + if cmp != Ordering::Equal { + return cmp; } - if other_c.next().is_some() { - return Ordering::Less; + for (c, other_c) in chars.iter().zip(other_chars.iter()) { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } } Ordering::Equal } @@ -233,10 +274,6 @@ impl Nfa { pub fn last_state_id(&self) -> u32 { self.states.len() as u32 - 1 } - - pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.last_state_id())); - } } impl fmt::Debug for Nfa { @@ -325,11 +362,17 @@ impl<'a> NfaCursor<'a> { while i < result.len() { let intersection = result[i].0.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut states = result[i].2.clone(); - let mut precedence = result[i].1; - states.push(state); - result.insert(i, (intersection, max(precedence, prec), states)); - i += 1; + if result[i].0.is_empty() { + result[i].0 = intersection; + result[i].1 = max(result[i].1, prec); + result[i].2.push(state); + } else { + let mut states = result[i].2.clone(); + let mut precedence = result[i].1; + states.push(state); + result.insert(i, (intersection, max(precedence, prec), states)); + i += 1; + } } i += 1; } @@ -341,27 +384,18 @@ impl<'a> NfaCursor<'a> { result } - pub fn finished_id(&self) -> Option<(usize, i32)> { - let mut result = None; - for state_id in self.state_ids.iter() { + pub fn completions(&self) -> impl Iterator + '_ { + self.state_ids.iter().filter_map(move |state_id| { if let NfaState::Accept { variable_index, precedence, } = self.nfa.states[*state_id as usize] { - match result { - None => result = Some((variable_index, precedence)), - Some((existing_id, existing_precedence)) => { - if precedence > existing_precedence - || (precedence == existing_precedence && variable_index < existing_id) - { - result = Some((variable_index, precedence)) - } - } - } + Some((variable_index, precedence)) + } else { + None } - } - result + }) } pub fn in_separator(&self) -> bool { @@ -467,7 +501,7 @@ mod tests { } #[test] - fn test_character_set_intersection() { + fn test_character_set_remove_intersection() { // whitelist - whitelist // both sets contain 'c', 'd', and 'f' let mut a = CharacterSet::empty().add_range('a', 'f'); @@ -529,4 +563,46 @@ mod tests { assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); } + + #[test] + fn test_character_set_does_intersect() { + let (a, b) = (CharacterSet::empty(), CharacterSet::empty()); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('a'), + CharacterSet::empty().add_char('a'), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('b'), + CharacterSet::empty().add_char('a').add_char('c'), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'b', 'c']), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'c']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Exclude(vec!['a']), + CharacterSet::Exclude(vec!['a']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + } } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index b0d2ae04..2b7e7b4d 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -13,6 +13,14 @@ struct NfaBuilder { precedence_stack: Vec, } +fn is_string(rule: &Rule) -> bool { + match rule { + Rule::String(_) => true, + Rule::Metadata { rule, .. } => is_string(rule), + _ => false + } +} + pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut builder = NfaBuilder { nfa: Nfa::new(), @@ -58,6 +66,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Result { if self.expand_regex(ast, next_state_id)? { - self.nfa - .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + self.push_split(next_state_id); Ok(true) } else { Ok(false) @@ -265,8 +269,7 @@ impl NfaBuilder { fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { if self.expand_one_or_more(&ast, next_state_id)? { - self.nfa - .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + self.push_split(next_state_id); Ok(true) } else { Ok(false) @@ -333,6 +336,11 @@ impl NfaBuilder { }); } + fn push_split(&mut self, state_id: u32) { + let last_state_id = self.nfa.last_state_id(); + self.nfa.states.push(NfaState::Split(state_id, last_state_id)); + } + fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { let mut i = 0; while i < state_ids.len() { @@ -371,10 +379,10 @@ mod tests { let mut start_char = 0; let mut end_char = 0; for c in s.chars() { - if let Some((id, finished_precedence)) = cursor.finished_id() { - if result.is_none() || result_precedence <= finished_precedence { + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { result = Some((id, &s[start_char..end_char])); - result_precedence = finished_precedence; + result_precedence = precedence; } } if cursor.advance(c) { @@ -387,10 +395,10 @@ mod tests { } } - if let Some((id, finished_precedence)) = cursor.finished_id() { - if result.is_none() || result_precedence <= finished_precedence { + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { result = Some((id, &s[start_char..end_char])); - result_precedence = finished_precedence; + result_precedence = precedence; } } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index ff7204a0..ee748f5d 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -137,16 +137,19 @@ mod tests { LexicalVariable { name: "t1".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, }, LexicalVariable { name: "t2".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, }, LexicalVariable { name: "t3".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, } ], From a46b8fcb46a1f8799bd50ebe7e04e7cddf4bff2d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 1 Jan 2019 13:47:29 -0800 Subject: [PATCH 080/102] Implement parse state merging --- src/build_tables/build_parse_table.rs | 13 +- src/build_tables/coincident_tokens.rs | 36 ++++ src/build_tables/item.rs | 32 +++- src/build_tables/mod.rs | 88 +++++++++- src/build_tables/shrink_parse_table.rs | 158 +++++++++++++++++- ...ken_conflict_map.rs => token_conflicts.rs} | 23 ++- src/nfa.rs | 41 +++-- src/prepare_grammar/expand_tokens.rs | 12 +- src/tables.rs | 1 + 9 files changed, 364 insertions(+), 40 deletions(-) create mode 100644 src/build_tables/coincident_tokens.rs rename src/build_tables/{token_conflict_map.rs => token_conflicts.rs} (92%) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index a7911689..2fe6fd8d 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,7 +7,8 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::Entry; +use std::hash::Hasher; +use std::collections::hash_map::{Entry, DefaultHasher}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; @@ -44,14 +45,13 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); self.add_parse_state( &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] .iter() .cloned(), ), @@ -78,6 +78,10 @@ impl<'a> ParseTableBuilder<'a> { } } + let mut hasher = DefaultHasher::new(); + item_set.hash_unfinished_items(&mut hasher); + let unfinished_item_signature = hasher.finish(); + match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -87,6 +91,7 @@ impl<'a> ParseTableBuilder<'a> { lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), + unfinished_item_signature, }); self.parse_state_queue.push_back(ParseStateQueueEntry { state_id, diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs new file mode 100644 index 00000000..10707489 --- /dev/null +++ b/src/build_tables/coincident_tokens.rs @@ -0,0 +1,36 @@ +use crate::rules::Symbol; +use crate::tables::{ParseStateId, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) struct CoincidentTokenIndex { + entries: HashMap<(Symbol, Symbol), HashSet>, + empty: HashSet, +} + +impl CoincidentTokenIndex { + pub fn new(table: &ParseTable) -> Self { + let mut entries = HashMap::new(); + for (i, state) in table.states.iter().enumerate() { + for symbol in state.terminal_entries.keys() { + for other_symbol in state.terminal_entries.keys() { + entries + .entry((*symbol, *other_symbol)) + .or_insert(HashSet::new()) + .insert(i); + } + } + } + Self { + entries, + empty: HashSet::new(), + } + } + + pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + self.entries.get(&(a, b)).unwrap_or(&self.empty) + } + + pub fn contains(&self, a: Symbol, b: Symbol) -> bool { + self.entries.contains_key(&(a, b)) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 28723d24..4cd2f643 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,11 +2,11 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; +use std::cmp::Ordering; use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; -use std::cmp::Ordering; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -85,10 +85,10 @@ impl LookaheadSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } - pub fn with<'a>(symbols: impl IntoIterator) -> Self { + pub fn with(symbols: impl IntoIterator) -> Self { let mut result = Self::new(); for symbol in symbols { - result.insert(*symbol); + result.insert(symbol); } result } @@ -219,6 +219,21 @@ impl<'a> ParseItemSet<'a> { result } + pub fn hash_unfinished_items(&self, h: &mut impl Hasher) { + let mut previous_variable_index = u32::MAX; + let mut previous_step_index = u32::MAX; + for item in self.entries.keys() { + if item.step().is_none() && item.variable_index != previous_variable_index + || item.step_index != previous_step_index + { + h.write_u32(item.variable_index); + h.write_u32(item.step_index); + previous_variable_index = item.variable_index; + previous_step_index = item.step_index; + } + } + } + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, @@ -369,11 +384,18 @@ impl<'a> Ord for ParseItem<'a> { if o != Ordering::Equal { return o; } - let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + let o = self + .production + .dynamic_precedence + .cmp(&other.production.dynamic_precedence); if o != Ordering::Equal { return o; } - let o = self.production.steps.len().cmp(&other.production.steps.len()); + let o = self + .production + .steps + .len() + .cmp(&other.production.steps.len()); if o != Ordering::Equal { return o; } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index d1983068..665c56a0 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,18 +1,20 @@ -use crate::error::Result; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::rules::{AliasMap, Symbol}; -use crate::tables::{LexTable, ParseTable}; - mod build_parse_table; +mod coincident_tokens; mod item; mod item_set_builder; mod lex_table_builder; mod shrink_parse_table; -mod token_conflict_map; +mod token_conflicts; use self::build_parse_table::build_parse_table; +use self::coincident_tokens::CoincidentTokenIndex; +use self::item::LookaheadSet; use self::shrink_parse_table::shrink_parse_table; -use self::token_conflict_map::TokenConflictMap; +use self::token_conflicts::TokenConflictMap; +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -23,6 +25,76 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + populate_error_state( + &mut parse_table, + syntax_grammar, + lexical_grammar, + &coincident_token_index, + &token_conflict_map, + ); + shrink_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + ); Ok((parse_table, LexTable::default(), LexTable::default(), None)) } + +fn populate_error_state( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + coincident_token_index: &CoincidentTokenIndex, + token_conflict_map: &TokenConflictMap, +) { + let state = &mut parse_table.states[0]; + let n = lexical_grammar.variables.len(); + let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { + j == i + || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + || !token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + Some(Symbol::terminal(i)) + } + })); + + let recover_entry = ParseTableEntry { + reusable: false, + actions: vec![ParseAction::Recover], + }; + + for i in 0..n { + let symbol = Symbol::terminal(i); + let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) + || conflict_free_tokens.iter().all(|t| { + coincident_token_index.contains(symbol, t) + || !token_conflict_map.does_conflict(i, t.index) + }); + if can_be_used_for_recovery { + eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); + } else { + eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); + } + } + + for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { + if external_token.corresponding_internal_token.is_none() { + state + .terminal_entries + .entry(Symbol::external(i)) + .or_insert_with(|| recover_entry.clone()); + } + } + + state.terminal_entries.insert(Symbol::end(), recover_entry); +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 8e826f5c..026c3058 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,14 +1,17 @@ +use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; -use crate::rules::AliasMap; -use crate::tables::{ParseAction, ParseTable}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, + token_conflict_map: &TokenConflictMap, ) { remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); remove_unused_states(parse_table); } @@ -86,6 +89,157 @@ fn remove_unit_reductions( } } +fn merge_compatible_states( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, +) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { + continue; + } + if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } + } + } + } + + if state_replacements.is_empty() { + break; + } + + for state in parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); + } + } +} + +fn merge_parse_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + parse_table: &mut ParseTable, + left: usize, + right: usize, +) -> bool { + let left_state = &parse_table.states[left]; + let right_state = &parse_table.states[right]; + + if left_state.nonterminal_entries != right_state.nonterminal_entries { + return false; + } + + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + right_state, + *symbol, + left_entry, + ) { + return false; + } + } + + eprintln!("maybe merge {} {}", left, right); + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + left_state, + *symbol, + right_entry, + ) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = parse_table.states[right].terminal_entries[&symbol].clone(); + parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true +} + +fn can_add_entry_to_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, +) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; + } + + // Only merge parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if token_conflict_map.does_conflict(token.index, existing_token.index) { + return false; + } + } + } + + true +} + fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; for state in &parse_table.states { diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflicts.rs similarity index 92% rename from src/build_tables/token_conflict_map.rs rename to src/build_tables/token_conflicts.rs index 52c68cc7..09d5e97c 100644 --- a/src/build_tables/token_conflict_map.rs +++ b/src/build_tables/token_conflicts.rs @@ -8,6 +8,7 @@ use std::fmt; struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, + does_match_separators: bool, matches_same_string: bool, } @@ -46,8 +47,9 @@ impl TokenConflictMap { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } - pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool { - self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation + pub fn does_conflict(&self, i: usize, j: usize) -> bool { + let entry = &self.status_matrix[matrix_index(self.n, i, j)]; + entry.does_match_valid_continuation || entry.does_match_separators } pub fn does_overlap(&self, i: usize, j: usize) -> bool { @@ -207,10 +209,15 @@ fn compute_conflict_status( if chars.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } + if cursor.in_separator() { + result.0.does_match_separators = true; + } } else { result.1.does_overlap = true; if chars.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; + } else { + result.1.does_match_separators = true; } } } @@ -326,9 +333,9 @@ mod tests { let token_map = TokenConflictMap::new( &grammar, vec![ - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), - LookaheadSet::with(&[Symbol::terminal(var("in"))]), - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), ], ); @@ -338,12 +345,12 @@ mod tests { // Depending on what character follows, the string "in" may be treated as part of an // `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("in"))); // Depending on what character follows, the string "instanceof" may be treated as part of // an `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof"))); - assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); + assert!(token_map.does_conflict(var("instanceof"), var("in"))); } fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { diff --git a/src/nfa.rs b/src/nfa.rs index 738d1b40..ee39d178 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -86,15 +86,34 @@ impl CharacterSet { } pub fn add(self, other: &CharacterSet) -> Self { - if let CharacterSet::Include(other_chars) = other { - if let CharacterSet::Include(mut chars) = self { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - return CharacterSet::Include(chars); - } + match self { + CharacterSet::Include(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } + CharacterSet::Exclude(other_chars) => { + let excluded_chars = other_chars + .iter() + .cloned() + .filter(|c| !chars.contains(&c)) + .collect(); + CharacterSet::Exclude(excluded_chars) + } + }, + CharacterSet::Exclude(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.retain(|c| !other_chars.contains(&c)); + CharacterSet::Exclude(chars) + } + CharacterSet::Exclude(other_chars) => { + chars.retain(|c| other_chars.contains(&c)); + CharacterSet::Exclude(chars) + }, + }, } - panic!("Called add with a negated character set"); } pub fn does_intersect(&self, other: &CharacterSet) -> bool { @@ -458,6 +477,9 @@ mod tests { (CharacterSet::empty().add_char('f'), 0, 4), ], vec![ + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ( CharacterSet::empty() .add_range('a', 'c') @@ -467,9 +489,6 @@ mod tests { 0, vec![1], ), - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ], ), ]; diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 2b7e7b4d..4ef17b27 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -164,12 +164,20 @@ impl NfaBuilder { Err(Error::regex("Unicode character classes are not supported")) } Class::Perl(class) => { - self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + let mut chars = self.expand_perl_character_class(&class.kind); + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { - self.push_advance(self.expand_character_class(&item)?, next_state_id); + let mut chars = self.expand_character_class(&item)?; + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( diff --git a/src/tables.rs b/src/tables.rs index 0815aac8..344c4816 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -37,6 +37,7 @@ pub(crate) struct ParseState { pub terminal_entries: HashMap, pub nonterminal_entries: HashMap, pub lex_state_id: usize, + pub unfinished_item_signature: u64, } #[derive(Debug, PartialEq, Eq)] From 9824ebbbc31f7cda43f8a5aa5b3847462ab4c6aa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 12:34:40 -0800 Subject: [PATCH 081/102] Implement lex table construction --- src/build_tables/build_lex_table.rs | 124 ++++++++++++++++ src/build_tables/build_parse_table.rs | 31 ++-- src/build_tables/item_set_builder.rs | 20 +-- src/build_tables/lex_table_builder.rs | 24 --- src/build_tables/mod.rs | 131 ++++++++++++++++- src/build_tables/shrink_parse_table.rs | 2 - src/build_tables/token_conflicts.rs | 80 +++++----- src/grammars.rs | 10 +- src/main.rs | 2 +- src/nfa.rs | 130 ++++++----------- src/prepare_grammar/expand_tokens.rs | 24 ++- src/prepare_grammar/extract_tokens.rs | 17 ++- src/render/mod.rs | 195 +++++++++++++++++++++++-- src/rules.rs | 3 + src/tables.rs | 15 +- 15 files changed, 581 insertions(+), 227 deletions(-) create mode 100644 src/build_tables/build_lex_table.rs delete mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs new file mode 100644 index 00000000..aa929d97 --- /dev/null +++ b/src/build_tables/build_lex_table.rs @@ -0,0 +1,124 @@ +use super::item::LookaheadSet; +use super::token_conflicts::TokenConflictMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::nfa::NfaCursor; +use crate::rules::Symbol; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, VecDeque}; + +pub(crate) fn build_lex_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + keywords: &LookaheadSet, +) -> (LexTable, LexTable) { + let keyword_lex_table; + if syntax_grammar.word_token.is_some() { + let mut builder = LexTableBuilder::new(lexical_grammar); + builder.add_state_for_tokens(keywords.iter()); + keyword_lex_table = builder.table; + } else { + keyword_lex_table = LexTable::default(); + } + + let mut builder = LexTableBuilder::new(lexical_grammar); + for state in parse_table.states.iter_mut() { + let tokens = state.terminal_entries.keys().filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else { + None + } + }); + state.lex_state_id = builder.add_state_for_tokens(tokens); + } + + (builder.table, keyword_lex_table) +} + +struct LexTableBuilder<'a> { + lexical_grammar: &'a LexicalGrammar, + cursor: NfaCursor<'a>, + table: LexTable, + state_queue: VecDeque<(usize, Vec)>, + state_ids_by_nfa_state_set: HashMap, usize>, +} + +impl<'a> LexTableBuilder<'a> { + fn new(lexical_grammar: &'a LexicalGrammar) -> Self { + Self { + lexical_grammar, + cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]), + table: LexTable::default(), + state_queue: VecDeque::new(), + state_ids_by_nfa_state_set: HashMap::new(), + } + } + + fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + let nfa_states = tokens + .map(|token| self.lexical_grammar.variables[token.index].start_state) + .collect(); + let result = self.add_state(nfa_states); + while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + self.populate_state(state_id, nfa_states); + } + result + } + + fn add_state(&mut self, nfa_states: Vec) -> usize { + match self.state_ids_by_nfa_state_set.entry(nfa_states) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.table.states.len(); + self.table.states.push(LexState::default()); + self.state_queue.push_back((state_id, v.key().clone())); + v.insert(state_id); + state_id + } + } + } + + fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + self.cursor.reset(nfa_states); + + let mut completion = None; + for (id, prec) in self.cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if TokenConflictMap::prefer_token( + self.lexical_grammar, + (prev_precedence, prev_id), + (prec, id), + ) { + continue; + } + } + completion = Some((id, prec)); + } + + for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + if let Some((_, completed_precedence)) = completion { + if advance_precedence < completed_precedence { + continue; + } + } + let next_state_id = self.add_state(next_states); + self.table.states[state_id].advance_actions.push(( + chars, + AdvanceAction { + state: next_state_id, + in_main_token: !is_sep, + }, + )); + } + + if let Some((completion_index, _)) = completion { + self.table.states[state_id].accept_action = Some(completion_index); + } + } +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 2fe6fd8d..c17261dc 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,10 +7,10 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::hash::Hasher; -use std::collections::hash_map::{Entry, DefaultHasher}; +use std::collections::hash_map::{DefaultHasher, Entry}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; +use std::hash::Hasher; #[derive(Clone)] struct AuxiliarySymbolInfo { @@ -31,7 +31,6 @@ struct ParseTableBuilder<'a> { item_set_builder: ParseItemSetBuilder<'a>, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, state_ids_by_item_set: HashMap, ParseStateId>, item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, @@ -51,9 +50,12 @@ impl<'a> ParseTableBuilder<'a> { &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] - .iter() - .cloned(), + [( + ParseItem::start(), + LookaheadSet::with([Symbol::end()].iter().cloned()), + )] + .iter() + .cloned(), ), ); @@ -69,8 +71,12 @@ impl<'a> ParseTableBuilder<'a> { item_set: ParseItemSet<'a>, ) -> ParseStateId { if preceding_symbols.len() > 1 { - let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]); - let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]); + let left_tokens = self + .item_set_builder + .last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self + .item_set_builder + .first_set(&preceding_symbols[preceding_symbols.len() - 1]); for left_token in left_tokens.iter() { if left_token.is_terminal() { self.following_tokens[left_token.index].insert_all(right_tokens); @@ -117,11 +123,9 @@ impl<'a> ParseTableBuilder<'a> { ); } - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); if debug { println!( @@ -606,7 +610,6 @@ pub(crate) fn build_parse_table( ParseTableBuilder { syntax_grammar, lexical_grammar, - inlines, item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 8649cb52..5e61bfcc 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -18,6 +18,7 @@ struct FollowSetInfo { pub(crate) struct ParseItemSetBuilder<'a> { first_sets: HashMap, last_sets: HashMap, + inlines: &'a InlinedProductionMap, transitive_closure_additions: Vec>>, } @@ -36,6 +37,7 @@ impl<'a> ParseItemSetBuilder<'a> { let mut result = Self { first_sets: HashMap::new(), last_sets: HashMap::new(), + inlines, transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], }; @@ -237,15 +239,12 @@ impl<'a> ParseItemSetBuilder<'a> { result } - pub(crate) fn transitive_closure( - &mut self, - item_set: &ParseItemSet<'a>, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> ParseItemSet<'a> { + pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + if let Some(productions) = self + .inlines + .inlined_productions(item.production, item.step_index) { for production in productions { self.add_item( @@ -273,12 +272,7 @@ impl<'a> ParseItemSetBuilder<'a> { &self.first_sets[symbol] } - fn add_item( - &self, - set: &mut ParseItemSet<'a>, - item: ParseItem<'a>, - lookaheads: &LookaheadSet, - ) { + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { let next_step = item.successor().step(); diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs deleted file mode 100644 index 86d1578b..00000000 --- a/src/build_tables/lex_table_builder.rs +++ /dev/null @@ -1,24 +0,0 @@ -use crate::rules::Symbol; -use crate::tables::LexTable; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; - -pub(crate) struct LexTableBuilder<'a> { - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - table: LexTable, -} - -impl<'a> LexTableBuilder<'a> { - pub fn new( - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> Self { - Self { - syntax_grammar, lexical_grammar, table: LexTable::default() - } - } - - pub fn build(self) -> (LexTable, LexTable, Option) { - (LexTable::default(), LexTable::default(), None) - } -} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 665c56a0..8b3a2db4 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,11 +1,12 @@ +mod build_lex_table; mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; -mod lex_table_builder; mod shrink_parse_table; mod token_conflicts; +use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; use self::item::LookaheadSet; @@ -13,6 +14,7 @@ use self::shrink_parse_table::shrink_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::nfa::{CharacterSet, NfaCursor}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; @@ -25,7 +27,22 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); + + eprintln!("{:?}", token_conflict_map); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + let keywords = if let Some(word_token) = syntax_grammar.word_token { + identify_keywords( + lexical_grammar, + &parse_table, + word_token, + &token_conflict_map, + &coincident_token_index, + ) + } else { + LookaheadSet::new() + }; + populate_error_state( &mut parse_table, syntax_grammar, @@ -39,7 +56,14 @@ pub(crate) fn build_tables( simple_aliases, &token_conflict_map, ); - Ok((parse_table, LexTable::default(), LexTable::default(), None)) + let (main_lex_table, keyword_lex_table) = + build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); + Ok(( + parse_table, + main_lex_table, + keyword_lex_table, + syntax_grammar.word_token, + )) } fn populate_error_state( @@ -77,13 +101,10 @@ fn populate_error_state( || !token_conflict_map.does_conflict(i, t.index) }); if can_be_used_for_recovery { - eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); state .terminal_entries .entry(symbol) .or_insert_with(|| recover_entry.clone()); - } else { - eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); } } @@ -98,3 +119,103 @@ fn populate_error_state( state.terminal_entries.insert(Symbol::end(), recover_entry); } + +fn identify_keywords( + lexical_grammar: &LexicalGrammar, + parse_table: &ParseTable, + word_token: Symbol, + token_conflict_map: &TokenConflictMap, + coincident_token_index: &CoincidentTokenIndex, +) -> LookaheadSet { + let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); + + // First find all of the candidate keyword tokens: tokens that start with + // letters or underscore and can match the same string as a word token. + let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( + |(i, variable)| { + cursor.reset(vec![variable.start_state]); + if all_chars_are_alphabetical(&cursor) + && token_conflict_map.does_match_same_string(i, word_token.index) + { + Some(Symbol::terminal(i)) + } else { + None + } + }, + )); + + // Exclude keyword candidates that shadow another keyword candidate. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + eprintln!( + "Exclude {} from keywords because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } + } + true + })); + + // Exclude keyword candidates for which substituting the keyword capture + // token would introduce new lexical conflicts with other tokens. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + eprintln!( + "Exclude {} from keywords because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } + } + + eprintln!( + "Include {} in keywords", + lexical_grammar.variables[token.index].name, + ); + true + })); + + keywords +} + +fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { + cursor.successors().all(|(chars, _, _, is_sep)| { + if is_sep { + true + } else if let CharacterSet::Include(chars) = chars { + chars.iter().all(|c| c.is_alphabetic() || *c == '_') + } else { + false + } + }) +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 026c3058..b943158f 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -166,8 +166,6 @@ fn merge_parse_state( } } - eprintln!("maybe merge {} {}", left, right); - let mut symbols_to_add = Vec::new(); for (symbol, right_entry) in &right_state.terminal_entries { if !left_state.terminal_entries.contains_key(&symbol) { diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 09d5e97c..9f1c4426 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -4,7 +4,7 @@ use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; use std::fmt; -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, @@ -12,15 +12,16 @@ struct TokenConflictStatus { matches_same_string: bool, } -pub(crate) struct TokenConflictMap { +pub(crate) struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, + grammar: &'a LexicalGrammar, } -impl TokenConflictMap { - pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { +impl<'a> TokenConflictMap<'a> { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); @@ -40,9 +41,16 @@ impl TokenConflictMap { status_matrix, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, + grammar, } } + pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { + let left = &self.status_matrix[matrix_index(self.n, a, other)]; + let right = &self.status_matrix[matrix_index(self.n, b, other)]; + left == right + } + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } @@ -55,9 +63,28 @@ impl TokenConflictMap { pub fn does_overlap(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].does_overlap } + + pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match ( + grammar.variables[left.1].is_string, + grammar.variables[right.1].is_string, + ) { + (true, false) => return true, + (false, true) => return false, + _ => {} + } + + left.0 < right.0 + } } -impl fmt::Debug for TokenConflictMap { +impl<'a> fmt::Debug for TokenConflictMap<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; @@ -69,18 +96,22 @@ impl fmt::Debug for TokenConflictMap { write!(f, " following_characters: {{\n")?; for i in 0..self.n { - write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; + write!( + f, + " {}: {:?},\n", + self.grammar.variables[i].name, self.following_chars_by_index[i] + )?; } write!(f, " }},\n")?; write!(f, " status_matrix: {{\n")?; for i in 0..self.n { - write!(f, " {}: {{\n", i)?; + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; for j in 0..self.n { write!( f, " {}: {:?},\n", - j, + self.grammar.variables[j].name, self.status_matrix[matrix_index(self.n, i, j)] )?; } @@ -101,7 +132,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec bool { - if left.0 > right.0 { - return true; - } else if left.0 < right.0 { - return false; - } - - match ( - grammar.variables[left.1].is_string, - grammar.variables[right.1].is_string, - ) { - (true, false) => return true, - (false, true) => return false, - _ => {} - } - - left.0 < right.0 -} - fn variable_ids_for_states<'a>( state_ids: &'a Vec, grammar: &'a LexicalGrammar, diff --git a/src/grammars.rs b/src/grammars.rs index 18da86d8..d23e8ca6 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -91,6 +91,7 @@ pub(crate) struct SyntaxGrammar { pub word_token: Option, } +#[cfg(test)] impl ProductionStep { pub(crate) fn new(symbol: Symbol) -> Self { Self { @@ -127,14 +128,6 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } - - pub fn last_precedence(&self) -> i32 { - self.steps.last().map(|s| s.precedence).unwrap_or(0) - } - - pub fn last_associativity(&self) -> Option { - self.steps.last().map(|s| s.associativity).unwrap_or(None) - } } impl Default for Production { @@ -146,6 +139,7 @@ impl Default for Production { } } +#[cfg(test)] impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { diff --git a/src/main.rs b/src/main.rs index c7ca2ca5..cd672186 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(matches) = matches.subcommand_matches("generate") { + if let Some(_) = matches.subcommand_matches("generate") { let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); diff --git a/src/nfa.rs b/src/nfa.rs index ee39d178..e14dac44 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -40,7 +40,6 @@ impl Default for Nfa { pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, nfa: &'a Nfa, - in_sep: bool, } impl CharacterSet { @@ -111,7 +110,7 @@ impl CharacterSet { CharacterSet::Exclude(other_chars) => { chars.retain(|c| other_chars.contains(&c)); CharacterSet::Exclude(chars) - }, + } }, } } @@ -311,7 +310,6 @@ impl<'a> NfaCursor<'a> { let mut result = Self { nfa, state_ids: Vec::new(), - in_sep: true, }; result.add_states(&mut states); result @@ -322,81 +320,59 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } - pub fn advance(&mut self, c: char) -> bool { - let mut result = false; - let mut new_state_ids = Vec::new(); - let mut any_sep_transitions = false; - for current_state_id in &self.state_ids { - if let NfaState::Advance { - chars, - state_id, - is_sep, - .. - } = &self.nfa.states[*current_state_id as usize] - { - if chars.contains(c) { - if *is_sep { - any_sep_transitions = true; - } - new_state_ids.push(*state_id); - result = true; - } - } - } - if !any_sep_transitions { - self.in_sep = false; - } - self.state_ids.clear(); - self.add_states(&mut new_state_ids); - result - } - - pub fn successors(&self) -> impl Iterator { + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { chars, state_id, precedence, - .. + is_sep, } = &self.nfa.states[*id as usize] { - Some((chars, *precedence, *state_id)) + Some((chars, *precedence, *state_id, *is_sep)) } else { None } }) } - pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec, bool)> { Self::group_successors(self.successors()) } fn group_successors<'b>( - iter: impl Iterator, - ) -> Vec<(CharacterSet, i32, Vec)> { - let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); - for (chars, prec, state) in iter { + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec, bool)> { + let mut result: Vec<(CharacterSet, i32, Vec, bool)> = Vec::new(); + for (chars, prec, state, is_sep) in iter { let mut chars = chars.clone(); let mut i = 0; while i < result.len() { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - if result[i].0.is_empty() { - result[i].0 = intersection; - result[i].1 = max(result[i].1, prec); - result[i].2.push(state); - } else { + if result[i].0 == chars { + result[i].1 = max(result[i].1, prec); + result[i].2.push(state); + result[i].3 |= is_sep; + } else { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { let mut states = result[i].2.clone(); - let mut precedence = result[i].1; states.push(state); - result.insert(i, (intersection, max(precedence, prec), states)); + result.insert( + i, + ( + intersection, + max(result[i].1, prec), + states, + result[i].3 || is_sep, + ), + ); i += 1; } } i += 1; } if !chars.is_empty() { - result.push((chars, prec, vec![state])); + result.push((chars, prec, vec![state], is_sep)); } } result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); @@ -417,10 +393,6 @@ impl<'a> NfaCursor<'a> { }) } - pub fn in_separator(&self) -> bool { - self.in_sep - } - pub fn add_states(&mut self, new_state_ids: &mut Vec) { let mut i = 0; while i < new_state_ids.len() { @@ -460,26 +432,31 @@ mod tests { let table = [ ( vec![ - (CharacterSet::empty().add_range('a', 'f'), 0, 1), - (CharacterSet::empty().add_range('d', 'i'), 1, 2), + (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), + (CharacterSet::empty().add_range('d', 'i'), 1, 2, false), ], vec![ - (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), - (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), - (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false), + ( + CharacterSet::empty().add_range('d', 'f'), + 1, + vec![1, 2], + false, + ), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), ], ), ( vec![ - (CharacterSet::empty().add_range('a', 'z'), 0, 1), - (CharacterSet::empty().add_char('d'), 0, 2), - (CharacterSet::empty().add_char('i'), 0, 3), - (CharacterSet::empty().add_char('f'), 0, 4), + (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), + (CharacterSet::empty().add_char('d'), 0, 2, false), + (CharacterSet::empty().add_char('i'), 0, 3, false), + (CharacterSet::empty().add_char('f'), 0, 4, false), ], vec![ - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2], false), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4], false), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3], false), ( CharacterSet::empty() .add_range('a', 'c') @@ -488,6 +465,7 @@ mod tests { .add_range('j', 'z'), 0, vec![1], + false, ), ], ), @@ -495,28 +473,10 @@ mod tests { for row in table.iter() { assert_eq!( - NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))), row.1 ); } - - // let successors = NfaCursor::group_successors( - // [ - // (&CharacterSet::empty().add_range('a', 'f'), 1), - // (&CharacterSet::empty().add_range('d', 'i'), 2), - // ] - // .iter() - // .cloned(), - // ); - // - // assert_eq!( - // successors, - // vec![ - // (CharacterSet::empty().add_range('a', 'c'), vec![1],), - // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), - // (CharacterSet::empty().add_range('g', 'i'), vec![2],), - // ] - // ); } #[test] diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 4ef17b27..fdf085f6 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -6,6 +6,7 @@ use crate::rules::Rule; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; +use std::i32; struct NfaBuilder { nfa: Nfa, @@ -17,7 +18,7 @@ fn is_string(rule: &Rule) -> bool { match rule { Rule::String(_) => true, Rule::Metadata { rule, .. } => is_string(rule), - _ => false + _ => false, } } @@ -346,7 +347,9 @@ impl NfaBuilder { fn push_split(&mut self, state_id: u32) { let last_state_id = self.nfa.last_state_id(); - self.nfa.states.push(NfaState::Split(state_id, last_state_id)); + self.nfa + .states + .push(NfaState::Split(state_id, last_state_id)); } fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { @@ -354,12 +357,12 @@ impl NfaBuilder { while i < state_ids.len() { let state_id = state_ids[i]; let (left, right) = match &mut self.nfa.states[state_id as usize] { - NfaState::Accept {precedence, ..} => { + NfaState::Accept { precedence, .. } => { *precedence = prec; return; - }, + } NfaState::Split(left, right) => (*left, *right), - _ => return + _ => return, }; if !state_ids.contains(&left) { state_ids.push(left); @@ -383,7 +386,7 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; - let mut result_precedence = 0; + let mut result_precedence = i32::MIN; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { @@ -393,9 +396,14 @@ mod tests { result_precedence = precedence; } } - if cursor.advance(c) { + if let Some((_, _, next_states, in_sep)) = cursor + .grouped_successors() + .into_iter() + .find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence) + { + cursor.reset(next_states); end_char += 1; - if cursor.in_separator() { + if in_sep { start_char = end_char; } } else { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index eaeede90..5f3f6e16 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,6 +1,6 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; -use crate::grammars::{ExternalToken, Variable}; +use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; use std::collections::HashMap; use std::mem; @@ -240,16 +240,21 @@ impl TokenExtractor { let index = self.extracted_variables.len(); let variable = if let Some(string_value) = string_value { - Variable::anonymous(string_value, rule.clone()) + Variable { + name: string_value.clone(), + kind: VariableType::Anonymous, + rule: rule.clone() + } } else { self.current_variable_token_count += 1; - Variable::auxiliary( - &format!( + Variable { + name: format!( "{}_token{}", &self.current_variable_name, self.current_variable_token_count ), - rule.clone(), - ) + kind: VariableType::Auxiliary, + rule: rule.clone(), + } }; self.extracted_variables.push(variable); diff --git a/src/render/mod.rs b/src/render/mod.rs index fc4cdafb..cbb8ba0d 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,6 +2,7 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use core::ops::Range; use std::collections::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -12,11 +13,17 @@ macro_rules! add { }} } -macro_rules! add_line { - ($this: tt, $($arg: tt)*) => { +macro_rules! add_whitespace { + ($this: tt) => {{ for _ in 0..$this.indent_level { write!(&mut $this.buffer, " ").unwrap(); } + }}; +} + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + add_whitespace!($this); $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); $this.buffer += "\n"; } @@ -162,7 +169,7 @@ impl Generator { } } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define LANGUAGE_VERSION {}", 9); add_line!( self, "#define STATE_COUNT {}", @@ -352,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&accept_action.symbol] + self.symbol_ids[&Symbol::terminal(accept_action)] ); } @@ -360,9 +367,10 @@ impl Generator { for (characters, action) in state.advance_actions { let previous_length = self.buffer.len(); + add_whitespace!(self); add!(self, "if ("); if self.add_character_set_condition(&characters, &ruled_out_characters) { - add!(self, ")"); + add!(self, ")\n"); indent!(self); if action.in_main_token { add_line!(self, "ADVANCE({});", action.state); @@ -370,7 +378,7 @@ impl Generator { add_line!(self, "SKIP({});", action.state); } if let CharacterSet::Include(chars) = characters { - ruled_out_characters.extend(chars.iter()); + ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { @@ -384,9 +392,106 @@ impl Generator { fn add_character_set_condition( &mut self, characters: &CharacterSet, - ruled_out_characters: &HashSet, + ruled_out_characters: &HashSet, ) -> bool { - true + match characters { + CharacterSet::Include(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, false) + } + CharacterSet::Exclude(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, true) + } + } + } + + fn add_character_range_conditions( + &mut self, + ranges: impl Iterator>, + is_negated: bool, + ) -> bool { + let line_break = "\n "; + let mut did_add = false; + for range in ranges { + if is_negated { + if did_add { + add!(self, " &&{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead != "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead != "); + self.add_character(range.start); + add!(self, " &&{}lookahead != ", line_break); + self.add_character(range.end); + } else { + add!(self, "(lookahead < "); + self.add_character(range.start); + add!(self, " || "); + self.add_character(range.end); + add!(self, " < lookahead)"); + } + } else { + if did_add { + add!(self, " ||{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead == "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead == "); + self.add_character(range.start); + add!(self, " ||{}lookahead == ", line_break); + self.add_character(range.end); + } else { + add!(self, "("); + self.add_character(range.start); + add!(self, " <= lookahead && lookahead <= "); + self.add_character(range.end); + add!(self, ")"); + } + } + did_add = true; + } + did_add + } + + fn get_ranges<'a>( + chars: &'a Vec, + ruled_out_characters: &'a HashSet, + ) -> impl Iterator> + 'a { + let mut prev_range: Option> = None; + chars + .iter() + .cloned() + .chain(Some('\0')) + .filter_map(move |c| { + if ruled_out_characters.contains(&(c as u32)) { + return None; + } + if let Some(range) = prev_range.clone() { + if c == '\0' { + prev_range = Some(c..c); + return Some(range); + } + + let mut prev_range_successor = range.end as u32 + 1; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains(&prev_range_successor) { + prev_range = Some(c..c); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..c); + None + } else { + prev_range = Some(c..c); + None + } + }) } fn add_lex_modes_list(&mut self) { @@ -577,13 +682,6 @@ impl Generator { alias_sequence_id, .. } => { - if !self.symbol_ids.contains_key(&symbol) { - eprintln!( - "SYMBOL: {:?} {:?}", - symbol, - self.metadata_for_symbol(symbol) - ); - } add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); if dynamic_precedence != 0 { add!(self, ", .dynamic_precedence = {}", dynamic_precedence); @@ -785,7 +883,7 @@ impl Generator { { result.push(c); } else { - result += match c { + let replacement = match c { '~' => "TILDE", '`' => "BQUOTE", '!' => "BANG", @@ -821,7 +919,11 @@ impl Generator { '\r' => "CR", '\t' => "TAB", _ => continue, + }; + if !result.is_empty() && !result.ends_with("_") { + result.push('_'); } + result += replacement; } } result @@ -837,6 +939,21 @@ impl Generator { } result } + + fn add_character(&mut self, c: char) { + if c.is_ascii() { + match c { + '\'' => add!(self, "'\\''"), + '\\' => add!(self, "'\\\\'"), + '\t' => add!(self, "'\\t'"), + '\n' => add!(self, "'\\n'"), + '\r' => add!(self, "'\\r'"), + _ => add!(self, "'{}'", c), + } + } else { + add!(self, "{}", c as u32) + } + } } pub(crate) fn render_c_code( @@ -867,3 +984,49 @@ pub(crate) fn render_c_code( } .generate() } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_char_ranges() { + struct Row { + chars: Vec, + ruled_out_chars: Vec, + expected_ranges: Vec>, + } + + let table = [ + Row { + chars: vec!['a'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'a'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'z'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'h', 'z'], + ruled_out_chars: vec!['d', 'f', 'g'], + expected_ranges: vec!['a'..'h', 'z'..'z'], + }, + ]; + + for Row { + chars, + ruled_out_chars, + expected_ranges, + } in table.iter() + { + let ruled_out_chars = ruled_out_chars + .into_iter() + .map(|c: &char| *c as u32) + .collect(); + let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::>(); + assert_eq!(ranges, *expected_ranges); + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 3bfd5181..77e50d3c 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -120,7 +120,10 @@ impl Rule { pub fn seq(rules: Vec) -> Self { Rule::Seq(rules) } +} +#[cfg(test)] +impl Rule { pub fn terminal(index: usize) -> Self { Rule::Symbol(Symbol::terminal(index)) } diff --git a/src/tables.rs b/src/tables.rs index 344c4816..1c125621 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; -use std::ops::Range; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -50,21 +49,13 @@ pub(crate) struct ParseTable { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { pub state: LexStateId, - pub precedence: Range, pub in_main_token: bool, } -#[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct AcceptTokenAction { - pub symbol: Symbol, - pub precedence: i32, - pub implicit_precedence: i32, -} - -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, - pub accept_action: Option, + pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] From 3fbaff5e69a1bfd200a7c9979e52412b55a26ba0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 16:48:44 -0800 Subject: [PATCH 082/102] Fix various logic errors in parse table construction --- Cargo.lock | 18 ++++ Cargo.toml | 5 ++ src/build_tables/build_lex_table.rs | 116 +++++++++++++++++++++---- src/build_tables/build_parse_table.rs | 59 +++++++------ src/build_tables/coincident_tokens.rs | 38 ++++---- src/build_tables/item.rs | 4 +- src/build_tables/item_set_builder.rs | 2 +- src/build_tables/mod.rs | 44 +++++----- src/build_tables/shrink_parse_table.rs | 6 +- src/build_tables/token_conflicts.rs | 2 +- src/grammars.rs | 2 +- src/logger.rs | 29 +++++++ src/main.rs | 28 ++++-- src/nfa.rs | 26 ++++-- src/parse_grammar.rs | 4 +- src/prepare_grammar/expand_repeats.rs | 2 +- src/prepare_grammar/extract_tokens.rs | 2 +- src/prepare_grammar/process_inlines.rs | 2 +- src/render/mod.rs | 19 ++-- src/rules.rs | 2 +- src/tables.rs | 2 +- 21 files changed, 297 insertions(+), 115 deletions(-) create mode 100644 src/logger.rs diff --git a/Cargo.lock b/Cargo.lock index 538517f1..2312d362 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,11 @@ dependencies = [ "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "byteorder" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cc" version = "1.0.25" @@ -212,6 +217,15 @@ dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "hashbrown" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ignore" version = "0.4.4" @@ -463,9 +477,11 @@ version = "0.1.0" dependencies = [ "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -737,6 +753,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "94f88df23a25417badc922ab0f5716cc1330e87f71ddd9203b3a3ccd9cedf75d" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" @@ -753,6 +770,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e" "checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" "checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" diff --git a/Cargo.toml b/Cargo.toml index b29bc85e..29b10e17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ lazy_static = "1.2.0" smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" +hashbrown = "0.1" ignore = "0.4.4" libloading = "0.5" rusqlite = "0.14.0" @@ -20,3 +21,7 @@ regex-syntax = "0.6.4" [dependencies.serde_json] version = "1.0" features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index aa929d97..c002f427 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -2,10 +2,9 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::NfaCursor; -use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeMap, HashMap, VecDeque}; pub(crate) fn build_lex_table( parse_table: &mut ParseTable, @@ -16,15 +15,16 @@ pub(crate) fn build_lex_table( let keyword_lex_table; if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); - builder.add_state_for_tokens(keywords.iter()); + builder.add_state_for_tokens(keywords); keyword_lex_table = builder.table; } else { keyword_lex_table = LexTable::default(); } let mut builder = LexTableBuilder::new(lexical_grammar); - for state in parse_table.states.iter_mut() { - let tokens = state.terminal_entries.keys().filter_map(|token| { + for (i, state) in parse_table.states.iter_mut().enumerate() { + info!("populate lex state for parse state {}", i); + let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { syntax_grammar.word_token @@ -34,11 +34,14 @@ pub(crate) fn build_lex_table( } else { None } - }); - state.lex_state_id = builder.add_state_for_tokens(tokens); + })); + state.lex_state_id = builder.add_state_for_tokens(&tokens); } - (builder.table, keyword_lex_table) + let mut table = builder.table; + shrink_lex_table(&mut table, parse_table); + + (table, keyword_lex_table) } struct LexTableBuilder<'a> { @@ -60,32 +63,49 @@ impl<'a> LexTableBuilder<'a> { } } - fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { let nfa_states = tokens + .iter() .map(|token| self.lexical_grammar.variables[token.index].start_state) .collect(); - let result = self.add_state(nfa_states); - while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + let (state_id, is_new) = self.add_state(nfa_states); + + if is_new { + info!( + "entry point state: {}, tokens: {:?}", + state_id, + tokens + .iter() + .map(|t| &self.lexical_grammar.variables[t.index].name) + .collect::>() + ); + } + + while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { self.populate_state(state_id, nfa_states); } - result + state_id } - fn add_state(&mut self, nfa_states: Vec) -> usize { - match self.state_ids_by_nfa_state_set.entry(nfa_states) { - Entry::Occupied(o) => *o.get(), + fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + self.cursor.reset(nfa_states); + match self + .state_ids_by_nfa_state_set + .entry(self.cursor.state_ids.clone()) + { + Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); self.state_queue.push_back((state_id, v.key().clone())); v.insert(state_id); - state_id + (state_id, true) } } } fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { - self.cursor.reset(nfa_states); + self.cursor.force_reset(nfa_states); let mut completion = None; for (id, prec) in self.cursor.completions() { @@ -102,12 +122,16 @@ impl<'a> LexTableBuilder<'a> { } for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!( + "populate state: {}, characters: {:?}, precedence: {:?}", + state_id, chars, advance_precedence + ); if let Some((_, completed_precedence)) = completion { if advance_precedence < completed_precedence { continue; } } - let next_state_id = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -122,3 +146,59 @@ impl<'a> LexTableBuilder<'a> { } } } + +fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { + let mut state_replacements = BTreeMap::new(); + let mut done = false; + while !done { + done = true; + for (i, state_i) in table.states.iter().enumerate() { + if state_replacements.contains_key(&i) { + continue; + } + for (j, state_j) in table.states.iter().enumerate() { + if state_replacements.contains_key(&j) { + continue; + } + if j == i { + break; + } + if state_i == state_j { + info!("replace state {} with state {}", i, j); + state_replacements.insert(i, j); + done = false; + } + } + } + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + if let Some(new_state_id) = state_replacements.get(&advance_action.1.state) { + advance_action.1.state = *new_state_id; + } + } + } + } + + let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { + let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); + let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); + replacement - prior_removed + }).collect::>(); + + for state in parse_table.states.iter_mut() { + state.lex_state_id = final_state_replacements[state.lex_state_id]; + } + + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + advance_action.1.state = final_state_replacements[advance_action.1.state]; + } + } + + let mut i = 0; + table.states.retain(|_| { + let result = !state_replacements.contains_key(&i); + i += 1; + result + }); +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index c17261dc..ada34dff 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,8 +7,11 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::{DefaultHasher, Entry}; -use std::collections::{HashMap, HashSet, VecDeque}; +use hashbrown::hash_map::Entry; +use hashbrown::{HashMap, HashSet}; +use std::collections::hash_map::DefaultHasher; +use std::collections::VecDeque; + use std::fmt::Write; use std::hash::Hasher; @@ -43,9 +46,10 @@ impl<'a> ParseTableBuilder<'a> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); - // Ensure that the error state has index 0. + // Add the error state at index 0. self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + // Add the starting state at index 1. self.add_parse_state( &Vec::new(), &Vec::new(), @@ -61,6 +65,8 @@ impl<'a> ParseTableBuilder<'a> { self.process_part_state_queue()?; self.populate_used_symbols(); + self.remove_precedences(); + Ok((self.parse_table, self.following_tokens)) } @@ -112,28 +118,9 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - self.add_actions( entry.preceding_symbols, entry.preceding_auxiliary_symbols, @@ -527,6 +514,7 @@ impl<'a> ParseTableBuilder<'a> { } fn populate_used_symbols(&mut self) { + self.parse_table.symbols.push(Symbol::end()); let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; @@ -542,20 +530,39 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } - self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); } } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::non_terminal(i)); } } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); + } + + fn remove_precedences(&mut self) { + for state in self.parse_table.states.iter_mut() { + for (_, entry) in state.terminal_entries.iter_mut() { + for action in entry.actions.iter_mut() { + match action { + ParseAction::Reduce { + precedence, + associativity, + .. + } => { + *precedence = 0; + *associativity = None; + } + _ => {} + } + } } } } diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 10707489..5f2bb3ec 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,36 +1,44 @@ +use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: HashMap<(Symbol, Symbol), HashSet>, - empty: HashSet, + entries: Vec>, + n: usize, } impl CoincidentTokenIndex { - pub fn new(table: &ParseTable) -> Self { - let mut entries = HashMap::new(); + pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { + let n = lexical_grammar.variables.len(); + let mut result = Self { + n, + entries: vec![HashSet::new(); n * n], + }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - entries - .entry((*symbol, *other_symbol)) - .or_insert(HashSet::new()) - .insert(i); + let index = result.index(*symbol, *other_symbol); + result.entries[index].insert(i); } } } - Self { - entries, - empty: HashSet::new(), - } + result } pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { - self.entries.get(&(a, b)).unwrap_or(&self.empty) + &self.entries[self.index(a, b)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - self.entries.contains_key(&(a, b)) + !self.entries[self.index(a, b)].is_empty() + } + + fn index(&self, a: Symbol, b: Symbol) -> usize { + if a.index < b.index { + a.index * self.n + b.index + } else { + b.index * self.n + a.index + } } } diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 4cd2f643..511d7bef 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -112,7 +112,9 @@ impl LookaheadSet { return; } }; - vec.resize(other.index + 1, false); + if other.index >= vec.len() { + vec.resize(other.index + 1, false); + } vec.set(other.index, true); } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 5e61bfcc..5714e7e2 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,7 +1,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq, Eq)] struct TransitiveClosureAddition<'a> { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 8b3a2db4..207431dd 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -27,22 +27,14 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - - eprintln!("{:?}", token_conflict_map); - - let coincident_token_index = CoincidentTokenIndex::new(&parse_table); - let keywords = if let Some(word_token) = syntax_grammar.word_token { - identify_keywords( - lexical_grammar, - &parse_table, - word_token, - &token_conflict_map, - &coincident_token_index, - ) - } else { - LookaheadSet::new() - }; - + let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); + let keywords = identify_keywords( + lexical_grammar, + &parse_table, + syntax_grammar.word_token, + &token_conflict_map, + &coincident_token_index, + ); populate_error_state( &mut parse_table, syntax_grammar, @@ -123,10 +115,15 @@ fn populate_error_state( fn identify_keywords( lexical_grammar: &LexicalGrammar, parse_table: &ParseTable, - word_token: Symbol, + word_token: Option, token_conflict_map: &TokenConflictMap, coincident_token_index: &CoincidentTokenIndex, ) -> LookaheadSet { + if word_token.is_none() { + return LookaheadSet::new(); + } + + let word_token = word_token.unwrap(); let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); // First find all of the candidate keyword tokens: tokens that start with @@ -137,6 +134,7 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { + info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); Some(Symbol::terminal(i)) } else { None @@ -150,8 +148,8 @@ fn identify_keywords( if other_token != *token && token_conflict_map.does_match_same_string(token.index, other_token.index) { - eprintln!( - "Exclude {} from keywords because it matches the same string as {}", + info!( + "Keywords - exclude {} because it matches the same string as {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_token.index].name ); @@ -189,8 +187,8 @@ fn identify_keywords( word_token.index, other_index, ) { - eprintln!( - "Exclude {} from keywords because of conflict with {}", + info!( + "Keywords - exclude {} because of conflict with {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_index].name ); @@ -198,8 +196,8 @@ fn identify_keywords( } } - eprintln!( - "Include {} in keywords", + info!( + "Keywords - include {}", lexical_grammar.variables[token.index].name, ); true diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index b943158f..33b72c32 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -2,7 +2,7 @@ use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, @@ -240,6 +240,10 @@ fn can_add_entry_to_state( fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + for state in &parse_table.states { for referenced_state in state.referenced_states() { state_usage_map[referenced_state] = true; diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 9f1c4426..18a80484 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -1,7 +1,7 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; -use std::collections::HashSet; +use hashbrown::HashSet; use std::fmt; #[derive(Clone, Debug, Default, PartialEq, Eq)] diff --git a/src/grammars.rs b/src/grammars.rs index d23e8ca6..7f587a8c 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,6 +1,6 @@ use crate::nfa::Nfa; use crate::rules::{Alias, Associativity, Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { diff --git a/src/logger.rs b/src/logger.rs new file mode 100644 index 00000000..18df763d --- /dev/null +++ b/src/logger.rs @@ -0,0 +1,29 @@ +use log::{LevelFilter, Log, Metadata, Record}; + +struct Logger { + pub filter: Option, +} + +impl Log for Logger { + fn enabled(&self, _: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + eprintln!( + "[{}] {}", + record + .module_path() + .unwrap_or_default() + .trim_start_matches("rust_tree_sitter_cli::"), + record.args() + ); + } + + fn flush(&self) {} +} + +pub(crate) fn init() { + log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap(); + log::set_max_level(LevelFilter::Info); +} diff --git a/src/main.rs b/src/main.rs index cd672186..a08922b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,20 +1,23 @@ #[macro_use] -extern crate serde_derive; -#[macro_use] -extern crate serde_json; -#[macro_use] extern crate lazy_static; +#[macro_use] +extern crate log; +#[macro_use] +extern crate serde_derive; +extern crate hashbrown; +extern crate serde_json; -use std::path::PathBuf; use clap::{App, Arg, SubCommand}; use std::env; use std::io::Write; +use std::path::PathBuf; use std::process::{Command, Stdio}; mod build_tables; mod error; mod generate; mod grammars; +mod logger; mod nfa; mod parse_grammar; mod prepare_grammar; @@ -27,7 +30,11 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") - .subcommand(SubCommand::with_name("generate").about("Generate a parser")) + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + .arg(Arg::with_name("log").long("log")), + ) .subcommand( SubCommand::with_name("parse") .about("Parse a file") @@ -42,7 +49,11 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(_) = matches.subcommand_matches("generate") { + if let Some(matches) = matches.subcommand_matches("generate") { + if matches.is_present("log") { + logger::init(); + } + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); @@ -70,7 +81,8 @@ fn load_js_grammar_file(grammar_path: PathBuf) -> String { "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", js_prelude, grammar_path.to_str().unwrap() - ).expect("Failed to write to node's stdin"); + ) + .expect("Failed to write to node's stdin"); drop(node_stdin); let output = node_process .wait_with_output() diff --git a/src/nfa.rs b/src/nfa.rs index e14dac44..1c7ff53b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -320,6 +320,10 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } + pub fn force_reset(&mut self, states: Vec) { + self.state_ids = states + } + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { @@ -352,16 +356,26 @@ impl<'a> NfaCursor<'a> { result[i].1 = max(result[i].1, prec); result[i].2.push(state); result[i].3 |= is_sep; - } else { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - let mut states = result[i].2.clone(); - states.push(state); + chars = CharacterSet::empty(); + break; + } + + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let max_prec = max(result[i].1, prec); + states.push(state); + if result[i].0.is_empty() { + result[i].0 = intersection; + result[i].1 = max_prec; + result[i].2 = states; + result[i].3 |= is_sep; + } else { result.insert( i, ( intersection, - max(result[i].1, prec), + max_prec, states, result[i].3 || is_sep, ), diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 07396329..6808f402 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -133,7 +133,7 @@ mod tests { #[test] fn test_parse_grammar() { - let grammar = parse_grammar(&json!({ + let grammar = parse_grammar(r#"{ "name": "my_lang", "rules": { "file": { @@ -148,7 +148,7 @@ mod tests { "value": "foo" } } - }).to_string()).unwrap(); + }"#).unwrap(); assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index f3811c5f..4589bd11 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,7 +1,7 @@ use super::ExtractedSyntaxGrammar; use crate::grammars::{Variable, VariableType}; use crate::rules::{Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; struct Expander { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 5f3f6e16..115933ee 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -2,7 +2,7 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; pub(super) fn extract_tokens( diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 0d7f6827..24bbc14d 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -1,5 +1,5 @@ use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct ProductionStepId { diff --git a/src/render/mod.rs b/src/render/mod.rs index cbb8ba0d..250218c1 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,9 +1,9 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use core::ops::Range; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -372,17 +372,14 @@ impl Generator { if self.add_character_set_condition(&characters, &ruled_out_characters) { add!(self, ")\n"); indent!(self); - if action.in_main_token { - add_line!(self, "ADVANCE({});", action.state); - } else { - add_line!(self, "SKIP({});", action.state); - } + self.add_advance_action(&action); if let CharacterSet::Include(chars) = characters { ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { self.buffer.truncate(previous_length); + self.add_advance_action(&action); } } @@ -494,6 +491,14 @@ impl Generator { }) } + fn add_advance_action(&mut self, action: &AdvanceAction) { + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + } + fn add_lex_modes_list(&mut self) { self.get_external_scanner_state_id(HashSet::new()); diff --git a/src/rules.rs b/src/rules.rs index 77e50d3c..ad16c632 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { diff --git a/src/tables.rs b/src/tables.rs index 1c125621..21222135 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; From 92d4fe419c291f48233a8cbcd5073111e2ebfaa7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 10:30:59 -0800 Subject: [PATCH 083/102] Fix character set intersection bugs --- src/nfa.rs | 159 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 34 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index 1c7ff53b..b746200f 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -149,14 +149,18 @@ impl CharacterSet { CharacterSet::Include(removed) } CharacterSet::Exclude(other_chars) => { - let removed = remove_chars(chars, other_chars, true); + let mut result_exclusion = chars.clone(); + result_exclusion.extend(other_chars.iter().cloned()); + result_exclusion.sort_unstable(); + result_exclusion.dedup(); + remove_chars(chars, other_chars, true); let mut included_characters = Vec::new(); let mut other_included_characters = Vec::new(); swap(&mut included_characters, other_chars); swap(&mut other_included_characters, chars); *self = CharacterSet::Include(included_characters); *other = CharacterSet::Include(other_included_characters); - CharacterSet::Exclude(removed) + CharacterSet::Exclude(result_exclusion) } }, } @@ -351,35 +355,24 @@ impl<'a> NfaCursor<'a> { for (chars, prec, state, is_sep) in iter { let mut chars = chars.clone(); let mut i = 0; - while i < result.len() { - if result[i].0 == chars { - result[i].1 = max(result[i].1, prec); - result[i].2.push(state); - result[i].3 |= is_sep; - chars = CharacterSet::empty(); - break; - } - + while i < result.len() && !chars.is_empty() { let intersection = result[i].0.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut states = result[i].2.clone(); - let max_prec = max(result[i].1, prec); - states.push(state); + let mut intersection_states = result[i].2.clone(); + match intersection_states.binary_search(&state) { + Err(j) => intersection_states.insert(j, state), + _ => {} + } + let intersection_entry = ( + intersection, + max(result[i].1, prec), + intersection_states, + result[i].3 || is_sep, + ); if result[i].0.is_empty() { - result[i].0 = intersection; - result[i].1 = max_prec; - result[i].2 = states; - result[i].3 |= is_sep; + result[i] = intersection_entry; } else { - result.insert( - i, - ( - intersection, - max_prec, - states, - result[i].3 || is_sep, - ), - ); + result.insert(i, intersection_entry); i += 1; } } @@ -444,6 +437,7 @@ mod tests { #[test] fn test_group_successors() { let table = [ + // overlapping character classes ( vec![ (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), @@ -460,6 +454,7 @@ mod tests { (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), ], ), + // large character class followed by many individual characters ( vec![ (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), @@ -483,6 +478,63 @@ mod tests { ), ], ), + // negated character class followed by an individual character + ( + vec![ + (CharacterSet::empty().add_char('0'), 0, 1, false), + (CharacterSet::empty().add_char('b'), 0, 2, false), + ( + CharacterSet::empty().add_range('a', 'f').negate(), + 0, + 3, + false, + ), + (CharacterSet::empty().add_char('c'), 0, 4, false), + ], + vec![ + (CharacterSet::empty().add_char('0'), 0, vec![1, 3], false), + (CharacterSet::empty().add_char('b'), 0, vec![2], false), + (CharacterSet::empty().add_char('c'), 0, vec![4], false), + ( + CharacterSet::empty() + .add_range('a', 'f') + .add_char('0') + .negate(), + 0, + vec![3], + false, + ), + ], + ), + // multiple negated character classes + ( + vec![ + (CharacterSet::Include(vec!['a']), 0, 1, false), + (CharacterSet::Exclude(vec!['a', 'b', 'c']), 0, 2, false), + (CharacterSet::Include(vec!['g']), 0, 6, false), + (CharacterSet::Exclude(vec!['d', 'e', 'f']), 0, 3, false), + (CharacterSet::Exclude(vec!['g', 'h', 'i']), 0, 4, false), + (CharacterSet::Include(vec!['g']), 0, 5, false), + ], + vec![ + (CharacterSet::Include(vec!['a']), 0, vec![1, 3, 4], false), + (CharacterSet::Include(vec!['g']), 0, vec![2, 3, 5, 6], false), + (CharacterSet::Include(vec!['b', 'c']), 0, vec![3, 4], false), + (CharacterSet::Include(vec!['h', 'i']), 0, vec![2, 3], false), + ( + CharacterSet::Include(vec!['d', 'e', 'f']), + 0, + vec![2, 4], + false, + ), + ( + CharacterSet::Exclude(vec!['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), + 0, + vec![2, 3, 4], + false, + ), + ], + ), ]; for row in table.iter() { @@ -495,8 +547,8 @@ mod tests { #[test] fn test_character_set_remove_intersection() { - // whitelist - whitelist - // both sets contain 'c', 'd', and 'f' + // A whitelist and an overlapping whitelist. + // Both sets contain 'c', 'd', and 'f' let mut a = CharacterSet::empty().add_range('a', 'f'); let mut b = CharacterSet::empty().add_range('c', 'h'); assert_eq!( @@ -515,8 +567,37 @@ mod tests { assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); - // whitelist - blacklist - // both sets contain 'e', 'f', and 'm' + // A whitelist and a larger whitelist. + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + // A whitelist and an intersecting blacklist. + // Both sets contain 'e', 'f', and 'm' let mut a = CharacterSet::empty() .add_range('c', 'h') .add_range('k', 'm'); @@ -545,16 +626,26 @@ mod tests { assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); - // blacklist - blacklist - // both sets exclude 'c', 'd', and 'e' + // A blacklist and an overlapping blacklist. + // Both sets exclude 'c', 'd', and 'e' let mut a = CharacterSet::empty().add_range('a', 'e').negate(); let mut b = CharacterSet::empty().add_range('c', 'h').negate(); assert_eq!( a.remove_intersection(&mut b), - CharacterSet::Exclude(vec!['c', 'd', 'e']) + CharacterSet::empty().add_range('a', 'h').negate(), ); assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + + // A blacklist and a larger blacklist. + let mut a = CharacterSet::empty().add_range('b', 'c').negate(); + let mut b = CharacterSet::empty().add_range('a', 'd').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('a', 'd').negate(), + ); + assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d')); + assert_eq!(b, CharacterSet::empty()); } #[test] From 82fda8929e0019f6ba676f659677e84000ae1632 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 10:31:14 -0800 Subject: [PATCH 084/102] Add EOF actions to lex table --- src/build_tables/build_lex_table.rs | 97 +++++++++++++++++++++------ src/build_tables/coincident_tokens.rs | 11 +-- src/render/mod.rs | 19 +++--- src/rules.rs | 4 ++ src/tables.rs | 2 +- 5 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index c002f427..66a4fe43 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -1,7 +1,8 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::NfaCursor; +use crate::nfa::{CharacterSet, NfaCursor}; +use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; @@ -23,7 +24,6 @@ pub(crate) fn build_lex_table( let mut builder = LexTableBuilder::new(lexical_grammar); for (i, state) in parse_table.states.iter_mut().enumerate() { - info!("populate lex state for parse state {}", i); let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { @@ -31,10 +31,13 @@ pub(crate) fn build_lex_table( } else { Some(*token) } + } else if token.is_eof() { + Some(*token) } else { None } })); + info!("populate lex state for parse state {}", i); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -44,12 +47,18 @@ pub(crate) fn build_lex_table( (table, keyword_lex_table) } +struct QueueEntry { + state_id: usize, + nfa_states: Vec, + eof_valid: bool, +} + struct LexTableBuilder<'a> { lexical_grammar: &'a LexicalGrammar, cursor: NfaCursor<'a>, table: LexTable, - state_queue: VecDeque<(usize, Vec)>, - state_ids_by_nfa_state_set: HashMap, usize>, + state_queue: VecDeque, + state_ids_by_nfa_state_set: HashMap<(Vec, bool), usize>, } impl<'a> LexTableBuilder<'a> { @@ -64,11 +73,19 @@ impl<'a> LexTableBuilder<'a> { } fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + let mut eof_valid = false; let nfa_states = tokens .iter() - .map(|token| self.lexical_grammar.variables[token.index].start_state) + .filter_map(|token| { + if token.is_terminal() { + Some(self.lexical_grammar.variables[token.index].start_state) + } else { + eof_valid = true; + None + } + }) .collect(); - let (state_id, is_new) = self.add_state(nfa_states); + let (state_id, is_new) = self.add_state(nfa_states, eof_valid); if is_new { info!( @@ -81,32 +98,42 @@ impl<'a> LexTableBuilder<'a> { ); } - while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { - self.populate_state(state_id, nfa_states); + while let Some(QueueEntry { + state_id, + nfa_states, + eof_valid, + }) = self.state_queue.pop_front() + { + self.populate_state(state_id, nfa_states, eof_valid); } state_id } - fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + fn add_state(&mut self, nfa_states: Vec, eof_valid: bool) -> (usize, bool) { self.cursor.reset(nfa_states); match self .state_ids_by_nfa_state_set - .entry(self.cursor.state_ids.clone()) + .entry((self.cursor.state_ids.clone(), eof_valid)) { Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); - self.state_queue.push_back((state_id, v.key().clone())); + self.state_queue.push_back(QueueEntry { + state_id, + nfa_states: v.key().0.clone(), + eof_valid, + }); v.insert(state_id); (state_id, true) } } } - fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + fn populate_state(&mut self, state_id: usize, nfa_states: Vec, eof_valid: bool) { self.cursor.force_reset(nfa_states); + // The EOF state is represented as an empty list of NFA states. let mut completion = None; for (id, prec) in self.cursor.completions() { if let Some((prev_id, prev_precedence)) = completion { @@ -121,7 +148,24 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!("raw successors: {:?}", self.cursor.successors().collect::>()); + let successors = self.cursor.grouped_successors(); + + // If EOF is a valid lookahead token, add a transition predicated on the null + // character that leads to the empty set of NFA states. + if eof_valid { + let (next_state_id, _) = self.add_state(Vec::new(), false); + info!("populate state: {}, character: EOF", state_id); + self.table.states[state_id].advance_actions.push(( + CharacterSet::empty().add_char('\0'), + AdvanceAction { + state: next_state_id, + in_main_token: true, + }, + )); + } + + for (chars, advance_precedence, next_states, is_sep) in successors { info!( "populate state: {}, characters: {:?}, precedence: {:?}", state_id, chars, advance_precedence @@ -131,7 +175,7 @@ impl<'a> LexTableBuilder<'a> { continue; } } - let (next_state_id, _) = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -141,8 +185,10 @@ impl<'a> LexTableBuilder<'a> { )); } - if let Some((completion_index, _)) = completion { - self.table.states[state_id].accept_action = Some(completion_index); + if let Some((complete_id, _)) = completion { + self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id)); + } else if self.cursor.state_ids.is_empty() { + self.table.states[state_id].accept_action = Some(Symbol::end()); } } } @@ -179,11 +225,20 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } } - let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { - let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); - let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); - replacement - prior_removed - }).collect::>(); + let final_state_replacements = (0..table.states.len()) + .into_iter() + .map(|state_id| { + let replacement = state_replacements + .get(&state_id) + .cloned() + .unwrap_or(state_id); + let prior_removed = state_replacements + .iter() + .take_while(|i| *i.0 < replacement) + .count(); + replacement - prior_removed + }) + .collect::>(); for state in parse_table.states.iter_mut() { state.lex_state_id = final_state_replacements[state.lex_state_id]; diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 5f2bb3ec..ac5931e1 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,10 +1,9 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: Vec>, + entries: Vec>, n: usize, } @@ -13,20 +12,22 @@ impl CoincidentTokenIndex { let n = lexical_grammar.variables.len(); let mut result = Self { n, - entries: vec![HashSet::new(); n * n], + entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { let index = result.index(*symbol, *other_symbol); - result.entries[index].insert(i); + if result.entries[index].last().cloned() != Some(i) { + result.entries[index].push(i); + } } } } result } - pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { &self.entries[self.index(a, b)] } diff --git a/src/render/mod.rs b/src/render/mod.rs index 250218c1..624fa1e0 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -125,7 +125,7 @@ impl Generator { .symbols .iter() .filter(|symbol| { - if symbol.is_terminal() { + if symbol.is_terminal() || symbol.is_eof() { true } else if symbol.is_external() { self.syntax_grammar.external_tokens[symbol.index] @@ -359,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&Symbol::terminal(accept_action)] + self.symbol_ids[&accept_action] ); } @@ -462,18 +462,16 @@ impl Generator { let mut prev_range: Option> = None; chars .iter() - .cloned() - .chain(Some('\0')) - .filter_map(move |c| { + .map(|c| (*c, false)) + .chain(Some(('\0', true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); + } if ruled_out_characters.contains(&(c as u32)) { return None; } if let Some(range) = prev_range.clone() { - if c == '\0' { - prev_range = Some(c..c); - return Some(range); - } - let mut prev_range_successor = range.end as u32 + 1; while prev_range_successor < c as u32 { if !ruled_out_characters.contains(&prev_range_successor) { @@ -948,6 +946,7 @@ impl Generator { fn add_character(&mut self, c: char) { if c.is_ascii() { match c { + '\0' => add!(self, "'\\0'"), '\'' => add!(self, "'\\''"), '\\' => add!(self, "'\\\\'"), '\t' => add!(self, "'\\t'"), diff --git a/src/rules.rs b/src/rules.rs index ad16c632..bd0340fc 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -162,6 +162,10 @@ impl Symbol { self.kind == SymbolType::External } + pub fn is_eof(&self) -> bool { + self.kind == SymbolType::End + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, diff --git a/src/tables.rs b/src/tables.rs index 21222135..f400d25c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -55,7 +55,7 @@ pub(crate) struct AdvanceAction { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, - pub accept_action: Option, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] From 02ca84fb4ae339753f2742d69017bdb7c39dda44 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 11:52:45 -0800 Subject: [PATCH 085/102] Add missing ';' in generated code --- src/render/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index 624fa1e0..dd046c93 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -358,7 +358,7 @@ impl Generator { if let Some(accept_action) = state.accept_action { add_line!( self, - "ACCEPT_TOKEN({})", + "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action] ); } From c0f48dff6f3128d94855826e63588847dfcabb61 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 11:52:57 -0800 Subject: [PATCH 086/102] Fix incorrect NFA generation for string rules --- src/build_tables/build_lex_table.rs | 6 +-- src/prepare_grammar/expand_tokens.rs | 63 +++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 66a4fe43..6cd9a1ce 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -148,8 +148,8 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - info!("raw successors: {:?}", self.cursor.successors().collect::>()); let successors = self.cursor.grouped_successors(); + info!("populate state: {}, successors: {:?}", state_id, successors); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. @@ -166,10 +166,6 @@ impl<'a> LexTableBuilder<'a> { } for (chars, advance_precedence, next_states, is_sep) in successors { - info!( - "populate state: {}, characters: {:?}, precedence: {:?}", - state_id, chars, advance_precedence - ); if let Some((_, completed_precedence)) = completion { if advance_precedence < completed_precedence { continue; diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index fdf085f6..61b1897c 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -89,7 +89,8 @@ impl NfaBuilder { } Rule::String(s) => { for c in s.chars().rev() { - self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id()); + self.push_advance(CharacterSet::empty().add_char(c), next_state_id); + next_state_id = self.nfa.last_state_id(); } Ok(s.len() > 0) } @@ -102,6 +103,8 @@ impl NfaBuilder { alternative_state_ids.push(next_state_id); } } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); for alternative_state_id in alternative_state_ids { self.push_split(alternative_state_id); @@ -542,6 +545,64 @@ mod tests { ("aeeeef", Some((2, "aeeee"))), ], }, + Row { + rules: vec![ + Rule::seq(vec![ + Rule::string("a"), + Rule::choice(vec![ + Rule::string("b"), + Rule::string("c"), + ]), + Rule::string("d"), + ]) + ], + separators: vec![], + examples: vec![ + ("abd", Some((0, "abd"))), + ("acd", Some((0, "acd"))), + ("abc", None), + ("ad", None), + ("d", None), + ("a", None), + ] + }, + // nested choices within sequences + Row { + rules: vec![ + Rule::seq(vec![ + Rule::pattern("[0-9]+"), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![ + Rule::seq(vec![ + Rule::choice(vec![ + Rule::string("e"), + Rule::string("E") + ]), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![ + Rule::string("+"), + Rule::string("-"), + ]) + ]), + Rule::pattern("[0-9]+"), + ]) + ]) + ]), + ]), + ], + separators: vec![], + examples: vec![ + ("12", Some((0, "12"))), + ("12e", Some((0, "12"))), + ("12g", Some((0, "12"))), + ("12e3", Some((0, "12e3"))), + ("12e+", Some((0, "12"))), + ("12E+34 +", Some((0, "12E+34"))), + ("12e34", Some((0, "12e34"))), + ], + }, ]; for Row { From 70f00d1a1e2e82582c576605d7f3e10c01345511 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 13:49:37 -0800 Subject: [PATCH 087/102] Give immediate tokens higher implicit precedence than other tokens --- src/build_tables/token_conflicts.rs | 17 ++++++++--------- src/grammars.rs | 2 +- src/prepare_grammar/expand_tokens.rs | 16 +++++++++++----- src/prepare_grammar/extract_simple_aliases.rs | 6 +++--- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 18a80484..91edadec 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -2,6 +2,7 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; use hashbrown::HashSet; +use std::cmp::Ordering; use std::fmt; #[derive(Clone, Debug, Default, PartialEq, Eq)] @@ -71,16 +72,14 @@ impl<'a> TokenConflictMap<'a> { return false; } - match ( - grammar.variables[left.1].is_string, - grammar.variables[right.1].is_string, - ) { - (true, false) => return true, - (false, true) => return false, - _ => {} + match grammar.variables[left.1] + .implicit_precedence + .cmp(&grammar.variables[right.1].implicit_precedence) + { + Ordering::Less => false, + Ordering::Greater => true, + Ordering::Equal => left.1 < right.1, } - - left.0 < right.0 } } diff --git a/src/grammars.rs b/src/grammars.rs index 7f587a8c..f82d6b02 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -36,7 +36,7 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, - pub is_string: bool, + pub implicit_precedence: i32, pub start_state: u32, } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 61b1897c..6520c432 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -14,11 +14,17 @@ struct NfaBuilder { precedence_stack: Vec, } -fn is_string(rule: &Rule) -> bool { +fn get_implicit_precedence(rule: &Rule) -> i32 { match rule { - Rule::String(_) => true, - Rule::Metadata { rule, .. } => is_string(rule), - _ => false, + Rule::String(_) => 1, + Rule::Metadata { rule, params } => { + if params.is_main_token { + get_implicit_precedence(rule) + 2 + } else { + get_implicit_precedence(rule) + } + } + _ => 0, } } @@ -67,7 +73,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Date: Thu, 3 Jan 2019 13:49:50 -0800 Subject: [PATCH 088/102] Fix logic for identifying error recovery tokens --- src/build_tables/coincident_tokens.rs | 46 ++- src/build_tables/mod.rs | 54 ++- src/build_tables/shrink_parse_table.rs | 464 +++++++++++++------------ 3 files changed, 311 insertions(+), 253 deletions(-) diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index ac5931e1..62295073 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,23 +1,26 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; +use std::fmt; -pub(crate) struct CoincidentTokenIndex { +pub(crate) struct CoincidentTokenIndex<'a> { entries: Vec>, + grammar: &'a LexicalGrammar, n: usize, } -impl CoincidentTokenIndex { - pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { +impl<'a> CoincidentTokenIndex<'a> { + pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self { let n = lexical_grammar.variables.len(); let mut result = Self { n, + grammar: lexical_grammar, entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - let index = result.index(*symbol, *other_symbol); + let index = result.index(symbol.index, other_symbol.index); if result.entries[index].last().cloned() != Some(i) { result.entries[index].push(i); } @@ -28,18 +31,41 @@ impl CoincidentTokenIndex { } pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { - &self.entries[self.index(a, b)] + &self.entries[self.index(a.index, b.index)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - !self.entries[self.index(a, b)].is_empty() + !self.entries[self.index(a.index, b.index)].is_empty() } - fn index(&self, a: Symbol, b: Symbol) -> usize { - if a.index < b.index { - a.index * self.n + b.index + fn index(&self, a: usize, b: usize) -> usize { + if a < b { + a * self.n + b } else { - b.index * self.n + a.index + b * self.n + a } } } + +impl<'a> fmt::Debug for CoincidentTokenIndex<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CoincidentTokenIndex {{\n")?; + + write!(f, " entries: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[j].name, + self.entries[self.index(i, j)].len() + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 207431dd..84659600 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -47,6 +47,7 @@ pub(crate) fn build_tables( syntax_grammar, simple_aliases, &token_conflict_map, + &keywords, ); let (main_lex_table, keyword_lex_table) = build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); @@ -67,15 +68,22 @@ fn populate_error_state( ) { let state = &mut parse_table.states[0]; let n = lexical_grammar.variables.len(); + + // First identify the *conflict-free tokens*: tokens that do not overlap with + // any other token in any way. let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { - let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { - j == i - || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - || !token_conflict_map.does_conflict(i, j) + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) }); if conflicts_with_other_tokens { None } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } })); @@ -85,19 +93,32 @@ fn populate_error_state( actions: vec![ParseAction::Recover], }; + // Exclude from the error-recovery state any token that conflicts with one of + // the *conflict-free tokens* identified above. for i in 0..n { let symbol = Symbol::terminal(i); - let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) - || conflict_free_tokens.iter().all(|t| { - coincident_token_index.contains(symbol, t) - || !token_conflict_map.does_conflict(i, t.index) - }); - if can_be_used_for_recovery { - state - .terminal_entries - .entry(symbol) - .or_insert_with(|| recover_entry.clone()); + if !conflict_free_tokens.contains(&symbol) { + if syntax_grammar.word_token != Some(symbol) { + if let Some(t) = conflict_free_tokens.iter().find(|t| { + !coincident_token_index.contains(symbol, *t) + && token_conflict_map.does_conflict(symbol.index, t.index) + }) { + info!( + "error recovery - exclude token {} because of conflict with {}", + lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name + ); + continue; + } + } } + info!( + "error recovery - include token {}", + lexical_grammar.variables[i].name + ); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); } for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { @@ -134,7 +155,10 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { - info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); + info!( + "Keywords - add candidate {}", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } else { None diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 33b72c32..64a4b259 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,3 +1,4 @@ +use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; @@ -9,265 +10,272 @@ pub(crate) fn shrink_parse_table( syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, + keywords: &LookaheadSet, ) { - remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); - merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); - remove_unused_states(parse_table); + let mut optimizer = Optimizer { + parse_table, + syntax_grammar, + token_conflict_map, + keywords, + simple_aliases, + }; + optimizer.remove_unit_reductions(); + optimizer.merge_compatible_states(); + optimizer.remove_unused_states(); } -fn remove_unit_reductions( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - simple_aliases: &AliasMap, -) { - let mut aliased_symbols = HashSet::new(); - for variable in &syntax_grammar.variables { - for production in &variable.productions { - for step in &production.steps { - if step.alias.is_some() { - aliased_symbols.insert(step.symbol); +struct Optimizer<'a> { + parse_table: &'a mut ParseTable, + syntax_grammar: &'a SyntaxGrammar, + token_conflict_map: &'a TokenConflictMap<'a>, + keywords: &'a LookaheadSet, + simple_aliases: &'a AliasMap, +} + +impl<'a> Optimizer<'a> { + fn remove_unit_reductions(&mut self) { + let mut aliased_symbols = HashSet::new(); + for variable in &self.syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } } } } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !self.simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && self.syntax_grammar.variables[symbol.index].kind + != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in self.parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } } - let mut unit_reduction_symbols_by_state = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - let mut only_unit_reductions = true; - let mut unit_reduction_symbol = None; - for (_, entry) in &state.terminal_entries { - for action in &entry.actions { - match action { - ParseAction::ShiftExtra => continue, - ParseAction::Reduce { - child_count: 1, - alias_sequence_id: 0, - symbol, - .. - } => { - if !simple_aliases.contains_key(&symbol) - && !aliased_symbols.contains(&symbol) - && syntax_grammar.variables[symbol.index].kind != VariableType::Named - && (unit_reduction_symbol.is_none() - || unit_reduction_symbol == Some(symbol)) - { - unit_reduction_symbol = Some(symbol); + fn merge_compatible_states(&mut self) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { continue; } + if self.merge_parse_state(*j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } } - _ => {} } - only_unit_reductions = false; + } + + if state_replacements.is_empty() { break; } - if !only_unit_reductions { - break; - } - } - - if let Some(symbol) = unit_reduction_symbol { - if only_unit_reductions { - unit_reduction_symbols_by_state.insert(i, *symbol); + for state in self.parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); } } } - for state in parse_table.states.iter_mut() { - let mut done = false; - while !done { - done = true; - state.update_referenced_states(|other_state_id, state| { - if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { - done = false; - state.nonterminal_entries[symbol] - } else { - other_state_id - } - }) - } - } -} + fn merge_parse_state(&mut self, left: usize, right: usize) -> bool { + let left_state = &self.parse_table.states[left]; + let right_state = &self.parse_table.states[right]; -fn merge_compatible_states( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, -) { - let mut state_ids_by_signature = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - state_ids_by_signature - .entry(state.unfinished_item_signature) - .or_insert(Vec::new()) - .push(i); - } - - let mut deleted_states = HashSet::new(); - loop { - let mut state_replacements = HashMap::new(); - for (_, state_ids) in &state_ids_by_signature { - for i in state_ids { - for j in state_ids { - if j == i { - break; - } - if deleted_states.contains(j) || deleted_states.contains(i) { - continue; - } - if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { - deleted_states.insert(*i); - state_replacements.insert(*i, *j); - } - } - } - } - - if state_replacements.is_empty() { - break; - } - - for state in parse_table.states.iter_mut() { - state.update_referenced_states(|other_state_id, _| { - *state_replacements - .get(&other_state_id) - .unwrap_or(&other_state_id) - }); - } - } -} - -fn merge_parse_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - parse_table: &mut ParseTable, - left: usize, - right: usize, -) -> bool { - let left_state = &parse_table.states[left]; - let right_state = &parse_table.states[right]; - - if left_state.nonterminal_entries != right_state.nonterminal_entries { - return false; - } - - for (symbol, left_entry) in &left_state.terminal_entries { - if let Some(right_entry) = right_state.terminal_entries.get(symbol) { - if right_entry.actions != left_entry.actions { - return false; - } - } else if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - right_state, - *symbol, - left_entry, - ) { + if left_state.nonterminal_entries != right_state.nonterminal_entries { return false; } - } - let mut symbols_to_add = Vec::new(); - for (symbol, right_entry) in &right_state.terminal_entries { - if !left_state.terminal_entries.contains_key(&symbol) { - if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - left_state, - *symbol, - right_entry, - ) { - return false; - } - symbols_to_add.push(*symbol); - } - } - - for symbol in symbols_to_add { - let entry = parse_table.states[right].terminal_entries[&symbol].clone(); - parse_table.states[left] - .terminal_entries - .insert(symbol, entry); - } - - true -} - -fn can_add_entry_to_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - state: &ParseState, - token: Symbol, - entry: &ParseTableEntry, -) -> bool { - // Do not add external tokens; they could conflict lexically with any of the state's - // existing lookahead tokens. - if token.is_external() { - return false; - } - - // Only merge parse states by allowing existing reductions to happen - // with additional lookahead tokens. Do not alter parse states in ways - // that allow entirely new types of actions to happen. - if state.terminal_entries.iter().all(|(_, e)| e != entry) { - return false; - } - match entry.actions.last() { - Some(ParseAction::Reduce { .. }) => {} - _ => return false, - } - - // Do not add tokens which are both internal and external. Their validity could - // influence the behavior of the external scanner. - if syntax_grammar - .external_tokens - .iter() - .any(|t| t.corresponding_internal_token == Some(token)) - { - return false; - } - - // Do not add a token if it conflicts with an existing token. - if token.is_terminal() { - for existing_token in state.terminal_entries.keys() { - if token_conflict_map.does_conflict(token.index, existing_token.index) { + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) { return false; } } + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !self.can_add_entry_to_state(left_state, *symbol, right_entry) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = self.parse_table.states[right].terminal_entries[&symbol].clone(); + self.parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true } - true -} - -fn remove_unused_states(parse_table: &mut ParseTable) { - let mut state_usage_map = vec![false; parse_table.states.len()]; - - state_usage_map[0] = true; - state_usage_map[1] = true; - - for state in &parse_table.states { - for referenced_state in state.referenced_states() { - state_usage_map[referenced_state] = true; + fn can_add_entry_to_state( + &self, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, + ) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; } + + // Only merge_compatible_states parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if self + .syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + let is_word_token = self.syntax_grammar.word_token == Some(token); + let is_keyword = self.keywords.contains(&token); + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if (is_word_token && self.keywords.contains(existing_token)) + || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token) + { + continue; + } + if self + .token_conflict_map + .does_conflict(token.index, existing_token.index) + || self + .token_conflict_map + .does_match_same_string(token.index, existing_token.index) + { + return false; + } + } + } + + true } - let mut removed_predecessor_count = 0; - let mut state_replacement_map = vec![0; parse_table.states.len()]; - for state_id in 0..parse_table.states.len() { - state_replacement_map[state_id] = state_id - removed_predecessor_count; - if !state_usage_map[state_id] { - removed_predecessor_count += 1; + + fn remove_unused_states(&mut self) { + let mut state_usage_map = vec![false; self.parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + + for state in &self.parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } } - } - let mut state_id = 0; - let mut original_state_id = 0; - while state_id < parse_table.states.len() { - if state_usage_map[original_state_id] { - parse_table.states[state_id].update_referenced_states(|other_state_id, _| { - state_replacement_map[other_state_id] - }); - state_id += 1; - } else { - parse_table.states.remove(state_id); + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; self.parse_table.states.len()]; + for state_id in 0..self.parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < self.parse_table.states.len() { + if state_usage_map[original_state_id] { + self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + self.parse_table.states.remove(state_id); + } + original_state_id += 1; } - original_state_id += 1; } } From 5d3d161c057f112baed490bb767f16cfecde9948 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 14:08:24 -0800 Subject: [PATCH 089/102] Respect simple aliases in code gen --- src/render/mod.rs | 58 +++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index dd046c93..0c0e6e59 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -233,12 +233,13 @@ impl Generator { indent!(self); for symbol in self.parse_table.symbols.iter() { if *symbol != Symbol::end() { - add_line!( - self, - "[{}] = \"{}\",", - self.symbol_ids[&symbol], - self.sanitize_string(self.metadata_for_symbol(*symbol).0) + let name = self.sanitize_string( + self.simple_aliases + .get(symbol) + .map(|alias| alias.value.as_str()) + .unwrap_or(self.metadata_for_symbol(*symbol).0), ); + add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); } } for (alias, symbol) in &self.alias_map { @@ -265,22 +266,27 @@ impl Generator { for symbol in &self.parse_table.symbols { add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); indent!(self); - match self.metadata_for_symbol(*symbol).1 { - VariableType::Named => { - add_line!(self, ".visible = true,"); - add_line!(self, ".named = true,"); - } - VariableType::Anonymous => { - add_line!(self, ".visible = true,"); - add_line!(self, ".named = false,"); - } - VariableType::Hidden => { - add_line!(self, ".visible = false,"); - add_line!(self, ".named = true,"); - } - VariableType::Auxiliary => { - add_line!(self, ".visible = false,"); - add_line!(self, ".named = false,"); + if let Some(Alias { is_named, .. }) = self.simple_aliases.get(symbol) { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", is_named); + } else { + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } } } dedent!(self); @@ -356,11 +362,7 @@ impl Generator { fn add_lex_state(&mut self, state: LexState) { if let Some(accept_action) = state.accept_action { - add_line!( - self, - "ACCEPT_TOKEN({});", - self.symbol_ids[&accept_action] - ); + add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } let mut ruled_out_characters = HashSet::new(); @@ -397,7 +399,9 @@ impl Generator { self.add_character_range_conditions(ranges, false) } CharacterSet::Exclude(chars) => { - let ranges = Self::get_ranges(chars, ruled_out_characters); + let ranges = Some('\0'..'\0') + .into_iter() + .chain(Self::get_ranges(chars, ruled_out_characters)); self.add_character_range_conditions(ranges, true) } } From bf9556dadc470dd2c543f9aab94070cc801e3d96 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 16:35:16 -0800 Subject: [PATCH 090/102] Fix recursive processing of rule inlining --- src/build_tables/build_lex_table.rs | 10 +- src/build_tables/build_parse_table.rs | 40 ++-- src/build_tables/item.rs | 48 ++-- src/build_tables/item_set_builder.rs | 30 ++- src/prepare_grammar/process_inlines.rs | 311 ++++++++++++------------- 5 files changed, 230 insertions(+), 209 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 6cd9a1ce..60810f83 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -23,7 +23,7 @@ pub(crate) fn build_lex_table( } let mut builder = LexTableBuilder::new(lexical_grammar); - for (i, state) in parse_table.states.iter_mut().enumerate() { + for state in parse_table.states.iter_mut() { let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { @@ -37,7 +37,6 @@ pub(crate) fn build_lex_table( None } })); - info!("populate lex state for parse state {}", i); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -199,16 +198,17 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { continue; } for (j, state_j) in table.states.iter().enumerate() { - if state_replacements.contains_key(&j) { - continue; - } if j == i { break; } + if state_replacements.contains_key(&j) { + continue; + } if state_i == state_j { info!("replace state {} with state {}", i, j); state_replacements.insert(i, j); done = false; + break; } } } diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index ada34dff..6f930463 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -63,7 +63,28 @@ impl<'a> ParseTableBuilder<'a> { ), ); - self.process_part_state_queue()?; + while let Some(entry) = self.parse_state_queue.pop_front() { + // info!( + // "state: {}, item set: {}", + // entry.state_id, + // ParseItemSetDisplay( + // &self.item_sets_by_state_id[entry.state_id], + // self.syntax_grammar, + // self.lexical_grammar, + // ) + // ); + + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + entry.state_id, + item_set, + )?; + } + self.populate_used_symbols(); self.remove_precedences(); @@ -116,27 +137,12 @@ impl<'a> ParseTableBuilder<'a> { } } - fn process_part_state_queue(&mut self) -> Result<()> { - while let Some(entry) = self.parse_state_queue.pop_front() { - let item_set = self - .item_set_builder - .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - self.add_actions( - entry.preceding_symbols, - entry.preceding_auxiliary_symbols, - item_set, - entry.state_id, - )?; - } - Ok(()) - } - fn add_actions( &mut self, mut preceding_symbols: SymbolSequence, mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet<'a>, state_id: ParseStateId, + item_set: ParseItemSet<'a>, ) -> Result<()> { let mut terminal_successors = HashMap::new(); let mut non_terminal_successors = HashMap::new(); diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 511d7bef..d1d0cbbf 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -42,12 +42,19 @@ pub(crate) struct ParseItemSet<'a> { pub entries: BTreeMap, LookaheadSet>, } -pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem<'a>, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct ParseItemDisplay<'a>( + pub &'a ParseItem<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar +); + pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +#[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( - &'a ParseItemSet<'a>, - &'a SyntaxGrammar, - &'a LexicalGrammar, + pub &'a ParseItemSet<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, ); impl LookaheadSet { @@ -144,14 +151,6 @@ impl LookaheadSet { } result } - - pub fn display_with<'a>( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> LookaheadSetDisplay<'a> { - LookaheadSetDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> ParseItem<'a> { @@ -202,14 +201,6 @@ impl<'a> ParseItem<'a> { step_index: self.step_index + 1, } } - - pub fn display_with( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> ParseItemSet<'a> { @@ -235,14 +226,6 @@ impl<'a> ParseItemSet<'a> { } } } - - pub fn display_with( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> ParseItemSetDisplay<'a> { - ParseItemSetDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> Default for ParseItemSet<'a> { @@ -253,6 +236,7 @@ impl<'a> Default for ParseItemSet<'a> { } } +#[allow(dead_code)] impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { if self.0.is_augmented() { @@ -282,6 +266,10 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } else { write!(f, "{}", &self.1.variables[step.symbol.index].name)?; } + + if let Some(alias) = &step.alias { + write!(f, " (alias {})", alias.value)?; + } } if self.0.is_done() { @@ -323,8 +311,8 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { writeln!( f, "{}\t{}", - item.display_with(self.1, self.2), - lookaheads.display_with(self.1, self.2) + ParseItemDisplay(item, self.1, self.2), + LookaheadSetDisplay(lookaheads, self.1, self.2) )?; } Ok(()) diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 5714e7e2..939d700c 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,7 +1,8 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use hashbrown::{HashMap, HashSet}; +use std::fmt; #[derive(Clone, Debug, PartialEq, Eq)] struct TransitiveClosureAddition<'a> { @@ -16,6 +17,8 @@ struct FollowSetInfo { } pub(crate) struct ParseItemSetBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, first_sets: HashMap, last_sets: HashMap, inlines: &'a InlinedProductionMap, @@ -35,6 +38,8 @@ impl<'a> ParseItemSetBuilder<'a> { inlines: &'a InlinedProductionMap, ) -> Self { let mut result = Self { + syntax_grammar, + lexical_grammar, first_sets: HashMap::new(), last_sets: HashMap::new(), inlines, @@ -300,3 +305,26 @@ impl<'a> ParseItemSetBuilder<'a> { set.entries.insert(item, lookaheads.clone()); } } + +impl<'a> fmt::Debug for ParseItemSetBuilder<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ParseItemSetBuilder {{\n")?; + + write!(f, " additions: {{\n")?; + for (i, variable) in self.syntax_grammar.variables.iter().enumerate() { + write!(f, " {}: {{\n", variable.name)?; + for addition in &self.transitive_closure_additions[i] { + write!( + f, + " {}\n", + ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar) + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + + write!(f, "}}")?; + Ok(()) + } +} diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 24bbc14d..9fd2f2c6 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -3,6 +3,9 @@ use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct ProductionStepId { + // A `None` value here means that the production itself was produced via inlining, + // and is stored in the the builder's `productions` vector, as opposed to being + // stored in one of the grammar's variables. variable_index: Option, production_index: usize, step_index: usize, @@ -13,169 +16,166 @@ struct InlinedProductionMapBuilder { productions: Vec, } -impl ProductionStepId { - pub fn successor(&self) -> Self { - Self { - variable_index: self.variable_index, - production_index: self.production_index, - step_index: self.step_index + 1, - } - } -} - -fn production_for_id<'a>( - map: &'a InlinedProductionMapBuilder, - id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> &'a Production { - if let Some(variable_index) = id.variable_index { - &grammar.variables[variable_index].productions[id.production_index] - } else { - &map.productions[id.production_index] - } -} - -fn production_step_for_id<'a>( - map: &'a InlinedProductionMapBuilder, - id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> Option<&'a ProductionStep> { - production_for_id(map, id, grammar).steps.get(id.step_index) -} - -fn inline<'a>( - map: &'a mut InlinedProductionMapBuilder, - step_id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> &'a Vec { - let step = production_step_for_id(map, step_id, grammar).unwrap(); - let mut productions_to_add = grammar.variables[step.symbol.index].productions.clone(); - - let mut i = 0; - while i < productions_to_add.len() { - if let Some(first_symbol) = productions_to_add[i].first_symbol() { - if grammar.variables_to_inline.contains(&first_symbol) { - // Remove the production from the vector, replacing it with a placeholder. - let production = productions_to_add - .splice(i..i + 1, [Production::default()].iter().cloned()) - .next() - .unwrap(); - - // Replace the placeholder with the inlined productions. - productions_to_add.splice( - i..i + 1, - grammar.variables[first_symbol.index] - .productions - .iter() - .map(|p| { - let mut p = p.clone(); - p.steps.extend(production.steps[1..].iter().cloned()); - p - }), - ); - continue; - } - } - i += 1; - } - - let result = productions_to_add - .into_iter() - .map(|production_to_add| { - let mut inlined_production = production_for_id(&map, step_id, grammar).clone(); - let removed_step = inlined_production - .steps - .splice( - step_id.step_index..step_id.step_index + 1, - production_to_add.steps.iter().cloned(), - ) - .next() - .unwrap(); - let inserted_steps = &mut inlined_production.steps - [step_id.step_index..step_id.step_index + production_to_add.steps.len()]; - if let Some(alias) = removed_step.alias { - for inserted_step in inserted_steps.iter_mut() { - inserted_step.alias = Some(alias.clone()); - } - } - if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; - } - map.productions - .iter() - .position(|p| *p == inlined_production) - .unwrap_or({ - map.productions.push(inlined_production); - map.productions.len() - 1 - }) - }) - .collect(); - - map.production_indices_by_step_id - .entry(step_id) - .or_insert(result) -} - -pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { - let mut result = InlinedProductionMapBuilder { - productions: Vec::new(), - production_indices_by_step_id: HashMap::new(), - }; - - let mut step_ids_to_process = Vec::new(); - for (variable_index, variable) in grammar.variables.iter().enumerate() { - for production_index in 0..variable.productions.len() { - step_ids_to_process.push(ProductionStepId { - variable_index: Some(variable_index), - production_index, - step_index: 0, - }); - while !step_ids_to_process.is_empty() { - let mut i = 0; - while i < step_ids_to_process.len() { - let step_id = step_ids_to_process[i]; - if let Some(step) = production_step_for_id(&result, step_id, grammar) { - if grammar.variables_to_inline.contains(&step.symbol) { - let inlined_step_ids = inline(&mut result, step_id, grammar) - .into_iter() - .cloned() - .map(|production_index| ProductionStepId { - variable_index: None, - production_index, - step_index: step_id.step_index, - }) - .collect::>(); - step_ids_to_process.splice(i..i + 1, inlined_step_ids); +impl InlinedProductionMapBuilder { + fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap { + let mut step_ids_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + step_ids_to_process.push(ProductionStepId { + variable_index: Some(variable_index), + production_index, + step_index: 0, + }); + while !step_ids_to_process.is_empty() { + let mut i = 0; + while i < step_ids_to_process.len() { + let step_id = step_ids_to_process[i]; + if let Some(step) = self.production_step_for_id(step_id, grammar) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_step_ids = self + .inline_production_at_step(step_id, grammar) + .into_iter() + .cloned() + .map(|production_index| ProductionStepId { + variable_index: None, + production_index, + step_index: step_id.step_index, + }); + step_ids_to_process.splice(i..i + 1, inlined_step_ids); + } else { + step_ids_to_process[i] = ProductionStepId { + variable_index: step_id.variable_index, + production_index: step_id.production_index, + step_index: step_id.step_index + 1, + }; + i += 1; + } } else { - step_ids_to_process[i] = step_id.successor(); - i += 1; + step_ids_to_process.remove(i); } - } else { - step_ids_to_process.remove(i); } } } } + + let productions = self.productions; + let production_indices_by_step_id = self.production_indices_by_step_id; + let production_map = production_indices_by_step_id + .into_iter() + .map(|(step_id, production_indices)| { + let production = if let Some(variable_index) = step_id.variable_index { + &grammar.variables[variable_index].productions[step_id.production_index] + } else { + &productions[step_id.production_index] + } as *const Production; + ((production, step_id.step_index as u32), production_indices) + }) + .collect(); + + InlinedProductionMap { + productions, + production_map, + } } - // result - let productions = result.productions; - let production_indices_by_step_id = result.production_indices_by_step_id; + fn inline_production_at_step<'a>( + &'a mut self, + step_id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Vec { + // Build a list of productions produced by inlining rules. + let mut i = 0; + let step_index = step_id.step_index; + let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()]; + while i < productions_to_add.len() { + if let Some(step) = productions_to_add[i].steps.get(step_index) { + let symbol = step.symbol.clone(); - let production_map = production_indices_by_step_id - .into_iter() - .map(|(step_id, production_indices)| { - let production = if let Some(variable_index) = step_id.variable_index { - &grammar.variables[variable_index].productions[step_id.production_index] - } else { - &productions[step_id.production_index] - } as *const Production; - ((production, step_id.step_index as u32), production_indices) - }) - .collect(); + if grammar.variables_to_inline.contains(&symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); - InlinedProductionMap { productions, production_map } + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[symbol.index].productions.iter().map(|p| { + let mut production = production.clone(); + let removed_step = production + .steps + .splice(step_index..(step_index + 1), p.steps.iter().cloned()) + .next() + .unwrap(); + let inserted_steps = + &mut production.steps[step_index..(step_index + p.steps.len())]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } + production + }), + ); + + continue; + } + } + i += 1; + } + + // Store all the computed productions. + let result = productions_to_add + .into_iter() + .map(|production| { + self.productions + .iter() + .position(|p| *p == production) + .unwrap_or({ + self.productions.push(production); + self.productions.len() - 1 + }) + }) + .collect(); + + // Cache these productions based on the original production step. + self.production_indices_by_step_id + .entry(step_id) + .or_insert(result) + } + + fn production_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Production { + if let Some(variable_index) = id.variable_index { + &grammar.variables[variable_index].productions[id.production_index] + } else { + &self.productions[id.production_index] + } + } + + fn production_step_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> Option<&'a ProductionStep> { + self.production_for_id(id, grammar).steps.get(id.step_index) + } +} + +pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { + InlinedProductionMapBuilder { + productions: Vec::new(), + production_indices_by_step_id: HashMap::new(), + } + .build(grammar) } #[cfg(test)] @@ -234,7 +234,7 @@ mod tests { // Inlining variable 1 yields two productions. assert_eq!( inline_map - .inlined_productions(&grammar.variables[0].productions[0], 1) + .inlined_productions(&grammar.variables[0].productions[0], 1) .unwrap() .cloned() .collect::>(), @@ -446,8 +446,7 @@ mod tests { ProductionStep::new(Symbol::terminal(12)) .with_prec(1, Some(Associativity::Left)), ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(2)) - .with_alias("outer_alias", true), + ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true), ] }], ); From 70aa4c2b2d97fbcf6e330f85e4d4fd0df026cfce Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:11:44 -0800 Subject: [PATCH 091/102] Add a --no-minimize flag to suppress table minimization for debugging --- src/build_tables/build_lex_table.rs | 22 ++++++++++++++----- src/build_tables/build_parse_table.rs | 13 ++++++++++- ...parse_table.rs => minimize_parse_table.rs} | 14 ++++++------ src/build_tables/mod.rs | 22 +++++++++++++------ src/generate.rs | 5 +++-- src/main.rs | 6 +++-- 6 files changed, 58 insertions(+), 24 deletions(-) rename src/build_tables/{shrink_parse_table.rs => minimize_parse_table.rs} (97%) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 60810f83..9c440f4e 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -12,6 +12,7 @@ pub(crate) fn build_lex_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, keywords: &LookaheadSet, + minimize: bool, ) -> (LexTable, LexTable) { let keyword_lex_table; if syntax_grammar.word_token.is_some() { @@ -41,7 +42,10 @@ pub(crate) fn build_lex_table( } let mut table = builder.table; - shrink_lex_table(&mut table, parse_table); + + if minimize { + minimize_lex_table(&mut table, parse_table); + } (table, keyword_lex_table) } @@ -147,14 +151,20 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } + info!( + "lex state: {}, completion: {:?}", + state_id, + completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) + ); + let successors = self.cursor.grouped_successors(); - info!("populate state: {}, successors: {:?}", state_id, successors); + info!("lex state: {}, successors: {:?}", state_id, successors); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. if eof_valid { let (next_state_id, _) = self.add_state(Vec::new(), false); - info!("populate state: {}, character: EOF", state_id); + info!("lex state: {}, successor: EOF", state_id); self.table.states[state_id].advance_actions.push(( CharacterSet::empty().add_char('\0'), AdvanceAction { @@ -166,7 +176,9 @@ impl<'a> LexTableBuilder<'a> { for (chars, advance_precedence, next_states, is_sep) in successors { if let Some((_, completed_precedence)) = completion { - if advance_precedence < completed_precedence { + if advance_precedence < completed_precedence + || (advance_precedence == completed_precedence && is_sep) + { continue; } } @@ -188,7 +200,7 @@ impl<'a> LexTableBuilder<'a> { } } -fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { +fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { let mut state_replacements = BTreeMap::new(); let mut done = false; while !done { diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 6f930463..9bccf238 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -67,7 +67,7 @@ impl<'a> ParseTableBuilder<'a> { // info!( // "state: {}, item set: {}", // entry.state_id, - // ParseItemSetDisplay( + // super::item::ParseItemSetDisplay( // &self.item_sets_by_state_id[entry.state_id], // self.syntax_grammar, // self.lexical_grammar, @@ -77,6 +77,17 @@ impl<'a> ParseTableBuilder<'a> { let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + + // info!( + // "state: {}, closed item set: {}", + // entry.state_id, + // super::item::ParseItemSetDisplay( + // &item_set, + // self.syntax_grammar, + // self.lexical_grammar, + // ) + // ); + self.add_actions( entry.preceding_symbols, entry.preceding_auxiliary_symbols, diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/minimize_parse_table.rs similarity index 97% rename from src/build_tables/shrink_parse_table.rs rename to src/build_tables/minimize_parse_table.rs index 64a4b259..573bf974 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/minimize_parse_table.rs @@ -5,26 +5,26 @@ use crate::rules::{AliasMap, Symbol}; use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use hashbrown::{HashMap, HashSet}; -pub(crate) fn shrink_parse_table( +pub(crate) fn minimize_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, keywords: &LookaheadSet, ) { - let mut optimizer = Optimizer { + let mut minimizer = Minimizer { parse_table, syntax_grammar, token_conflict_map, keywords, simple_aliases, }; - optimizer.remove_unit_reductions(); - optimizer.merge_compatible_states(); - optimizer.remove_unused_states(); + minimizer.remove_unit_reductions(); + minimizer.merge_compatible_states(); + minimizer.remove_unused_states(); } -struct Optimizer<'a> { +struct Minimizer<'a> { parse_table: &'a mut ParseTable, syntax_grammar: &'a SyntaxGrammar, token_conflict_map: &'a TokenConflictMap<'a>, @@ -32,7 +32,7 @@ struct Optimizer<'a> { simple_aliases: &'a AliasMap, } -impl<'a> Optimizer<'a> { +impl<'a> Minimizer<'a> { fn remove_unit_reductions(&mut self) { let mut aliased_symbols = HashSet::new(); for variable in &self.syntax_grammar.variables { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 84659600..886594f8 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -3,14 +3,14 @@ mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; -mod shrink_parse_table; +mod minimize_parse_table; mod token_conflicts; use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; use self::item::LookaheadSet; -use self::shrink_parse_table::shrink_parse_table; +use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; @@ -23,6 +23,7 @@ pub(crate) fn build_tables( lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap, inlines: &InlinedProductionMap, + minimize: bool, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; @@ -42,15 +43,22 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); - shrink_parse_table( + if minimize { + minimize_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + &keywords, + ); + } + let (main_lex_table, keyword_lex_table) = build_lex_table( &mut parse_table, syntax_grammar, - simple_aliases, - &token_conflict_map, + lexical_grammar, &keywords, + minimize, ); - let (main_lex_table, keyword_lex_table) = - build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); Ok(( parse_table, main_lex_table, diff --git a/src/generate.rs b/src/generate.rs index cdbbea4f..d574c165 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -4,14 +4,15 @@ use crate::prepare_grammar::prepare_grammar; use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: &str) -> Result { +pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result { let input_grammar = parse_grammar(input)?; let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, - &inlines + &inlines, + minimize )?; let c_code = render_c_code( &input_grammar.name, diff --git a/src/main.rs b/src/main.rs index a08922b7..10820ed1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,7 +33,8 @@ fn main() -> error::Result<()> { .subcommand( SubCommand::with_name("generate") .about("Generate a parser") - .arg(Arg::with_name("log").long("log")), + .arg(Arg::with_name("log").long("log")) + .arg(Arg::with_name("no-minimize").long("no-minimize")), ) .subcommand( SubCommand::with_name("parse") @@ -54,10 +55,11 @@ fn main() -> error::Result<()> { logger::init(); } + let minimize = !matches.is_present("no-minimize"); let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); - let code = generate::generate_parser_for_grammar(&grammar_json)?; + let code = generate::generate_parser_for_grammar(&grammar_json, minimize)?; println!("{}", code); } From cc0fbc0d9306a838d10a7b258a58fa7f76c55cc3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:12:05 -0800 Subject: [PATCH 092/102] Fix and simplify handling of precedence for completion of tokens --- src/prepare_grammar/expand_tokens.rs | 88 +++++++++++----------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 6520c432..01b925f9 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -28,6 +28,13 @@ fn get_implicit_precedence(rule: &Rule) -> i32 { } } +fn get_completion_precedence(rule: &Rule) -> i32 { + match rule { + Rule::Metadata { params, .. } => params.precedence.unwrap_or(0), + _ => 0, + } +} + pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut builder = NfaBuilder { nfa: Nfa::new(), @@ -52,7 +59,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result) { - let mut i = 0; - while i < state_ids.len() { - let state_id = state_ids[i]; - let (left, right) = match &mut self.nfa.states[state_id as usize] { - NfaState::Accept { precedence, .. } => { - *precedence = prec; - return; - } - NfaState::Split(left, right) => (*left, *right), - _ => return, - }; - if !state_ids.contains(&left) { - state_ids.push(left); - } - if !state_ids.contains(&right) { - state_ids.push(right); - } - i += 1; - } - } } #[cfg(test)] @@ -551,17 +535,21 @@ mod tests { ("aeeeef", Some((2, "aeeee"))), ], }, + // immediate tokens with higher precedence Row { rules: vec![ - Rule::seq(vec![ - Rule::string("a"), - Rule::choice(vec![ - Rule::string("b"), - Rule::string("c"), - ]), - Rule::string("d"), - ]) + Rule::prec(1, Rule::pattern("[^a]+")), + Rule::immediate_token(Rule::prec(2, Rule::pattern("[^ab]+"))), ], + separators: vec![Rule::pattern("\\s")], + examples: vec![("cccb", Some((1, "ccc")))], + }, + Row { + rules: vec![Rule::seq(vec![ + Rule::string("a"), + Rule::choice(vec![Rule::string("b"), Rule::string("c")]), + Rule::string("d"), + ])], separators: vec![], examples: vec![ ("abd", Some((0, "abd"))), @@ -570,34 +558,24 @@ mod tests { ("ad", None), ("d", None), ("a", None), - ] + ], }, // nested choices within sequences Row { - rules: vec![ - Rule::seq(vec![ - Rule::pattern("[0-9]+"), - Rule::choice(vec![ - Rule::Blank, + rules: vec![Rule::seq(vec![ + Rule::pattern("[0-9]+"), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![Rule::seq(vec![ + Rule::choice(vec![Rule::string("e"), Rule::string("E")]), Rule::choice(vec![ - Rule::seq(vec![ - Rule::choice(vec![ - Rule::string("e"), - Rule::string("E") - ]), - Rule::choice(vec![ - Rule::Blank, - Rule::choice(vec![ - Rule::string("+"), - Rule::string("-"), - ]) - ]), - Rule::pattern("[0-9]+"), - ]) - ]) - ]), + Rule::Blank, + Rule::choice(vec![Rule::string("+"), Rule::string("-")]), + ]), + Rule::pattern("[0-9]+"), + ])]), ]), - ], + ])], separators: vec![], examples: vec![ ("12", Some((0, "12"))), From d845b81ee961d37e8506a2b421d54b867bb7e3c7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:42:06 -0800 Subject: [PATCH 093/102] Represent nfa transitions as structs with named fields, not tuples --- src/build_tables/build_lex_table.rs | 24 ++- src/build_tables/mod.rs | 2 +- src/build_tables/token_conflicts.rs | 26 +-- src/nfa.rs | 259 +++++++++++++++++---------- src/prepare_grammar/expand_tokens.rs | 16 +- 5 files changed, 211 insertions(+), 116 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 9c440f4e..4212d62b 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -1,7 +1,7 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::{CharacterSet, NfaCursor}; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; @@ -157,8 +157,8 @@ impl<'a> LexTableBuilder<'a> { completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) ); - let successors = self.cursor.grouped_successors(); - info!("lex state: {}, successors: {:?}", state_id, successors); + let transitions = self.cursor.transitions(); + info!("lex state: {}, transitions: {:?}", state_id, transitions); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. @@ -174,20 +174,26 @@ impl<'a> LexTableBuilder<'a> { )); } - for (chars, advance_precedence, next_states, is_sep) in successors { + for NfaTransition { + characters, + precedence, + states, + is_separator, + } in transitions + { if let Some((_, completed_precedence)) = completion { - if advance_precedence < completed_precedence - || (advance_precedence == completed_precedence && is_sep) + if precedence < completed_precedence + || (precedence == completed_precedence && is_separator) { continue; } } - let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep); + let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); self.table.states[state_id].advance_actions.push(( - chars, + characters, AdvanceAction { state: next_state_id, - in_main_token: !is_sep, + in_main_token: !is_separator, }, )); } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 886594f8..78798732 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -239,7 +239,7 @@ fn identify_keywords( } fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { - cursor.successors().all(|(chars, _, _, is_sep)| { + cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { true } else if let CharacterSet::Include(chars) = chars { diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 91edadec..cb2b6efe 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -1,6 +1,6 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; -use crate::nfa::{CharacterSet, NfaCursor}; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; use std::cmp::Ordering; use std::fmt; @@ -131,7 +131,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, } -impl Default for Nfa { - fn default() -> Self { - Self { states: Vec::new() } - } -} - #[derive(Debug)] pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, nfa: &'a Nfa, } +#[derive(Debug, PartialEq, Eq)] +pub struct NfaTransition { + pub characters: CharacterSet, + pub is_separator: bool, + pub precedence: i32, + pub states: Vec, +} + +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + impl CharacterSet { pub fn empty() -> Self { CharacterSet::Include(Vec::new()) @@ -328,7 +336,15 @@ impl<'a> NfaCursor<'a> { self.state_ids = states } - pub fn successors(&self) -> impl Iterator { + pub fn transition_chars(&self) -> impl Iterator { + self.raw_transitions().map(|t| (t.0, t.1)) + } + + pub fn transitions(&self) -> Vec { + Self::group_transitions(self.raw_transitions()) + } + + fn raw_transitions(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { chars, @@ -337,52 +353,53 @@ impl<'a> NfaCursor<'a> { is_sep, } = &self.nfa.states[*id as usize] { - Some((chars, *precedence, *state_id, *is_sep)) + Some((chars, *is_sep, *precedence, *state_id)) } else { None } }) } - pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec, bool)> { - Self::group_successors(self.successors()) - } - - fn group_successors<'b>( - iter: impl Iterator, - ) -> Vec<(CharacterSet, i32, Vec, bool)> { - let mut result: Vec<(CharacterSet, i32, Vec, bool)> = Vec::new(); - for (chars, prec, state, is_sep) in iter { + fn group_transitions<'b>( + iter: impl Iterator, + ) -> Vec { + let mut result: Vec = Vec::new(); + for (chars, is_sep, prec, state) in iter { let mut chars = chars.clone(); let mut i = 0; while i < result.len() && !chars.is_empty() { - let intersection = result[i].0.remove_intersection(&mut chars); + let intersection = result[i].characters.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut intersection_states = result[i].2.clone(); + let mut intersection_states = result[i].states.clone(); match intersection_states.binary_search(&state) { Err(j) => intersection_states.insert(j, state), _ => {} } - let intersection_entry = ( - intersection, - max(result[i].1, prec), - intersection_states, - result[i].3 || is_sep, - ); - if result[i].0.is_empty() { - result[i] = intersection_entry; + let intersection_transition = NfaTransition { + characters: intersection, + is_separator: result[i].is_separator || is_sep, + precedence: max(result[i].precedence, prec), + states: intersection_states, + }; + if result[i].characters.is_empty() { + result[i] = intersection_transition; } else { - result.insert(i, intersection_entry); + result.insert(i, intersection_transition); i += 1; } } i += 1; } if !chars.is_empty() { - result.push((chars, prec, vec![state], is_sep)); + result.push(NfaTransition { + characters: chars, + precedence: prec, + states: vec![state], + is_separator: is_sep, + }); } } - result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters)); result } @@ -435,111 +452,173 @@ mod tests { use super::*; #[test] - fn test_group_successors() { + fn test_group_transitions() { let table = [ // overlapping character classes ( vec![ - (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), - (CharacterSet::empty().add_range('d', 'i'), 1, 2, false), + (CharacterSet::empty().add_range('a', 'f'), false, 0, 1), + (CharacterSet::empty().add_range('d', 'i'), false, 1, 2), ], vec![ - (CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false), - ( - CharacterSet::empty().add_range('d', 'f'), - 1, - vec![1, 2], - false, - ), - (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), + NfaTransition { + characters: CharacterSet::empty().add_range('a', 'c'), + is_separator: false, + precedence: 0, + states: vec![1], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('d', 'f'), + is_separator: false, + precedence: 1, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('g', 'i'), + is_separator: false, + precedence: 1, + states: vec![2], + }, ], ), // large character class followed by many individual characters ( vec![ - (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), - (CharacterSet::empty().add_char('d'), 0, 2, false), - (CharacterSet::empty().add_char('i'), 0, 3, false), - (CharacterSet::empty().add_char('f'), 0, 4, false), + (CharacterSet::empty().add_range('a', 'z'), false, 0, 1), + (CharacterSet::empty().add_char('d'), false, 0, 2), + (CharacterSet::empty().add_char('i'), false, 0, 3), + (CharacterSet::empty().add_char('f'), false, 0, 4), ], vec![ - (CharacterSet::empty().add_char('d'), 0, vec![1, 2], false), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4], false), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3], false), - ( - CharacterSet::empty() + NfaTransition { + characters: CharacterSet::empty().add_char('d'), + is_separator: false, + precedence: 0, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('f'), + is_separator: false, + precedence: 0, + states: vec![1, 4], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('i'), + is_separator: false, + precedence: 0, + states: vec![1, 3], + }, + NfaTransition { + characters: CharacterSet::empty() .add_range('a', 'c') .add_char('e') .add_range('g', 'h') .add_range('j', 'z'), - 0, - vec![1], - false, - ), + is_separator: false, + precedence: 0, + states: vec![1], + }, ], ), // negated character class followed by an individual character ( vec![ - (CharacterSet::empty().add_char('0'), 0, 1, false), - (CharacterSet::empty().add_char('b'), 0, 2, false), + (CharacterSet::empty().add_char('0'), false, 0, 1), + (CharacterSet::empty().add_char('b'), false, 0, 2), ( CharacterSet::empty().add_range('a', 'f').negate(), + false, 0, 3, - false, ), - (CharacterSet::empty().add_char('c'), 0, 4, false), + (CharacterSet::empty().add_char('c'), false, 0, 4), ], vec![ - (CharacterSet::empty().add_char('0'), 0, vec![1, 3], false), - (CharacterSet::empty().add_char('b'), 0, vec![2], false), - (CharacterSet::empty().add_char('c'), 0, vec![4], false), - ( - CharacterSet::empty() + NfaTransition { + characters: CharacterSet::empty().add_char('0'), + precedence: 0, + states: vec![1, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('b'), + precedence: 0, + states: vec![2], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('c'), + precedence: 0, + states: vec![4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty() .add_range('a', 'f') .add_char('0') .negate(), - 0, - vec![3], - false, - ), + precedence: 0, + states: vec![3], + is_separator: false, + }, ], ), // multiple negated character classes ( vec![ - (CharacterSet::Include(vec!['a']), 0, 1, false), - (CharacterSet::Exclude(vec!['a', 'b', 'c']), 0, 2, false), - (CharacterSet::Include(vec!['g']), 0, 6, false), - (CharacterSet::Exclude(vec!['d', 'e', 'f']), 0, 3, false), - (CharacterSet::Exclude(vec!['g', 'h', 'i']), 0, 4, false), - (CharacterSet::Include(vec!['g']), 0, 5, false), + (CharacterSet::Include(vec!['a']), false, 0, 1), + (CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2), + (CharacterSet::Include(vec!['g']), false, 0, 6), + (CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3), + (CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4), + (CharacterSet::Include(vec!['g']), false, 0, 5), ], vec![ - (CharacterSet::Include(vec!['a']), 0, vec![1, 3, 4], false), - (CharacterSet::Include(vec!['g']), 0, vec![2, 3, 5, 6], false), - (CharacterSet::Include(vec!['b', 'c']), 0, vec![3, 4], false), - (CharacterSet::Include(vec!['h', 'i']), 0, vec![2, 3], false), - ( - CharacterSet::Include(vec!['d', 'e', 'f']), - 0, - vec![2, 4], - false, - ), - ( - CharacterSet::Exclude(vec!['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), - 0, - vec![2, 3, 4], - false, - ), + NfaTransition { + characters: CharacterSet::Include(vec!['a']), + precedence: 0, + states: vec![1, 3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['g']), + precedence: 0, + states: vec![2, 3, 5, 6], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['b', 'c']), + precedence: 0, + states: vec![3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['h', 'i']), + precedence: 0, + states: vec![2, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['d', 'e', 'f']), + precedence: 0, + states: vec![2, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Exclude(vec![ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', + ]), + precedence: 0, + states: vec![2, 3, 4], + is_separator: false, + }, ], ), ]; for row in table.iter() { assert_eq!( - NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))), + NfaCursor::group_transitions(row.0.iter().map(|(c, sep, p, s)| (c, *sep, *p, *s))), row.1 ); } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 01b925f9..91a0e364 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -372,7 +372,7 @@ impl NfaBuilder { mod tests { use super::*; use crate::grammars::Variable; - use crate::nfa::NfaCursor; + use crate::nfa::{NfaCursor, NfaTransition}; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); @@ -389,14 +389,18 @@ mod tests { result_precedence = precedence; } } - if let Some((_, _, next_states, in_sep)) = cursor - .grouped_successors() + if let Some(NfaTransition { + states, + is_separator, + .. + }) = cursor + .transitions() .into_iter() - .find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence) + .find(|t| t.characters.contains(c) && t.precedence >= result_precedence) { - cursor.reset(next_states); + cursor.reset(states); end_char += 1; - if in_sep { + if is_separator { start_char = end_char; } } else { From 79b9d5ebed3470195e05b50d3f0b42b21cb7c69b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:19:53 -0800 Subject: [PATCH 094/102] Fix minor differences in generated C code --- src/build_tables/build_parse_table.rs | 12 ++++++------ src/render/mod.rs | 20 +++++++++----------- src/rules.rs | 2 +- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 9bccf238..5fc015af 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -531,7 +531,6 @@ impl<'a> ParseTableBuilder<'a> { } fn populate_used_symbols(&mut self) { - self.parse_table.symbols.push(Symbol::end()); let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; @@ -547,16 +546,17 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } for (i, value) in external_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::external(i)); } } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::non_terminal(i)); diff --git a/src/render/mod.rs b/src/render/mod.rs index 0c0e6e59..61c167bb 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -232,15 +232,13 @@ impl Generator { add_line!(self, "static const char *ts_symbol_names[] = {{"); indent!(self); for symbol in self.parse_table.symbols.iter() { - if *symbol != Symbol::end() { - let name = self.sanitize_string( - self.simple_aliases - .get(symbol) - .map(|alias| alias.value.as_str()) - .unwrap_or(self.metadata_for_symbol(*symbol).0), - ); - add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); - } + let name = self.sanitize_string( + self.simple_aliases + .get(symbol) + .map(|alias| alias.value.as_str()) + .unwrap_or(self.metadata_for_symbol(*symbol).0), + ); + add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); } for (alias, symbol) in &self.alias_map { if symbol.is_none() { @@ -864,7 +862,7 @@ impl Generator { fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { match symbol.kind { - SymbolType::End => ("end", VariableType::Auxiliary), + SymbolType::End => ("end", VariableType::Hidden), SymbolType::NonTerminal => { let variable = &self.syntax_grammar.variables[symbol.index]; (&variable.name, variable.kind) @@ -950,7 +948,7 @@ impl Generator { fn add_character(&mut self, c: char) { if c.is_ascii() { match c { - '\0' => add!(self, "'\\0'"), + '\0' => add!(self, "0"), '\'' => add!(self, "'\\''"), '\\' => add!(self, "'\\\\'"), '\t' => add!(self, "'\\t'"), diff --git a/src/rules.rs b/src/rules.rs index bd0340fc..e15070ea 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -3,9 +3,9 @@ use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { External, + End, Terminal, NonTerminal, - End, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] From baf7f3603c5eca1c338be4665d516ff6d189a020 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:30:53 -0800 Subject: [PATCH 095/102] Mark fragile tokens --- src/build_tables/mod.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 78798732..ed47665e 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -43,6 +43,11 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); + mark_fragile_tokens( + &mut parse_table, + lexical_grammar, + &token_conflict_map, + ); if minimize { minimize_parse_table( &mut parse_table, @@ -238,6 +243,34 @@ fn identify_keywords( keywords } +fn mark_fragile_tokens( + parse_table: &mut ParseTable, + lexical_grammar: &LexicalGrammar, + token_conflict_map: &TokenConflictMap, +) { + let n = lexical_grammar.variables.len(); + let mut valid_tokens_mask = Vec::with_capacity(n); + for state in parse_table.states.iter_mut() { + valid_tokens_mask.clear(); + valid_tokens_mask.resize(n, false); + for token in state.terminal_entries.keys() { + if token.is_terminal() { + valid_tokens_mask[token.index] = true; + } + } + for (token, entry) in state.terminal_entries.iter_mut() { + for i in 0..n { + if token_conflict_map.does_overlap(i, token.index) { + if valid_tokens_mask[i] { + entry.reusable = false; + break; + } + } + } + } + } +} + fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { From d0c3e26e8409637f4752a4dafe20297fac4420bc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:52:52 -0800 Subject: [PATCH 096/102] Don't let lex state merging be fooled by trivial loops --- src/build_tables/build_lex_table.rs | 21 +++++++++++++-------- src/render/mod.rs | 14 +++++++------- src/tables.rs | 2 +- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 4212d62b..bcc1bf3d 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -168,7 +168,7 @@ impl<'a> LexTableBuilder<'a> { self.table.states[state_id].advance_actions.push(( CharacterSet::empty().add_char('\0'), AdvanceAction { - state: next_state_id, + state: Some(next_state_id), in_main_token: true, }, )); @@ -189,10 +189,15 @@ impl<'a> LexTableBuilder<'a> { } } let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); + let next_state = if next_state_id == state_id { + None + } else { + Some(next_state_id) + }; self.table.states[state_id].advance_actions.push(( characters, AdvanceAction { - state: next_state_id, + state: next_state, in_main_token: !is_separator, }, )); @@ -231,10 +236,10 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } } for state in table.states.iter_mut() { - for advance_action in state.advance_actions.iter_mut() { - if let Some(new_state_id) = state_replacements.get(&advance_action.1.state) { - advance_action.1.state = *new_state_id; - } + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action + .state + .map(|s| state_replacements.get(&s).cloned().unwrap_or(s)) } } } @@ -259,8 +264,8 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } for state in table.states.iter_mut() { - for advance_action in state.advance_actions.iter_mut() { - advance_action.1.state = final_state_replacements[advance_action.1.state]; + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action.state.map(|s| final_state_replacements[s]); } } diff --git a/src/render/mod.rs b/src/render/mod.rs index 61c167bb..58235fd9 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -342,7 +342,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {}:", i); indent!(self); - self.add_lex_state(state); + self.add_lex_state(i, state); dedent!(self); } @@ -358,7 +358,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_state(&mut self, state: LexState) { + fn add_lex_state(&mut self, index: usize, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -372,14 +372,14 @@ impl Generator { if self.add_character_set_condition(&characters, &ruled_out_characters) { add!(self, ")\n"); indent!(self); - self.add_advance_action(&action); + self.add_advance_action(index, &action); if let CharacterSet::Include(chars) = characters { ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { self.buffer.truncate(previous_length); - self.add_advance_action(&action); + self.add_advance_action(index, &action); } } @@ -491,11 +491,11 @@ impl Generator { }) } - fn add_advance_action(&mut self, action: &AdvanceAction) { + fn add_advance_action(&mut self, index: usize, action: &AdvanceAction) { if action.in_main_token { - add_line!(self, "ADVANCE({});", action.state); + add_line!(self, "ADVANCE({});", action.state.unwrap_or(index)); } else { - add_line!(self, "SKIP({});", action.state); + add_line!(self, "SKIP({});", action.state.unwrap_or(index)); } } diff --git a/src/tables.rs b/src/tables.rs index f400d25c..c8f7e1e4 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -48,7 +48,7 @@ pub(crate) struct ParseTable { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { - pub state: LexStateId, + pub state: Option, pub in_main_token: bool, } From ba96e4961b9710728e6a9ef02be475e2e942d3ca Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 12:42:45 -0800 Subject: [PATCH 097/102] Simplify error handling, finish up LR conflict message generation --- src/build_tables/build_parse_table.rs | 93 +++++++++++++++++++++++++-- src/error.rs | 17 +++-- src/main.rs | 11 +++- src/prepare_grammar/expand_tokens.rs | 9 +-- src/prepare_grammar/extract_tokens.rs | 12 ++-- src/prepare_grammar/intern_symbols.rs | 14 ++-- 6 files changed, 117 insertions(+), 39 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 5fc015af..e642c3cd 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -455,9 +455,9 @@ impl<'a> ParseTableBuilder<'a> { self.symbol_name(&conflicting_lookahead) ) .unwrap(); - write!(&mut msg, "Possible interpretations:\n").unwrap(); + write!(&mut msg, "Possible interpretations:\n\n").unwrap(); for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, "\n {}:", i).unwrap(); + write!(&mut msg, " {}:", i + 1).unwrap(); for preceding_symbol in preceding_symbols .iter() @@ -501,11 +501,89 @@ impl<'a> ParseTableBuilder<'a> { ) .unwrap(); } + + write!(&mut msg, "\n").unwrap(); } - // TODO - generate suggested resolutions + let mut resolution_count = 0; + write!(&mut msg, "\nPossible resolutions:\n\n").unwrap(); + let shift_items = conflicting_items + .iter() + .filter(|i| !i.is_done()) + .cloned() + .collect::>(); + if shift_items.len() > 0 { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in", + resolution_count + ) + .unwrap(); + for (i, item) in shift_items.iter().enumerate() { + if i > 0 { + write!(&mut msg, " and").unwrap(); + } + write!( + &mut msg, + " `{}`", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + write!(&mut msg, " than in the other rules.\n").unwrap(); + } - Err(Error::ConflictError(msg)) + if considered_associativity { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a left or right associativity in ", + resolution_count + ) + .unwrap(); + for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() { + if i > 0 { + write!(&mut msg, " and ").unwrap(); + } + write!( + &mut msg, + "{}", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + } + + for item in &conflicting_items { + if item.is_done() { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in `{}` than in the other rules.\n", + resolution_count, + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + } + + resolution_count += 1; + write!( + &mut msg, + " {}: Add a conflict for these rules: ", + resolution_count + ) + .unwrap(); + for (i, symbol) in actual_conflict.iter().enumerate() { + if i > 0 { + write!(&mut msg, ", ").unwrap(); + } + write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap(); + } + write!(&mut msg, "\n").unwrap(); + + Err(Error(msg)) } fn get_auxiliary_node_info( @@ -517,8 +595,11 @@ impl<'a> ParseTableBuilder<'a> { .entries .keys() .filter_map(|item| { - if item.symbol() == Some(symbol) { - None + let variable_index = item.variable_index as usize; + if item.symbol() == Some(symbol) + && !self.syntax_grammar.variables[variable_index].is_auxiliary() + { + Some(Symbol::non_terminal(variable_index)) } else { None } diff --git a/src/error.rs b/src/error.rs index b03efa93..9a5801f8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,25 +1,24 @@ #[derive(Debug)] -pub enum Error { - GrammarError(String), - SymbolError(String), - RegexError(String), - ConflictError(String), -} +pub struct Error(pub String); pub type Result = std::result::Result; impl Error { pub fn grammar(message: &str) -> Self { - Error::GrammarError(message.to_string()) + Error(format!("Grammar error: {}", message)) } pub fn regex(message: &str) -> Self { - Error::RegexError(message.to_string()) + Error(format!("Regex error: {}", message)) + } + + pub fn undefined_symbol(name: &str) -> Self { + Error(format!("Undefined symbol `{}`", name)) } } impl From for Error { fn from(error: serde_json::Error) -> Self { - Error::GrammarError(error.to_string()) + Error(error.to_string()) } } diff --git a/src/main.rs b/src/main.rs index 10820ed1..c3dbf33d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,7 @@ use clap::{App, Arg, SubCommand}; use std::env; use std::io::Write; use std::path::PathBuf; -use std::process::{Command, Stdio}; +use std::process::{exit, Command, Stdio}; mod build_tables; mod error; @@ -25,7 +25,14 @@ mod render; mod rules; mod tables; -fn main() -> error::Result<()> { +fn main() { + if let Err(e) = run() { + eprintln!("{}", e.0); + exit(1); + } +} + +fn run() -> error::Result<()> { let matches = App::new("tree-sitter") .version("0.1") .author("Max Brunsfeld ") diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 91a0e364..2678df19 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -64,12 +64,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - Error::RegexError(format!("Rule {} {}", variable.name, msg)) - } - _ => e, - })?; + .map_err(|Error(msg)| Error(format!("Rule {} {}", variable.name, msg)))?; if !is_immediate_token { builder.is_sep = true; @@ -97,7 +92,7 @@ impl NfaBuilder { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; + .map_err(|e| Error(e.to_string()))?; self.expand_regex(&ast, next_state_id) } Rule::String(s) => { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 115933ee..5a54d34e 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -89,7 +89,7 @@ pub(super) fn extract_tokens( if let Rule::Symbol(symbol) = rule { let new_symbol = symbol_replacer.replace_symbol(symbol); if new_symbol.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-token symbol '{}' cannot be used as an extra token", &variables[new_symbol.index].name ))); @@ -110,7 +110,7 @@ pub(super) fn extract_tokens( let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); if let Rule::Symbol(symbol) = rule { if symbol.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Rule '{}' cannot be used as both an external token and a non-terminal rule", &variables[symbol.index].name, ))); @@ -130,7 +130,7 @@ pub(super) fn extract_tokens( }) } } else { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-symbol rules cannot be used as external tokens" ))); } @@ -140,7 +140,7 @@ pub(super) fn extract_tokens( if let Some(token) = grammar.word_token { let token = symbol_replacer.replace_symbol(token); if token.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-terminal symbol '{}' cannot be used as the word token", &variables[token.index].name ))); @@ -475,7 +475,7 @@ mod test { grammar.extra_tokens = vec![Rule::non_terminal(1)]; match extract_tokens(grammar) { - Err(Error::GrammarError(s)) => { + Err(Error(s)) => { assert_eq!( s, "Non-token symbol 'rule_1' cannot be used as an extra token" @@ -503,7 +503,7 @@ mod test { grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; match extract_tokens(grammar) { - Err(Error::GrammarError(s)) => { + Err(Error(s)) => { assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); } _ => { diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 5165875c..2e6f5b1c 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -7,7 +7,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let interner = Interner { grammar }; if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { - return Err(Error::GrammarError( + return Err(Error( "Grammar's start rule must be visible".to_string(), )); } @@ -44,7 +44,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result interned_conflict.push( interner .intern_name(&name) - .ok_or_else(|| symbol_error(name))?, + .ok_or_else(|| Error::undefined_symbol(name))?, ); } expected_conflicts.push(interned_conflict); @@ -62,7 +62,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result word_token = Some( interner .intern_name(&name) - .ok_or_else(|| symbol_error(&name))?, + .ok_or_else(|| Error::undefined_symbol(&name))?, ); } @@ -107,7 +107,7 @@ impl<'a> Interner<'a> { if let Some(symbol) = self.intern_name(&name) { Ok(Rule::Symbol(symbol)) } else { - Err(symbol_error(name)) + Err(Error::undefined_symbol(name)) } } @@ -134,10 +134,6 @@ impl<'a> Interner<'a> { } } -fn symbol_error(name: &str) -> Error { - Error::SymbolError(format!("Undefined symbol '{}'", name)) -} - fn variable_type_for_name(name: &str) -> VariableType { if name.starts_with("_") { VariableType::Hidden @@ -223,7 +219,7 @@ mod tests { let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); match result { - Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), + Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"), _ => panic!("Expected an error but got none"), } } From a0e65018ba8282fc8c77734092618e87cfb8cf2d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 13:01:07 -0800 Subject: [PATCH 098/102] Fix computation of MAX_ALIAS_SEQUENCE_LENGTH --- src/build_tables/build_parse_table.rs | 6 +++++- src/render/mod.rs | 10 ++-------- src/tables.rs | 1 + 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index e642c3cd..7fb668dd 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -675,6 +675,9 @@ impl<'a> ParseTableBuilder<'a> { while alias_sequence.last() == Some(&None) { alias_sequence.pop(); } + if item.production.steps.len() > self.parse_table.max_aliased_production_length { + self.parse_table.max_aliased_production_length = item.production.steps.len() + } if let Some(index) = self .parse_table .alias_sequences @@ -721,8 +724,9 @@ pub(crate) fn build_parse_table( parse_state_queue: VecDeque::new(), parse_table: ParseTable { states: Vec::new(), - alias_sequences: Vec::new(), symbols: Vec::new(), + alias_sequences: Vec::new(), + max_aliased_production_length: 0, }, following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], } diff --git a/src/render/mod.rs b/src/render/mod.rs index 58235fd9..8d3ee195 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -191,17 +191,11 @@ impl Generator { "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len() ); - if let Some(max_alias_sequence_length) = self - .parse_table - .alias_sequences - .iter() - .map(|seq| seq.len()) - .max() - { + if self.parse_table.max_aliased_production_length > 0 { add_line!( self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}", - max_alias_sequence_length + self.parse_table.max_aliased_production_length ); } add_line!(self, ""); diff --git a/src/tables.rs b/src/tables.rs index c8f7e1e4..edbbaaab 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -44,6 +44,7 @@ pub(crate) struct ParseTable { pub states: Vec, pub symbols: Vec, pub alias_sequences: Vec>>, + pub max_aliased_production_length: usize, } #[derive(Clone, Debug, PartialEq, Eq)] From 3a727af2645fb41d3f2151d1b1b4893232e49c06 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:26:48 -0800 Subject: [PATCH 099/102] Add flag for logging the item set associated with a certain parse state --- src/build_tables/build_parse_table.rs | 43 ++++++++++++++------------- src/build_tables/item.rs | 18 ++++++++++- src/build_tables/mod.rs | 3 +- src/generate.rs | 16 ++++++---- src/main.rs | 15 +++++++++- 5 files changed, 66 insertions(+), 29 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 7fb668dd..cda1d7ea 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -39,6 +39,7 @@ struct ParseTableBuilder<'a> { parse_state_queue: VecDeque, parse_table: ParseTable, following_tokens: Vec, + state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { @@ -64,29 +65,26 @@ impl<'a> ParseTableBuilder<'a> { ); while let Some(entry) = self.parse_state_queue.pop_front() { - // info!( - // "state: {}, item set: {}", - // entry.state_id, - // super::item::ParseItemSetDisplay( - // &self.item_sets_by_state_id[entry.state_id], - // self.syntax_grammar, - // self.lexical_grammar, - // ) - // ); - let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - // info!( - // "state: {}, closed item set: {}", - // entry.state_id, - // super::item::ParseItemSetDisplay( - // &item_set, - // self.syntax_grammar, - // self.lexical_grammar, - // ) - // ); + if self.state_ids_to_log.contains(&entry.state_id) { + eprintln!( + "state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}", + entry.state_id, + super::item::ParseItemSetDisplay( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.lexical_grammar, + ), + super::item::ParseItemSetDisplay( + &item_set, + self.syntax_grammar, + self.lexical_grammar, + ) + ); + } self.add_actions( entry.preceding_symbols, @@ -553,6 +551,7 @@ impl<'a> ParseTableBuilder<'a> { ) .unwrap(); } + write!(&mut msg, "\n").unwrap(); } for item in &conflicting_items { @@ -560,7 +559,7 @@ impl<'a> ParseTableBuilder<'a> { resolution_count += 1; write!( &mut msg, - " {}: Specify a higher precedence in `{}` than in the other rules.\n", + " {}: Specify a higher precedence in `{}` than in the other rules.\n", resolution_count, self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) ) @@ -571,7 +570,7 @@ impl<'a> ParseTableBuilder<'a> { resolution_count += 1; write!( &mut msg, - " {}: Add a conflict for these rules: ", + " {}: Add a conflict for these rules: ", resolution_count ) .unwrap(); @@ -714,10 +713,12 @@ pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, + state_ids_to_log: Vec, ) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, + state_ids_to_log, item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index d1d0cbbf..bbd5bbfa 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -45,7 +45,7 @@ pub(crate) struct ParseItemSet<'a> { pub(crate) struct ParseItemDisplay<'a>( pub &'a ParseItem<'a>, pub &'a SyntaxGrammar, - pub &'a LexicalGrammar + pub &'a LexicalGrammar, ); pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); @@ -252,6 +252,13 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { for (i, step) in self.0.production.steps.iter().enumerate() { if i == self.0.step_index as usize { write!(f, " •")?; + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } } write!(f, " ")?; @@ -274,6 +281,15 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { if self.0.is_done() { write!(f, " •")?; + if let Some(step) = self.0.production.steps.last() { + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } + } } Ok(()) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index ed47665e..04b750e3 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -24,9 +24,10 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, minimize: bool, + state_ids_to_log: Vec, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { let (mut parse_table, following_tokens) = - build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); let keywords = identify_keywords( diff --git a/src/generate.rs b/src/generate.rs index d574c165..aa8f3b5b 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -1,18 +1,24 @@ +use crate::build_tables::build_tables; use crate::error::Result; use crate::parse_grammar::parse_grammar; use crate::prepare_grammar::prepare_grammar; -use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result { +pub fn generate_parser_for_grammar( + input: &str, + minimize: bool, + state_ids_to_log: Vec, +) -> Result { let input_grammar = parse_grammar(input)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, &inlines, - minimize + minimize, + state_ids_to_log, )?; let c_code = render_c_code( &input_grammar.name, @@ -22,7 +28,7 @@ pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result error::Result<()> { SubCommand::with_name("generate") .about("Generate a parser") .arg(Arg::with_name("log").long("log")) + .arg( + Arg::with_name("state-ids-to-log") + .long("log-state") + .takes_value(true), + ) .arg(Arg::with_name("no-minimize").long("no-minimize")), ) .subcommand( @@ -63,10 +69,17 @@ fn run() -> error::Result<()> { } let minimize = !matches.is_present("no-minimize"); + let state_ids_to_log = matches + .values_of("state-ids-to-log") + .map_or(Vec::new(), |ids| { + ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) + .collect() + }); let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); - let code = generate::generate_parser_for_grammar(&grammar_json, minimize)?; + let code = + generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?; println!("{}", code); } From d8f8bd288eece27626c02407054b454b8102b7f8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:27:15 -0800 Subject: [PATCH 100/102] Fix error in code generation w/ tokens that are internal and external --- src/render/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index 8d3ee195..36429848 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -561,11 +561,13 @@ impl Generator { ); indent!(self); for i in 0..self.syntax_grammar.external_tokens.len() { + let token = &self.syntax_grammar.external_tokens[i]; + let id_token = token.corresponding_internal_token.unwrap_or(Symbol::external(i)); add_line!( self, "[{}] = {},", - self.external_token_id(&self.syntax_grammar.external_tokens[i]), - self.symbol_ids[&Symbol::external(i)], + self.external_token_id(&token), + self.symbol_ids[&id_token], ); } dedent!(self); From b8dd5d2640f2011d016d0dfd750e804824771c68 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:27:35 -0800 Subject: [PATCH 101/102] Fix handling of precedence and associativity with inlining --- src/prepare_grammar/process_inlines.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 9fd2f2c6..557b0fa4 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -90,7 +90,6 @@ impl InlinedProductionMapBuilder { while i < productions_to_add.len() { if let Some(step) = productions_to_add[i].steps.get(step_index) { let symbol = step.symbol.clone(); - if grammar.variables_to_inline.contains(&symbol) { // Remove the production from the vector, replacing it with a placeholder. let production = productions_to_add @@ -116,8 +115,12 @@ impl InlinedProductionMapBuilder { } } if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; + if last_inserted_step.precedence == 0 { + last_inserted_step.precedence = removed_step.precedence; + } + if last_inserted_step.associativity == None { + last_inserted_step.associativity = removed_step.associativity; + } } production }), From 5b0e12ea332ebe231ba103b078f832f2ee2148c5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 16:50:52 -0800 Subject: [PATCH 102/102] Move code into cli directory --- Cargo.toml | 31 +++---------------- cli/Cargo.toml | 27 ++++++++++++++++ .../src}/build_tables/build_lex_table.rs | 0 .../src}/build_tables/build_parse_table.rs | 0 .../src}/build_tables/coincident_tokens.rs | 0 {src => cli/src}/build_tables/item.rs | 0 .../src}/build_tables/item_set_builder.rs | 0 .../src}/build_tables/minimize_parse_table.rs | 0 {src => cli/src}/build_tables/mod.rs | 0 .../src}/build_tables/token_conflicts.rs | 0 {src => cli/src}/error.rs | 0 {src => cli/src}/generate.rs | 0 {src => cli/src}/grammars.rs | 0 {src => cli/src}/js/dsl.js | 0 {src => cli/src}/logger.rs | 0 {src => cli/src}/main.rs | 0 {src => cli/src}/nfa.rs | 0 {src => cli/src}/parse_grammar.rs | 0 .../src}/prepare_grammar/expand_repeats.rs | 0 .../src}/prepare_grammar/expand_tokens.rs | 0 .../prepare_grammar/extract_simple_aliases.rs | 0 .../src}/prepare_grammar/extract_tokens.rs | 0 .../src}/prepare_grammar/flatten_grammar.rs | 0 .../src}/prepare_grammar/intern_symbols.rs | 0 {src => cli/src}/prepare_grammar/mod.rs | 0 .../src}/prepare_grammar/process_inlines.rs | 0 {src => cli/src}/render/mod.rs | 0 {src => cli/src}/rules.rs | 0 {src => cli/src}/tables.rs | 0 29 files changed, 32 insertions(+), 26 deletions(-) create mode 100644 cli/Cargo.toml rename {src => cli/src}/build_tables/build_lex_table.rs (100%) rename {src => cli/src}/build_tables/build_parse_table.rs (100%) rename {src => cli/src}/build_tables/coincident_tokens.rs (100%) rename {src => cli/src}/build_tables/item.rs (100%) rename {src => cli/src}/build_tables/item_set_builder.rs (100%) rename {src => cli/src}/build_tables/minimize_parse_table.rs (100%) rename {src => cli/src}/build_tables/mod.rs (100%) rename {src => cli/src}/build_tables/token_conflicts.rs (100%) rename {src => cli/src}/error.rs (100%) rename {src => cli/src}/generate.rs (100%) rename {src => cli/src}/grammars.rs (100%) rename {src => cli/src}/js/dsl.js (100%) rename {src => cli/src}/logger.rs (100%) rename {src => cli/src}/main.rs (100%) rename {src => cli/src}/nfa.rs (100%) rename {src => cli/src}/parse_grammar.rs (100%) rename {src => cli/src}/prepare_grammar/expand_repeats.rs (100%) rename {src => cli/src}/prepare_grammar/expand_tokens.rs (100%) rename {src => cli/src}/prepare_grammar/extract_simple_aliases.rs (100%) rename {src => cli/src}/prepare_grammar/extract_tokens.rs (100%) rename {src => cli/src}/prepare_grammar/flatten_grammar.rs (100%) rename {src => cli/src}/prepare_grammar/intern_symbols.rs (100%) rename {src => cli/src}/prepare_grammar/mod.rs (100%) rename {src => cli/src}/prepare_grammar/process_inlines.rs (100%) rename {src => cli/src}/render/mod.rs (100%) rename {src => cli/src}/rules.rs (100%) rename {src => cli/src}/tables.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 29b10e17..75d3b403 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,27 +1,6 @@ -[package] -name = "rust-tree-sitter-cli" -version = "0.1.0" -authors = ["Max Brunsfeld "] -edition = "2018" +[workspace] -[dependencies] -lazy_static = "1.2.0" -smallbitvec = "2.3.0" -clap = "2.32" -dirs = "1.0.2" -hashbrown = "0.1" -ignore = "0.4.4" -libloading = "0.5" -rusqlite = "0.14.0" -serde = "1.0" -serde_derive = "1.0" -tree-sitter = "0.3.1" -regex-syntax = "0.6.4" - -[dependencies.serde_json] -version = "1.0" -features = ["preserve_order"] - -[dependencies.log] -version = "0.4.6" -features = ["std"] +members = [ + "cli", + "lib", +] diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 00000000..29b10e17 --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "rust-tree-sitter-cli" +version = "0.1.0" +authors = ["Max Brunsfeld "] +edition = "2018" + +[dependencies] +lazy_static = "1.2.0" +smallbitvec = "2.3.0" +clap = "2.32" +dirs = "1.0.2" +hashbrown = "0.1" +ignore = "0.4.4" +libloading = "0.5" +rusqlite = "0.14.0" +serde = "1.0" +serde_derive = "1.0" +tree-sitter = "0.3.1" +regex-syntax = "0.6.4" + +[dependencies.serde_json] +version = "1.0" +features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs similarity index 100% rename from src/build_tables/build_lex_table.rs rename to cli/src/build_tables/build_lex_table.rs diff --git a/src/build_tables/build_parse_table.rs b/cli/src/build_tables/build_parse_table.rs similarity index 100% rename from src/build_tables/build_parse_table.rs rename to cli/src/build_tables/build_parse_table.rs diff --git a/src/build_tables/coincident_tokens.rs b/cli/src/build_tables/coincident_tokens.rs similarity index 100% rename from src/build_tables/coincident_tokens.rs rename to cli/src/build_tables/coincident_tokens.rs diff --git a/src/build_tables/item.rs b/cli/src/build_tables/item.rs similarity index 100% rename from src/build_tables/item.rs rename to cli/src/build_tables/item.rs diff --git a/src/build_tables/item_set_builder.rs b/cli/src/build_tables/item_set_builder.rs similarity index 100% rename from src/build_tables/item_set_builder.rs rename to cli/src/build_tables/item_set_builder.rs diff --git a/src/build_tables/minimize_parse_table.rs b/cli/src/build_tables/minimize_parse_table.rs similarity index 100% rename from src/build_tables/minimize_parse_table.rs rename to cli/src/build_tables/minimize_parse_table.rs diff --git a/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs similarity index 100% rename from src/build_tables/mod.rs rename to cli/src/build_tables/mod.rs diff --git a/src/build_tables/token_conflicts.rs b/cli/src/build_tables/token_conflicts.rs similarity index 100% rename from src/build_tables/token_conflicts.rs rename to cli/src/build_tables/token_conflicts.rs diff --git a/src/error.rs b/cli/src/error.rs similarity index 100% rename from src/error.rs rename to cli/src/error.rs diff --git a/src/generate.rs b/cli/src/generate.rs similarity index 100% rename from src/generate.rs rename to cli/src/generate.rs diff --git a/src/grammars.rs b/cli/src/grammars.rs similarity index 100% rename from src/grammars.rs rename to cli/src/grammars.rs diff --git a/src/js/dsl.js b/cli/src/js/dsl.js similarity index 100% rename from src/js/dsl.js rename to cli/src/js/dsl.js diff --git a/src/logger.rs b/cli/src/logger.rs similarity index 100% rename from src/logger.rs rename to cli/src/logger.rs diff --git a/src/main.rs b/cli/src/main.rs similarity index 100% rename from src/main.rs rename to cli/src/main.rs diff --git a/src/nfa.rs b/cli/src/nfa.rs similarity index 100% rename from src/nfa.rs rename to cli/src/nfa.rs diff --git a/src/parse_grammar.rs b/cli/src/parse_grammar.rs similarity index 100% rename from src/parse_grammar.rs rename to cli/src/parse_grammar.rs diff --git a/src/prepare_grammar/expand_repeats.rs b/cli/src/prepare_grammar/expand_repeats.rs similarity index 100% rename from src/prepare_grammar/expand_repeats.rs rename to cli/src/prepare_grammar/expand_repeats.rs diff --git a/src/prepare_grammar/expand_tokens.rs b/cli/src/prepare_grammar/expand_tokens.rs similarity index 100% rename from src/prepare_grammar/expand_tokens.rs rename to cli/src/prepare_grammar/expand_tokens.rs diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/cli/src/prepare_grammar/extract_simple_aliases.rs similarity index 100% rename from src/prepare_grammar/extract_simple_aliases.rs rename to cli/src/prepare_grammar/extract_simple_aliases.rs diff --git a/src/prepare_grammar/extract_tokens.rs b/cli/src/prepare_grammar/extract_tokens.rs similarity index 100% rename from src/prepare_grammar/extract_tokens.rs rename to cli/src/prepare_grammar/extract_tokens.rs diff --git a/src/prepare_grammar/flatten_grammar.rs b/cli/src/prepare_grammar/flatten_grammar.rs similarity index 100% rename from src/prepare_grammar/flatten_grammar.rs rename to cli/src/prepare_grammar/flatten_grammar.rs diff --git a/src/prepare_grammar/intern_symbols.rs b/cli/src/prepare_grammar/intern_symbols.rs similarity index 100% rename from src/prepare_grammar/intern_symbols.rs rename to cli/src/prepare_grammar/intern_symbols.rs diff --git a/src/prepare_grammar/mod.rs b/cli/src/prepare_grammar/mod.rs similarity index 100% rename from src/prepare_grammar/mod.rs rename to cli/src/prepare_grammar/mod.rs diff --git a/src/prepare_grammar/process_inlines.rs b/cli/src/prepare_grammar/process_inlines.rs similarity index 100% rename from src/prepare_grammar/process_inlines.rs rename to cli/src/prepare_grammar/process_inlines.rs diff --git a/src/render/mod.rs b/cli/src/render/mod.rs similarity index 100% rename from src/render/mod.rs rename to cli/src/render/mod.rs diff --git a/src/rules.rs b/cli/src/rules.rs similarity index 100% rename from src/rules.rs rename to cli/src/rules.rs diff --git a/src/tables.rs b/cli/src/tables.rs similarity index 100% rename from src/tables.rs rename to cli/src/tables.rs