From 6e4115548c1982a764ca22f819544455ca9f7807 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 10 Jul 2016 14:03:00 -0700 Subject: [PATCH 001/208] Initial commit --- .gitignore | 2 + .gitmodules | 3 + Cargo.toml | 15 ++ build.rs | 35 +++++ script/bindgen.sh | 16 +++ src/ffi.rs | 333 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 329 ++++++++++++++++++++++++++++++++++++++++++++ vendor/tree-sitter | 1 + 8 files changed, 734 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Cargo.toml create mode 100644 build.rs create mode 100755 script/bindgen.sh create mode 100644 src/ffi.rs create mode 100644 src/lib.rs create mode 160000 vendor/tree-sitter diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a9d37c56 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..eef86f94 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/tree-sitter"] + path = vendor/tree-sitter + url = https://github.com/tree-sitter/tree-sitter diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..0a93febe --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "tree-sitter" +version = "0.1.0" +authors = ["Max Brunsfeld "] +build = "build.rs" +exclude = ["vendor/tree-sitter/**/*"] +include = [ + "vendor/tree-sitter/src/runtime/*", + "vendor/tree-sitter/externals/utf8proc/utf8proc*" +] + +[dependencies] + +[build-dependencies] +cc = "1.0" diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..3427ed5f --- /dev/null +++ b/build.rs @@ -0,0 +1,35 @@ +extern crate cc; + +use std::path::Path; + + +fn main() { + let dir_path = Path::new("vendor/tree-sitter/src/runtime"); + + let source_filenames = [ + "get_changed_ranges.c", + "language.c", + "lexer.c", + "node.c", + "parser.c", + "parser.c", + "stack.c", + "subtree.c", + "tree_cursor.c", + "tree.c", + "utf16.c", + ]; + + let mut config = cc::Build::new(); + config.include("vendor/tree-sitter/src"); + config.include("vendor/tree-sitter/include"); + config.include("vendor/tree-sitter/externals/utf8proc"); + config.flag_if_supported("-Wno-unused-parameter"); + + for source_filename in source_filenames.iter() { + let source_path = dir_path.join(Path::new(&source_filename)); + config.file(&source_path.to_str().unwrap()); + } + + config.compile("libruntime.a") +} diff --git a/script/bindgen.sh b/script/bindgen.sh new file mode 100755 index 00000000..190e7a4f --- /dev/null +++ b/script/bindgen.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +output_path=src/ffi.rs +header_path='vendor/tree-sitter/include/tree_sitter/runtime.h' + +bindgen \ + --no-layout-tests \ + --whitelist-type '^TS.*' \ + --whitelist-function '^ts_.*' \ + --opaque-type FILE \ + $header_path > $output_path + +echo "" >> $output_path +version_constant='TREE_SITTER_LANGUAGE_VERSION' +version_number=$(egrep "#define $version_constant (.*)" $header_path | cut -d' ' -f3) +echo "pub const $version_constant: usize = $version_number;" >> $output_path diff --git a/src/ffi.rs b/src/ffi.rs new file mode 100644 index 00000000..7d1c06e8 --- /dev/null +++ b/src/ffi.rs @@ -0,0 +1,333 @@ +/* automatically generated by rust-bindgen */ + +pub type FILE = [u64; 19usize]; +pub type TSSymbol = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLanguage { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSParser { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTree { + _unused: [u8; 0], +} +pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; +pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; +pub type TSInputEncoding = u32; +pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; +pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; +pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; +pub type TSSymbolType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSPoint { + pub row: u32, + pub column: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSRange { + pub start: TSPoint, + pub end: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInput { + pub payload: *mut ::std::os::raw::c_void, + pub read: ::std::option::Option< + unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) + -> *const ::std::os::raw::c_char, + >, + pub seek: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + byte_index: u32, + position: TSPoint, + ) -> ::std::os::raw::c_int, + >, + pub encoding: TSInputEncoding, +} +pub const TSLogType_TSLogTypeParse: TSLogType = 0; +pub const TSLogType_TSLogTypeLex: TSLogType = 1; +pub type TSLogType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLogger { + pub payload: *mut ::std::os::raw::c_void, + pub log: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + arg1: TSLogType, + arg2: *const ::std::os::raw::c_char, + ), + >, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_point: TSPoint, + pub old_end_point: TSPoint, + pub new_end_point: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSNode { + pub context: [u32; 4usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTreeCursor { + pub context: [u32; 2usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +extern "C" { + #[link_name = "\u{1}_ts_parser_new"] + pub fn ts_parser_new() -> *mut TSParser; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_delete"] + pub fn ts_parser_delete(arg1: *mut TSParser); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_language"] + pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_language"] + pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_logger"] + pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_logger"] + pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_print_dot_graphs"] + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_halt_on_error"] + pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse"] + pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse_string"] + pub fn ts_parser_parse_string( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + ) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_copy"] + pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_delete"] + pub fn ts_tree_delete(arg1: *mut TSTree); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_root_node"] + pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_edit"] + pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_get_changed_ranges"] + pub fn ts_tree_get_changed_ranges( + arg1: *const TSTree, + arg2: *const TSTree, + arg3: *mut u32, + ) -> *mut TSRange; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_print_dot_graph"] + pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_byte"] + pub fn ts_node_start_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_point"] + pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_byte"] + pub fn ts_node_end_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_point"] + pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_symbol"] + pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; +} +extern "C" { + #[link_name = "\u{1}_ts_node_type"] + pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_string"] + pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_eq"] + pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_null"] + pub fn ts_node_is_null(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_named"] + pub fn ts_node_is_named(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_missing"] + pub fn ts_node_is_missing(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_changes"] + pub fn ts_node_has_changes(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_error"] + pub fn ts_node_has_error(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_parent"] + pub fn ts_node_parent(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child"] + pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child"] + pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child_count"] + pub fn ts_node_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child_count"] + pub fn ts_node_named_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_sibling"] + pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_named_sibling"] + pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_sibling"] + pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_named_sibling"] + pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_child_for_byte"] + pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] + pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] + pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] + pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_point_range"] + pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) + -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] + pub fn ts_node_named_descendant_for_point_range( + arg1: TSNode, + arg2: TSPoint, + arg3: TSPoint, + ) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_new"] + pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_delete"] + pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] + pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_current_node"] + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_count"] + pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_name"] + pub fn ts_language_symbol_name( + arg1: *const TSLanguage, + arg2: TSSymbol, + ) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_type"] + pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; +} +extern "C" { + #[link_name = "\u{1}_ts_language_version"] + pub fn ts_language_version(arg1: *const TSLanguage) -> u32; +} + +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..ef11757a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,329 @@ +mod ffi; + +use std::ffi::CStr; +use std::marker::PhantomData; +use std::os::raw::{c_char, c_int, c_void}; +use std::ptr; + +#[derive(Clone, Copy)] +pub struct Symbol(ffi::TSSymbol); + +#[derive(Clone, Copy)] +pub struct Language(*const ffi::TSLanguage); + +pub trait Utf16Input { + fn read(&self) -> &[u16]; + fn seek(&self, u32, Point); +} + +pub trait Utf8Input { + fn read(&self) -> &[u8]; + fn seek(&self, u32, Point); +} + +pub enum LogType { + Parse, + Lex, +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Point { + pub row: u32, + pub column: u32, +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct InputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_position: Point, + pub old_end_position: Point, + pub new_end_position: Point, +} + +pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); + +pub struct Parser(*mut ffi::TSParser); + +pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); + +pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); + +impl Parser { + pub fn new() -> Parser { + unsafe { + let parser = ffi::ts_parser_new(); + Parser(parser) + } + } + + pub fn set_language(&mut self, language: Language) { + unsafe { + ffi::ts_parser_set_language(self.0, language.0); + } + } + + pub fn set_logger ()>(&mut self, logger: &mut F) { + unsafe extern "C" fn log ()>( + payload: *mut c_void, + c_log_type: ffi::TSLogType, + c_message: *const c_char, + ) { + let callback = (payload as *mut F).as_mut().unwrap(); + if let Ok(message) = CStr::from_ptr(c_message).to_str() { + let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { + LogType::Parse + } else { + LogType::Lex + }; + callback(log_type, message); + } + }; + + let c_logger = ffi::TSLogger { + payload: logger as *mut F as *mut c_void, + log: Some(log::), + }; + + unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; + } + + pub fn parse_utf8( + &mut self, + input: &mut T, + old_tree: Option, + ) -> Option { + unsafe extern "C" fn read( + payload: *mut c_void, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let result = input.read(); + *bytes_read = result.len() as u32; + return result.as_ptr() as *const c_char; + }; + + unsafe extern "C" fn seek( + payload: *mut c_void, + byte: u32, + position: ffi::TSPoint, + ) -> c_int { + let input = (payload as *mut T).as_mut().unwrap(); + input.seek( + byte, + Point { + row: position.row, + column: position.column, + }, + ); + return 1; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + seek: Some(seek::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + + let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; + if new_tree_ptr.is_null() { + None + } else { + Some(Tree(new_tree_ptr, ffi::TSInputEncoding_TSInputEncodingUTF8)) + } + } + + pub fn parse_utf16( + &mut self, + input: &mut T, + old_tree: Option, + ) -> Option { + unsafe extern "C" fn read( + payload: *mut c_void, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let result = input.read(); + *bytes_read = result.len() as u32 * 2; + return result.as_ptr() as *const c_char; + }; + + unsafe extern "C" fn seek( + payload: *mut c_void, + byte: u32, + position: ffi::TSPoint, + ) -> c_int { + let input = (payload as *mut T).as_mut().unwrap(); + input.seek( + byte / 2, + Point { + row: position.row, + column: position.column / 2, + }, + ); + return 1; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + seek: Some(seek::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + + let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; + if new_tree_ptr.is_null() { + None + } else { + Some(Tree( + new_tree_ptr, + ffi::TSInputEncoding_TSInputEncodingUTF16, + )) + } + } +} + +impl Drop for Parser { + fn drop(&mut self) { + unsafe { ffi::ts_parser_delete(self.0) } + } +} + +impl Tree { + pub fn root_node(&self) -> Node { + Node::new(unsafe { ffi::ts_tree_root_node(self.0) }).unwrap() + } + + pub fn edit(&mut self, edit: &InputEdit) { + let edit = ffi::TSInputEdit { + start_byte: edit.start_byte, + old_end_byte: edit.old_end_byte, + new_end_byte: edit.new_end_byte, + start_point: edit.start_position.into(), + old_end_point: edit.old_end_position.into(), + new_end_point: edit.new_end_position.into(), + }; + unsafe { ffi::ts_tree_edit(self.0, &edit) }; + } + + pub fn walk(&self) -> TreeCursor { + TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + } +} + +impl Drop for Tree { + fn drop(&mut self) { + unsafe { ffi::ts_tree_delete(self.0) } + } +} + +impl Clone for Tree { + fn clone(&self) -> Tree { + unsafe { Tree(ffi::ts_tree_copy(self.0), self.1) } + } +} + +impl<'a> Node<'a> { + fn new(node: ffi::TSNode) -> Option { + if node.id.is_null() { + None + } else { + Some(Node(node, PhantomData)) + } + } + + pub fn name(&self) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } + .to_str() + .unwrap() + } + + pub fn start_index(&self) -> u32 { + unsafe { ffi::ts_node_start_byte(self.0) } + } + + pub fn end_index(&self) -> u32 { + unsafe { ffi::ts_node_end_byte(self.0) } + } + + pub fn start_position(&self) -> Point { + let result = unsafe { ffi::ts_node_start_point(self.0) }; + Point { + row: result.row, + column: result.column, + } + } + + pub fn end_position(&self) -> Point { + let result = unsafe { ffi::ts_node_end_point(self.0) }; + Point { + row: result.row, + column: result.column, + } + } + + pub fn child(&self, i: u32) -> Option { + Self::new(unsafe { ffi::ts_node_child(self.0, i) }) + } + + pub fn parent(&self) -> Option { + Self::new(unsafe { ffi::ts_node_parent(self.0) }) + } +} + +impl<'a> TreeCursor<'a> { + fn node(&'a self) -> Node<'a> { + Node( + unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, + PhantomData, + ) + } + + fn goto_first_child(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; + } + + fn goto_parent(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; + } + + fn goto_next_sibling(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; + } + + fn goto_first_child_for_index(&mut self, index: u32) -> Option { + let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; + if result < 0 { + None + } else { + Some(result as u32) + } + } +} + +impl<'a> Drop for TreeCursor<'a> { + fn drop(&mut self) { + unsafe { ffi::ts_tree_cursor_delete(&mut self.0) } + } +} + +impl Into for Point { + fn into(self) -> ffi::TSPoint { + ffi::TSPoint { + row: self.row, + column: self.column, + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn it_works() {} +} diff --git a/vendor/tree-sitter b/vendor/tree-sitter new file mode 160000 index 00000000..5ec3769c --- /dev/null +++ b/vendor/tree-sitter @@ -0,0 +1 @@ +Subproject commit 5ec3769cb4c9acfda64f80d7c14abce939e8b4c5 From 8918d1a5b14f9a54ef23dcb4b29d8bf2bccd6384 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:35:31 -0700 Subject: [PATCH 002/208] Add boilerplate --- .travis.yml | 8 ++++++++ LICENSE | 21 +++++++++++++++++++++ README.md | 8 ++++++++ 3 files changed, 37 insertions(+) create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..32e3a71f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: rust + +rust: + - stable + +branches: + only: + - master diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..971b81f9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 Max Brunsfeld + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..08df0e4e --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +Rust Tree-sitter +=========================== + +[![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) + +Rust bindings to the [Tree-sitter][] parsing library. + +[tree-sitter]: https://github.com/tree-sitter/tree-sitter From f07f710db7633dc26d86163972512799ae407540 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:40:31 -0700 Subject: [PATCH 003/208] Compile tree-sitter sources in c99 mode --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 3427ed5f..53265655 100644 --- a/build.rs +++ b/build.rs @@ -24,6 +24,7 @@ fn main() { config.include("vendor/tree-sitter/src"); config.include("vendor/tree-sitter/include"); config.include("vendor/tree-sitter/externals/utf8proc"); + config.flag_if_supported("-std=c99"); config.flag_if_supported("-Wno-unused-parameter"); for source_filename in source_filenames.iter() { From ead0e312624a4e20a312875c073be4dc51a2f29b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 14:43:30 -0700 Subject: [PATCH 004/208] Fix duplicated compile of parser.c --- build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/build.rs b/build.rs index 53265655..ad62f3a1 100644 --- a/build.rs +++ b/build.rs @@ -12,7 +12,6 @@ fn main() { "lexer.c", "node.c", "parser.c", - "parser.c", "stack.c", "subtree.c", "tree_cursor.c", From 08217fff8dfc7a80b2348679144ff44344d63008 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:16:35 -0700 Subject: [PATCH 005/208] Get basic parsing working, add some unit tests --- .gitignore | 1 + .travis.yml | 6 ++ Cargo.toml | 2 - build.rs | 31 ++++++---- fixtures/.gitkeep | 0 script/fetch-test-fixtures.sh | 14 +++++ src/lib.rs | 113 +++++++++++++++++++++++++++++----- 7 files changed, 138 insertions(+), 29 deletions(-) create mode 100644 fixtures/.gitkeep create mode 100755 script/fetch-test-fixtures.sh diff --git a/.gitignore b/.gitignore index a9d37c56..fbd4fda0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target Cargo.lock +fixtures/tree-sitter-rust diff --git a/.travis.yml b/.travis.yml index 32e3a71f..10fcfe94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,12 @@ language: rust rust: - stable +env: + - RUST_TREE_SITTER_TEST=1 + +before_install: + - ./script/fetch-test-fixtures.sh + branches: only: - master diff --git a/Cargo.toml b/Cargo.toml index 0a93febe..e20d40aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,5 @@ include = [ "vendor/tree-sitter/externals/utf8proc/utf8proc*" ] -[dependencies] - [build-dependencies] cc = "1.0" diff --git a/build.rs b/build.rs index ad62f3a1..fa8b41ea 100644 --- a/build.rs +++ b/build.rs @@ -1,10 +1,17 @@ extern crate cc; +use std::env; use std::path::Path; - fn main() { - let dir_path = Path::new("vendor/tree-sitter/src/runtime"); + let root_path = Path::new("vendor/tree-sitter"); + + let mut config = cc::Build::new(); + config.flag_if_supported("-std=c99"); + config.flag_if_supported("-Wno-unused-parameter"); + config.include(root_path.join(Path::new("src"))); + config.include(root_path.join(Path::new("include"))); + config.include(root_path.join(Path::new("externals/utf8proc"))); let source_filenames = [ "get_changed_ranges.c", @@ -19,16 +26,18 @@ fn main() { "utf16.c", ]; - let mut config = cc::Build::new(); - config.include("vendor/tree-sitter/src"); - config.include("vendor/tree-sitter/include"); - config.include("vendor/tree-sitter/externals/utf8proc"); - config.flag_if_supported("-std=c99"); - config.flag_if_supported("-Wno-unused-parameter"); + config.files(source_filenames.iter().map(|source_filename| { + root_path + .join(Path::new(&"src/runtime")) + .join(Path::new(&source_filename)) + })); - for source_filename in source_filenames.iter() { - let source_path = dir_path.join(Path::new(&source_filename)); - config.file(&source_path.to_str().unwrap()); + config.file(root_path.join(Path::new("externals/utf8proc/utf8proc.c"))); + + if env::var("RUST_TREE_SITTER_TEST").is_ok() { + let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + config.file(parser_dir.join("parser.c")); + config.file(parser_dir.join("scanner.c")); } config.compile("libruntime.a") diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/script/fetch-test-fixtures.sh b/script/fetch-test-fixtures.sh new file mode 100755 index 00000000..24cc316a --- /dev/null +++ b/script/fetch-test-fixtures.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +grammar_dir='fixtures/tree-sitter-rust' +grammar_url='https://github.com/tree-sitter/tree-sitter-rust' + +if [ ! -d $grammar_dir ]; then + git clone $grammar_url $grammar_dir --depth=1 +fi + +( + cd $grammar_dir; + git fetch origin master --depth=1 + git reset --hard origin/master; +) diff --git a/src/lib.rs b/src/lib.rs index ef11757a..fa1db0f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,19 +8,19 @@ use std::ptr; #[derive(Clone, Copy)] pub struct Symbol(ffi::TSSymbol); -#[derive(Clone, Copy)] -pub struct Language(*const ffi::TSLanguage); +pub type Language = *const ffi::TSLanguage; pub trait Utf16Input { - fn read(&self) -> &[u16]; - fn seek(&self, u32, Point); + fn read(&mut self) -> &[u16]; + fn seek(&mut self, u32, Point); } pub trait Utf8Input { - fn read(&self) -> &[u8]; - fn seek(&self, u32, Point); + fn read(&mut self) -> &[u8]; + fn seek(&mut self, u32, Point); } +#[derive(Debug, PartialEq, Eq)] pub enum LogType { Parse, Lex, @@ -50,6 +50,11 @@ pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +struct FlatInput<'a> { + bytes: &'a [u8], + offset: usize, +} + impl Parser { pub fn new() -> Parser { unsafe { @@ -60,11 +65,11 @@ impl Parser { pub fn set_language(&mut self, language: Language) { unsafe { - ffi::ts_parser_set_language(self.0, language.0); + ffi::ts_parser_set_language(self.0, language); } } - pub fn set_logger ()>(&mut self, logger: &mut F) { + pub fn set_logger ()>(&mut self, logger: Option<&mut F>) { unsafe extern "C" fn log ()>( payload: *mut c_void, c_log_type: ffi::TSLogType, @@ -81,14 +86,24 @@ impl Parser { } }; - let c_logger = ffi::TSLogger { - payload: logger as *mut F as *mut c_void, - log: Some(log::), - }; + let c_logger; + if let Some(logger) = logger { + c_logger = ffi::TSLogger { + payload: logger as *mut F as *mut c_void, + log: Some(log::), + }; + } else { + c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; + } unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; } + pub fn parse_str(&mut self, input: &str, old_tree: Option) -> Option { + let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; + self.parse_utf8(&mut input, old_tree) + } + pub fn parse_utf8( &mut self, input: &mut T, @@ -239,9 +254,7 @@ impl<'a> Node<'a> { } pub fn name(&self) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } - .to_str() - .unwrap() + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } pub fn start_index(&self) -> u32 { @@ -272,11 +285,24 @@ impl<'a> Node<'a> { Self::new(unsafe { ffi::ts_node_child(self.0, i) }) } + pub fn child_count(&self) -> u32 { + unsafe { ffi::ts_node_child_count(self.0) } + } + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } + + pub fn to_sexp(&self) -> String { + let c_string = unsafe { ffi::ts_node_string(self.0) }; + let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); + unsafe { free(c_string as *mut c_void) }; + result + } } +extern "C" { fn free(pointer: *mut c_void); } + impl<'a> TreeCursor<'a> { fn node(&'a self) -> Node<'a> { Node( @@ -322,8 +348,63 @@ impl Into for Point { } } +impl<'a> Utf8Input for FlatInput<'a> { + fn read(&mut self) -> &[u8] { + let result = &self.bytes[self.offset..]; + self.offset = self.bytes.len(); + result + } + + fn seek(&mut self, offset: u32, _position: Point) { + self.offset = offset as usize; + } +} + #[cfg(test)] mod tests { + use super::*; + + fn rust() -> Language { unsafe { tree_sitter_rust() } } + extern "C" { fn tree_sitter_rust() -> Language; } + #[test] - fn it_works() {} + fn test_basic_parsing() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let tree = parser.parse_str(" + struct Stuff {} + fn main() {} + ", None).unwrap(); + + let root_node = tree.root_node(); + assert_eq!(root_node.name(), "source_file"); + + assert_eq!( + root_node.to_sexp(), + "(source_file (struct_item (type_identifier) (field_declaration_list)) (function_item (identifier) (parameters) (block)))" + ); + + let struct_node = root_node.child(0).unwrap(); + assert_eq!(struct_node.name(), "struct_item"); + } + + #[test] + fn test_logging() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let mut messages = Vec::new(); + parser.set_logger(Some(&mut |log_type, message| { + messages.push((log_type, message.to_string())); + })); + + parser.parse_str(" + struct Stuff {} + fn main() {} + ", None).unwrap(); + + assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); + assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); + } } From 7e6675d56effa6177eaf387b13942c8219107ae1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:23:35 -0700 Subject: [PATCH 006/208] Use a more unique library name when building C sources --- build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.rs b/build.rs index fa8b41ea..4e2c3b8f 100644 --- a/build.rs +++ b/build.rs @@ -40,5 +40,5 @@ fn main() { config.file(parser_dir.join("scanner.c")); } - config.compile("libruntime.a") + config.compile("treesitter") } From 572a60183c86920b0c1bc83941d70b3772534e3a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 17:29:23 -0700 Subject: [PATCH 007/208] Suppress warnings associated w/ generated bindings --- build.rs | 1 + script/bindgen.sh | 2 +- src/bindings.rs | 333 +++++++++++++++++++++++++++++++++++++++++++++ src/ffi.rs | 335 +--------------------------------------------- 4 files changed, 338 insertions(+), 333 deletions(-) create mode 100644 src/bindings.rs diff --git a/build.rs b/build.rs index 4e2c3b8f..5fa5d408 100644 --- a/build.rs +++ b/build.rs @@ -36,6 +36,7 @@ fn main() { if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + config.flag_if_supported("-Wno-typedef-redefinition"); config.file(parser_dir.join("parser.c")); config.file(parser_dir.join("scanner.c")); } diff --git a/script/bindgen.sh b/script/bindgen.sh index 190e7a4f..1b9008b2 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -1,6 +1,6 @@ #!/bin/bash -output_path=src/ffi.rs +output_path=src/bindings.rs header_path='vendor/tree-sitter/include/tree_sitter/runtime.h' bindgen \ diff --git a/src/bindings.rs b/src/bindings.rs new file mode 100644 index 00000000..7d1c06e8 --- /dev/null +++ b/src/bindings.rs @@ -0,0 +1,333 @@ +/* automatically generated by rust-bindgen */ + +pub type FILE = [u64; 19usize]; +pub type TSSymbol = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLanguage { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSParser { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTree { + _unused: [u8; 0], +} +pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; +pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; +pub type TSInputEncoding = u32; +pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; +pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; +pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; +pub type TSSymbolType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSPoint { + pub row: u32, + pub column: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSRange { + pub start: TSPoint, + pub end: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInput { + pub payload: *mut ::std::os::raw::c_void, + pub read: ::std::option::Option< + unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) + -> *const ::std::os::raw::c_char, + >, + pub seek: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + byte_index: u32, + position: TSPoint, + ) -> ::std::os::raw::c_int, + >, + pub encoding: TSInputEncoding, +} +pub const TSLogType_TSLogTypeParse: TSLogType = 0; +pub const TSLogType_TSLogTypeLex: TSLogType = 1; +pub type TSLogType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLogger { + pub payload: *mut ::std::os::raw::c_void, + pub log: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + arg1: TSLogType, + arg2: *const ::std::os::raw::c_char, + ), + >, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_point: TSPoint, + pub old_end_point: TSPoint, + pub new_end_point: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSNode { + pub context: [u32; 4usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTreeCursor { + pub context: [u32; 2usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +extern "C" { + #[link_name = "\u{1}_ts_parser_new"] + pub fn ts_parser_new() -> *mut TSParser; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_delete"] + pub fn ts_parser_delete(arg1: *mut TSParser); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_language"] + pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_language"] + pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_logger"] + pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_set_logger"] + pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_print_dot_graphs"] + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_halt_on_error"] + pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse"] + pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_parser_parse_string"] + pub fn ts_parser_parse_string( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + ) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_copy"] + pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_delete"] + pub fn ts_tree_delete(arg1: *mut TSTree); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_root_node"] + pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_edit"] + pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_get_changed_ranges"] + pub fn ts_tree_get_changed_ranges( + arg1: *const TSTree, + arg2: *const TSTree, + arg3: *mut u32, + ) -> *mut TSRange; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_print_dot_graph"] + pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_byte"] + pub fn ts_node_start_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_start_point"] + pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_byte"] + pub fn ts_node_end_byte(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_end_point"] + pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + #[link_name = "\u{1}_ts_node_symbol"] + pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; +} +extern "C" { + #[link_name = "\u{1}_ts_node_type"] + pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_string"] + pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_node_eq"] + pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_null"] + pub fn ts_node_is_null(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_named"] + pub fn ts_node_is_named(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_is_missing"] + pub fn ts_node_is_missing(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_changes"] + pub fn ts_node_has_changes(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_has_error"] + pub fn ts_node_has_error(arg1: TSNode) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_node_parent"] + pub fn ts_node_parent(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child"] + pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child"] + pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_child_count"] + pub fn ts_node_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_child_count"] + pub fn ts_node_named_child_count(arg1: TSNode) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_sibling"] + pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_next_named_sibling"] + pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_sibling"] + pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_prev_named_sibling"] + pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_child_for_byte"] + pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] + pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] + pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] + pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_descendant_for_point_range"] + pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) + -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] + pub fn ts_node_named_descendant_for_point_range( + arg1: TSNode, + arg2: TSPoint, + arg3: TSPoint, + ) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_new"] + pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_delete"] + pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] + pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + #[link_name = "\u{1}_ts_tree_cursor_current_node"] + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_count"] + pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_name"] + pub fn ts_language_symbol_name( + arg1: *const TSLanguage, + arg2: TSSymbol, + ) -> *const ::std::os::raw::c_char; +} +extern "C" { + #[link_name = "\u{1}_ts_language_symbol_type"] + pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; +} +extern "C" { + #[link_name = "\u{1}_ts_language_version"] + pub fn ts_language_version(arg1: *const TSLanguage) -> u32; +} + +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; diff --git a/src/ffi.rs b/src/ffi.rs index 7d1c06e8..323609e0 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -1,333 +1,4 @@ -/* automatically generated by rust-bindgen */ +#![allow(dead_code)] +#![allow(non_upper_case_globals)] -pub type FILE = [u64; 19usize]; -pub type TSSymbol = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSLanguage { - _unused: [u8; 0], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSParser { - _unused: [u8; 0], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSTree { - _unused: [u8; 0], -} -pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; -pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; -pub type TSInputEncoding = u32; -pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; -pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; -pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; -pub type TSSymbolType = u32; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSPoint { - pub row: u32, - pub column: u32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSRange { - pub start: TSPoint, - pub end: TSPoint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSInput { - pub payload: *mut ::std::os::raw::c_void, - pub read: ::std::option::Option< - unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) - -> *const ::std::os::raw::c_char, - >, - pub seek: ::std::option::Option< - unsafe extern "C" fn( - payload: *mut ::std::os::raw::c_void, - byte_index: u32, - position: TSPoint, - ) -> ::std::os::raw::c_int, - >, - pub encoding: TSInputEncoding, -} -pub const TSLogType_TSLogTypeParse: TSLogType = 0; -pub const TSLogType_TSLogTypeLex: TSLogType = 1; -pub type TSLogType = u32; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSLogger { - pub payload: *mut ::std::os::raw::c_void, - pub log: ::std::option::Option< - unsafe extern "C" fn( - payload: *mut ::std::os::raw::c_void, - arg1: TSLogType, - arg2: *const ::std::os::raw::c_char, - ), - >, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSInputEdit { - pub start_byte: u32, - pub old_end_byte: u32, - pub new_end_byte: u32, - pub start_point: TSPoint, - pub old_end_point: TSPoint, - pub new_end_point: TSPoint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSNode { - pub context: [u32; 4usize], - pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct TSTreeCursor { - pub context: [u32; 2usize], - pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, -} -extern "C" { - #[link_name = "\u{1}_ts_parser_new"] - pub fn ts_parser_new() -> *mut TSParser; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_delete"] - pub fn ts_parser_delete(arg1: *mut TSParser); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_language"] - pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_set_language"] - pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_logger"] - pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_set_logger"] - pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_print_dot_graphs"] - pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_halt_on_error"] - pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); -} -extern "C" { - #[link_name = "\u{1}_ts_parser_parse"] - pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_parser_parse_string"] - pub fn ts_parser_parse_string( - arg1: *mut TSParser, - arg2: *const TSTree, - arg3: *const ::std::os::raw::c_char, - arg4: u32, - ) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_copy"] - pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_delete"] - pub fn ts_tree_delete(arg1: *mut TSTree); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_root_node"] - pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_edit"] - pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_get_changed_ranges"] - pub fn ts_tree_get_changed_ranges( - arg1: *const TSTree, - arg2: *const TSTree, - arg3: *mut u32, - ) -> *mut TSRange; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_print_dot_graph"] - pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); -} -extern "C" { - #[link_name = "\u{1}_ts_node_start_byte"] - pub fn ts_node_start_byte(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_start_point"] - pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; -} -extern "C" { - #[link_name = "\u{1}_ts_node_end_byte"] - pub fn ts_node_end_byte(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_end_point"] - pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; -} -extern "C" { - #[link_name = "\u{1}_ts_node_symbol"] - pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; -} -extern "C" { - #[link_name = "\u{1}_ts_node_type"] - pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_node_string"] - pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_node_eq"] - pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_null"] - pub fn ts_node_is_null(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_named"] - pub fn ts_node_is_named(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_is_missing"] - pub fn ts_node_is_missing(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_has_changes"] - pub fn ts_node_has_changes(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_has_error"] - pub fn ts_node_has_error(arg1: TSNode) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_node_parent"] - pub fn ts_node_parent(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_child"] - pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_child"] - pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_child_count"] - pub fn ts_node_child_count(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_child_count"] - pub fn ts_node_named_child_count(arg1: TSNode) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_node_next_sibling"] - pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_next_named_sibling"] - pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_prev_sibling"] - pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_prev_named_sibling"] - pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_first_child_for_byte"] - pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] - pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] - pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] - pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_point_range"] - pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) - -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] - pub fn ts_node_named_descendant_for_point_range( - arg1: TSNode, - arg2: TSPoint, - arg3: TSPoint, - ) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_new"] - pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_delete"] - pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] - pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] - pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] - pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] - pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; -} -extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_current_node"] - pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_count"] - pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_name"] - pub fn ts_language_symbol_name( - arg1: *const TSLanguage, - arg2: TSSymbol, - ) -> *const ::std::os::raw::c_char; -} -extern "C" { - #[link_name = "\u{1}_ts_language_symbol_type"] - pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; -} -extern "C" { - #[link_name = "\u{1}_ts_language_version"] - pub fn ts_language_version(arg1: *const TSLanguage) -> u32; -} - -pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; +include!("./bindings.rs"); From b1ff399960cb4a72fe9a4323ecfc9b633c35e545 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 18:02:01 -0700 Subject: [PATCH 008/208] :arrow_up: tree-sitter for warning fixes --- build.rs | 21 +++++++++++---------- vendor/tree-sitter | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/build.rs b/build.rs index 5fa5d408..8736b645 100644 --- a/build.rs +++ b/build.rs @@ -4,14 +4,15 @@ use std::env; use std::path::Path; fn main() { + let mut config = cc::Build::new(); let root_path = Path::new("vendor/tree-sitter"); - let mut config = cc::Build::new(); - config.flag_if_supported("-std=c99"); - config.flag_if_supported("-Wno-unused-parameter"); - config.include(root_path.join(Path::new("src"))); - config.include(root_path.join(Path::new("include"))); - config.include(root_path.join(Path::new("externals/utf8proc"))); + config + .flag("-std=c99") + .flag("-Wno-unused-parameter") + .include(root_path.join(Path::new("src"))) + .include(root_path.join(Path::new("include"))) + .include(root_path.join(Path::new("externals/utf8proc"))); let source_filenames = [ "get_changed_ranges.c", @@ -36,10 +37,10 @@ fn main() { if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); - config.flag_if_supported("-Wno-typedef-redefinition"); - config.file(parser_dir.join("parser.c")); - config.file(parser_dir.join("scanner.c")); + config + .file(parser_dir.join("parser.c")) + .file(parser_dir.join("scanner.c")); } - config.compile("treesitter") + config.compile("treesitter_ffi"); } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 5ec3769c..3c01382b 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 5ec3769cb4c9acfda64f80d7c14abce939e8b4c5 +Subproject commit 3c01382b95364ce40f0cf9856865a30af77f9690 From 13e26b5007b19f2f98584adf594b478f2cbb9175 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 18:08:44 -0700 Subject: [PATCH 009/208] Try a static flag --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 8736b645..b7433f54 100644 --- a/build.rs +++ b/build.rs @@ -10,6 +10,7 @@ fn main() { config .flag("-std=c99") .flag("-Wno-unused-parameter") + .static_flag(true) .include(root_path.join(Path::new("src"))) .include(root_path.join(Path::new("include"))) .include(root_path.join(Path::new("externals/utf8proc"))); From 29dfa0550413cecb9f2fb13798e60f95522bb0ba Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 May 2018 19:40:06 -0700 Subject: [PATCH 010/208] Try clang --- .travis.yml | 12 +++++++++++- build.rs | 1 - 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 10fcfe94..5b99d596 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,21 @@ rust: - stable env: - - RUST_TREE_SITTER_TEST=1 + - CC=clang-3.6 RUST_TREE_SITTER_TEST=1 before_install: - ./script/fetch-test-fixtures.sh +compiler: clang-3.6 + +addons: + apt: + sources: + - llvm-toolchain-precise-3.6 + - ubuntu-toolchain-r-test + packages: + - clang-3.6 + branches: only: - master diff --git a/build.rs b/build.rs index b7433f54..8736b645 100644 --- a/build.rs +++ b/build.rs @@ -10,7 +10,6 @@ fn main() { config .flag("-std=c99") .flag("-Wno-unused-parameter") - .static_flag(true) .include(root_path.join(Path::new("src"))) .include(root_path.join(Path::new("include"))) .include(root_path.join(Path::new("externals/utf8proc"))); From e61edf539824631b4e59a8d8ed022f7a065cf95a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 09:30:00 -0700 Subject: [PATCH 011/208] Don't perform platform-specific name mangling on C functions for bindings --- script/bindgen.sh | 1 + src/bindings.rs | 57 +---------------------------------------------- 2 files changed, 2 insertions(+), 56 deletions(-) diff --git a/script/bindgen.sh b/script/bindgen.sh index 1b9008b2..699f0339 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -8,6 +8,7 @@ bindgen \ --whitelist-type '^TS.*' \ --whitelist-function '^ts_.*' \ --opaque-type FILE \ + --distrust-clang-mangling \ $header_path > $output_path echo "" >> $output_path diff --git a/src/bindings.rs b/src/bindings.rs index 7d1c06e8..1ab49bde 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -1,7 +1,7 @@ /* automatically generated by rust-bindgen */ pub type FILE = [u64; 19usize]; -pub type TSSymbol = ::std::os::raw::c_ushort; +pub type TSSymbol = u16; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSLanguage { @@ -93,43 +93,33 @@ pub struct TSTreeCursor { pub tree: *const ::std::os::raw::c_void, } extern "C" { - #[link_name = "\u{1}_ts_parser_new"] pub fn ts_parser_new() -> *mut TSParser; } extern "C" { - #[link_name = "\u{1}_ts_parser_delete"] pub fn ts_parser_delete(arg1: *mut TSParser); } extern "C" { - #[link_name = "\u{1}_ts_parser_language"] pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; } extern "C" { - #[link_name = "\u{1}_ts_parser_set_language"] pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_parser_logger"] pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; } extern "C" { - #[link_name = "\u{1}_ts_parser_set_logger"] pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); } extern "C" { - #[link_name = "\u{1}_ts_parser_print_dot_graphs"] pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); } extern "C" { - #[link_name = "\u{1}_ts_parser_halt_on_error"] pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); } extern "C" { - #[link_name = "\u{1}_ts_parser_parse"] pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_parser_parse_string"] pub fn ts_parser_parse_string( arg1: *mut TSParser, arg2: *const TSTree, @@ -138,23 +128,18 @@ extern "C" { ) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_tree_copy"] pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } extern "C" { - #[link_name = "\u{1}_ts_tree_delete"] pub fn ts_tree_delete(arg1: *mut TSTree); } extern "C" { - #[link_name = "\u{1}_ts_tree_root_node"] pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_tree_edit"] pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); } extern "C" { - #[link_name = "\u{1}_ts_tree_get_changed_ranges"] pub fn ts_tree_get_changed_ranges( arg1: *const TSTree, arg2: *const TSTree, @@ -162,120 +147,91 @@ extern "C" { ) -> *mut TSRange; } extern "C" { - #[link_name = "\u{1}_ts_tree_print_dot_graph"] pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); } extern "C" { - #[link_name = "\u{1}_ts_node_start_byte"] pub fn ts_node_start_byte(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_start_point"] pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; } extern "C" { - #[link_name = "\u{1}_ts_node_end_byte"] pub fn ts_node_end_byte(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_end_point"] pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; } extern "C" { - #[link_name = "\u{1}_ts_node_symbol"] pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; } extern "C" { - #[link_name = "\u{1}_ts_node_type"] pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_node_string"] pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_node_eq"] pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_null"] pub fn ts_node_is_null(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_named"] pub fn ts_node_is_named(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_is_missing"] pub fn ts_node_is_missing(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_has_changes"] pub fn ts_node_has_changes(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_has_error"] pub fn ts_node_has_error(arg1: TSNode) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_node_parent"] pub fn ts_node_parent(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_child"] pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_child"] pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_child_count"] pub fn ts_node_child_count(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_named_child_count"] pub fn ts_node_named_child_count(arg1: TSNode) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_node_next_sibling"] pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_next_named_sibling"] pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_prev_sibling"] pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_prev_named_sibling"] pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_first_child_for_byte"] pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_first_named_child_for_byte"] pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_byte_range"] pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_byte_range"] pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_descendant_for_point_range"] pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_node_named_descendant_for_point_range"] pub fn ts_node_named_descendant_for_point_range( arg1: TSNode, arg2: TSPoint, @@ -283,50 +239,39 @@ extern "C" { ) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_new"] pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_delete"] pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child"] pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_first_child_for_byte"] pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_next_sibling"] pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_goto_parent"] pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - #[link_name = "\u{1}_ts_tree_cursor_current_node"] pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_count"] pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_name"] pub fn ts_language_symbol_name( arg1: *const TSLanguage, arg2: TSSymbol, ) -> *const ::std::os::raw::c_char; } extern "C" { - #[link_name = "\u{1}_ts_language_symbol_type"] pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; } extern "C" { - #[link_name = "\u{1}_ts_language_version"] pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } From 29c0cd3aa4d9e569c0ea2d1b4ea2652e207ca51a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 09:48:24 -0700 Subject: [PATCH 012/208] Add appveyor config --- README.md | 1 + appveyor.yml | 24 ++++++++++++++++++++++++ script/fetch-test-fixtures.cmd | 16 ++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 appveyor.yml create mode 100755 script/fetch-test-fixtures.cmd diff --git a/README.md b/README.md index 08df0e4e..40f5624f 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ Rust Tree-sitter =========================== [![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) +[![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master) Rust bindings to the [Tree-sitter][] parsing library. diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 00000000..23fe3d97 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,24 @@ +environment: + RUST_TREE_SITTER_TEST: true + +build: false + +install: + - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe + - rustup-init -yv --default-toolchain stable + - set PATH=%PATH%;%USERPROFILE%\.cargo\bin + - rustc -vV + - cargo -vV + - script\fetch-test-fixtures.cmd + +test_script: + - cargo build + - cargo test + +branches: + only: + - master + +cache: + - fixtures + - C:\Users\appveyor\.cargo diff --git a/script/fetch-test-fixtures.cmd b/script/fetch-test-fixtures.cmd new file mode 100755 index 00000000..33543961 --- /dev/null +++ b/script/fetch-test-fixtures.cmd @@ -0,0 +1,16 @@ +@Echo off +SETLOCAL + +Set grammar_dir=fixtures\tree-sitter-rust +Set grammar_url=https://github.com/tree-sitter/tree-sitter-rust + +@IF NOT EXIST %grammar_dir% ( + git clone %grammar_url% %grammar_dir% --depth=1 +) + +pushd %grammar_dir% +git fetch origin master --depth=1 +git reset --hard origin/master +popd + +ENDLOCAL From 8d485857e10d90f76c344811a2da645ddfb74bd2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:01:37 -0700 Subject: [PATCH 013/208] Tweak build script for windows --- build.rs | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/build.rs b/build.rs index 8736b645..c1e768ff 100644 --- a/build.rs +++ b/build.rs @@ -1,18 +1,18 @@ extern crate cc; use std::env; -use std::path::Path; +use std::path::PathBuf; fn main() { let mut config = cc::Build::new(); - let root_path = Path::new("vendor/tree-sitter"); + let root_path: PathBuf = ["vendor", "tree-sitter"].iter().collect(); config - .flag("-std=c99") - .flag("-Wno-unused-parameter") - .include(root_path.join(Path::new("src"))) - .include(root_path.join(Path::new("include"))) - .include(root_path.join(Path::new("externals/utf8proc"))); + .flag_if_supported("-std=c99") + .flag_if_supported("-Wno-unused-parameter") + .include(root_path.join("src")) + .include(root_path.join("include")) + .include(root_path.join("externals").join("utf8proc")); let source_filenames = [ "get_changed_ranges.c", @@ -29,18 +29,19 @@ fn main() { config.files(source_filenames.iter().map(|source_filename| { root_path - .join(Path::new(&"src/runtime")) - .join(Path::new(&source_filename)) + .join("src") + .join("runtime") + .join(&source_filename) })); - config.file(root_path.join(Path::new("externals/utf8proc/utf8proc.c"))); + config.file(root_path.join("externals").join("utf8proc").join("utf8proc.c")); if env::var("RUST_TREE_SITTER_TEST").is_ok() { - let parser_dir = Path::new("fixtures/tree-sitter-rust/src"); + let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); config .file(parser_dir.join("parser.c")) .file(parser_dir.join("scanner.c")); } - config.compile("treesitter_ffi"); + config.compile("tree-sitter-runtime"); } From 7748f8e1687042fc477890378fc653c152bc2b31 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:16:28 -0700 Subject: [PATCH 014/208] Fetch submodules on appveyor --- appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 23fe3d97..22c8b96e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -4,11 +4,14 @@ environment: build: false install: + - git submodule update --init --recursive + - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - rustup-init -yv --default-toolchain stable - set PATH=%PATH%;%USERPROFILE%\.cargo\bin - rustc -vV - cargo -vV + - script\fetch-test-fixtures.cmd test_script: From 654789f92534b4fd6d59006a13353edc923da1cb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:27:36 -0700 Subject: [PATCH 015/208] Use UTF8PROC_STATIC macro --- build.rs | 1 + vendor/tree-sitter | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index c1e768ff..7d9ee83e 100644 --- a/build.rs +++ b/build.rs @@ -8,6 +8,7 @@ fn main() { let root_path: PathBuf = ["vendor", "tree-sitter"].iter().collect(); config + .define("UTF8PROC_STATIC", "") .flag_if_supported("-std=c99") .flag_if_supported("-Wno-unused-parameter") .include(root_path.join("src")) diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 3c01382b..9c1e82a7 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 3c01382b95364ce40f0cf9856865a30af77f9690 +Subproject commit 9c1e82a7eac97767cee0469faa2722fd5753b065 From 993bfea669b1ba49fa4a37b11abd82c5206f0209 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:39:00 -0700 Subject: [PATCH 016/208] Add missing source file --- build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/build.rs b/build.rs index 7d9ee83e..2843c758 100644 --- a/build.rs +++ b/build.rs @@ -22,6 +22,7 @@ fn main() { "node.c", "parser.c", "stack.c", + "string_input.c", "subtree.c", "tree_cursor.c", "tree.c", From 4603542747743e0f0bb1361a8cdb3d4abbb089b0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:44:14 -0700 Subject: [PATCH 017/208] Add more public methods and tests --- src/lib.rs | 134 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 124 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fa1db0f9..ef53e4de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -253,15 +253,27 @@ impl<'a> Node<'a> { } } - pub fn name(&self) -> &'static str { + pub fn kind(&self) -> &'static str { unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } - pub fn start_index(&self) -> u32 { + pub fn is_named(&self) -> bool { + unsafe { ffi::ts_node_is_named(self.0) } + } + + pub fn has_changes(&self) -> bool { + unsafe { ffi::ts_node_has_changes(self.0) } + } + + pub fn has_error(&self) -> bool { + unsafe { ffi::ts_node_has_error(self.0) } + } + + pub fn start_byte(&self) -> u32 { unsafe { ffi::ts_node_start_byte(self.0) } } - pub fn end_index(&self) -> u32 { + pub fn end_byte(&self) -> u32 { unsafe { ffi::ts_node_end_byte(self.0) } } @@ -289,10 +301,34 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_child_count(self.0) } } + pub fn named_child(&self, i: u32) -> Option { + Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) + } + + pub fn named_child_count(&self) -> u32 { + unsafe { ffi::ts_node_named_child_count(self.0) } + } + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } + pub fn next_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) + } + + pub fn prev_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) + } + + pub fn next_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) + } + + pub fn prev_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) + } + pub fn to_sexp(&self) -> String { let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); @@ -304,26 +340,26 @@ impl<'a> Node<'a> { extern "C" { fn free(pointer: *mut c_void); } impl<'a> TreeCursor<'a> { - fn node(&'a self) -> Node<'a> { + pub fn node(&'a self) -> Node<'a> { Node( unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, PhantomData, ) } - fn goto_first_child(&mut self) -> bool { + pub fn goto_first_child(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; } - fn goto_parent(&mut self) -> bool { + pub fn goto_parent(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; } - fn goto_next_sibling(&mut self) -> bool { + pub fn goto_next_sibling(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; } - fn goto_first_child_for_index(&mut self, index: u32) -> Option { + pub fn goto_first_child_for_index(&mut self, index: u32) -> Option { let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; if result < 0 { None @@ -378,7 +414,7 @@ mod tests { ", None).unwrap(); let root_node = tree.root_node(); - assert_eq!(root_node.name(), "source_file"); + assert_eq!(root_node.kind(), "source_file"); assert_eq!( root_node.to_sexp(), @@ -386,7 +422,7 @@ mod tests { ); let struct_node = root_node.child(0).unwrap(); - assert_eq!(struct_node.name(), "struct_item"); + assert_eq!(struct_node.kind(), "struct_item"); } #[test] @@ -407,4 +443,82 @@ mod tests { assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); } + + #[test] + fn test_tree_cursor() { + let mut parser = Parser::new(); + parser.set_language(rust()); + + let tree = parser.parse_str(" + struct Stuff { + a: A; + b: Option, + } + ", None).unwrap(); + + let mut cursor = tree.walk(); + assert_eq!(cursor.node().kind(), "source_file"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct_item"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct"); + assert_eq!(cursor.node().is_named(), false); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "type_identifier"); + assert_eq!(cursor.node().is_named(), true); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "field_declaration_list"); + assert_eq!(cursor.node().is_named(), true); + } + + #[test] + fn test_custom_utf8_input() { + struct LineBasedInput { + lines: &'static [&'static str], + row: usize, + column: usize, + } + + impl Utf8Input for LineBasedInput { + fn read(&mut self) -> &[u8] { + if self.row < self.lines.len() { + let result = &self.lines[self.row].as_bytes()[self.column..]; + self.row += 1; + self.column = 0; + result + } else { + &[] + } + } + + fn seek(&mut self, _byte: u32, position: Point) { + self.row = position.row as usize; + self.column = position.column as usize; + } + } + + let mut parser = Parser::new(); + parser.set_language(rust()); + + let mut input = LineBasedInput { + lines: &[ + "pub fn main() {", + "}", + ], + row: 0, + column: 0 + }; + + let tree = parser.parse_utf8(&mut input, None).unwrap(); + let root = tree.root_node(); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + + let child = root.child(0).unwrap(); + assert_eq!(child.kind(), "function_item"); + } } From e10a817704c3982b4ed41928b2b504cdbdbaf702 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 10:55:42 -0700 Subject: [PATCH 018/208] Switch back to default c compiler on travis --- .travis.yml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5b99d596..10fcfe94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,21 +4,11 @@ rust: - stable env: - - CC=clang-3.6 RUST_TREE_SITTER_TEST=1 + - RUST_TREE_SITTER_TEST=1 before_install: - ./script/fetch-test-fixtures.sh -compiler: clang-3.6 - -addons: - apt: - sources: - - llvm-toolchain-precise-3.6 - - ubuntu-toolchain-r-test - packages: - - clang-3.6 - branches: only: - master From 870dc11f791425f441eb6e84f86332f4a6b1a21a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:15:37 -0700 Subject: [PATCH 019/208] Implement Eq and Debug for Node --- src/lib.rs | 66 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ef53e4de..2ecc7341 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,11 @@ mod ffi; +use std::fmt; use std::ffi::CStr; use std::marker::PhantomData; use std::os::raw::{c_char, c_int, c_void}; use std::ptr; -#[derive(Clone, Copy)] -pub struct Symbol(ffi::TSSymbol); - pub type Language = *const ffi::TSLanguage; pub trait Utf16Input { @@ -26,13 +24,13 @@ pub enum LogType { Lex, } -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Point { pub row: u32, pub column: u32, } -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { pub start_byte: u32, pub old_end_byte: u32, @@ -63,9 +61,19 @@ impl Parser { } } - pub fn set_language(&mut self, language: Language) { + pub fn set_language(&mut self, language: Language) -> Result<(), String> { unsafe { - ffi::ts_parser_set_language(self.0, language); + let version = ffi::ts_language_version(language) as usize; + if version == ffi::TREE_SITTER_LANGUAGE_VERSION { + ffi::ts_parser_set_language(self.0, language); + Ok(()) + } else { + Err(format!( + "Incompatible language version {}. Expected {}.", + version, + ffi::TREE_SITTER_LANGUAGE_VERSION + )) + } } } @@ -253,6 +261,10 @@ impl<'a> Node<'a> { } } + pub fn kind_id(&self) -> u16 { + unsafe { ffi::ts_node_symbol(self.0) } + } + pub fn kind(&self) -> &'static str { unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() } @@ -330,6 +342,8 @@ impl<'a> Node<'a> { } pub fn to_sexp(&self) -> String { + extern "C" { fn free(pointer: *mut c_void); } + let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); unsafe { free(c_string as *mut c_void) }; @@ -337,7 +351,17 @@ impl<'a> Node<'a> { } } -extern "C" { fn free(pointer: *mut c_void); } +impl<'a> PartialEq for Node<'a> { + fn eq(&self, other: &Self) -> bool { + self.0.id == other.0.id + } +} + +impl<'a> fmt::Debug for Node<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{{Node {} {} - {}}}", self.kind(), self.start_position(), self.end_position()) + } +} impl<'a> TreeCursor<'a> { pub fn node(&'a self) -> Node<'a> { @@ -375,6 +399,12 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl fmt::Display for Point { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "({}, {})", self.row, self.column) + } +} + impl Into for Point { fn into(self) -> ffi::TSPoint { ffi::TSPoint { @@ -406,7 +436,7 @@ mod tests { #[test] fn test_basic_parsing() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let tree = parser.parse_str(" struct Stuff {} @@ -428,7 +458,7 @@ mod tests { #[test] fn test_logging() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let mut messages = Vec::new(); parser.set_logger(Some(&mut |log_type, message| { @@ -447,7 +477,7 @@ mod tests { #[test] fn test_tree_cursor() { let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let tree = parser.parse_str(" struct Stuff { @@ -502,7 +532,7 @@ mod tests { } let mut parser = Parser::new(); - parser.set_language(rust()); + parser.set_language(rust()).unwrap(); let mut input = LineBasedInput { lines: &[ @@ -521,4 +551,16 @@ mod tests { let child = root.child(0).unwrap(); assert_eq!(child.kind(), "function_item"); } + + #[test] + fn test_node_equality() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("struct A {}", None).unwrap(); + let node1 = tree.root_node(); + let node2 = tree.root_node(); + assert_eq!(node1, node2); + assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); + assert_ne!(node1.child(0).unwrap(), node2); + } } From a27ac49dea32cb296ff4ebdd939c7fa01a3d72e7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:42:13 -0700 Subject: [PATCH 020/208] Flesh out README --- README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/README.md b/README.md index 40f5624f..43270713 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,94 @@ Rust Tree-sitter Rust bindings to the [Tree-sitter][] parsing library. +### Basic Usage + +First, create a parser: + +```rust +let parser = Parser::new(); +``` + +Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`: + +```rust +extern "C" fn tree_sitter_c() -> Language; +extern "C" fn tree_sitter_rust() -> Language; +extern "C" fn tree_sitter_javascript() -> Language; + +parser.set_language(unsafe { tree_sitter_rust() }).unwrap(); +``` + +Now you can parse source code: + +```rust +let source_code = "fn test() {}"; + +let tree = parser.parse_str(source_code, None); +let root_node = tree.root_node(); +assert_eq!(root_node.kind(), "source_file"); +assert_eq!(root_node.start_position().column, 0); +assert_eq!(root_node.end_position().column, 12); +``` + +### Editing + +Once you have a syntax tree, you can update it when your source code changes: + +```rust +let new_source_code = "fn test(a: u32) {}" + +tree.edit(InputEdit { + start_byte: 8, + old_end_byte: 8, + new_end_byte: 14, + start_position: Point::new(0, 8), + old_end_position: Point::new(0, 8), + new_end_position: Point::new(0, 14), +}); +let new_tree = parser.parse_str(new_source_code, Some(tree)); +``` + +### Text Input + + +The code can be provided either as a simple string or by any type that implements Tree-sitter's `Utf8Input` or `Utf16Input` traits: + +```rust +struct LineWiseInput { + lines: &'static [&'static str], + row: usize, + column: usize, +} + +impl tree_sitter::Utf8Input for LineWiseInput { + fn read(&mut self) -> &[u8] { + if self.row < self.lines.len() { + let result = &self.lines[self.row].as_bytes()[self.column..]; + self.row += 1; + self.column = 0; + result + } else { + &[] + } + } + + fn seek(&mut self, _byte: u32, position: Point) { + self.row = position.row as usize; + self.column = position.column as usize; + } +} + +let mut input = LineBasedInput { + lines: &[ + "pub fn main() {", + "}", + ], + row: 0, + column: 0 +}; + +let tree = parser.parse_utf8(&mut input, None).unwrap(); +``` + [tree-sitter]: https://github.com/tree-sitter/tree-sitter From c0b49e99357fbe25d62d800c9da2fd47566e9b31 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 11:51:46 -0700 Subject: [PATCH 021/208] Fix include globs in package manifest --- Cargo.toml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e20d40aa..560d9a71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,19 @@ [package] name = "tree-sitter" +description = "Rust bindings to the Tree-sitter parsing library" version = "0.1.0" authors = ["Max Brunsfeld "] build = "build.rs" -exclude = ["vendor/tree-sitter/**/*"] +license = "MIT" include = [ - "vendor/tree-sitter/src/runtime/*", - "vendor/tree-sitter/externals/utf8proc/utf8proc*" + "/build.rs", + "/Cargo.toml", + "/LICENSE", + "/README.md", + "/src/*", + "/vendor/tree-sitter/externals/utf8proc/utf8proc*", + "/vendor/tree-sitter/include/*", + "/vendor/tree-sitter/src/runtime/*", ] [build-dependencies] From e6d580597d5925f3d43bf01b2101d6e0ca9643fc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 12:02:52 -0700 Subject: [PATCH 022/208] Add crates.io badge to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 43270713..da6e1a80 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Rust Tree-sitter [![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) [![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master) +[![Crates.io](https://img.shields.io/crates/v/tree-sitter.svg)](https://crates.io/crates/tree-sitter) Rust bindings to the [Tree-sitter][] parsing library. From 819b14070123c4f6c61aa73c72654ce1b97fef16 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:06:49 -0700 Subject: [PATCH 023/208] Make set_logger take a boxed function --- src/lib.rs | 68 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2ecc7341..5ef80f70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,8 @@ pub enum LogType { Lex, } +type Logger<'a> = Box; + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Point { pub row: u32, @@ -44,7 +46,7 @@ pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); -pub struct Tree(*mut ffi::TSTree, ffi::TSInputEncoding); +pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); @@ -77,28 +79,42 @@ impl Parser { } } - pub fn set_logger ()>(&mut self, logger: Option<&mut F>) { - unsafe extern "C" fn log ()>( - payload: *mut c_void, - c_log_type: ffi::TSLogType, - c_message: *const c_char, - ) { - let callback = (payload as *mut F).as_mut().unwrap(); - if let Ok(message) = CStr::from_ptr(c_message).to_str() { - let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { - LogType::Parse - } else { - LogType::Lex - }; - callback(log_type, message); - } - }; + pub fn logger(&self) -> Option<&Logger> { + let logger = unsafe { ffi::ts_parser_logger(self.0) }; + unsafe { (logger.payload as *mut Logger).as_ref() } + } + + pub fn set_logger(&mut self, logger: Option) { + let prev_logger = unsafe { ffi::ts_parser_logger(self.0) }; + if !prev_logger.payload.is_null() { + unsafe { Box::from_raw(prev_logger.payload as *mut Logger) }; + } let c_logger; if let Some(logger) = logger { + let container = Box::new(logger); + + unsafe extern "C" fn log( + payload: *mut c_void, + c_log_type: ffi::TSLogType, + c_message: *const c_char, + ) { + let callback = (payload as *mut Logger).as_mut().unwrap(); + if let Ok(message) = CStr::from_ptr(c_message).to_str() { + let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { + LogType::Parse + } else { + LogType::Lex + }; + callback(log_type, message); + } + }; + + let raw_container = Box::into_raw(container); + c_logger = ffi::TSLogger { - payload: logger as *mut F as *mut c_void, - log: Some(log::), + payload: raw_container as *mut c_void, + log: Some(log), }; } else { c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; @@ -156,7 +172,7 @@ impl Parser { if new_tree_ptr.is_null() { None } else { - Some(Tree(new_tree_ptr, ffi::TSInputEncoding_TSInputEncodingUTF8)) + Some(Tree(new_tree_ptr)) } } @@ -204,16 +220,14 @@ impl Parser { if new_tree_ptr.is_null() { None } else { - Some(Tree( - new_tree_ptr, - ffi::TSInputEncoding_TSInputEncodingUTF16, - )) + Some(Tree(new_tree_ptr)) } } } impl Drop for Parser { fn drop(&mut self) { + self.set_logger(None); unsafe { ffi::ts_parser_delete(self.0) } } } @@ -248,7 +262,7 @@ impl Drop for Tree { impl Clone for Tree { fn clone(&self) -> Tree { - unsafe { Tree(ffi::ts_tree_copy(self.0), self.1) } + unsafe { Tree(ffi::ts_tree_copy(self.0)) } } } @@ -461,9 +475,9 @@ mod tests { parser.set_language(rust()).unwrap(); let mut messages = Vec::new(); - parser.set_logger(Some(&mut |log_type, message| { + parser.set_logger(Some(Box::new(|log_type, message| { messages.push((log_type, message.to_string())); - })); + }))); parser.parse_str(" struct Stuff {} From 4da669ce8d23cbfaeaba2d2c5969b678779ff0e9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:27:08 -0700 Subject: [PATCH 024/208] Fix bugs in editing/reparsing --- README.md | 2 +- src/lib.rs | 101 ++++++++++++++++++++++++++++++++++++++++----- vendor/tree-sitter | 2 +- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index da6e1a80..d0806bbb 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ tree.edit(InputEdit { old_end_position: Point::new(0, 8), new_end_position: Point::new(0, 14), }); -let new_tree = parser.parse_str(new_source_code, Some(tree)); +let new_tree = parser.parse_str(new_source_code, Some(&tree)); ``` ### Text Input diff --git a/src/lib.rs b/src/lib.rs index 5ef80f70..0ac1300e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -123,7 +123,7 @@ impl Parser { unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; } - pub fn parse_str(&mut self, input: &str, old_tree: Option) -> Option { + pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; self.parse_utf8(&mut input, old_tree) } @@ -131,7 +131,7 @@ impl Parser { pub fn parse_utf8( &mut self, input: &mut T, - old_tree: Option, + old_tree: Option<&Tree>, ) -> Option { unsafe extern "C" fn read( payload: *mut c_void, @@ -179,7 +179,7 @@ impl Parser { pub fn parse_utf16( &mut self, input: &mut T, - old_tree: Option, + old_tree: Option<&Tree>, ) -> Option { unsafe extern "C" fn read( payload: *mut c_void, @@ -266,7 +266,7 @@ impl Clone for Tree { } } -impl<'a> Node<'a> { +impl<'tree> Node<'tree> { fn new(node: ffi::TSNode) -> Option { if node.id.is_null() { None @@ -319,7 +319,7 @@ impl<'a> Node<'a> { } } - pub fn child(&self, i: u32) -> Option { + pub fn child(&self, i: u32) -> Option { Self::new(unsafe { ffi::ts_node_child(self.0, i) }) } @@ -327,7 +327,7 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_child_count(self.0) } } - pub fn named_child(&self, i: u32) -> Option { + pub fn named_child<'a>(&'a self, i: u32) -> Option { Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) } @@ -335,23 +335,23 @@ impl<'a> Node<'a> { unsafe { ffi::ts_node_named_child_count(self.0) } } - pub fn parent(&self) -> Option { + pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } - pub fn next_sibling(&self) -> Option { + pub fn next_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) } - pub fn prev_sibling(&self) -> Option { + pub fn prev_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) } - pub fn next_named_sibling(&self) -> Option { + pub fn next_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) } - pub fn prev_named_sibling(&self) -> Option { + pub fn prev_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) } @@ -413,6 +413,12 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl Point { + pub fn new(row: u32, column: u32) -> Self { + Point { row, column } + } +} + impl fmt::Display for Point { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "({}, {})", self.row, self.column) @@ -577,4 +583,77 @@ mod tests { assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); assert_ne!(node1.child(0).unwrap(), node2); } + + #[test] + fn test_editing() { + struct SpyInput { + bytes: &'static [u8], + offset: usize, + bytes_read: Vec, + } + + impl Utf8Input for SpyInput { + fn read(&mut self) -> &[u8] { + if self.offset < self.bytes.len() { + let result = &self.bytes[self.offset..self.offset + 1]; + self.bytes_read.extend(result.iter()); + self.offset += 1; + result + } else { + &[] + } + } + + fn seek(&mut self, byte: u32, _position: Point) { + self.offset = byte as usize; + } + } + + let mut input = SpyInput { + bytes: "fn test(a: A, c: C) {}".as_bytes(), + offset: 0, + bytes_read: Vec::new(), + }; + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut tree = parser.parse_utf8(&mut input, None).unwrap(); + let parameters_sexp = tree.root_node() + .named_child(0).unwrap() + .named_child(1).unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + input.offset = 0; + input.bytes_read.clear(); + input.bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + tree.edit(&InputEdit{ + start_byte: 14, + old_end_byte: 14, + new_end_byte: 20, + start_position: Point::new(0, 14), + old_end_position: Point::new(0, 14), + new_end_position: Point::new(0, 20), + }); + + let tree = parser.parse_utf8(&mut input, Some(&tree)).unwrap(); + let parameters_sexp = tree.root_node() + .named_child(0).unwrap() + .named_child(1).unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + let retokenized_content = String::from_utf8(input.bytes_read).unwrap(); + assert!(retokenized_content.contains("b: B")); + assert!(!retokenized_content.contains("a: A")); + assert!(!retokenized_content.contains("c: C")); + assert!(!retokenized_content.contains("{}")); + } } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 9c1e82a7..78f28b14 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 9c1e82a7eac97767cee0469faa2722fd5753b065 +Subproject commit 78f28b14ce519ba085ab7886c2fc19739f7f7da0 From 45660e7b4e5db579905924717fa4da22f6a1d97d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 14:27:08 -0700 Subject: [PATCH 025/208] Make syntax trees implement Send --- src/lib.rs | 107 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0ac1300e..6084516c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -254,6 +254,14 @@ impl Tree { } } +unsafe impl Send for Tree {} + +impl fmt::Debug for Tree { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{{Tree {:?}}}", self.root_node()) + } +} + impl Drop for Tree { fn drop(&mut self) { unsafe { ffi::ts_tree_delete(self.0) } @@ -448,6 +456,7 @@ impl<'a> Utf8Input for FlatInput<'a> { #[cfg(test)] mod tests { + use std::thread; use super::*; fn rust() -> Language { unsafe { tree_sitter_rust() } } @@ -586,29 +595,6 @@ mod tests { #[test] fn test_editing() { - struct SpyInput { - bytes: &'static [u8], - offset: usize, - bytes_read: Vec, - } - - impl Utf8Input for SpyInput { - fn read(&mut self) -> &[u8] { - if self.offset < self.bytes.len() { - let result = &self.bytes[self.offset..self.offset + 1]; - self.bytes_read.extend(result.iter()); - self.offset += 1; - result - } else { - &[] - } - } - - fn seek(&mut self, byte: u32, _position: Point) { - self.offset = byte as usize; - } - } - let mut input = SpyInput { bytes: "fn test(a: A, c: C) {}".as_bytes(), offset: 0, @@ -656,4 +642,79 @@ mod tests { assert!(!retokenized_content.contains("c: C")); assert!(!retokenized_content.contains("{}")); } + + #[test] + fn test_parallel_parsing() { + // Parse this source file so that each thread has a non-trivial amount of + // work to do. + let this_file_source = include_str!("lib.rs"); + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str(this_file_source, None).unwrap(); + + let mut parse_threads = Vec::new(); + for thread_id in 1..5 { + let mut tree_clone = tree.clone(); + parse_threads.push(thread::spawn(move || { + + // For each thread, prepend a different number of declarations to the + // source code. + let mut prepend_line_count = 0; + let mut prepended_source = String::new(); + for _ in 0..thread_id { + prepend_line_count += 2; + prepended_source += "struct X {}\n\n"; + } + + tree_clone.edit(&InputEdit{ + start_byte: 0, + old_end_byte: 0, + new_end_byte: prepended_source.len() as u32, + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(prepend_line_count, 0), + }); + prepended_source += this_file_source; + + // Reparse using the old tree as a starting point. + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + parser.parse_str(&prepended_source, Some(&tree_clone)).unwrap() + })); + } + + // Check that the trees have the expected relationship to one another. + let trees = parse_threads + .into_iter() + .map(|thread| thread.join().unwrap()); + let child_count_differences = trees + .map(|t| t.root_node().child_count() - tree.root_node().child_count()) + .collect::>(); + + assert_eq!(child_count_differences, &[1, 2, 3, 4]); + } + + struct SpyInput { + bytes: &'static [u8], + offset: usize, + bytes_read: Vec, + } + + impl Utf8Input for SpyInput { + fn read(&mut self) -> &[u8] { + if self.offset < self.bytes.len() { + let result = &self.bytes[self.offset..self.offset + 1]; + self.bytes_read.extend(result.iter()); + self.offset += 1; + result + } else { + &[] + } + } + + fn seek(&mut self, byte: u32, _position: Point) { + self.offset = byte as usize; + } + } } From 0034fce8093374bc5193727c96d45d98b9816a32 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 15:05:31 -0700 Subject: [PATCH 026/208] Add some fields to the cargo manifest --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 560d9a71..13c84759 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,10 @@ version = "0.1.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" +readme = "README.md" +keywords = ["incremental", "parsing"] +categories = ["parsing", "text editors", "api bindings"] + include = [ "/build.rs", "/Cargo.toml", From 16a7366ec75f5c03d497a12bb796d883bfd32466 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 May 2018 15:06:05 -0700 Subject: [PATCH 027/208] 0.1.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 13c84759..12d92923 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.1.0" +version = "0.1.1" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 5efc28f2f3741e9f3b1ff376be5de2890df80ed0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Jun 2018 16:19:37 -0700 Subject: [PATCH 028/208] Update to latest tree-sitter API --- README.md | 66 ++++++----- build.rs | 1 - src/bindings.rs | 22 +++- src/lib.rs | 273 +++++++++++++++++++++------------------------ vendor/tree-sitter | 2 +- 5 files changed, 175 insertions(+), 189 deletions(-) diff --git a/README.md b/README.md index d0806bbb..ff7140c5 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ Rust bindings to the [Tree-sitter][] parsing library. First, create a parser: ```rust +use tree_sitter::{Parser, Language}; + +// ... + let parser = Parser::new(); ``` @@ -22,16 +26,17 @@ extern "C" fn tree_sitter_c() -> Language; extern "C" fn tree_sitter_rust() -> Language; extern "C" fn tree_sitter_javascript() -> Language; -parser.set_language(unsafe { tree_sitter_rust() }).unwrap(); +let language = unsafe { tree_sitter_rust() }; +parser.set_language(language).unwrap(); ``` Now you can parse source code: ```rust let source_code = "fn test() {}"; - let tree = parser.parse_str(source_code, None); let root_node = tree.root_node(); + assert_eq!(root_node.kind(), "source_file"); assert_eq!(root_node.start_position().column, 0); assert_eq!(root_node.end_position().column, 12); @@ -39,7 +44,7 @@ assert_eq!(root_node.end_position().column, 12); ### Editing -Once you have a syntax tree, you can update it when your source code changes: +Once you have a syntax tree, you can update it when your source code changes. Passing in the previous edited tree makes `parse` run much more quickly: ```rust let new_source_code = "fn test(a: u32) {}" @@ -52,49 +57,42 @@ tree.edit(InputEdit { old_end_position: Point::new(0, 8), new_end_position: Point::new(0, 14), }); + let new_tree = parser.parse_str(new_source_code, Some(&tree)); ``` ### Text Input - -The code can be provided either as a simple string or by any type that implements Tree-sitter's `Utf8Input` or `Utf16Input` traits: +The source code to parse can be provided either as a string or as a function that returns text encoded as either UTF8 or UTF16: ```rust -struct LineWiseInput { - lines: &'static [&'static str], - row: usize, - column: usize, -} +// Store some source code in an array of lines. +let lines = &[ + "pub fn foo() {", + " 1", + "}", +]; -impl tree_sitter::Utf8Input for LineWiseInput { - fn read(&mut self) -> &[u8] { - if self.row < self.lines.len() { - let result = &self.lines[self.row].as_bytes()[self.column..]; - self.row += 1; - self.column = 0; - result +// Parse the source code using a custom callback. The callback is called +// with both a byte offset and a row/column offset. +let tree = parser.parse_utf8(&mut |_byte: u32, position: Point| -> &[u8] { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] } else { - &[] + "\n".as_bytes() } + } else { + &[] } +}, None).unwrap(); - fn seek(&mut self, _byte: u32, position: Point) { - self.row = position.row as usize; - self.column = position.column as usize; - } -} - -let mut input = LineBasedInput { - lines: &[ - "pub fn main() {", - "}", - ], - row: 0, - column: 0 -}; - -let tree = parser.parse_utf8(&mut input, None).unwrap(); +assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))" +); ``` [tree-sitter]: https://github.com/tree-sitter/tree-sitter diff --git a/build.rs b/build.rs index 2843c758..7d9ee83e 100644 --- a/build.rs +++ b/build.rs @@ -22,7 +22,6 @@ fn main() { "node.c", "parser.c", "stack.c", - "string_input.c", "subtree.c", "tree_cursor.c", "tree.c", diff --git a/src/bindings.rs b/src/bindings.rs index 1ab49bde..b2d83729 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -41,15 +41,12 @@ pub struct TSRange { pub struct TSInput { pub payload: *mut ::std::os::raw::c_void, pub read: ::std::option::Option< - unsafe extern "C" fn(payload: *mut ::std::os::raw::c_void, bytes_read: *mut u32) - -> *const ::std::os::raw::c_char, - >, - pub seek: ::std::option::Option< unsafe extern "C" fn( payload: *mut ::std::os::raw::c_void, byte_index: u32, position: TSPoint, - ) -> ::std::os::raw::c_int, + bytes_read: *mut u32, + ) -> *const ::std::os::raw::c_char, >, pub encoding: TSInputEncoding, } @@ -127,6 +124,21 @@ extern "C" { arg4: u32, ) -> *mut TSTree; } +extern "C" { + pub fn ts_parser_enabled(arg1: *const TSParser) -> bool; +} +extern "C" { + pub fn ts_parser_set_enabled(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + pub fn ts_parser_operation_limit(arg1: *const TSParser) -> usize; +} +extern "C" { + pub fn ts_parser_set_operation_limit(arg1: *mut TSParser, arg2: usize); +} +extern "C" { + pub fn ts_parser_reset(arg1: *mut TSParser); +} extern "C" { pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } diff --git a/src/lib.rs b/src/lib.rs index 6084516c..84d51f04 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,21 +3,11 @@ mod ffi; use std::fmt; use std::ffi::CStr; use std::marker::PhantomData; -use std::os::raw::{c_char, c_int, c_void}; +use std::os::raw::{c_char, c_void}; use std::ptr; pub type Language = *const ffi::TSLanguage; -pub trait Utf16Input { - fn read(&mut self) -> &[u16]; - fn seek(&mut self, u32, Point); -} - -pub trait Utf8Input { - fn read(&mut self) -> &[u8]; - fn seek(&mut self, u32, Point); -} - #[derive(Debug, PartialEq, Eq)] pub enum LogType { Parse, @@ -50,11 +40,6 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -struct FlatInput<'a> { - bytes: &'a [u8], - offset: usize, -} - impl Parser { pub fn new() -> Parser { unsafe { @@ -124,105 +109,86 @@ impl Parser { } pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { - let mut input = FlatInput { bytes: input.as_bytes(), offset: 0}; - self.parse_utf8(&mut input, old_tree) + let bytes = input.as_bytes(); + self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) } - pub fn parse_utf8( + pub fn parse_utf8<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read( + unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let result = input.read(); + let result = (*input)(byte_offset, position.into()); *bytes_read = result.len() as u32; return result.as_ptr() as *const c_char; }; - unsafe extern "C" fn seek( - payload: *mut c_void, - byte: u32, - position: ffi::TSPoint, - ) -> c_int { - let input = (payload as *mut T).as_mut().unwrap(); - input.seek( - byte, - Point { - row: position.row, - column: position.column, - }, - ); - return 1; - }; - let c_input = ffi::TSInput { payload: input as *mut T as *mut c_void, - read: Some(read::), - seek: Some(seek::), + read: Some(read::<'a, T>), encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, }; - let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; - if new_tree_ptr.is_null() { + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { None } else { - Some(Tree(new_tree_ptr)) + Some(Tree(c_new_tree)) } } - pub fn parse_utf16( + pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read( + unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let result = input.read(); + let result = (*input)(byte_offset, Point { + row: position.row, + column: position.column / 2, + }); *bytes_read = result.len() as u32 * 2; return result.as_ptr() as *const c_char; }; - unsafe extern "C" fn seek( - payload: *mut c_void, - byte: u32, - position: ffi::TSPoint, - ) -> c_int { - let input = (payload as *mut T).as_mut().unwrap(); - input.seek( - byte / 2, - Point { - row: position.row, - column: position.column / 2, - }, - ); - return 1; - }; - let c_input = ffi::TSInput { payload: input as *mut T as *mut c_void, - read: Some(read::), - seek: Some(seek::), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + read: Some(read::<'a, T>), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, }; - let old_tree_ptr = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - let new_tree_ptr = unsafe { ffi::ts_parser_parse(self.0, old_tree_ptr, c_input) }; - if new_tree_ptr.is_null() { + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { None } else { - Some(Tree(new_tree_ptr)) + Some(Tree(c_new_tree)) } } + + pub fn reset(&mut self) { + unsafe { ffi::ts_parser_reset(self.0) } + } + + pub fn set_operation_limit(&mut self, limit: usize) { + unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } + } } impl Drop for Parser { @@ -442,15 +408,12 @@ impl Into for Point { } } -impl<'a> Utf8Input for FlatInput<'a> { - fn read(&mut self) -> &[u8] { - let result = &self.bytes[self.offset..]; - self.offset = self.bytes.len(); - result - } - - fn seek(&mut self, offset: u32, _position: Point) { - self.offset = offset as usize; +impl From for Point { + fn from(point: ffi::TSPoint) -> Self { + Self { + row: point.row, + column: point.column, + } } } @@ -536,49 +499,70 @@ mod tests { #[test] fn test_custom_utf8_input() { - struct LineBasedInput { - lines: &'static [&'static str], - row: usize, - column: usize, - } - - impl Utf8Input for LineBasedInput { - fn read(&mut self) -> &[u8] { - if self.row < self.lines.len() { - let result = &self.lines[self.row].as_bytes()[self.column..]; - self.row += 1; - self.column = 0; - result - } else { - &[] - } - } - - fn seek(&mut self, _byte: u32, position: Point) { - self.row = position.row as usize; - self.column = position.column as usize; - } - } - let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let mut input = LineBasedInput { - lines: &[ - "pub fn main() {", - "}", - ], - row: 0, - column: 0 - }; + let lines = &[ + "pub fn foo() {", + " 1", + "}", + ]; + + let tree = parser.parse_utf8(&mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, None).unwrap(); - let tree = parser.parse_utf8(&mut input, None).unwrap(); let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); + } - let child = root.child(0).unwrap(); - assert_eq!(child.kind(), "function_item"); + #[test] + fn test_custom_utf16_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + parser.set_logger(Some(Box::new(|t, message| { + println!("log: {:?} {}", t, message); + }))); + + let lines: Vec> = [ + "pub fn foo() {", + " 1", + "}" + ].iter().map(|s| s.encode_utf16().collect()).collect(); + + let tree = parser.parse_utf16(&mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, None).unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); } #[test] @@ -595,16 +579,23 @@ mod tests { #[test] fn test_editing() { - let mut input = SpyInput { - bytes: "fn test(a: A, c: C) {}".as_bytes(), - offset: 0, - bytes_read: Vec::new(), - }; - let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let mut tree = parser.parse_utf8(&mut input, None).unwrap(); + let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); + let mut input_bytes_read = Vec::new(); + + let mut tree = parser.parse_utf8(&mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, None).unwrap(); + let parameters_sexp = tree.root_node() .named_child(0).unwrap() .named_child(1).unwrap() @@ -614,9 +605,8 @@ mod tests { "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" ); - input.offset = 0; - input.bytes_read.clear(); - input.bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + input_bytes_read.clear(); + input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); tree.edit(&InputEdit{ start_byte: 14, old_end_byte: 14, @@ -626,7 +616,17 @@ mod tests { new_end_position: Point::new(0, 20), }); - let tree = parser.parse_utf8(&mut input, Some(&tree)).unwrap(); + let tree = parser.parse_utf8(&mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, Some(&tree)).unwrap(); + let parameters_sexp = tree.root_node() .named_child(0).unwrap() .named_child(1).unwrap() @@ -636,7 +636,7 @@ mod tests { "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" ); - let retokenized_content = String::from_utf8(input.bytes_read).unwrap(); + let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); assert!(retokenized_content.contains("b: B")); assert!(!retokenized_content.contains("a: A")); assert!(!retokenized_content.contains("c: C")); @@ -694,27 +694,4 @@ mod tests { assert_eq!(child_count_differences, &[1, 2, 3, 4]); } - - struct SpyInput { - bytes: &'static [u8], - offset: usize, - bytes_read: Vec, - } - - impl Utf8Input for SpyInput { - fn read(&mut self) -> &[u8] { - if self.offset < self.bytes.len() { - let result = &self.bytes[self.offset..self.offset + 1]; - self.bytes_read.extend(result.iter()); - self.offset += 1; - result - } else { - &[] - } - } - - fn seek(&mut self, byte: u32, _position: Point) { - self.offset = byte as usize; - } - } } diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 78f28b14..26ab57a6 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 78f28b14ce519ba085ab7886c2fc19739f7f7da0 +Subproject commit 26ab57a6562aaeb48b579e3ca29eb064925e857c From 86c8206e35757694d37d3fe627236d22a75eb3ec Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Jun 2018 16:20:58 -0700 Subject: [PATCH 029/208] 0.2.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 12d92923..bfc6b2e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.1.1" +version = "0.2.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 2eff3225bac3422b19fc442482eb45f0462fa478 Mon Sep 17 00:00:00 2001 From: Stephan Renatus Date: Thu, 28 Jun 2018 10:25:01 +0200 Subject: [PATCH 030/208] README.md: small fixes To call .set_language on parser, it needs to be mut; also, the syntax for the extern "C" blocks seemed to be a bit off. Both now corresponds to what's in the tests. Signed-off-by: Stephan Renatus --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ff7140c5..449c6c46 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,15 @@ use tree_sitter::{Parser, Language}; // ... -let parser = Parser::new(); +let mut parser = Parser::new(); ``` Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`: ```rust -extern "C" fn tree_sitter_c() -> Language; -extern "C" fn tree_sitter_rust() -> Language; -extern "C" fn tree_sitter_javascript() -> Language; +extern "C" { fn tree_sitter_c() -> Language; } +extern "C" { fn tree_sitter_rust() -> Language; } +extern "C" { fn tree_sitter_javascript() -> Language; } let language = unsafe { tree_sitter_rust() }; parser.set_language(language).unwrap(); From c477e45fccf746fcb9335ba777ace035a6292a48 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:32:22 -0700 Subject: [PATCH 031/208] Update to the latest Tree-sitter --- src/bindings.rs | 30 +++++++++++++++++++++++++----- src/lib.rs | 6 +++++- vendor/tree-sitter | 2 +- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/bindings.rs b/src/bindings.rs index b2d83729..58d0e510 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -33,8 +33,10 @@ pub struct TSPoint { #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSRange { - pub start: TSPoint, - pub end: TSPoint, + pub start_point: TSPoint, + pub end_point: TSPoint, + pub start_byte: u32, + pub end_byte: u32, } #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -80,7 +82,7 @@ pub struct TSInputEdit { pub struct TSNode { pub context: [u32; 4usize], pub id: *const ::std::os::raw::c_void, - pub tree: *const ::std::os::raw::c_void, + pub tree: *const TSTree, } #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -139,6 +141,12 @@ extern "C" { extern "C" { pub fn ts_parser_reset(arg1: *mut TSParser); } +extern "C" { + pub fn ts_parser_set_included_ranges(arg1: *mut TSParser, arg2: *const TSRange, arg3: u32); +} +extern "C" { + pub fn ts_parser_included_ranges(arg1: *const TSParser, arg2: *mut u32) -> *const TSRange; +} extern "C" { pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; } @@ -161,6 +169,9 @@ extern "C" { extern "C" { pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); } +extern "C" { + pub fn ts_tree_language(arg1: *const TSTree) -> *const TSLanguage; +} extern "C" { pub fn ts_node_start_byte(arg1: TSNode) -> u32; } @@ -251,7 +262,10 @@ extern "C" { ) -> TSNode; } extern "C" { - pub fn ts_tree_cursor_new(arg1: *const TSTree) -> TSTreeCursor; + pub fn ts_node_edit(arg1: *mut TSNode, arg2: *const TSInputEdit); +} +extern "C" { + pub fn ts_tree_cursor_new(arg1: TSNode) -> TSTreeCursor; } extern "C" { pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); @@ -280,6 +294,12 @@ extern "C" { arg2: TSSymbol, ) -> *const ::std::os::raw::c_char; } +extern "C" { + pub fn ts_language_symbol_for_name( + arg1: *const TSLanguage, + arg2: *const ::std::os::raw::c_char, + ) -> TSSymbol; +} extern "C" { pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; } @@ -287,4 +307,4 @@ extern "C" { pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } -pub const TREE_SITTER_LANGUAGE_VERSION: usize = 8; +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 9; diff --git a/src/lib.rs b/src/lib.rs index 84d51f04..9f0ef9b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,7 +216,7 @@ impl Tree { } pub fn walk(&self) -> TreeCursor { - TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + self.root_node().walk() } } @@ -337,6 +337,10 @@ impl<'tree> Node<'tree> { unsafe { free(c_string as *mut c_void) }; result } + + pub fn walk(&self) -> TreeCursor<'tree> { + TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + } } impl<'a> PartialEq for Node<'a> { diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 26ab57a6..16376c43 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 26ab57a6562aaeb48b579e3ca29eb064925e857c +Subproject commit 16376c43f5cc75bbc5297e6d5716bd94d55ccc05 From 47a7430da319b8e2a55cdb8998acc3f3f099a1c7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:32:56 -0700 Subject: [PATCH 032/208] 0.3.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index bfc6b2e2..746d2d47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.2.0" +version = "0.3.0" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From bdd52376a82ae2354b6226d9bb3b23649b81df4d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:36:12 -0700 Subject: [PATCH 033/208] Fix cargo category slugs --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 746d2d47..c2d733f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ build = "build.rs" license = "MIT" readme = "README.md" keywords = ["incremental", "parsing"] -categories = ["parsing", "text editors", "api bindings"] +categories = ["api-bindings", "parsing", "text-editors"] include = [ "/build.rs", From 5fbb261316737117c827db935e667bcfd3932348 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Jul 2018 13:36:42 -0700 Subject: [PATCH 034/208] 0.3.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c2d733f2..9adbcfd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.0" +version = "0.3.1" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From c8125ec617ec4a3e2d93c460bcc22c89f1c06981 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 11:32:40 -0700 Subject: [PATCH 035/208] Make Language send + sync, add language methods --- src/lib.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9f0ef9b9..434d05fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,8 @@ use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; -pub type Language = *const ffi::TSLanguage; +#[repr(transparent)] +pub struct Language (*const ffi::TSLanguage); #[derive(Debug, PartialEq, Eq)] pub enum LogType { @@ -50,9 +51,9 @@ impl Parser { pub fn set_language(&mut self, language: Language) -> Result<(), String> { unsafe { - let version = ffi::ts_language_version(language) as usize; + let version = ffi::ts_language_version(language.0) as usize; if version == ffi::TREE_SITTER_LANGUAGE_VERSION { - ffi::ts_parser_set_language(self.0, language); + ffi::ts_parser_set_language(self.0, language.0); Ok(()) } else { Err(format!( @@ -222,6 +223,24 @@ impl Tree { unsafe impl Send for Tree {} +impl Language { + pub fn node_kind_count(&self) -> usize { + unsafe { ffi::ts_language_symbol_count(self.0) as usize } + } + + pub fn node_kind_for_id(&self, id: u16) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) }.to_str().unwrap() + } + + pub fn node_kind_is_named(&self, id: u16) -> bool { + unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } + } +} + +unsafe impl Send for Language {} + +unsafe impl Sync for Language {} + impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "{{Tree {:?}}}", self.root_node()) @@ -527,7 +546,7 @@ mod tests { }, None).unwrap(); let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); assert_eq!(root.child(0).unwrap().kind(), "function_item"); @@ -563,7 +582,7 @@ mod tests { }, None).unwrap(); let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); assert_eq!(root.kind(), "source_file"); assert_eq!(root.has_error(), false); assert_eq!(root.child(0).unwrap().kind(), "function_item"); From 0c2e1c189b2c4f696a1a1b48ee1ad04c7ef49936 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:32:58 -0700 Subject: [PATCH 036/208] Implement Clone for Language --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 434d05fb..81b4d09a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +#[derive(Clone, Copy)] #[repr(transparent)] pub struct Language (*const ffi::TSLanguage); From 572e8c202e36c98e875a67f2edadbbad341602cf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:33:11 -0700 Subject: [PATCH 037/208] Implement Send for Parser --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 81b4d09a..c547974b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,6 +200,8 @@ impl Drop for Parser { } } +unsafe impl Send for Parser {} + impl Tree { pub fn root_node(&self) -> Node { Node::new(unsafe { ffi::ts_tree_root_node(self.0) }).unwrap() From 91d35dec7d4ddf60054efbbc6631489af74c09f0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 8 Oct 2018 22:33:43 -0700 Subject: [PATCH 038/208] Add Parser.parser_utf8_io() method --- src/lib.rs | 159 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c547974b..ff272a29 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ use std::ffi::CStr; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +use std::io::{self, Read, Seek}; #[derive(Clone, Copy)] #[repr(transparent)] @@ -115,37 +116,15 @@ impl Parser { self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) } - pub fn parse_utf8<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( + pub fn parse_utf8<'a, T: FnMut(u32, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u8]>( - payload: *mut c_void, - byte_offset: u32, - position: ffi::TSPoint, - bytes_read: *mut u32, - ) -> *const c_char { - let input = (payload as *mut T).as_mut().unwrap(); - let result = (*input)(byte_offset, position.into()); - *bytes_read = result.len() as u32; - return result.as_ptr() as *const c_char; - }; - - let c_input = ffi::TSInput { - payload: input as *mut T as *mut c_void, - read: Some(read::<'a, T>), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, - }; - - let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); - - let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; - if c_new_tree.is_null() { - None - } else { - Some(Tree(c_new_tree)) - } + self.parse_utf8_ptr(&mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, old_tree) } pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( @@ -153,34 +132,43 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( - payload: *mut c_void, - byte_offset: u32, - position: ffi::TSPoint, - bytes_read: *mut u32, - ) -> *const c_char { - let input = (payload as *mut T).as_mut().unwrap(); - let result = (*input)(byte_offset, Point { - row: position.row, - column: position.column / 2, - }); - *bytes_read = result.len() as u32 * 2; - return result.as_ptr() as *const c_char; - }; + self.parse_utf16_ptr(&mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, old_tree) + } - let c_input = ffi::TSInput { - payload: input as *mut T as *mut c_void, - read: Some(read::<'a, T>), - encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, - }; + pub fn parse_utf8_io( + &mut self, + mut input: impl Read + Seek, + old_tree: Option<&Tree>, + ) -> io::Result> { + let mut error = None; + let mut current_offset = 0; + let mut buffer = [0; 10 * 1024]; + let result = self.parse_utf8_ptr(&mut |byte, _| { + if byte as u64 != current_offset { + current_offset = byte as u64; + if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { + error = Some(e); + return (ptr::null(), 0) + } + } - let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + match input.read(&mut buffer) { + Err(e) => { + error = Some(e); + (ptr::null(), 0) + }, + Ok(length) => { + (buffer.as_ptr(), length) + } + } + }, old_tree); - let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; - if c_new_tree.is_null() { - None - } else { - Some(Tree(c_new_tree)) + match error { + Some(e) => Err(e), + None => Ok(result) } } @@ -191,6 +179,73 @@ impl Parser { pub fn set_operation_limit(&mut self, limit: usize) { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } + + fn parse_utf8_ptr (*const u8, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u8, usize)> ( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)(byte_offset, position.into()); + *bytes_read = length as u32; + return ptr as *const c_char; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } + + fn parse_utf16_ptr (*const u16, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u16, usize)>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)(byte_offset, Point { + row: position.row, + column: position.column / 2, + }); + *bytes_read = length as u32 * 2; + ptr as *const c_char + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } } impl Drop for Parser { From a8cbde6dbfbc8ae9b7b37075ad0dffeed3e079b8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 9 Oct 2018 08:23:02 -0700 Subject: [PATCH 039/208] Run rustfmt on lib.rs --- src/lib.rs | 336 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 204 insertions(+), 132 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ff272a29..4a132a3f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,15 @@ mod ffi; -use std::fmt; use std::ffi::CStr; +use std::fmt; +use std::io::{self, Read, Seek}; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; -use std::io::{self, Read, Seek}; #[derive(Clone, Copy)] #[repr(transparent)] -pub struct Language (*const ffi::TSLanguage); +pub struct Language(*const ffi::TSLanguage); #[derive(Debug, PartialEq, Eq)] pub enum LogType { @@ -43,6 +43,26 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +impl Language { + pub fn node_kind_count(&self) -> usize { + unsafe { ffi::ts_language_symbol_count(self.0) as usize } + } + + pub fn node_kind_for_id(&self, id: u16) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) } + .to_str() + .unwrap() + } + + pub fn node_kind_is_named(&self, id: u16) -> bool { + unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } + } +} + +unsafe impl Send for Language {} + +unsafe impl Sync for Language {} + impl Parser { pub fn new() -> Parser { unsafe { @@ -105,7 +125,10 @@ impl Parser { log: Some(log), }; } else { - c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None }; + c_logger = ffi::TSLogger { + payload: ptr::null_mut(), + log: None, + }; } unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; @@ -121,10 +144,13 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - self.parse_utf8_ptr(&mut |byte, position| { - let slice = input(byte, position); - (slice.as_ptr(), slice.len()) - }, old_tree) + self.parse_utf8_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) } pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( @@ -132,10 +158,13 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - self.parse_utf16_ptr(&mut |byte, position| { - let slice = input(byte, position); - (slice.as_ptr(), slice.len()) - }, old_tree) + self.parse_utf16_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) } pub fn parse_utf8_io( @@ -146,29 +175,30 @@ impl Parser { let mut error = None; let mut current_offset = 0; let mut buffer = [0; 10 * 1024]; - let result = self.parse_utf8_ptr(&mut |byte, _| { - if byte as u64 != current_offset { - current_offset = byte as u64; - if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { - error = Some(e); - return (ptr::null(), 0) + let result = self.parse_utf8_ptr( + &mut |byte, _| { + if byte as u64 != current_offset { + current_offset = byte as u64; + if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { + error = Some(e); + return (ptr::null(), 0); + } } - } - match input.read(&mut buffer) { - Err(e) => { - error = Some(e); - (ptr::null(), 0) - }, - Ok(length) => { - (buffer.as_ptr(), length) + match input.read(&mut buffer) { + Err(e) => { + error = Some(e); + (ptr::null(), 0) + } + Ok(length) => (buffer.as_ptr(), length), } - } - }, old_tree); + }, + old_tree, + ); match error { Some(e) => Err(e), - None => Ok(result) + None => Ok(result), } } @@ -185,7 +215,7 @@ impl Parser { input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u8, usize)> ( + unsafe extern "C" fn read (*const u8, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, @@ -224,10 +254,13 @@ impl Parser { bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let (ptr, length) = (*input)(byte_offset, Point { - row: position.row, - column: position.column / 2, - }); + let (ptr, length) = (*input)( + byte_offset, + Point { + row: position.row, + column: position.column / 2, + }, + ); *bytes_read = length as u32 * 2; ptr as *const c_char }; @@ -281,24 +314,6 @@ impl Tree { unsafe impl Send for Tree {} -impl Language { - pub fn node_kind_count(&self) -> usize { - unsafe { ffi::ts_language_symbol_count(self.0) as usize } - } - - pub fn node_kind_for_id(&self, id: u16) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) }.to_str().unwrap() - } - - pub fn node_kind_is_named(&self, id: u16) -> bool { - unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } - } -} - -unsafe impl Send for Language {} - -unsafe impl Sync for Language {} - impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "{{Tree {:?}}}", self.root_node()) @@ -331,7 +346,9 @@ impl<'tree> Node<'tree> { } pub fn kind(&self) -> &'static str { - unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) }.to_str().unwrap() + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } + .to_str() + .unwrap() } pub fn is_named(&self) -> bool { @@ -407,10 +424,15 @@ impl<'tree> Node<'tree> { } pub fn to_sexp(&self) -> String { - extern "C" { fn free(pointer: *mut c_void); } + extern "C" { + fn free(pointer: *mut c_void); + } let c_string = unsafe { ffi::ts_node_string(self.0) }; - let result = unsafe { CStr::from_ptr(c_string) }.to_str().unwrap().to_string(); + let result = unsafe { CStr::from_ptr(c_string) } + .to_str() + .unwrap() + .to_string(); unsafe { free(c_string as *mut c_void) }; result } @@ -428,7 +450,13 @@ impl<'a> PartialEq for Node<'a> { impl<'a> fmt::Debug for Node<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - write!(f, "{{Node {} {} - {}}}", self.kind(), self.start_position(), self.end_position()) + write!( + f, + "{{Node {} {} - {}}}", + self.kind(), + self.start_position(), + self.end_position() + ) } } @@ -500,21 +528,30 @@ impl From for Point { #[cfg(test)] mod tests { - use std::thread; use super::*; + use std::thread; - fn rust() -> Language { unsafe { tree_sitter_rust() } } - extern "C" { fn tree_sitter_rust() -> Language; } + fn rust() -> Language { + unsafe { tree_sitter_rust() } + } + extern "C" { + fn tree_sitter_rust() -> Language; + } #[test] fn test_basic_parsing() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str(" + let tree = parser + .parse_str( + " struct Stuff {} fn main() {} - ", None).unwrap(); + ", + None, + ) + .unwrap(); let root_node = tree.root_node(); assert_eq!(root_node.kind(), "source_file"); @@ -538,12 +575,20 @@ mod tests { messages.push((log_type, message.to_string())); }))); - parser.parse_str(" + parser + .parse_str( + " struct Stuff {} fn main() {} - ", None).unwrap(); + ", + None, + ) + .unwrap(); - assert!(messages.contains(&(LogType::Parse, "reduce sym:struct_item, child_count:3".to_string()))); + assert!(messages.contains(&( + LogType::Parse, + "reduce sym:struct_item, child_count:3".to_string() + ))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); } @@ -552,12 +597,17 @@ mod tests { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str(" + let tree = parser + .parse_str( + " struct Stuff { a: A; b: Option, } - ", None).unwrap(); + ", + None, + ) + .unwrap(); let mut cursor = tree.walk(); assert_eq!(cursor.node().kind(), "source_file"); @@ -583,25 +633,26 @@ mod tests { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let lines = &[ - "pub fn foo() {", - " 1", - "}", - ]; + let lines = &["pub fn foo() {", " 1", "}"]; - let tree = parser.parse_utf8(&mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; - if row < lines.len() { - if column < lines[row].as_bytes().len() { - &lines[row].as_bytes()[column..] - } else { - "\n".as_bytes() - } - } else { - &[] - } - }, None).unwrap(); + let tree = parser + .parse_utf8( + &mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, + None, + ) + .unwrap(); let root = tree.root_node(); assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); @@ -619,25 +670,29 @@ mod tests { println!("log: {:?} {}", t, message); }))); - let lines: Vec> = [ - "pub fn foo() {", - " 1", - "}" - ].iter().map(|s| s.encode_utf16().collect()).collect(); + let lines: Vec> = ["pub fn foo() {", " 1", "}"] + .iter() + .map(|s| s.encode_utf16().collect()) + .collect(); - let tree = parser.parse_utf16(&mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; - if row < lines.len() { - if column < lines[row].len() { - &lines[row][column..] - } else { - &[10] - } - } else { - &[] - } - }, None).unwrap(); + let tree = parser + .parse_utf16( + &mut |_, position| { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, + None, + ) + .unwrap(); let root = tree.root_node(); assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); @@ -666,20 +721,28 @@ mod tests { let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); let mut input_bytes_read = Vec::new(); - let mut tree = parser.parse_utf8(&mut |offset, _| { - let offset = offset as usize; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, None).unwrap(); + let mut tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + None, + ) + .unwrap(); - let parameters_sexp = tree.root_node() - .named_child(0).unwrap() - .named_child(1).unwrap() + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() .to_sexp(); assert_eq!( parameters_sexp, @@ -688,7 +751,7 @@ mod tests { input_bytes_read.clear(); input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); - tree.edit(&InputEdit{ + tree.edit(&InputEdit { start_byte: 14, old_end_byte: 14, new_end_byte: 20, @@ -697,20 +760,28 @@ mod tests { new_end_position: Point::new(0, 20), }); - let tree = parser.parse_utf8(&mut |offset, _| { - let offset = offset as usize; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, Some(&tree)).unwrap(); + let tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset as usize; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + Some(&tree), + ) + .unwrap(); - let parameters_sexp = tree.root_node() - .named_child(0).unwrap() - .named_child(1).unwrap() + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() .to_sexp(); assert_eq!( parameters_sexp, @@ -738,7 +809,6 @@ mod tests { for thread_id in 1..5 { let mut tree_clone = tree.clone(); parse_threads.push(thread::spawn(move || { - // For each thread, prepend a different number of declarations to the // source code. let mut prepend_line_count = 0; @@ -748,7 +818,7 @@ mod tests { prepended_source += "struct X {}\n\n"; } - tree_clone.edit(&InputEdit{ + tree_clone.edit(&InputEdit { start_byte: 0, old_end_byte: 0, new_end_byte: prepended_source.len() as u32, @@ -761,7 +831,9 @@ mod tests { // Reparse using the old tree as a starting point. let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - parser.parse_str(&prepended_source, Some(&tree_clone)).unwrap() + parser + .parse_str(&prepended_source, Some(&tree_clone)) + .unwrap() })); } From db360b73fb33d5c03a226b42b1bfa60398645873 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 13 Oct 2018 14:09:36 -0700 Subject: [PATCH 040/208] Add Tree.walk_with_properties --- Cargo.toml | 5 + src/lib.rs | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 292 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9adbcfd1..485d369e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,5 +20,10 @@ include = [ "/vendor/tree-sitter/src/runtime/*", ] +[dependencies] +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" + [build-dependencies] cc = "1.0" diff --git a/src/lib.rs b/src/lib.rs index 4a132a3f..19b9a670 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,17 @@ mod ffi; +#[macro_use] +extern crate serde_derive; +extern crate serde_json; + +use std::collections::HashMap; use std::ffi::CStr; use std::fmt; use std::io::{self, Read, Seek}; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +use std::str; #[derive(Clone, Copy)] #[repr(transparent)] @@ -19,7 +25,7 @@ pub enum LogType { type Logger<'a> = Box; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Point { pub row: u32, pub column: u32, @@ -35,6 +41,22 @@ pub struct InputEdit { pub new_end_position: Point, } +struct PropertyTransition { + state_id: u32, + child_index: Option, +} + +struct PropertyState { + transitions: HashMap>, + property_set_id: u32, + default_next_state_id: u32, +} + +pub struct PropertySheet { + states: Vec, + property_sets: Vec>, +} + pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); @@ -43,6 +65,13 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); +pub struct TreePropertyCursor<'a> { + cursor: TreeCursor<'a>, + state_stack: Vec, + child_index_stack: Vec, + property_sheet: &'a PropertySheet, +} + impl Language { pub fn node_kind_count(&self) -> usize { unsafe { ffi::ts_language_symbol_count(self.0) as usize } @@ -310,6 +339,13 @@ impl Tree { pub fn walk(&self) -> TreeCursor { self.root_node().walk() } + + pub fn walk_with_properties<'a>( + &'a self, + property_sheet: &'a PropertySheet, + ) -> TreePropertyCursor<'a> { + TreePropertyCursor::new(self, property_sheet) + } } unsafe impl Send for Tree {} @@ -437,6 +473,14 @@ impl<'tree> Node<'tree> { result } + pub fn utf8_text<'a>(&self, source: &'a str) -> Result<&'a str, str::Utf8Error> { + str::from_utf8(&source.as_bytes()[self.start_byte() as usize..self.end_byte() as usize]) + } + + pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { + &source[self.start_byte() as usize..self.end_byte() as usize] + } + pub fn walk(&self) -> TreeCursor<'tree> { TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) } @@ -461,7 +505,7 @@ impl<'a> fmt::Debug for Node<'a> { } impl<'a> TreeCursor<'a> { - pub fn node(&'a self) -> Node<'a> { + pub fn node(&self) -> Node<'a> { Node( unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, PhantomData, @@ -496,6 +540,87 @@ impl<'a> Drop for TreeCursor<'a> { } } +impl<'a> TreePropertyCursor<'a> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet) -> Self { + Self { + cursor: tree.root_node().walk(), + child_index_stack: vec![0], + state_stack: vec![0], + property_sheet, + } + } + + pub fn node(&self) -> Node<'a> { + self.cursor.node() + } + + pub fn node_properties(&self) -> &'a HashMap { + &self.property_sheet.property_sets[self.current_state().property_set_id as usize] + } + + pub fn goto_first_child(&mut self) -> bool { + if self.cursor.goto_first_child() { + let child_index = 0; + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_next_sibling(&mut self) -> bool { + if self.cursor.goto_next_sibling() { + let child_index = self.child_index_stack.pop().unwrap() + 1; + self.state_stack.pop(); + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_parent(&mut self) -> bool { + if self.cursor.goto_parent() { + self.state_stack.pop(); + self.child_index_stack.pop(); + true + } else { + false + } + } + + fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: u32) -> u32 { + state + .transitions + .get(&node_kind_id) + .and_then(|transitions| { + for transition in transitions.iter() { + if transition.child_index == Some(node_child_index) || transition.child_index == None { + return Some(transition.state_id); + } + } + None + }) + .unwrap_or(state.default_next_state_id) + } + + fn current_state(&self) -> &PropertyState { + &self.property_sheet.states[*self.state_stack.last().unwrap() as usize] + } +} + impl Point { pub fn new(row: u32, column: u32) -> Self { Point { row, column } @@ -526,6 +651,64 @@ impl From for Point { } } +impl PropertySheet { + pub fn new(language: Language, json: &str) -> Result { + #[derive(Deserialize, Debug)] + struct PropertyTransitionJSON { + #[serde(rename = "type")] + kind: String, + named: bool, + index: Option, + state_id: u32, + } + + #[derive(Deserialize, Debug)] + struct PropertyStateJSON { + transitions: Vec, + property_set_id: u32, + default_next_state_id: u32, + } + + #[derive(Deserialize, Debug)] + struct PropertySheetJSON { + states: Vec, + property_sets: Vec>, + } + + let input: PropertySheetJSON = serde_json::from_str(json)?; + Ok(PropertySheet { + property_sets: input.property_sets, + states: input + .states + .iter() + .map(|state| { + let mut transitions = HashMap::new(); + let node_kind_count = language.node_kind_count(); + for transition in state.transitions.iter() { + for i in 0..node_kind_count { + let i = i as u16; + if language.node_kind_is_named(i) == transition.named + && transition.kind == language.node_kind_for_id(i) + { + let entry = transitions.entry(i).or_insert(Vec::new()); + entry.push(PropertyTransition { + child_index: transition.index, + state_id: transition.state_id, + }); + } + } + } + PropertyState { + transitions, + default_next_state_id: state.default_next_state_id, + property_set_id: state.property_set_id, + } + }) + .collect(), + }) + } +} + #[cfg(test)] mod tests { use super::*; @@ -600,11 +783,11 @@ mod tests { let tree = parser .parse_str( " - struct Stuff { - a: A; - b: Option, - } - ", + struct Stuff { + a: A; + b: Option, + } + ", None, ) .unwrap(); @@ -628,6 +811,103 @@ mod tests { assert_eq!(cursor.node().is_named(), true); } + #[test] + fn test_tree_property_matching() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); + + let property_sheet = PropertySheet::new( + rust(), + r##" + { + "states": [ + { + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1}, + {"type": "function_item", "named": true, "state_id": 2} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 4} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"reference": "function"}, + {"define": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "function_item"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "fn"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties()["define"], "function"); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "parameters"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "("); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), ")"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_parent()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), HashMap::new()); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties()["reference"], "function"); + } + #[test] fn test_custom_utf8_input() { let mut parser = Parser::new(); From afe722358236dfb1389471a1037531b7c5422d0f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Nov 2018 15:56:16 -0800 Subject: [PATCH 041/208] Upgrade Tree-sitter, use single source file in build script --- build.rs | 25 ++----------------------- vendor/tree-sitter | 2 +- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/build.rs b/build.rs index 7d9ee83e..add3bec7 100644 --- a/build.rs +++ b/build.rs @@ -13,29 +13,8 @@ fn main() { .flag_if_supported("-Wno-unused-parameter") .include(root_path.join("src")) .include(root_path.join("include")) - .include(root_path.join("externals").join("utf8proc")); - - let source_filenames = [ - "get_changed_ranges.c", - "language.c", - "lexer.c", - "node.c", - "parser.c", - "stack.c", - "subtree.c", - "tree_cursor.c", - "tree.c", - "utf16.c", - ]; - - config.files(source_filenames.iter().map(|source_filename| { - root_path - .join("src") - .join("runtime") - .join(&source_filename) - })); - - config.file(root_path.join("externals").join("utf8proc").join("utf8proc.c")); + .include(root_path.join("externals").join("utf8proc")) + .file(root_path.join("src").join("runtime").join("runtime.c")); if env::var("RUST_TREE_SITTER_TEST").is_ok() { let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); diff --git a/vendor/tree-sitter b/vendor/tree-sitter index 16376c43..6b8e5bd1 160000 --- a/vendor/tree-sitter +++ b/vendor/tree-sitter @@ -1 +1 @@ -Subproject commit 16376c43f5cc75bbc5297e6d5716bd94d55ccc05 +Subproject commit 6b8e5bd1f96ab63f17873ef9f7a72569a421810f From 8fdcf84ff3396e4c8fc8ee4cdc9e37ebe9f126cf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 20 Nov 2018 16:00:45 -0800 Subject: [PATCH 042/208] 0.3.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 485d369e..2c92acc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.1" +version = "0.3.2" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From a741265ead8dc67de991046d295e2f316681cce0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 28 Nov 2018 17:26:16 -0800 Subject: [PATCH 043/208] Replace all u32s in the API with usizes Co-Authored-By: Timothy Clem --- src/lib.rs | 146 +++++++++++++++++++++++++++-------------------------- 1 file changed, 75 insertions(+), 71 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 19b9a670..fa3d970e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,29 +27,36 @@ type Logger<'a> = Box; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Point { - pub row: u32, - pub column: u32, + pub row: usize, + pub column: usize, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { - pub start_byte: u32, - pub old_end_byte: u32, - pub new_end_byte: u32, + pub start_byte: usize, + pub old_end_byte: usize, + pub new_end_byte: usize, pub start_position: Point, pub old_end_position: Point, pub new_end_position: Point, } struct PropertyTransition { - state_id: u32, - child_index: Option, + state_id: usize, + child_index: Option, + text_regex: Option, } struct PropertyState { transitions: HashMap>, - property_set_id: u32, - default_next_state_id: u32, + property_set_id: usize, + default_next_state_id: usize, +} + +#[derive(Debug)] +pub enum PropertySheetError { + InvalidJSON(serde_json::Error), + InvalidRegex(regex::Error) } pub struct PropertySheet { @@ -67,9 +74,10 @@ pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); pub struct TreePropertyCursor<'a> { cursor: TreeCursor<'a>, - state_stack: Vec, - child_index_stack: Vec, + state_stack: Vec, + child_index_stack: Vec, property_sheet: &'a PropertySheet, + source: &'a str, } impl Language { @@ -165,10 +173,10 @@ impl Parser { pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let bytes = input.as_bytes(); - self.parse_utf8(&mut |offset, _| &bytes[(offset as usize)..], old_tree) + self.parse_utf8(&mut |offset, _| &bytes[offset..], old_tree) } - pub fn parse_utf8<'a, T: FnMut(u32, Point) -> &'a [u8]>( + pub fn parse_utf8<'a, T: FnMut(usize, Point) -> &'a [u8]>( &mut self, input: &mut T, old_tree: Option<&Tree>, @@ -182,7 +190,7 @@ impl Parser { ) } - pub fn parse_utf16<'a, T: 'a + FnMut(u32, Point) -> &'a [u16]>( + pub fn parse_utf16<'a, T: 'a + FnMut(usize, Point) -> &'a [u16]>( &mut self, input: &mut T, old_tree: Option<&Tree>, @@ -239,19 +247,19 @@ impl Parser { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } - fn parse_utf8_ptr (*const u8, usize)>( + fn parse_utf8_ptr (*const u8, usize)>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u8, usize)>( + unsafe extern "C" fn read (*const u8, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); - let (ptr, length) = (*input)(byte_offset, position.into()); + let (ptr, length) = (*input)(byte_offset as usize, position.into()); *bytes_read = length as u32; return ptr as *const c_char; }; @@ -271,12 +279,12 @@ impl Parser { } } - fn parse_utf16_ptr (*const u16, usize)>( + fn parse_utf16_ptr (*const u16, usize)>( &mut self, input: &mut T, old_tree: Option<&Tree>, ) -> Option { - unsafe extern "C" fn read (*const u16, usize)>( + unsafe extern "C" fn read (*const u16, usize)>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, @@ -284,10 +292,10 @@ impl Parser { ) -> *const c_char { let input = (payload as *mut T).as_mut().unwrap(); let (ptr, length) = (*input)( - byte_offset, + byte_offset as usize, Point { - row: position.row, - column: position.column / 2, + row: position.row as usize, + column: position.column as usize / 2, }, ); *bytes_read = length as u32 * 2; @@ -326,9 +334,9 @@ impl Tree { pub fn edit(&mut self, edit: &InputEdit) { let edit = ffi::TSInputEdit { - start_byte: edit.start_byte, - old_end_byte: edit.old_end_byte, - new_end_byte: edit.new_end_byte, + start_byte: edit.start_byte as u32, + old_end_byte: edit.old_end_byte as u32, + new_end_byte: edit.new_end_byte as u32, start_point: edit.start_position.into(), old_end_point: edit.old_end_position.into(), new_end_point: edit.new_end_position.into(), @@ -399,44 +407,38 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_has_error(self.0) } } - pub fn start_byte(&self) -> u32 { - unsafe { ffi::ts_node_start_byte(self.0) } + pub fn start_byte(&self) -> usize { + unsafe { ffi::ts_node_start_byte(self.0) as usize } } - pub fn end_byte(&self) -> u32 { - unsafe { ffi::ts_node_end_byte(self.0) } + pub fn end_byte(&self) -> usize { + unsafe { ffi::ts_node_end_byte(self.0) as usize } } pub fn start_position(&self) -> Point { let result = unsafe { ffi::ts_node_start_point(self.0) }; - Point { - row: result.row, - column: result.column, - } + result.into() } pub fn end_position(&self) -> Point { let result = unsafe { ffi::ts_node_end_point(self.0) }; - Point { - row: result.row, - column: result.column, - } + result.into() } - pub fn child(&self, i: u32) -> Option { - Self::new(unsafe { ffi::ts_node_child(self.0, i) }) + pub fn child(&self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_child(self.0, i as u32) }) } - pub fn child_count(&self) -> u32 { - unsafe { ffi::ts_node_child_count(self.0) } + pub fn child_count(&self) -> usize { + unsafe { ffi::ts_node_child_count(self.0) as usize } } - pub fn named_child<'a>(&'a self, i: u32) -> Option { - Self::new(unsafe { ffi::ts_node_named_child(self.0, i) }) + pub fn named_child<'a>(&'a self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) } - pub fn named_child_count(&self) -> u32 { - unsafe { ffi::ts_node_named_child_count(self.0) } + pub fn named_child_count(&self) -> usize { + unsafe { ffi::ts_node_named_child_count(self.0) as usize } } pub fn parent(&self) -> Option { @@ -474,11 +476,11 @@ impl<'tree> Node<'tree> { } pub fn utf8_text<'a>(&self, source: &'a str) -> Result<&'a str, str::Utf8Error> { - str::from_utf8(&source.as_bytes()[self.start_byte() as usize..self.end_byte() as usize]) + str::from_utf8(&source.as_bytes()[self.start_byte()..self.end_byte()]) } pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { - &source[self.start_byte() as usize..self.end_byte() as usize] + &source[self.start_byte()..self.end_byte()] } pub fn walk(&self) -> TreeCursor<'tree> { @@ -524,12 +526,12 @@ impl<'a> TreeCursor<'a> { return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; } - pub fn goto_first_child_for_index(&mut self, index: u32) -> Option { - let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index) }; + pub fn goto_first_child_for_index(&mut self, index: usize) -> Option { + let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; if result < 0 { None } else { - Some(result as u32) + Some(result as usize) } } } @@ -541,12 +543,13 @@ impl<'a> Drop for TreeCursor<'a> { } impl<'a> TreePropertyCursor<'a> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet) -> Self { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], state_stack: vec![0], property_sheet, + source, } } @@ -555,7 +558,7 @@ impl<'a> TreePropertyCursor<'a> { } pub fn node_properties(&self) -> &'a HashMap { - &self.property_sheet.property_sets[self.current_state().property_set_id as usize] + &self.property_sheet.property_sets[self.current_state().property_set_id] } pub fn goto_first_child(&mut self) -> bool { @@ -601,7 +604,7 @@ impl<'a> TreePropertyCursor<'a> { } } - fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: u32) -> u32 { + fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: usize) -> usize { state .transitions .get(&node_kind_id) @@ -617,12 +620,12 @@ impl<'a> TreePropertyCursor<'a> { } fn current_state(&self) -> &PropertyState { - &self.property_sheet.states[*self.state_stack.last().unwrap() as usize] + &self.property_sheet.states[*self.state_stack.last().unwrap()] } } impl Point { - pub fn new(row: u32, column: u32) -> Self { + pub fn new(row: usize, column: usize) -> Self { Point { row, column } } } @@ -636,8 +639,8 @@ impl fmt::Display for Point { impl Into for Point { fn into(self) -> ffi::TSPoint { ffi::TSPoint { - row: self.row, - column: self.column, + row: self.row as u32, + column: self.column as u32, } } } @@ -645,28 +648,29 @@ impl Into for Point { impl From for Point { fn from(point: ffi::TSPoint) -> Self { Self { - row: point.row, - column: point.column, + row: point.row as usize, + column: point.column as usize, } } } impl PropertySheet { - pub fn new(language: Language, json: &str) -> Result { + pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] kind: String, named: bool, - index: Option, - state_id: u32, + index: Option, + text: Option, + state_id: usize, } #[derive(Deserialize, Debug)] struct PropertyStateJSON { transitions: Vec, - property_set_id: u32, - default_next_state_id: u32, + property_set_id: usize, + default_next_state_id: usize, } #[derive(Deserialize, Debug)] @@ -918,8 +922,8 @@ mod tests { let tree = parser .parse_utf8( &mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; + let row = position.row; + let column = position.column; if row < lines.len() { if column < lines[row].as_bytes().len() { &lines[row].as_bytes()[column..] @@ -958,8 +962,8 @@ mod tests { let tree = parser .parse_utf16( &mut |_, position| { - let row = position.row as usize; - let column = position.column as usize; + let row = position.row; + let column = position.column; if row < lines.len() { if column < lines[row].len() { &lines[row][column..] @@ -1004,7 +1008,7 @@ mod tests { let mut tree = parser .parse_utf8( &mut |offset, _| { - let offset = offset as usize; + let offset = offset; if offset < input_bytes.len() { let result = &input_bytes[offset..offset + 1]; input_bytes_read.extend(result.iter()); @@ -1043,7 +1047,7 @@ mod tests { let tree = parser .parse_utf8( &mut |offset, _| { - let offset = offset as usize; + let offset = offset; if offset < input_bytes.len() { let result = &input_bytes[offset..offset + 1]; input_bytes_read.extend(result.iter()); @@ -1101,7 +1105,7 @@ mod tests { tree_clone.edit(&InputEdit { start_byte: 0, old_end_byte: 0, - new_end_byte: prepended_source.len() as u32, + new_end_byte: prepended_source.len(), start_position: Point::new(0, 0), old_end_position: Point::new(0, 0), new_end_position: Point::new(prepend_line_count, 0), From d5b53cde7dded6ebbc0d78ed131e9a10f2a62c5b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 28 Nov 2018 17:26:48 -0800 Subject: [PATCH 044/208] Respect the `:text` pseudo-class in TreePropertyCursor Co-Authored-By: Timothy Clem --- Cargo.toml | 1 + src/lib.rs | 87 ++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2c92acc5..0ffee772 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ include = [ ] [dependencies] +regex = "1" serde = "1.0" serde_json = "1.0" serde_derive = "1.0" diff --git a/src/lib.rs b/src/lib.rs index fa3d970e..a76ed115 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,9 @@ mod ffi; #[macro_use] extern crate serde_derive; extern crate serde_json; +extern crate regex; +use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; use std::fmt; @@ -351,8 +353,9 @@ impl Tree { pub fn walk_with_properties<'a>( &'a self, property_sheet: &'a PropertySheet, + source: &'a str, ) -> TreePropertyCursor<'a> { - TreePropertyCursor::new(self, property_sheet) + TreePropertyCursor::new(self, property_sheet, source) } } @@ -610,9 +613,23 @@ impl<'a> TreePropertyCursor<'a> { .get(&node_kind_id) .and_then(|transitions| { for transition in transitions.iter() { - if transition.child_index == Some(node_child_index) || transition.child_index == None { - return Some(transition.state_id); + if let Some(text_regex) = transition.text_regex.as_ref() { + let node = self.cursor.node(); + let text = &self.source.as_bytes()[node.start_byte()..node.end_byte()]; + if let Ok(text) = str::from_utf8(text) { + if !text_regex.is_match(text) { + continue; + } + } } + + if let Some(child_index) = transition.child_index { + if child_index != node_child_index { + continue; + } + } + + return Some(transition.state_id); } None }) @@ -679,36 +696,42 @@ impl PropertySheet { property_sets: Vec>, } - let input: PropertySheetJSON = serde_json::from_str(json)?; + let input: PropertySheetJSON = serde_json::from_str(json) + .map_err(|e| PropertySheetError::InvalidJSON(e))?; + let mut states = Vec::new(); + + for state in input.states.iter() { + let mut transitions = HashMap::new(); + let node_kind_count = language.node_kind_count(); + for transition in state.transitions.iter() { + for i in 0..node_kind_count { + let i = i as u16; + if language.node_kind_is_named(i) == transition.named + && transition.kind == language.node_kind_for_id(i) + { + let entry = transitions.entry(i).or_insert(Vec::new()); + let text_regex = if let Some(text) = transition.text.as_ref() { + Some(Regex::new(&text).map_err(|e| PropertySheetError::InvalidRegex(e))?) + } else { + None + }; + entry.push(PropertyTransition { + child_index: transition.index, + state_id: transition.state_id, + text_regex + }); + } + } + } + states.push(PropertyState { + transitions, + default_next_state_id: state.default_next_state_id, + property_set_id: state.property_set_id, + }); + } Ok(PropertySheet { property_sets: input.property_sets, - states: input - .states - .iter() - .map(|state| { - let mut transitions = HashMap::new(); - let node_kind_count = language.node_kind_count(); - for transition in state.transitions.iter() { - for i in 0..node_kind_count { - let i = i as u16; - if language.node_kind_is_named(i) == transition.named - && transition.kind == language.node_kind_for_id(i) - { - let entry = transitions.entry(i).or_insert(Vec::new()); - entry.push(PropertyTransition { - child_index: transition.index, - state_id: transition.state_id, - }); - } - } - } - PropertyState { - transitions, - default_next_state_id: state.default_next_state_id, - property_set_id: state.property_set_id, - } - }) - .collect(), + states, }) } } @@ -869,7 +892,7 @@ mod tests { ) .unwrap(); - let mut cursor = tree.walk_with_properties(&property_sheet); + let mut cursor = tree.walk_with_properties(&property_sheet, ""); assert_eq!(cursor.node().kind(), "source_file"); assert_eq!(*cursor.node_properties(), HashMap::new()); From c9ce314695a5bad674aed9b267b9c430411bb731 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 29 Nov 2018 16:21:01 -0800 Subject: [PATCH 045/208] Make PropertySheet generic on the properties type Co-Authored-By: Timothy Clem --- src/lib.rs | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a76ed115..68715879 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,9 @@ mod ffi; extern crate serde_derive; extern crate serde_json; extern crate regex; +extern crate serde; +use serde::Deserialize; use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; @@ -61,9 +63,10 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet { +pub struct PropertySheet<'d, P: Deserialize<'d>> { states: Vec, - property_sets: Vec>, + property_sets: Vec

, + _phantom: &'d std::marker::PhantomData<()>, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -74,11 +77,11 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a> { +pub struct TreePropertyCursor<'a, 'd, P: Deserialize<'d>> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, - property_sheet: &'a PropertySheet, + property_sheet: &'a PropertySheet<'d, P>, source: &'a str, } @@ -350,11 +353,11 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a>( + pub fn walk_with_properties<'a, 'd, P: Deserialize<'d>>( &'a self, - property_sheet: &'a PropertySheet, + property_sheet: &'a PropertySheet<'d, P>, source: &'a str, - ) -> TreePropertyCursor<'a> { + ) -> TreePropertyCursor<'a, 'd, P> { TreePropertyCursor::new(self, property_sheet, source) } } @@ -545,8 +548,8 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a> TreePropertyCursor<'a> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet, source: &'a str) -> Self { +impl<'a, 'd, P: Deserialize<'d>> TreePropertyCursor<'a, 'd, P> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet<'d, P>, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], @@ -560,7 +563,7 @@ impl<'a> TreePropertyCursor<'a> { self.cursor.node() } - pub fn node_properties(&self) -> &'a HashMap { + pub fn node_properties(&self) -> &'a P { &self.property_sheet.property_sets[self.current_state().property_set_id] } @@ -671,8 +674,8 @@ impl From for Point { } } -impl PropertySheet { - pub fn new(language: Language, json: &str) -> Result { +impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { + pub fn new(language: Language, json: &'a str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -691,12 +694,12 @@ impl PropertySheet { } #[derive(Deserialize, Debug)] - struct PropertySheetJSON { + struct PropertySheetJSON

{ states: Vec, - property_sets: Vec>, + property_sets: Vec

, } - let input: PropertySheetJSON = serde_json::from_str(json) + let input: PropertySheetJSON

= serde_json::from_str(json) .map_err(|e| PropertySheetError::InvalidJSON(e))?; let mut states = Vec::new(); @@ -729,9 +732,10 @@ impl PropertySheet { property_set_id: state.property_set_id, }); } - Ok(PropertySheet { + Ok(Self { property_sets: input.property_sets, states, + _phantom: &std::marker::PhantomData, }) } } @@ -844,7 +848,7 @@ mod tests { parser.set_language(rust()).unwrap(); let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); - let property_sheet = PropertySheet::new( + let property_sheet = PropertySheet::>::new( rust(), r##" { From 11610e1df66214a1bf58bff2565b52d270bf0d5b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 29 Nov 2018 20:51:50 -0800 Subject: [PATCH 046/208] Eliminate deserializer lifetime on PropertySheet The PropertySheet is intended to be a long-lived object, whereas its JSON source is not needed once the property sheet is instantiated. Co-Authored-By: Timothy Clem --- src/lib.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 68715879..681af7fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ extern crate serde_json; extern crate regex; extern crate serde; -use serde::Deserialize; +use serde::de::DeserializeOwned; use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; @@ -63,10 +63,9 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet<'d, P: Deserialize<'d>> { +pub struct PropertySheet> { states: Vec, property_sets: Vec

, - _phantom: &'d std::marker::PhantomData<()>, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -77,11 +76,11 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a, 'd, P: Deserialize<'d>> { +pub struct TreePropertyCursor<'a, P: 'a + DeserializeOwned> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, - property_sheet: &'a PropertySheet<'d, P>, + property_sheet: &'a PropertySheet

, source: &'a str, } @@ -353,11 +352,11 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a, 'd, P: Deserialize<'d>>( + pub fn walk_with_properties<'a, P: DeserializeOwned>( &'a self, - property_sheet: &'a PropertySheet<'d, P>, + property_sheet: &'a PropertySheet

, source: &'a str, - ) -> TreePropertyCursor<'a, 'd, P> { + ) -> TreePropertyCursor<'a, P> { TreePropertyCursor::new(self, property_sheet, source) } } @@ -548,8 +547,8 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a, 'd, P: Deserialize<'d>> TreePropertyCursor<'a, 'd, P> { - fn new(tree: &'a Tree, property_sheet: &'a PropertySheet<'d, P>, source: &'a str) -> Self { +impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], @@ -674,8 +673,8 @@ impl From for Point { } } -impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { - pub fn new(language: Language, json: &'a str) -> Result { +impl PropertySheet

{ + pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -735,7 +734,6 @@ impl<'a, P: Deserialize<'a>> PropertySheet<'a, P> { Ok(Self { property_sets: input.property_sets, states, - _phantom: &std::marker::PhantomData, }) } } From fbb220f19302ff44f172b6a48362ece7f62167ee Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 3 Dec 2018 10:43:58 -0800 Subject: [PATCH 047/208] Add test for regexes in property sheets --- src/lib.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 681af7fb..724a08bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -844,9 +844,18 @@ mod tests { fn test_tree_property_matching() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - let tree = parser.parse_str("fn f1() { f2(); }", None).unwrap(); + let source_code = "fn f1() { f2(); }"; + let tree = parser.parse_str(source_code, None).unwrap(); - let property_sheet = PropertySheet::>::new( + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + reference: Option, + define: Option, + } + + let empty_properties = Properties { reference: None, define: None }; + + let property_sheet = PropertySheet::::new( rust(), r##" { @@ -894,47 +903,126 @@ mod tests { ) .unwrap(); - let mut cursor = tree.walk_with_properties(&property_sheet, ""); + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "function_item"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "fn"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(!cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties()["define"], "function"); + assert_eq!(cursor.node_properties().define, Some("function".to_owned())); assert!(!cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "parameters"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "("); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), ")"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_parent()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert!(cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), HashMap::new()); + assert_eq!(*cursor.node_properties(), empty_properties); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties()["reference"], "function"); + assert_eq!(cursor.node_properties().reference, Some("function".to_owned())); + } + + #[test] + fn test_tree_property_matching_with_regexes() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { None(a()) }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + scope: Option, + } + + let empty_properties = Properties { scope: None }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "id": 0, + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "id": 1, + "transitions": [ + {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"scope": "constructor"}, + {"scope": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + cursor.goto_first_child(); + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties().scope, Some("constructor".to_owned())); } #[test] From beb60194d12b62cf70bc6b9e8652258ae07a9b44 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 3 Dec 2018 14:42:18 -0800 Subject: [PATCH 048/208] 0.3.3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0ffee772..f61b1583 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.2" +version = "0.3.3" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From a4c4b85a16ce0ecbb550d6de47801d2e387e629b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 5 Dec 2018 12:50:12 -0800 Subject: [PATCH 049/208] Initial commit --- .gitignore | 2 + Cargo.lock | 812 ++++++++++++++++++ Cargo.toml | 17 + src/build_tables/item.rs | 22 + src/build_tables/mod.rs | 34 + src/error.rs | 13 + src/generate.rs | 26 + src/grammars.rs | 98 +++ src/main.rs | 35 + src/parse_grammar.rs | 153 ++++ src/prepare_grammar/expand_repeats.rs | 220 +++++ src/prepare_grammar/extract_simple_aliases.rs | 10 + src/prepare_grammar/extract_tokens.rs | 7 + src/prepare_grammar/flatten_grammar.rs | 7 + src/prepare_grammar/intern_symbols.rs | 237 +++++ src/prepare_grammar/mod.rs | 40 + src/prepare_grammar/normalize_rules.rs | 5 + src/render/mod.rs | 16 + src/rules.rs | 205 +++++ src/tables.rs | 77 ++ 20 files changed, 2036 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/build_tables/item.rs create mode 100644 src/build_tables/mod.rs create mode 100644 src/error.rs create mode 100644 src/generate.rs create mode 100644 src/grammars.rs create mode 100644 src/main.rs create mode 100644 src/parse_grammar.rs create mode 100644 src/prepare_grammar/expand_repeats.rs create mode 100644 src/prepare_grammar/extract_simple_aliases.rs create mode 100644 src/prepare_grammar/extract_tokens.rs create mode 100644 src/prepare_grammar/flatten_grammar.rs create mode 100644 src/prepare_grammar/intern_symbols.rs create mode 100644 src/prepare_grammar/mod.rs create mode 100644 src/prepare_grammar/normalize_rules.rs create mode 100644 src/render/mod.rs create mode 100644 src/rules.rs create mode 100644 src/tables.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..53eaa219 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +**/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..20908681 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,812 @@ +[[package]] +name = "aho-corasick" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "argon2rs" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)", + "scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "arrayvec" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "atty" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace-sys" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bitvec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "blake2-rfc" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cc" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cfg-if" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clap" +version = "2.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-channel" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-utils" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "dirs" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure_derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fnv" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "globset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ignore" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "itoa" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lazy_static" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.44" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libloading" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libsqlite3-sys" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "linked-hash-map" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lock_api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lru-cache" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memchr" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memoffset" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "owning_ref" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_syscall" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "redox_users" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rusqlite" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", + "lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rust-tree-sitter-cli" +version = "0.1.0" +dependencies = [ + "bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ryu" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "same-file" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scopeguard" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "smallvec" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "stable_deref_trait" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "syn" +version = "0.15.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "synstructure" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termion" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "textwrap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "time" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tree-sitter" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ucd-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vcpkg" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "walkdir" +version = "2.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +"checksum argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3f67b0b6a86dae6e67ff4ca2b6201396074996379fba2b92ff649126f37cb392" +"checksum arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "f405cc4c21cd8b784f6c8fc2adf9bc00f59558f0049b5ec21517f875963040cc" +"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" +"checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" +"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" +"checksum bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e37e2176261200377c7cde4c6de020394174df556c356f965e4bc239f5ce1c5a" +"checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" +"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" +"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e" +"checksum crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b85741761b7f160bc5e7e0c14986ef685b7f8bf9b7ad081c60c604bb4649827" +"checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8" +"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015" +"checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816" +"checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a" +"checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7" +"checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596" +"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" +"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +"checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" +"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" +"checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" +"checksum libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d3711dfd91a1081d2458ad2d06ea30a8755256e74038be2ad927d94e1c955ca8" +"checksum linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7860ec297f7008ff7a1e3382d7f7e1dcd69efc94751a2284bafc3d013c2aa939" +"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" +"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" +"checksum lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d06ff7ff06f729ce5f4e227876cb88d10bc59cd4ae1e09fbb2bde15c850dc21" +"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" +"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" +"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" +"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" +"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" +"checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" +"checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" +"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" +"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" +"checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372" +"checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db" +"checksum redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)" = "679da7508e9a6390aeaf7fbd02a800fdc64b73fe2204dd2c8ae66d22d9d5ad5d" +"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +"checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" +"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c" +"checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" +"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" +"checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" +"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +"checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" +"checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" +"checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" +"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" +"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" +"checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" +"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" +"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" +"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b" +"checksum tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "311adf1e004ac816285a1196c93ea36364857c3adc37ffc9fd5ed0d70545391a" +"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" +"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..965cc81e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rust-tree-sitter-cli" +version = "0.1.0" +authors = ["Max Brunsfeld "] +edition = "2018" + +[dependencies] +bitvec = "0.8" +clap = "2.32" +dirs = "1.0.2" +ignore = "0.4.4" +libloading = "0.5" +rusqlite = "0.14.0" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0" +tree-sitter = "0.3.1" diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs new file mode 100644 index 00000000..c8d30997 --- /dev/null +++ b/src/build_tables/item.rs @@ -0,0 +1,22 @@ +use crate::grammars::Production; +use std::collections::HashMap; +use bitvec::BitVec; + +#[derive(Debug, PartialEq, Eq)] +pub(super) struct LookaheadSet { + terminal_bits: BitVec, + external_bits: BitVec, + eof: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(super) struct ParseItem { + variable_index: u32, + production_index: u32, + step_index: u32, +} + +#[derive(Debug, PartialEq, Eq)] +pub(super) struct ParseItemSet { + entries: HashMap +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs new file mode 100644 index 00000000..c5dd5b54 --- /dev/null +++ b/src/build_tables/mod.rs @@ -0,0 +1,34 @@ +mod item; + +use std::collections::{HashMap, VecDeque}; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; +use crate::tables::{ParseTable, LexTable, ParseStateId}; +use crate::rules::{AliasMap, Symbol}; +use crate::error::Result; +use self::item::ParseItemSet; + +type SymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + item_set: ParseItemSet, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + simple_aliases: &'a AliasMap, + state_ids_by_item_set: HashMap, + item_sets_by_state_id: Vec<&'a ParseItemSet>, + parse_state_queue: VecDeque, + parse_table: ParseTable, +} + +pub fn build_tables( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + simple_aliases: &AliasMap +) -> Result<(ParseTable, LexTable, LexTable, Option)> { + unimplemented!(); +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 00000000..90e7b8f9 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,13 @@ +#[derive(Debug)] +pub enum Error { + GrammarError(String), + SymbolError(String), +} + +pub type Result = std::result::Result; + +impl From for Error { + fn from(error: serde_json::Error) -> Self { + Error::GrammarError(error.to_string()) + } +} diff --git a/src/generate.rs b/src/generate.rs new file mode 100644 index 00000000..4507fb6f --- /dev/null +++ b/src/generate.rs @@ -0,0 +1,26 @@ +use crate::error::Result; +use crate::parse_grammar::parse_grammar; +use crate::prepare_grammar::prepare_grammar; +use crate::build_tables::build_tables; +use crate::render::render_c_code; + +pub fn generate_parser_for_grammar(input: String) -> Result { + let input_grammar = parse_grammar(&input)?; + let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases + )?; + let c_code = render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases + ); + Ok(c_code) +} diff --git a/src/grammars.rs b/src/grammars.rs new file mode 100644 index 00000000..6f5b772e --- /dev/null +++ b/src/grammars.rs @@ -0,0 +1,98 @@ +use crate::rules::{Associativity, Alias, Rule, Symbol}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VariableType { + Hidden, + Auxiliary, + Anonymous, + Named +} + +// Input grammar + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct InputVariable { + pub name: String, + pub kind: VariableType, + pub rule: Rule, +} + +#[derive(PartialEq, Eq)] +pub struct InputGrammar { + pub name: String, + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Option, +} + +// Extracted lexical grammar + +#[derive(PartialEq, Eq)] +pub struct LexicalVariable { + name: String, + kind: VariableType, + rule: Rule, + is_string: bool, +} + +pub struct LexicalGrammar { + variables: Vec, + separators: Vec, +} + +// Extracted syntax grammar + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ProductionStep { + symbol: Symbol, + precedence: i32, + associativity: Option, + alias: Option, + is_excluded: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Production { + steps: Vec, + dynamic_precedence: i32, +} + +#[derive(Clone, PartialEq, Eq)] +pub struct SyntaxVariable { + name: String, + kind: VariableType, +} + +#[derive(Clone, PartialEq, Eq)] +pub struct ExternalToken { + name: String, + kind: VariableType, + corresponding_internal_token: Symbol, +} + +pub struct SyntaxGrammar { + variables: Vec, + extra_tokens: Vec, + expected_conflicts: Vec>, + external_tokens: Vec, + variables_to_inline: Vec, + word_token: Symbol, +} + +#[cfg(test)] +impl InputVariable { + pub fn named(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Named, rule } + } + + pub fn auxiliary(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Auxiliary, rule } + } + + pub fn hidden(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Hidden, rule } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..3eeb306a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,35 @@ +use clap::{App, Arg, SubCommand}; + +#[macro_use] extern crate serde_derive; +#[macro_use] extern crate serde_json; + +mod build_tables; +mod error; +mod generate; +mod grammars; +mod parse_grammar; +mod prepare_grammar; +mod render; +mod rules; +mod tables; + +fn main() { + let matches = App::new("tree-sitter") + .version("0.1") + .author("Max Brunsfeld ") + .about("Generates and tests parsers") + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + ).subcommand( + SubCommand::with_name("parse") + .about("Parse a file") + .arg(Arg::with_name("path").index(1)) + ).subcommand( + SubCommand::with_name("test") + .about("Run a parser's tests") + .arg(Arg::with_name("path").index(1).required(true)) + .arg(Arg::with_name("line").index(2).required(true)) + .arg(Arg::with_name("column").index(3).required(true)) + ); +} diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs new file mode 100644 index 00000000..4c21e5ba --- /dev/null +++ b/src/parse_grammar.rs @@ -0,0 +1,153 @@ +use serde_json::{Map, Value}; +use crate::error::Result; +use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::rules::Rule; +use std::collections::HashMap; + +#[derive(Deserialize)] +#[serde(tag = "type")] +#[allow(non_camel_case_types)] +pub enum RuleJSON { + BLANK, + STRING { + value: String, + }, + PATTERN { + value: String, + }, + SYMBOL { + name: String, + }, + CHOICE { + members: Vec, + }, + SEQ { + members: Vec, + }, + REPEAT { + content: Box, + }, + PREC_LEFT { + value: i32, + content: Box, + }, + PREC_RIGHT { + value: i32, + content: Box, + }, + PREC { + value: i32, + content: Box, + }, + TOKEN { + content: Box, + }, + TOKEN_IMMEDIATE { + content: Box, + }, +} + +#[derive(Deserialize)] +struct GrammarJSON { + name: String, + rules: Map, + conflicts: Option>>, + externals: Option>, + extras: Option>, + inline: Option>, + word: Option, +} + +pub fn parse_grammar(input: &str) -> Result { + let grammar_json: GrammarJSON = serde_json::from_str(&input)?; + + let mut variables = Vec::with_capacity(grammar_json.rules.len()); + for (name, value) in grammar_json.rules { + variables.push(InputVariable { + name: name.to_owned(), + kind: VariableType::Named, + rule: parse_rule(serde_json::from_value(value)?), + }) + } + + let extra_tokens = grammar_json.extras + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let external_tokens = grammar_json.externals + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let expected_conflicts = grammar_json.conflicts + .unwrap_or(Vec::new()); + let variables_to_inline = grammar_json.inline + .unwrap_or(Vec::new()); + + Ok(InputGrammar { + name: grammar_json.name, + word_token: grammar_json.word, + variables, + extra_tokens, + expected_conflicts, + external_tokens, + variables_to_inline, + }) +} + +fn parse_rule(json: RuleJSON) -> Rule { + match json { + RuleJSON::BLANK => Rule::Blank, + RuleJSON::STRING { value } => Rule::String(value), + RuleJSON::PATTERN { value } => Rule::Pattern(value), + RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name), + RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), + RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), + RuleJSON::REPEAT { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), + RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), + RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), + RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), + RuleJSON::TOKEN_IMMEDIATE { content } => Rule::immediate_token(parse_rule(*content)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_grammar() { + let grammar = parse_grammar(&json!({ + "name": "my_lang", + "rules": { + "file": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "statement" + } + }, + "statement": { + "type": "STRING", + "value": "foo" + } + } + }).to_string()).unwrap(); + + assert_eq!(grammar.name, "my_lang"); + assert_eq!(grammar.variables, vec![ + InputVariable { + name: "file".to_string(), + kind: VariableType::Named, + rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) + }, + InputVariable { + name: "statement".to_string(), + kind: VariableType::Named, + rule: Rule::String("foo".to_string()) + }, + ]); + } +} diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs new file mode 100644 index 00000000..69db150c --- /dev/null +++ b/src/prepare_grammar/expand_repeats.rs @@ -0,0 +1,220 @@ +use crate::rules::{Rule, Symbol}; +use crate::grammars::{InputVariable, VariableType}; +use std::collections::HashMap; +use std::mem; +use std::rc::Rc; +use super::ExtractedGrammar; + +struct Expander { + variable_name: String, + repeat_count_in_variable: usize, + preceding_symbol_count: usize, + auxiliary_variables: Vec, + existing_repeats: HashMap +} + +impl Expander { + fn expand_variable(&mut self, variable: &mut InputVariable) { + self.variable_name.clear(); + self.variable_name.push_str(&variable.name); + self.repeat_count_in_variable = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.expand_rule(&rule); + } + + fn expand_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Choice { elements } => + Rule::Choice { + elements: elements.iter().map(|element| self.expand_rule(element)).collect() + }, + + Rule::Seq { left, right } => + Rule::Seq { + left: Rc::new(self.expand_rule(left)), + right: Rc::new(self.expand_rule(right)), + }, + + Rule::Repeat(content) => { + let inner_rule = self.expand_rule(content); + + if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) { + return Rule::Symbol(*existing_symbol); + } + + self.repeat_count_in_variable += 1; + let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); + let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); + let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); + self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); + self.auxiliary_variables.push(InputVariable { + name: rule_name, + kind: VariableType::Auxiliary, + rule: Rule::Choice { + elements: vec![ + Rule::Seq { + left: rc_symbol.clone(), + right: rc_symbol + }, + inner_rule + ], + }, + }); + + Rule::Symbol(repeat_symbol) + } + + Rule::Metadata { rule, params } => Rule::Metadata { + rule: Rc::new(self.expand_rule(rule)), + params: params.clone() + }, + + _ => rule.clone() + } + } +} + +pub(super) fn expand_repeats(mut grammar: ExtractedGrammar) -> ExtractedGrammar { + let mut expander = Expander { + variable_name: String::new(), + repeat_count_in_variable: 0, + preceding_symbol_count: grammar.variables.len(), + auxiliary_variables: Vec::new(), + existing_repeats: HashMap::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + expander.expand_variable(&mut variable); + } + + grammar.variables.extend(expander.auxiliary_variables.into_iter()); + grammar +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + // Repeats nested inside of sequences and choices are expanded. + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![ + Rule::repeat(Rule::terminal(11)), + Rule::repeat(Rule::terminal(12)), + ]), + Rule::terminal(13), + ])), + ])); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ]), + Rule::terminal(13), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(1), + ]), + Rule::terminal(11), + ])), + InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::terminal(12), + ])), + ]); + } + + #[test] + fn test_repeat_deduplication() { + // Terminal 4 appears inside of a repeat in three different places. + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::choice(vec![ + Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), + Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), + ])), + InputVariable::named("rule1", Rule::seq(vec![ + Rule::terminal(3), + Rule::repeat(Rule::terminal(4)), + ])), + ])); + + // Only one auxiliary rule is created for repeating terminal 4. + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::choice(vec![ + Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), + Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), + ])), + InputVariable::named("rule1", Rule::seq(vec![ + Rule::terminal(3), + Rule::non_terminal(2), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::terminal(4), + ])) + ]); + } + + #[test] + fn test_expansion_of_nested_repeats() { + let grammar = expand_repeats(build_grammar(vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::repeat(Rule::seq(vec![ + Rule::terminal(11), + Rule::repeat(Rule::terminal(12)) + ])), + ])), + ])); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("rule0", Rule::seq(vec![ + Rule::terminal(10), + Rule::non_terminal(2), + ])), + InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(1), + ]), + Rule::terminal(12), + ])), + InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ]), + Rule::seq(vec![ + Rule::terminal(11), + Rule::non_terminal(1), + ]), + ])), + ]); + } + + fn build_grammar(variables: Vec) -> ExtractedGrammar { + ExtractedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs new file mode 100644 index 00000000..250246f3 --- /dev/null +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -0,0 +1,10 @@ +use crate::rules::AliasMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use super::ExtractedGrammar; + +pub(super) fn extract_simple_aliases( + syntax_grammar: &mut SyntaxGrammar, + lexical_grammar: &mut LexicalGrammar +) -> AliasMap { + unimplemented!(); +} diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs new file mode 100644 index 00000000..660d3819 --- /dev/null +++ b/src/prepare_grammar/extract_tokens.rs @@ -0,0 +1,7 @@ +use crate::error::Result; +use crate::grammars::LexicalGrammar; +use super::{InternedGrammar, ExtractedGrammar}; + +pub(super) fn extract_tokens(grammar: InternedGrammar) -> Result<(ExtractedGrammar, LexicalGrammar)> { + unimplemented!(); +} diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs new file mode 100644 index 00000000..36fe76c9 --- /dev/null +++ b/src/prepare_grammar/flatten_grammar.rs @@ -0,0 +1,7 @@ +use crate::error::Result; +use crate::grammars::SyntaxGrammar; +use super::ExtractedGrammar; + +pub(super) fn flatten_grammar(grammar: ExtractedGrammar) -> Result { + unimplemented!(); +} diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs new file mode 100644 index 00000000..00a5c330 --- /dev/null +++ b/src/prepare_grammar/intern_symbols.rs @@ -0,0 +1,237 @@ +use crate::error::{Error, Result}; +use crate::rules::{Rule, Symbol}; +use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use std::rc::Rc; +use super::InternedGrammar; + +pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result { + let interner = Interner { grammar }; + + if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { + return Err(Error::GrammarError("Grammar's start rule must be visible".to_string())); + } + + let mut variables = Vec::with_capacity(grammar.variables.len()); + for variable in grammar.variables.iter() { + variables.push(InputVariable { + name: variable.name.clone(), + kind: variable_type_for_name(&variable.name), + rule: interner.intern_rule(&variable.rule)?, + }); + } + + let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); + for external_token in grammar.external_tokens.iter() { + let rule = interner.intern_rule(&external_token)?; + let (name, kind) = if let Rule::NamedSymbol(name) = external_token { + (name.clone(), variable_type_for_name(&name)) + } else { + (String::new(), VariableType::Anonymous) + }; + external_tokens.push(InputVariable { name, kind, rule }); + } + + let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); + for extra_token in grammar.extra_tokens.iter() { + extra_tokens.push(interner.intern_rule(extra_token)?); + } + + let mut expected_conflicts = Vec::new(); + for conflict in grammar.expected_conflicts.iter() { + let mut interned_conflict = Vec::with_capacity(conflict.len()); + for name in conflict { + interned_conflict.push(interner + .intern_name(&name) + .ok_or_else(|| symbol_error(name))? + ); + } + expected_conflicts.push(interned_conflict); + } + + let mut variables_to_inline = Vec::new(); + for name in grammar.variables_to_inline.iter() { + if let Some(symbol) = interner.intern_name(&name) { + variables_to_inline.push(symbol); + } + } + + let mut word_token = None; + if let Some(name) = grammar.word_token.as_ref() { + word_token = Some(interner + .intern_name(&name) + .ok_or_else(|| symbol_error(&name))? + ); + } + + Ok(InternedGrammar { + variables, + external_tokens, + extra_tokens, + expected_conflicts, + variables_to_inline, + word_token, + }) +} + +struct Interner<'a> { + grammar: &'a InputGrammar +} + +impl<'a> Interner<'a> { + fn intern_rule(&self, rule: &Rule) -> Result { + match rule { + Rule::Choice { elements } => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Choice { elements: result }) + }, + + Rule::Seq { left, right } => + Ok(Rule::Seq { + left: Rc::new(self.intern_rule(left)?), + right: Rc::new(self.intern_rule(right)?), + }), + + Rule::Repeat(content) => + Ok(Rule::Repeat(Rc::new(self.intern_rule(content)?))), + + Rule::Metadata { rule, params } => + Ok(Rule::Metadata { + rule: Rc::new(self.intern_rule(rule)?), + params: params.clone() + }), + + Rule::NamedSymbol(name) => { + if let Some(symbol) = self.intern_name(&name) { + Ok(Rule::Symbol(symbol)) + } else { + Err(symbol_error(name)) + } + }, + + _ => Ok(rule.clone()) + + } + } + + fn intern_name(&self, symbol: &str) -> Option { + for (i, variable) in self.grammar.variables.iter().enumerate() { + if variable.name == symbol { + return Some(Symbol::non_terminal(i)) + } + } + + for (i, external_token) in self.grammar.external_tokens.iter().enumerate() { + if let Rule::NamedSymbol(name) = external_token { + if name == symbol { + return Some(Symbol::external(i)) + } + } + } + + return None + } +} + +fn symbol_error(name: &str) -> Error { + Error::SymbolError(format!("Undefined symbol '{}'", name)) +} + +fn variable_type_for_name(name: &str) -> VariableType { + if name.starts_with("_") { + VariableType::Hidden + } else { + VariableType::Named + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + let grammar = intern_symbols(&build_grammar(vec![ + InputVariable::named("x", Rule::choice(vec![ + Rule::named("y"), + Rule::named("_z"), + ])), + InputVariable::named("y", Rule::named("_z")), + InputVariable::named("_z", Rule::string("a")), + ])).unwrap(); + + assert_eq!(grammar.variables, vec![ + InputVariable::named("x", Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + InputVariable::named("y", Rule::non_terminal(2)), + InputVariable::hidden("_z", Rule::string("a")), + ]); + } + + #[test] + fn test_interning_external_token_names() { + // Variable `y` is both an internal and an external token. + // Variable `z` is just an external token. + let mut input_grammar = build_grammar(vec![ + InputVariable::named("w", Rule::choice(vec![ + Rule::named("x"), + Rule::named("y"), + Rule::named("z"), + ])), + InputVariable::named("x", Rule::string("a")), + InputVariable::named("y", Rule::string("b")), + ]); + input_grammar.external_tokens.extend(vec![ + Rule::named("y"), + Rule::named("z"), + ]); + + let grammar = intern_symbols(&input_grammar).unwrap(); + + // Variable `y` is referred to by its internal index. + // Variable `z` is referred to by its external index. + assert_eq!(grammar.variables, vec![ + InputVariable::named("w", Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::external(1), + ])), + InputVariable::named("x", Rule::string("a")), + InputVariable::named("y", Rule::string("b")), + ]); + + // The external token for `y` refers back to its internal index. + assert_eq!(grammar.external_tokens, vec![ + InputVariable::named("y", Rule::non_terminal(2)), + InputVariable::named("z", Rule::external(1)), + ]); + } + + #[test] + fn test_grammar_with_undefined_symbols() { + let result = intern_symbols(&build_grammar(vec![ + InputVariable::named("x", Rule::named("y")), + ])); + + match result { + Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), + _ => panic!("Expected an error but got none"), + } + } + + fn build_grammar(variables: Vec) -> InputGrammar { + InputGrammar { + variables, + name: "the_language".to_string(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs new file mode 100644 index 00000000..0788edca --- /dev/null +++ b/src/prepare_grammar/mod.rs @@ -0,0 +1,40 @@ +mod intern_symbols; +mod extract_tokens; +mod expand_repeats; +mod flatten_grammar; +mod normalize_rules; +mod extract_simple_aliases; + +use crate::rules::{AliasMap, Rule, Symbol}; +use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, InputVariable, ExternalToken}; +use crate::error::Result; +use self::intern_symbols::intern_symbols; +use self::extract_tokens::extract_tokens; +use self::expand_repeats::expand_repeats; +use self::flatten_grammar::flatten_grammar; +use self::normalize_rules::normalize_rules; +use self::extract_simple_aliases::extract_simple_aliases; + +pub(self) struct IntermediateGrammar { + variables: Vec, + extra_tokens: Vec, + expected_conflicts: Vec>, + external_tokens: Vec, + variables_to_inline: Vec, + word_token: Option, +} + +pub(self) type InternedGrammar = IntermediateGrammar; +pub(self) type ExtractedGrammar = IntermediateGrammar; + +pub fn prepare_grammar( + input_grammar: &InputGrammar +) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { + let interned_grammar = intern_symbols(input_grammar)?; + let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; + let syntax_grammar = expand_repeats(syntax_grammar); + let mut syntax_grammar = flatten_grammar(syntax_grammar)?; + let mut lexical_grammar = normalize_rules(lexical_grammar); + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); + Ok((syntax_grammar, lexical_grammar, simple_aliases)) +} diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs new file mode 100644 index 00000000..9e625ef5 --- /dev/null +++ b/src/prepare_grammar/normalize_rules.rs @@ -0,0 +1,5 @@ +use crate::grammars::LexicalGrammar; + +pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { + unimplemented!(); +} diff --git a/src/render/mod.rs b/src/render/mod.rs new file mode 100644 index 00000000..85ce1f32 --- /dev/null +++ b/src/render/mod.rs @@ -0,0 +1,16 @@ +use crate::rules::{Symbol, AliasMap}; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; +use crate::tables::{ParseTable, LexTable}; + +pub fn render_c_code( + name: &str, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, +) -> String { + unimplemented!(); +} diff --git a/src/rules.rs b/src/rules.rs new file mode 100644 index 00000000..3cccca0d --- /dev/null +++ b/src/rules.rs @@ -0,0 +1,205 @@ +use std::rc::Rc; +use std::collections::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum SymbolType { + External, + Terminal, + NonTerminal, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Associativity { + Left, + Right +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct Alias { + value: String, + is_named: bool, +} + +pub type AliasMap = HashMap; + +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct MetadataParams { + precedence: Option, + dynamic_precedence: i32, + associativity: Option, + is_token: bool, + is_string: bool, + is_active: bool, + is_main_token: bool, + is_excluded: bool, + alias: Option, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct Symbol { + kind: SymbolType, + index: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Rule { + Blank, + CharacterSet(Vec), + String(String), + Pattern(String), + NamedSymbol(String), + Symbol(Symbol), + Choice { + elements: Vec, + }, + Metadata { + params: MetadataParams, + rule: Rc, + }, + Repeat(Rc), + Seq { + left: Rc, + right: Rc, + } +} + +impl Rule { + pub fn token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + }) + } + + pub fn immediate_token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + params.is_main_token = true; + }) + } + + pub fn prec(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.precedence = Some(value); + }) + } + + pub fn prec_left(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Left); + params.precedence = Some(value); + }) + } + + pub fn prec_right(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Right); + params.precedence = Some(value); + }) + } + + pub fn repeat(rule: Rule) -> Self { + Rule::Repeat(Rc::new(rule)) + } + + pub fn choice(rules: Vec) -> Self { + let mut elements = Vec::with_capacity(rules.len()); + for rule in rules { + choice_helper(&mut elements, rule); + } + Rule::Choice { elements } + } + + pub fn seq(rules: Vec) -> Self { + let mut result = Rule::Blank; + for rule in rules { + match rule { + Rule::Blank => continue, + Rule::Metadata { rule, params: _ } => { + if *rule == Rule::Blank { + continue; + } + }, + _ => { + if result == Rule::Blank { + result = rule; + } else { + result = Rule::Seq { + left: Rc::new(result), + right: Rc::new(rule), + } + } + } + } + } + result + } + + pub fn terminal(index: usize) -> Self { + Rule::Symbol(Symbol::terminal(index)) + } + + pub fn non_terminal(index: usize) -> Self { + Rule::Symbol(Symbol::non_terminal(index)) + } + + pub fn external(index: usize) -> Self { + Rule::Symbol(Symbol::external(index)) + } + + pub fn named(name: &'static str) -> Self { + Rule::NamedSymbol(name.to_string()) + } + + pub fn string(value: &'static str) -> Self { + Rule::String(value.to_string()) + } +} + +impl Symbol { + pub fn non_terminal(index: usize) -> Self { + Symbol { kind: SymbolType::NonTerminal, index } + } + + pub fn terminal(index: usize) -> Self { + Symbol { kind: SymbolType::Terminal, index } + } + + pub fn external(index: usize) -> Self { + Symbol { kind: SymbolType::External, index } + } +} + +impl From for Rule { + fn from(symbol: Symbol) -> Self { + Rule::Symbol(symbol) + } +} + +fn add_metadata(input: Rule, f: T) -> Rule { + match input { + Rule::Metadata { rule, mut params } => { + f(&mut params); + Rule::Metadata { rule, params } + }, + _ => { + let mut params = MetadataParams::default(); + f(&mut params); + Rule::Metadata { rule: Rc::new(input), params } + } + } +} + +fn choice_helper(result: &mut Vec, rule: Rule) { + match rule { + Rule::Choice {elements} => { + for element in elements { + choice_helper(result, element); + } + }, + _ => { + if !result.contains(&rule) { + result.push(rule); + } + } + } +} diff --git a/src/tables.rs b/src/tables.rs new file mode 100644 index 00000000..10b1e41d --- /dev/null +++ b/src/tables.rs @@ -0,0 +1,77 @@ +use std::collections::HashMap; +use std::ops::Range; +use crate::rules::{Associativity, Symbol, Alias}; + +pub type AliasSequenceId = usize; +pub type ParseStateId = usize; +pub type LexStateId = usize; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ParseActionType { + Error, + Shift, + Reduce, + Accept, + Recover, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ParseAction { + Accept, + Error, + Shift(ParseStateId), + ShiftExtra, + Recover, + Reduce { + symbol: Symbol, + child_count: usize, + precedence: i32, + dynamic_precedence: i32, + associativity: Option, + alias_sequence_id: Option, + is_repetition: bool, + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ParseTableEntry { + actions: Vec, + reusable: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ParseState { + terminal_entries: HashMap, + nonterminal_entries: HashMap +} + +#[derive(Debug, PartialEq, Eq)] +pub struct ParseTable { + states: Vec, + alias_sequences: Vec>, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AdvanceAction { + state: LexStateId, + precedence: Range, + in_main_token: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AcceptTokenAction { + symbol: Symbol, + precedence: i32, + implicit_precedence: i32, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LexState { + advance_actions: HashMap, + accept_action: Option, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct LexTable { + states: Vec, +} From 0688a5edd387e01ca7c83f9bbf2fb732852d2f5d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Dec 2018 22:11:52 -0800 Subject: [PATCH 050/208] Implement extract_tokens --- src/build_tables/mod.rs | 2 +- src/grammars.rs | 83 +++-- src/parse_grammar.rs | 12 +- src/prepare_grammar/expand_repeats.rs | 36 +- src/prepare_grammar/extract_tokens.rs | 492 +++++++++++++++++++++++++- src/prepare_grammar/intern_symbols.rs | 38 +- src/prepare_grammar/mod.rs | 8 +- src/render/mod.rs | 2 +- src/rules.rs | 52 +-- src/tables.rs | 24 +- 10 files changed, 621 insertions(+), 128 deletions(-) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c5dd5b54..c3518428 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -25,7 +25,7 @@ struct ParseTableBuilder<'a> { parse_table: ParseTable, } -pub fn build_tables( +pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap diff --git a/src/grammars.rs b/src/grammars.rs index 6f5b772e..62910637 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,7 +1,7 @@ use crate::rules::{Associativity, Alias, Rule, Symbol}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum VariableType { +pub(crate) enum VariableType { Hidden, Auxiliary, Anonymous, @@ -11,16 +11,16 @@ pub enum VariableType { // Input grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct InputVariable { +pub(crate) struct Variable { pub name: String, pub kind: VariableType, pub rule: Rule, } -#[derive(PartialEq, Eq)] -pub struct InputGrammar { +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct InputGrammar { pub name: String, - pub variables: Vec, + pub variables: Vec, pub extra_tokens: Vec, pub expected_conflicts: Vec>, pub external_tokens: Vec, @@ -30,60 +30,53 @@ pub struct InputGrammar { // Extracted lexical grammar -#[derive(PartialEq, Eq)] -pub struct LexicalVariable { - name: String, - kind: VariableType, - rule: Rule, - is_string: bool, -} - -pub struct LexicalGrammar { - variables: Vec, - separators: Vec, +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalGrammar { + pub variables: Vec, + pub separators: Vec, } // Extracted syntax grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ProductionStep { - symbol: Symbol, - precedence: i32, - associativity: Option, - alias: Option, - is_excluded: bool, +pub(crate) struct ProductionStep { + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, + pub is_excluded: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct Production { - steps: Vec, - dynamic_precedence: i32, +pub(crate) struct Production { + pub steps: Vec, + pub dynamic_precedence: i32, } -#[derive(Clone, PartialEq, Eq)] -pub struct SyntaxVariable { - name: String, - kind: VariableType, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct SyntaxVariable { + pub name: String, + pub kind: VariableType, } -#[derive(Clone, PartialEq, Eq)] -pub struct ExternalToken { - name: String, - kind: VariableType, - corresponding_internal_token: Symbol, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ExternalToken { + pub name: String, + pub kind: VariableType, + pub corresponding_internal_token: Option, } -pub struct SyntaxGrammar { - variables: Vec, - extra_tokens: Vec, - expected_conflicts: Vec>, - external_tokens: Vec, - variables_to_inline: Vec, - word_token: Symbol, +#[derive(Debug)] +pub(crate) struct SyntaxGrammar { + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Symbol, } -#[cfg(test)] -impl InputVariable { +impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } } @@ -95,4 +88,8 @@ impl InputVariable { pub fn hidden(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Hidden, rule } } + + pub fn anonymous(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Anonymous, rule } + } } diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 4c21e5ba..0f1f5008 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -1,13 +1,13 @@ use serde_json::{Map, Value}; use crate::error::Result; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] #[allow(non_camel_case_types)] -pub enum RuleJSON { +enum RuleJSON { BLANK, STRING { value: String, @@ -58,12 +58,12 @@ struct GrammarJSON { word: Option, } -pub fn parse_grammar(input: &str) -> Result { +pub(crate) fn parse_grammar(input: &str) -> Result { let grammar_json: GrammarJSON = serde_json::from_str(&input)?; let mut variables = Vec::with_capacity(grammar_json.rules.len()); for (name, value) in grammar_json.rules { - variables.push(InputVariable { + variables.push(Variable { name: name.to_owned(), kind: VariableType::Named, rule: parse_rule(serde_json::from_value(value)?), @@ -138,12 +138,12 @@ mod tests { assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ - InputVariable { + Variable { name: "file".to_string(), kind: VariableType::Named, rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) }, - InputVariable { + Variable { name: "statement".to_string(), kind: VariableType::Named, rule: Rule::String("foo".to_string()) diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index 69db150c..dcb8f916 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,5 +1,5 @@ use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputVariable, VariableType}; +use crate::grammars::{Variable, VariableType}; use std::collections::HashMap; use std::mem; use std::rc::Rc; @@ -9,12 +9,12 @@ struct Expander { variable_name: String, repeat_count_in_variable: usize, preceding_symbol_count: usize, - auxiliary_variables: Vec, + auxiliary_variables: Vec, existing_repeats: HashMap } impl Expander { - fn expand_variable(&mut self, variable: &mut InputVariable) { + fn expand_variable(&mut self, variable: &mut Variable) { self.variable_name.clear(); self.variable_name.push_str(&variable.name); self.repeat_count_in_variable = 0; @@ -48,7 +48,7 @@ impl Expander { let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); - self.auxiliary_variables.push(InputVariable { + self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, rule: Rule::Choice { @@ -100,7 +100,7 @@ mod tests { fn test_basic_repeat_expansion() { // Repeats nested inside of sequences and choices are expanded. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::repeat(Rule::terminal(11)), @@ -111,7 +111,7 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::non_terminal(1), @@ -119,14 +119,14 @@ mod tests { ]), Rule::terminal(13), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(11), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -140,11 +140,11 @@ mod tests { fn test_repeat_deduplication() { // Terminal 4 appears inside of a repeat in three different places. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::repeat(Rule::terminal(4)), ])), @@ -152,15 +152,15 @@ mod tests { // Only one auxiliary rule is created for repeating terminal 4. assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -173,7 +173,7 @@ mod tests { #[test] fn test_expansion_of_nested_repeats() { let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::repeat(Rule::seq(vec![ Rule::terminal(11), @@ -183,18 +183,18 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(12), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -207,7 +207,7 @@ mod tests { ]); } - fn build_grammar(variables: Vec) -> ExtractedGrammar { + fn build_grammar(variables: Vec) -> ExtractedGrammar { ExtractedGrammar { variables, extra_tokens: Vec::new(), diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 660d3819..ee90b3c8 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,7 +1,491 @@ -use crate::error::Result; -use crate::grammars::LexicalGrammar; +use std::collections::HashMap; +use std::rc::Rc; +use std::mem; +use crate::error::{Error, Result}; +use crate::rules::{Rule, MetadataParams, Symbol, SymbolType}; +use crate::grammars::{Variable, VariableType, LexicalGrammar, ExternalToken}; use super::{InternedGrammar, ExtractedGrammar}; -pub(super) fn extract_tokens(grammar: InternedGrammar) -> Result<(ExtractedGrammar, LexicalGrammar)> { - unimplemented!(); +pub(super) fn extract_tokens( + mut grammar: InternedGrammar +) -> Result<(ExtractedGrammar, LexicalGrammar)> { + let mut extractor = TokenExtractor { + current_variable_name: String::new(), + current_variable_token_count: 0, + extracted_variables: Vec::new(), + extracted_usage_counts: Vec::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + for mut variable in grammar.external_tokens.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len()); + for variable in extractor.extracted_variables { + lexical_variables.push(Variable { + name: variable.name, + kind: variable.kind, + rule: variable.rule, + }); + } + + // If a variable's entire rule was extracted as a token and that token didn't + // appear within any other rule, then remove that variable from the syntax + // grammar, giving its name to the token in the lexical grammar. Any symbols + // that pointed to that variable will need to be updated to point to the + // variable in the lexical grammar. Symbols that pointed to later variables + // will need to have their indices decremented. + let mut variables = Vec::new(); + let mut symbol_replacer = SymbolReplacer { replacements: HashMap::new() }; + for (i, variable) in grammar.variables.into_iter().enumerate() { + if let Rule::Symbol(Symbol { kind: SymbolType::Terminal, index }) = variable.rule { + if i > 0 && extractor.extracted_usage_counts[index] == 1 { + let mut lexical_variable = &mut lexical_variables[index]; + lexical_variable.kind = variable.kind; + lexical_variable.name = variable.name; + symbol_replacer.replacements.insert(i, index); + continue; + } + } + variables.push(variable); + } + + for variable in variables.iter_mut() { + variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); + } + + let expected_conflicts = grammar.expected_conflicts + .into_iter() + .map(|conflict| + conflict + .iter() + .map(|symbol| symbol_replacer.replace_symbol(*symbol)) + .collect() + ).collect(); + + let variables_to_inline = grammar.variables_to_inline + .into_iter() + .map(|symbol| symbol_replacer.replace_symbol(symbol)) + .collect(); + + let mut separators = Vec::new(); + let mut extra_tokens = Vec::new(); + for rule in grammar.extra_tokens { + if let Rule::Symbol(symbol) = rule { + let new_symbol = symbol_replacer.replace_symbol(symbol); + if new_symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-token symbol '{}' cannot be used as an extra token", + &variables[new_symbol.index].name + ))); + } else { + extra_tokens.push(new_symbol); + } + } else { + if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) { + extra_tokens.push(Symbol::terminal(index)); + } else { + separators.push(rule); + } + } + } + + let mut external_tokens = Vec::new(); + for external_token in grammar.external_tokens { + let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); + if let Rule::Symbol(symbol) = rule { + if symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Rule '{}' cannot be used as both an external token and a non-terminal rule", + &variables[symbol.index].name, + ))); + } + + if symbol.is_external() { + external_tokens.push(ExternalToken { + name: external_token.name, + kind: external_token.kind, + corresponding_internal_token: None, + }) + } else { + external_tokens.push(ExternalToken { + name: lexical_variables[symbol.index].name.clone(), + kind: external_token.kind, + corresponding_internal_token: Some(symbol), + }) + } + } else { + return Err(Error::GrammarError(format!( + "Non-symbol rules cannot be used as external tokens" + ))); + } + } + + let mut word_token = None; + if let Some(token) = grammar.word_token { + let token = symbol_replacer.replace_symbol(token); + if token.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-terminal symbol '{}' cannot be used as the word token", + &variables[token.index].name + ))); + } + word_token = Some(token); + } + + Ok(( + ExtractedGrammar { + variables, + expected_conflicts, + extra_tokens, + variables_to_inline, + external_tokens, + word_token, + }, + LexicalGrammar { + variables: lexical_variables, + separators, + } + )) +} + +struct TokenExtractor { + current_variable_name: String, + current_variable_token_count: usize, + extracted_variables: Vec, + extracted_usage_counts: Vec, +} + +struct SymbolReplacer { + replacements: HashMap +} + +impl TokenExtractor { + fn extract_tokens_in_variable(&mut self, variable: &mut Variable) { + self.current_variable_name.clear(); + self.current_variable_name.push_str(&variable.name); + self.current_variable_token_count = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.extract_tokens_in_rule(&rule); + } + + fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule { + match input { + Rule::String(name) => self.extract_token(input, Some(name)).into(), + Rule::Pattern(..) => self.extract_token(input, None).into(), + Rule::Metadata { params, rule } => { + if params.is_token { + let mut params = params.clone(); + params.is_token = false; + + let mut string_value = None; + if let Rule::String(value) = rule.as_ref() { + string_value = Some(value); + } + + let rule_to_extract = if params == MetadataParams::default() { + rule.as_ref() + } else { + input + }; + + self.extract_token(rule_to_extract, string_value).into() + } else { + Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.extract_tokens_in_rule((&rule).clone())) + } + } + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.extract_tokens_in_rule(content)) + ), + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.extract_tokens_in_rule(left)), + right: Rc::new(self.extract_tokens_in_rule(right)), + }, + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + }, + _ => input.clone() + } + } + + fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol { + for (i, variable) in self.extracted_variables.iter_mut().enumerate() { + if variable.rule == *rule { + self.extracted_usage_counts[i] += 1; + return Symbol::terminal(i) + } + } + + let index = self.extracted_variables.len(); + let variable = if let Some(string_value) = string_value { + Variable::anonymous(string_value, rule.clone()) + } else { + self.current_variable_token_count += 1; + Variable::auxiliary( + &format!( + "{}_token{}", + &self.current_variable_name, + self.current_variable_token_count + ), + rule.clone() + ) + }; + + self.extracted_variables.push(variable); + self.extracted_usage_counts.push(1); + Symbol::terminal(index) + } +} + +impl SymbolReplacer { + fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + }, + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.replace_symbols_in_rule(left)), + right: Rc::new(self.replace_symbols_in_rule(right)), + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.replace_symbols_in_rule(content)) + ), + Rule::Metadata { rule, params } => Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.replace_symbols_in_rule(rule)), + }, + _ => rule.clone() + } + } + + fn replace_symbol(&self, symbol: Symbol) -> Symbol { + if !symbol.is_non_terminal() { + return symbol + } + + if let Some(replacement) = self.replacements.get(&symbol.index) { + return Symbol::terminal(*replacement); + } + + let mut adjusted_index = symbol.index; + for (replaced_index, _) in self.replacements.iter() { + if *replaced_index < symbol.index { + adjusted_index -= 1; + } + } + + return Symbol::non_terminal(adjusted_index); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_extraction() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))) + ]) + ]))), + Variable::named("rule_1", Rule::pattern("e")), + Variable::named("rule_2", Rule::pattern("b")), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::Blank, + ])), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ]))), + + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::Blank, + ])), + ]); + + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary("rule_0_token2", Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + Variable::named("rule_1", Rule::pattern("e")), + ]); + } + + #[test] + fn test_start_rule_is_token() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::string("hello")), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::terminal(0)), + ]); + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("hello", Rule::string("hello")), + ]) + } + + #[test] + fn test_extracting_extra_tokens() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::string("x")), + Variable::named("comment", Rule::pattern("//.*")), + ]); + grammar.extra_tokens = vec![ + Rule::string(" "), + Rule::non_terminal(1), + ]; + + let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); + assert_eq!(syntax_grammar.extra_tokens, vec![ + Symbol::terminal(1), + ]); + assert_eq!(lexical_grammar.separators, vec![ + Rule::string(" "), + ]); + } + + #[test] + fn test_extract_externals() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::string("b")), + Variable::named("rule_2", Rule::string("c")), + ]); + grammar.external_tokens = vec![ + Variable::named("external_0", Rule::external(0)), + Variable::anonymous("a", Rule::string("a")), + Variable::named("rule_2", Rule::non_terminal(2)), + ]; + + let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); + + assert_eq!(syntax_grammar.external_tokens, vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ]); + } + + #[test] + fn test_error_on_non_terminal_symbol_extras() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::non_terminal(1)), + Variable::named("rule_1", Rule::non_terminal(2)), + Variable::named("rule_2", Rule::string("x")), + ]); + grammar.extra_tokens = vec![ + Rule::non_terminal(1), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Non-token symbol 'rule_1' cannot be used as an extra token"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + #[test] + fn test_error_on_external_with_same_name_as_non_terminal() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ])), + Variable::named("rule_2", Rule::string("a")), + ]); + grammar.external_tokens = vec![ + Variable::named("rule_1", Rule::non_terminal(1)), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + fn build_grammar(variables: Vec) -> InternedGrammar { + InternedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 00a5c330..e4cf7ff1 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -1,6 +1,6 @@ use crate::error::{Error, Result}; use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use std::rc::Rc; use super::InternedGrammar; @@ -13,7 +13,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let mut variables = Vec::with_capacity(grammar.variables.len()); for variable in grammar.variables.iter() { - variables.push(InputVariable { + variables.push(Variable { name: variable.name.clone(), kind: variable_type_for_name(&variable.name), rule: interner.intern_rule(&variable.rule)?, @@ -28,7 +28,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result } else { (String::new(), VariableType::Anonymous) }; - external_tokens.push(InputVariable { name, kind, rule }); + external_tokens.push(Variable { name, kind, rule }); } let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); @@ -154,21 +154,21 @@ mod tests { #[test] fn test_basic_repeat_expansion() { let grammar = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::named("y"), Rule::named("_z"), ])), - InputVariable::named("y", Rule::named("_z")), - InputVariable::named("_z", Rule::string("a")), + Variable::named("y", Rule::named("_z")), + Variable::named("_z", Rule::string("a")), ])).unwrap(); assert_eq!(grammar.variables, vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), ])), - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::hidden("_z", Rule::string("a")), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), ]); } @@ -177,13 +177,13 @@ mod tests { // Variable `y` is both an internal and an external token. // Variable `z` is just an external token. let mut input_grammar = build_grammar(vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::named("x"), Rule::named("y"), Rule::named("z"), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); input_grammar.external_tokens.extend(vec![ Rule::named("y"), @@ -195,26 +195,26 @@ mod tests { // Variable `y` is referred to by its internal index. // Variable `z` is referred to by its external index. assert_eq!(grammar.variables, vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), Rule::external(1), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); // The external token for `y` refers back to its internal index. assert_eq!(grammar.external_tokens, vec![ - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::named("z", Rule::external(1)), + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), ]); } #[test] fn test_grammar_with_undefined_symbols() { let result = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::named("y")), + Variable::named("x", Rule::named("y")), ])); match result { @@ -223,7 +223,7 @@ mod tests { } } - fn build_grammar(variables: Vec) -> InputGrammar { + fn build_grammar(variables: Vec) -> InputGrammar { InputGrammar { variables, name: "the_language".to_string(), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 0788edca..b860807a 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -6,7 +6,7 @@ mod normalize_rules; mod extract_simple_aliases; use crate::rules::{AliasMap, Rule, Symbol}; -use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, InputVariable, ExternalToken}; +use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, Variable, ExternalToken}; use crate::error::Result; use self::intern_symbols::intern_symbols; use self::extract_tokens::extract_tokens; @@ -16,7 +16,7 @@ use self::normalize_rules::normalize_rules; use self::extract_simple_aliases::extract_simple_aliases; pub(self) struct IntermediateGrammar { - variables: Vec, + variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, external_tokens: Vec, @@ -24,10 +24,10 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(self) type InternedGrammar = IntermediateGrammar; pub(self) type ExtractedGrammar = IntermediateGrammar; -pub fn prepare_grammar( +pub(crate) fn prepare_grammar( input_grammar: &InputGrammar ) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { let interned_grammar = intern_symbols(input_grammar)?; diff --git a/src/render/mod.rs b/src/render/mod.rs index 85ce1f32..5bd11a34 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,7 +2,7 @@ use crate::rules::{Symbol, AliasMap}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; use crate::tables::{ParseTable, LexTable}; -pub fn render_c_code( +pub(crate) fn render_c_code( name: &str, parse_table: ParseTable, main_lex_table: LexTable, diff --git a/src/rules.rs b/src/rules.rs index 3cccca0d..5c3b65fd 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -2,47 +2,47 @@ use std::rc::Rc; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum SymbolType { +pub(crate) enum SymbolType { External, Terminal, NonTerminal, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Associativity { +pub(crate) enum Associativity { Left, Right } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub struct Alias { - value: String, - is_named: bool, +pub(crate) struct Alias { + pub value: String, + pub is_named: bool, } -pub type AliasMap = HashMap; +pub(crate) type AliasMap = HashMap; #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] -pub struct MetadataParams { - precedence: Option, - dynamic_precedence: i32, - associativity: Option, - is_token: bool, - is_string: bool, - is_active: bool, - is_main_token: bool, - is_excluded: bool, - alias: Option, +pub(crate) struct MetadataParams { + pub precedence: Option, + pub dynamic_precedence: i32, + pub associativity: Option, + pub is_token: bool, + pub is_string: bool, + pub is_active: bool, + pub is_main_token: bool, + pub is_excluded: bool, + pub alias: Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct Symbol { - kind: SymbolType, - index: usize, +pub(crate) struct Symbol { + pub kind: SymbolType, + pub index: usize, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub enum Rule { +pub(crate) enum Rule { Blank, CharacterSet(Vec), String(String), @@ -153,9 +153,21 @@ impl Rule { pub fn string(value: &'static str) -> Self { Rule::String(value.to_string()) } + + pub fn pattern(value: &'static str) -> Self { + Rule::Pattern(value.to_string()) + } } impl Symbol { + pub fn is_non_terminal(&self) -> bool { + return self.kind == SymbolType::NonTerminal + } + + pub fn is_external(&self) -> bool { + return self.kind == SymbolType::External + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, index } } diff --git a/src/tables.rs b/src/tables.rs index 10b1e41d..de66253c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -2,12 +2,12 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; -pub type AliasSequenceId = usize; -pub type ParseStateId = usize; -pub type LexStateId = usize; +pub(crate) type AliasSequenceId = usize; +pub(crate) type ParseStateId = usize; +pub(crate) type LexStateId = usize; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseActionType { +pub(crate) enum ParseActionType { Error, Shift, Reduce, @@ -16,7 +16,7 @@ pub enum ParseActionType { } #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseAction { +pub(crate) enum ParseAction { Accept, Error, Shift(ParseStateId), @@ -34,44 +34,44 @@ pub enum ParseAction { } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseTableEntry { +pub(crate) struct ParseTableEntry { actions: Vec, reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseState { +pub(crate) struct ParseState { terminal_entries: HashMap, nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] -pub struct ParseTable { +pub(crate) struct ParseTable { states: Vec, alias_sequences: Vec>, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AdvanceAction { +pub(crate) struct AdvanceAction { state: LexStateId, precedence: Range, in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AcceptTokenAction { +pub(crate) struct AcceptTokenAction { symbol: Symbol, precedence: i32, implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct LexState { +pub(crate) struct LexState { advance_actions: HashMap, accept_action: Option, } #[derive(Debug, PartialEq, Eq)] -pub struct LexTable { +pub(crate) struct LexTable { states: Vec, } From ead6ca1738c52e8da4a2eb577d1c4c50b08593b4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 8 Dec 2018 13:44:11 -0800 Subject: [PATCH 051/208] Generate NFAs from regexes --- Cargo.lock | 1 + Cargo.toml | 1 + src/error.rs | 11 ++ src/main.rs | 1 + src/nfa.rs | 160 ++++++++++++++++++ src/prepare_grammar/normalize_rules.rs | 224 +++++++++++++++++++++++++ src/rules.rs | 2 +- 7 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 src/nfa.rs diff --git a/Cargo.lock b/Cargo.lock index 20908681..d5109fb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -466,6 +466,7 @@ dependencies = [ "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 965cc81e..93a49d2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tree-sitter = "0.3.1" +regex-syntax = "0.6.4" diff --git a/src/error.rs b/src/error.rs index 90e7b8f9..49064c22 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,10 +2,21 @@ pub enum Error { GrammarError(String), SymbolError(String), + RegexError(String), } pub type Result = std::result::Result; +impl Error { + pub fn grammar(message: &str) -> Self { + Error::GrammarError(message.to_string()) + } + + pub fn regex(message: &str) -> Self { + Error::RegexError(message.to_string()) + } +} + impl From for Error { fn from(error: serde_json::Error) -> Self { Error::GrammarError(error.to_string()) diff --git a/src/main.rs b/src/main.rs index 3eeb306a..4d376929 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod build_tables; mod error; mod generate; mod grammars; +mod nfa; mod parse_grammar; mod prepare_grammar; mod render; diff --git a/src/nfa.rs b/src/nfa.rs new file mode 100644 index 00000000..55aa11dc --- /dev/null +++ b/src/nfa.rs @@ -0,0 +1,160 @@ +use std::fmt; +use std::char; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum CharacterSet { + Include(Vec), + Exclude(Vec), +} + +#[derive(Debug)] +pub enum NfaState { + Advance(CharacterSet, u32), + Split(u32, u32), + Accept, +} + +pub struct Nfa { + pub states: Vec +} + +#[derive(Debug)] +pub struct NfaCursor<'a> { + indices: Vec, + nfa: &'a Nfa, +} + +impl CharacterSet { + pub fn empty() -> Self { + CharacterSet::Include(Vec::new()) + } + + pub fn all() -> Self { + CharacterSet::Exclude(Vec::new()) + } + + pub fn negate(self) -> CharacterSet { + match self { + CharacterSet::Include(chars) => CharacterSet::Exclude(chars), + CharacterSet::Exclude(chars) => CharacterSet::Include(chars), + } + } + + pub fn add_char(self, c: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + if let Err(i) = chars.binary_search(&c) { + chars.insert(i, c); + } + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add_range(self, start: char, end: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + let mut c = start as u32; + while c <= end as u32 { + chars.push(char::from_u32(c).unwrap()); + c += 1; + } + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add(self, other: CharacterSet) -> Self { + if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn contains(&self, c: char) -> bool { + match self { + CharacterSet::Include(chars) => chars.contains(&c), + CharacterSet::Exclude(chars) => !chars.contains(&c), + } + } +} + +impl Nfa { + pub fn new() -> Self { + Nfa { states: vec![NfaState::Accept] } + } + + pub fn start_index(&self) -> u32 { + self.states.len() as u32 - 1 + } + + pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { + self.states.push(f(self.start_index())); + } +} + +impl fmt::Debug for Nfa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Nfa {{ states: {{")?; + for (i, state) in self.states.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {:?}", i, state)?; + } + write!(f, "}} }}")?; + Ok(()) + } +} + +impl<'a> NfaCursor<'a> { + pub fn new(nfa: &'a Nfa) -> Self { + let mut result = Self { nfa, indices: Vec::new() }; + result.add_indices(&mut vec![nfa.start_index()]); + result + } + + pub fn advance(&mut self, c: char) -> bool { + let mut result = false; + let mut new_indices = Vec::new(); + for index in &self.indices { + if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if chars.contains(c) { + new_indices.push(*next_index); + result = true; + } + } + } + self.indices.clear(); + self.add_indices(&mut new_indices); + result + } + + pub fn is_done(&self) -> bool { + self.indices.iter().any(|index| { + if let NfaState::Accept = self.nfa.states[*index as usize] { + true + } else { + false + } + }) + } + + pub fn add_indices(&mut self, new_indices: &mut Vec) { + while let Some(index) = new_indices.pop() { + let state = &self.nfa.states[index as usize]; + if let NfaState::Split(left, right) = state { + new_indices.push(*left); + new_indices.push(*right); + } else if let Err(i) = self.indices.binary_search(&index) { + self.indices.insert(i, index); + } + } + } +} diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs index 9e625ef5..67177b4f 100644 --- a/src/prepare_grammar/normalize_rules.rs +++ b/src/prepare_grammar/normalize_rules.rs @@ -1,5 +1,229 @@ +use crate::error::{Error, Result}; +use crate::rules::Rule; use crate::grammars::LexicalGrammar; +use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet}; +use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; + +fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty() + .add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9') + } +} + +fn evaluate_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)), + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(evaluate_character_class(&item)?); + } + Ok(result) + } + _ => Err(Error::regex("Unsupported character class syntax")), + } +} + +fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { + match ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index)); + Ok(()) + }, + Ast::Dot(_) => { + nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index)); + Ok(()) + }, + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), + Class::Perl(class) => { + nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index)); + Ok(()) + }, + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + let character_set = evaluate_character_class(&item)?; + nfa.states.push(NfaState::Advance(character_set, next_state_index)); + Ok(()) + }, + ClassSet::BinaryOp(_) => { + Err(Error::regex("Binary operators in character classes aren't supported")) + } + } + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + regex_to_nfa(&repetition.ast, nfa, next_state_index)?; + nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); + Ok(()) + }, + RepetitionKind::OneOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + Ok(()) + }, + RepetitionKind::ZeroOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); + Ok(()) + }, + RepetitionKind::Range(_) => unimplemented!(), + }, + Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()), + Ast::Alternation(alternation) => { + let mut alternative_start_indices = Vec::new(); + for ast in alternation.asts.iter() { + regex_to_nfa(&ast, nfa, next_state_index)?; + alternative_start_indices.push(nfa.start_index()); + } + alternative_start_indices.pop(); + for alternative_start_index in alternative_start_indices { + nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + } + Ok(()) + }, + Ast::Concat(concat) => { + for ast in concat.asts.iter().rev() { + regex_to_nfa(&ast, nfa, next_state_index)?; + next_state_index = nfa.start_index(); + } + Ok(()) + } + } +} + +fn expand_rule(rule: Rule) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; + let mut nfa = Nfa::new(); + regex_to_nfa(&ast, &mut nfa, 0)?; + Ok(nfa) + }, + Rule::String(s) => { + let mut nfa = Nfa::new(); + for c in s.chars().rev() { + nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); + } + Ok(nfa) + }, + _ => Err(Error::grammar("Unexpected rule type")), + } +} pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { unimplemented!(); } + +#[cfg(test)] +mod tests { + use super::*; + + fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { + let mut result = None; + let mut char_count = 0; + let mut cursor = NfaCursor::new(nfa); + for c in s.chars() { + if cursor.is_done() { + result = Some(&s[0..char_count]); + } + if cursor.advance(c) { + char_count += 1; + } else { + break; + } + } + result + } + + #[test] + fn test_regex_expansion() { + struct Row { + pattern: &'static str, + examples: Vec<(&'static str, Option<&'static str>)>, + } + + let table = [ + Row { + pattern: "a|bc", + examples: vec![ + ("a12", Some("a")), + ("bc12", Some("bc")), + ("b12", None), + ("c12", None), + ], + }, + Row { + pattern: "(a|b|c)d(e|f|g)h?", + examples: vec![ + ("ade1", Some("ade")), + ("bdf1", Some("bdf")), + ("bdfh1", Some("bdfh")), + ("ad1", None), + ], + }, + Row { + pattern: "a*", + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], + }, + Row { + pattern: "a((bc)+|(de)*)f", + examples: vec![ + ("af1", Some("af")), + ("adedef1", Some("adedef")), + ("abcbcbcf1", Some("abcbcbcf")), + ("a", None), + ], + }, + Row { + pattern: "[a-fA-F0-9]+", + examples: vec![ + ("A1ff0", Some("A1ff")), + ], + }, + Row { + pattern: "\\w\\d\\s", + examples: vec![ + ("_0 ", Some("_0 ")), + ], + }, + ]; + + for Row { pattern, examples } in table.iter() { + let nfa = expand_rule(Rule::pattern(pattern)).unwrap(); + for (haystack, needle) in examples.iter() { + assert_eq!(simulate_nfa(&nfa, haystack), *needle); + } + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 5c3b65fd..b593496a 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,5 @@ use std::rc::Rc; +use std::char; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -44,7 +45,6 @@ pub(crate) struct Symbol { #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) enum Rule { Blank, - CharacterSet(Vec), String(String), Pattern(String), NamedSymbol(String), From d482894c7d40b9b563262fef49e2ec81f96d346a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 8 Dec 2018 23:35:48 -0800 Subject: [PATCH 052/208] Implement expand_tokens --- src/grammars.rs | 12 +- src/main.rs | 11 +- src/nfa.rs | 3 +- src/prepare_grammar/expand_repeats.rs | 40 +++--- .../{normalize_rules.rs => expand_tokens.rs} | 130 +++++++++++++----- src/prepare_grammar/extract_simple_aliases.rs | 1 - src/prepare_grammar/extract_tokens.rs | 45 +++--- src/prepare_grammar/flatten_grammar.rs | 4 +- src/prepare_grammar/intern_symbols.rs | 26 ++-- src/prepare_grammar/mod.rs | 15 +- src/rules.rs | 44 ++---- 11 files changed, 192 insertions(+), 139 deletions(-) rename src/prepare_grammar/{normalize_rules.rs => expand_tokens.rs} (61%) diff --git a/src/grammars.rs b/src/grammars.rs index 62910637..c5e9aaa1 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,4 +1,5 @@ use crate::rules::{Associativity, Alias, Rule, Symbol}; +use crate::nfa::Nfa; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { @@ -30,10 +31,17 @@ pub(crate) struct InputGrammar { // Extracted lexical grammar +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalVariable { + pub name: String, + pub kind: VariableType, + pub nfa: Nfa, +} + #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexicalGrammar { - pub variables: Vec, - pub separators: Vec, + pub variables: Vec, + pub separators: Vec, } // Extracted syntax grammar diff --git a/src/main.rs b/src/main.rs index 4d376929..b83764fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,7 +14,7 @@ mod render; mod rules; mod tables; -fn main() { +fn main() -> error::Result<()> { let matches = App::new("tree-sitter") .version("0.1") .author("Max Brunsfeld ") @@ -32,5 +32,12 @@ fn main() { .arg(Arg::with_name("path").index(1).required(true)) .arg(Arg::with_name("line").index(2).required(true)) .arg(Arg::with_name("column").index(3).required(true)) - ); + ).get_matches(); + + if let Some(matches) = matches.subcommand_matches("generate") { + let code = generate::generate_parser_for_grammar(String::new())?; + println!("{}", code); + } + + Ok(()) } diff --git a/src/nfa.rs b/src/nfa.rs index 55aa11dc..22cb2a2e 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -7,13 +7,14 @@ pub enum CharacterSet { Exclude(Vec), } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub enum NfaState { Advance(CharacterSet, u32), Split(u32, u32), Accept, } +#[derive(PartialEq, Eq)] pub struct Nfa { pub states: Vec } diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index dcb8f916..85f37c80 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -3,7 +3,7 @@ use crate::grammars::{Variable, VariableType}; use std::collections::HashMap; use std::mem; use std::rc::Rc; -use super::ExtractedGrammar; +use super::ExtractedSyntaxGrammar; struct Expander { variable_name: String, @@ -25,16 +25,11 @@ impl Expander { fn expand_rule(&mut self, rule: &Rule) -> Rule { match rule { - Rule::Choice { elements } => - Rule::Choice { - elements: elements.iter().map(|element| self.expand_rule(element)).collect() - }, + Rule::Choice(elements) => + Rule::Choice(elements.iter().map(|element| self.expand_rule(element)).collect()), - Rule::Seq { left, right } => - Rule::Seq { - left: Rc::new(self.expand_rule(left)), - right: Rc::new(self.expand_rule(right)), - }, + Rule::Seq(elements) => + Rule::Seq(elements.iter().map(|element| self.expand_rule(element)).collect()), Rule::Repeat(content) => { let inner_rule = self.expand_rule(content); @@ -46,27 +41,24 @@ impl Expander { self.repeat_count_in_variable += 1; let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); - let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, - rule: Rule::Choice { - elements: vec![ - Rule::Seq { - left: rc_symbol.clone(), - right: rc_symbol - }, - inner_rule - ], - }, + rule: Rule::Choice(vec![ + Rule::Seq(vec![ + Rule::Symbol(repeat_symbol), + Rule::Symbol(repeat_symbol), + ]), + inner_rule + ]), }); Rule::Symbol(repeat_symbol) } Rule::Metadata { rule, params } => Rule::Metadata { - rule: Rc::new(self.expand_rule(rule)), + rule: Box::new(self.expand_rule(rule)), params: params.clone() }, @@ -75,7 +67,7 @@ impl Expander { } } -pub(super) fn expand_repeats(mut grammar: ExtractedGrammar) -> ExtractedGrammar { +pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar { let mut expander = Expander { variable_name: String::new(), repeat_count_in_variable: 0, @@ -207,8 +199,8 @@ mod tests { ]); } - fn build_grammar(variables: Vec) -> ExtractedGrammar { - ExtractedGrammar { + fn build_grammar(variables: Vec) -> ExtractedSyntaxGrammar { + ExtractedSyntaxGrammar { variables, extra_tokens: Vec::new(), external_tokens: Vec::new(), diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/expand_tokens.rs similarity index 61% rename from src/prepare_grammar/normalize_rules.rs rename to src/prepare_grammar/expand_tokens.rs index 67177b4f..9cfa819f 100644 --- a/src/prepare_grammar/normalize_rules.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -1,10 +1,11 @@ use crate::error::{Error, Result}; use crate::rules::Rule; -use crate::grammars::LexicalGrammar; -use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet}; +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{Nfa, NfaState, CharacterSet}; +use super::{ExtractedLexicalGrammar}; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; -fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { +fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { match item { ClassPerlKind::Digit => CharacterSet::empty() .add_range('0', '9'), @@ -21,7 +22,7 @@ fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { } } -fn evaluate_character_class(item: &ClassSetItem) -> Result { +fn expand_character_class(item: &ClassSetItem) -> Result { match item { ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), @@ -29,7 +30,7 @@ fn evaluate_character_class(item: &ClassSetItem) -> Result { ClassSetItem::Union(union) => { let mut result = CharacterSet::empty(); for item in &union.items { - result = result.add(evaluate_character_class(&item)?); + result = result.add(expand_character_class(&item)?); } Ok(result) } @@ -37,7 +38,7 @@ fn evaluate_character_class(item: &ClassSetItem) -> Result { } } -fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match ast { Ast::Empty(_) => Ok(()), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), @@ -53,12 +54,12 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index)); + nfa.states.push(NfaState::Advance(expand_perl_character_class(&class.kind), next_state_index)); Ok(()) }, Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { - let character_set = evaluate_character_class(&item)?; + let character_set = expand_character_class(&item)?; nfa.states.push(NfaState::Advance(character_set, next_state_index)); Ok(()) }, @@ -69,14 +70,14 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - regex_to_nfa(&repetition.ast, nfa, next_state_index)?; + expand_regex(&repetition.ast, nfa, next_state_index)?; nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); Ok(()) }, RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); - regex_to_nfa(&repetition.ast, nfa, split_index)?; + expand_regex(&repetition.ast, nfa, split_index)?; nfa.states[split_index as usize] = NfaState::Split( nfa.start_index(), next_state_index @@ -86,7 +87,7 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); - regex_to_nfa(&repetition.ast, nfa, split_index)?; + expand_regex(&repetition.ast, nfa, split_index)?; nfa.states[split_index as usize] = NfaState::Split( nfa.start_index(), next_state_index @@ -96,11 +97,11 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), Ast::Alternation(alternation) => { let mut alternative_start_indices = Vec::new(); for ast in alternation.asts.iter() { - regex_to_nfa(&ast, nfa, next_state_index)?; + expand_regex(&ast, nfa, next_state_index)?; alternative_start_indices.push(nfa.start_index()); } alternative_start_indices.pop(); @@ -111,7 +112,7 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Concat(concat) => { for ast in concat.asts.iter().rev() { - regex_to_nfa(&ast, nfa, next_state_index)?; + expand_regex(&ast, nfa, next_state_index)?; next_state_index = nfa.start_index(); } Ok(()) @@ -119,32 +120,77 @@ fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( } } -fn expand_rule(rule: Rule) -> Result { +fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; - let mut nfa = Nfa::new(); - regex_to_nfa(&ast, &mut nfa, 0)?; - Ok(nfa) + expand_regex(&ast, nfa, next_state_index)?; + Ok(()) }, Rule::String(s) => { - let mut nfa = Nfa::new(); for c in s.chars().rev() { nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); } - Ok(nfa) + Ok(()) + }, + Rule::Choice(elements) => { + let mut alternative_start_indices = Vec::new(); + for element in elements { + expand_rule(element, nfa, next_state_index)?; + alternative_start_indices.push(nfa.start_index()); + } + alternative_start_indices.pop(); + for alternative_start_index in alternative_start_indices { + nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + } + Ok(()) + }, + Rule::Seq(elements) => { + for element in elements.into_iter().rev() { + expand_rule(element, nfa, next_state_index)?; + next_state_index = nfa.start_index(); + } + Ok(()) + }, + Rule::Repeat(rule) => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + expand_rule(*rule, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + Ok(()) }, _ => Err(Error::grammar("Unexpected rule type")), } } -pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { - unimplemented!(); +pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { + let mut variables = Vec::new(); + for variable in grammar.variables { + let mut nfa = Nfa::new(); + expand_rule(variable.rule, &mut nfa, 0)?; + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + nfa, + }); + } + let mut separators = Vec::new(); + for separator in grammar.separators { + let mut nfa = Nfa::new(); + expand_rule(separator, &mut nfa, 0)?; + separators.push(nfa); + } + + Ok(LexicalGrammar { variables, separators }) } #[cfg(test)] mod tests { use super::*; + use crate::nfa::NfaCursor; fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { let mut result = None; @@ -164,15 +210,15 @@ mod tests { } #[test] - fn test_regex_expansion() { + fn test_rule_expansion() { struct Row { - pattern: &'static str, + rule: Rule, examples: Vec<(&'static str, Option<&'static str>)>, } let table = [ Row { - pattern: "a|bc", + rule: Rule::pattern("a|bc"), examples: vec![ ("a12", Some("a")), ("bc12", Some("bc")), @@ -181,7 +227,7 @@ mod tests { ], }, Row { - pattern: "(a|b|c)d(e|f|g)h?", + rule: Rule::pattern("(a|b|c)d(e|f|g)h?"), examples: vec![ ("ade1", Some("ade")), ("bdf1", Some("bdf")), @@ -190,14 +236,14 @@ mod tests { ], }, Row { - pattern: "a*", + rule: Rule::pattern("a*"), examples: vec![ ("aaa1", Some("aaa")), ("b", Some("")), ], }, Row { - pattern: "a((bc)+|(de)*)f", + rule: Rule::pattern("a((bc)+|(de)*)f"), examples: vec![ ("af1", Some("af")), ("adedef1", Some("adedef")), @@ -206,21 +252,41 @@ mod tests { ], }, Row { - pattern: "[a-fA-F0-9]+", + rule: Rule::pattern("[a-fA-F0-9]+"), examples: vec![ ("A1ff0", Some("A1ff")), ], }, Row { - pattern: "\\w\\d\\s", + rule: Rule::pattern("\\w\\d\\s"), examples: vec![ ("_0 ", Some("_0 ")), ], }, + Row { + rule: Rule::string("abc"), + examples: vec![ + ("abcd", Some("abc")), + ("ab", None), + ], + }, + Row { + rule: Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ])), + examples: vec![ + ("{a}{", Some("{a}")), + ("{a}{d", Some("{a}")), + ("ab", None), + ], + }, ]; - for Row { pattern, examples } in table.iter() { - let nfa = expand_rule(Rule::pattern(pattern)).unwrap(); + for Row { rule, examples } in table.iter() { + let mut nfa = Nfa::new(); + expand_rule(rule.clone(), &mut nfa, 0).unwrap(); for (haystack, needle) in examples.iter() { assert_eq!(simulate_nfa(&nfa, haystack), *needle); } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 250246f3..2a175242 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -1,6 +1,5 @@ use crate::rules::AliasMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use super::ExtractedGrammar; pub(super) fn extract_simple_aliases( syntax_grammar: &mut SyntaxGrammar, diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index ee90b3c8..7322516f 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -3,12 +3,12 @@ use std::rc::Rc; use std::mem; use crate::error::{Error, Result}; use crate::rules::{Rule, MetadataParams, Symbol, SymbolType}; -use crate::grammars::{Variable, VariableType, LexicalGrammar, ExternalToken}; -use super::{InternedGrammar, ExtractedGrammar}; +use crate::grammars::{Variable, ExternalToken}; +use super::{InternedGrammar, ExtractedSyntaxGrammar, ExtractedLexicalGrammar}; pub(super) fn extract_tokens( mut grammar: InternedGrammar -) -> Result<(ExtractedGrammar, LexicalGrammar)> { +) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { let mut extractor = TokenExtractor { current_variable_name: String::new(), current_variable_token_count: 0, @@ -138,7 +138,7 @@ pub(super) fn extract_tokens( } Ok(( - ExtractedGrammar { + ExtractedSyntaxGrammar { variables, expected_conflicts, extra_tokens, @@ -146,7 +146,7 @@ pub(super) fn extract_tokens( external_tokens, word_token, }, - LexicalGrammar { + ExtractedLexicalGrammar { variables: lexical_variables, separators, } @@ -198,20 +198,19 @@ impl TokenExtractor { } else { Rule::Metadata { params: params.clone(), - rule: Rc::new(self.extract_tokens_in_rule((&rule).clone())) + rule: Box::new(self.extract_tokens_in_rule((&rule).clone())) } } }, Rule::Repeat(content) => Rule::Repeat( - Rc::new(self.extract_tokens_in_rule(content)) + Box::new(self.extract_tokens_in_rule(content)) + ), + Rule::Seq(elements) => Rule::Seq( + elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + ), + Rule::Choice(elements) => Rule::Choice( + elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() ), - Rule::Seq { left, right } => Rule::Seq { - left: Rc::new(self.extract_tokens_in_rule(left)), - right: Rc::new(self.extract_tokens_in_rule(right)), - }, - Rule::Choice { elements } => Rule::Choice { - elements: elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() - }, _ => input.clone() } } @@ -249,19 +248,18 @@ impl SymbolReplacer { fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { match rule { Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), - Rule::Choice { elements } => Rule::Choice { - elements: elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() - }, - Rule::Seq { left, right } => Rule::Seq { - left: Rc::new(self.replace_symbols_in_rule(left)), - right: Rc::new(self.replace_symbols_in_rule(right)), - }, + Rule::Choice(elements) => Rule::Choice( + elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + ), + Rule::Seq(elements) => Rule::Seq( + elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + ), Rule::Repeat(content) => Rule::Repeat( - Rc::new(self.replace_symbols_in_rule(content)) + Box::new(self.replace_symbols_in_rule(content)) ), Rule::Metadata { rule, params } => Rule::Metadata { params: params.clone(), - rule: Rc::new(self.replace_symbols_in_rule(rule)), + rule: Box::new(self.replace_symbols_in_rule(rule)), }, _ => rule.clone() } @@ -290,6 +288,7 @@ impl SymbolReplacer { #[cfg(test)] mod test { use super::*; + use crate::grammars::VariableType; #[test] fn test_extraction() { diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs index 36fe76c9..0f09cd14 100644 --- a/src/prepare_grammar/flatten_grammar.rs +++ b/src/prepare_grammar/flatten_grammar.rs @@ -1,7 +1,7 @@ use crate::error::Result; use crate::grammars::SyntaxGrammar; -use super::ExtractedGrammar; +use super::ExtractedSyntaxGrammar; -pub(super) fn flatten_grammar(grammar: ExtractedGrammar) -> Result { +pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { unimplemented!(); } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index e4cf7ff1..17132262 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -80,26 +80,26 @@ struct Interner<'a> { impl<'a> Interner<'a> { fn intern_rule(&self, rule: &Rule) -> Result { match rule { - Rule::Choice { elements } => { + Rule::Choice(elements) => { let mut result = Vec::with_capacity(elements.len()); for element in elements { result.push(self.intern_rule(element)?); } - Ok(Rule::Choice { elements: result }) + Ok(Rule::Choice(result)) }, - - Rule::Seq { left, right } => - Ok(Rule::Seq { - left: Rc::new(self.intern_rule(left)?), - right: Rc::new(self.intern_rule(right)?), - }), - - Rule::Repeat(content) => - Ok(Rule::Repeat(Rc::new(self.intern_rule(content)?))), - + Rule::Seq(elements) => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Seq(result)) + }, + Rule::Repeat(content) => Ok(Rule::Repeat( + Box::new(self.intern_rule(content)?) + )), Rule::Metadata { rule, params } => Ok(Rule::Metadata { - rule: Rc::new(self.intern_rule(rule)?), + rule: Box::new(self.intern_rule(rule)?), params: params.clone() }), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index b860807a..e2615479 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -2,7 +2,7 @@ mod intern_symbols; mod extract_tokens; mod expand_repeats; mod flatten_grammar; -mod normalize_rules; +mod expand_tokens; mod extract_simple_aliases; use crate::rules::{AliasMap, Rule, Symbol}; @@ -12,7 +12,7 @@ use self::intern_symbols::intern_symbols; use self::extract_tokens::extract_tokens; use self::expand_repeats::expand_repeats; use self::flatten_grammar::flatten_grammar; -use self::normalize_rules::normalize_rules; +use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; pub(self) struct IntermediateGrammar { @@ -25,7 +25,14 @@ pub(self) struct IntermediateGrammar { } pub(self) type InternedGrammar = IntermediateGrammar; -pub(self) type ExtractedGrammar = IntermediateGrammar; + +pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar; + +#[derive(Debug, PartialEq, Eq)] +pub(self) struct ExtractedLexicalGrammar { + variables: Vec, + separators: Vec, +} pub(crate) fn prepare_grammar( input_grammar: &InputGrammar @@ -34,7 +41,7 @@ pub(crate) fn prepare_grammar( let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; - let mut lexical_grammar = normalize_rules(lexical_grammar); + let mut lexical_grammar = expand_tokens(lexical_grammar)?; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); Ok((syntax_grammar, lexical_grammar, simple_aliases)) } diff --git a/src/rules.rs b/src/rules.rs index b593496a..c6f18cf4 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -49,18 +49,13 @@ pub(crate) enum Rule { Pattern(String), NamedSymbol(String), Symbol(Symbol), - Choice { - elements: Vec, - }, + Choice(Vec), Metadata { params: MetadataParams, - rule: Rc, + rule: Box, }, - Repeat(Rc), - Seq { - left: Rc, - right: Rc, - } + Repeat(Box), + Seq(Vec), } impl Rule { @@ -98,7 +93,7 @@ impl Rule { } pub fn repeat(rule: Rule) -> Self { - Rule::Repeat(Rc::new(rule)) + Rule::Repeat(Box::new(rule)) } pub fn choice(rules: Vec) -> Self { @@ -106,32 +101,11 @@ impl Rule { for rule in rules { choice_helper(&mut elements, rule); } - Rule::Choice { elements } + Rule::Choice(elements) } pub fn seq(rules: Vec) -> Self { - let mut result = Rule::Blank; - for rule in rules { - match rule { - Rule::Blank => continue, - Rule::Metadata { rule, params: _ } => { - if *rule == Rule::Blank { - continue; - } - }, - _ => { - if result == Rule::Blank { - result = rule; - } else { - result = Rule::Seq { - left: Rc::new(result), - right: Rc::new(rule), - } - } - } - } - } - result + Rule::Seq(rules) } pub fn terminal(index: usize) -> Self { @@ -196,14 +170,14 @@ fn add_metadata(input: Rule, f: T) -> Rule { _ => { let mut params = MetadataParams::default(); f(&mut params); - Rule::Metadata { rule: Rc::new(input), params } + Rule::Metadata { rule: Box::new(input), params } } } } fn choice_helper(result: &mut Vec, rule: Rule) { match rule { - Rule::Choice {elements} => { + Rule::Choice(elements) => { for element in elements { choice_helper(result, element); } From b0a7c854a4939915703980c229093e70147a1615 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Dec 2018 14:57:46 -0800 Subject: [PATCH 053/208] Avoid redundant regex complication when instantiating PropertySheets --- src/lib.rs | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 724a08bd..d70dc607 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,7 +48,7 @@ pub struct InputEdit { struct PropertyTransition { state_id: usize, child_index: Option, - text_regex: Option, + text_regex_index: Option, } struct PropertyState { @@ -66,6 +66,7 @@ pub enum PropertySheetError { pub struct PropertySheet> { states: Vec, property_sets: Vec

, + text_regexes: Vec, } pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -615,11 +616,11 @@ impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { .get(&node_kind_id) .and_then(|transitions| { for transition in transitions.iter() { - if let Some(text_regex) = transition.text_regex.as_ref() { + if let Some(text_regex_index) = transition.text_regex_index { let node = self.cursor.node(); let text = &self.source.as_bytes()[node.start_byte()..node.end_byte()]; if let Ok(text) = str::from_utf8(text) { - if !text_regex.is_match(text) { + if !self.property_sheet.text_regexes[text_regex_index].is_match(text) { continue; } } @@ -699,28 +700,37 @@ impl PropertySheet

{ } let input: PropertySheetJSON

= serde_json::from_str(json) - .map_err(|e| PropertySheetError::InvalidJSON(e))?; + .map_err(PropertySheetError::InvalidJSON)?; let mut states = Vec::new(); + let mut text_regexes = Vec::new(); + let mut text_regex_patterns = Vec::new(); for state in input.states.iter() { let mut transitions = HashMap::new(); let node_kind_count = language.node_kind_count(); for transition in state.transitions.iter() { - for i in 0..node_kind_count { - let i = i as u16; - if language.node_kind_is_named(i) == transition.named - && transition.kind == language.node_kind_for_id(i) + let text_regex_index = if let Some(regex_pattern) = transition.text.as_ref() { + if let Some(index) = text_regex_patterns.iter().position(|r| *r == regex_pattern) { + Some(index) + } else { + text_regex_patterns.push(regex_pattern); + text_regexes.push(Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?); + Some(text_regexes.len() - 1) + } + } else { + None + }; + + for i in 0..(node_kind_count as u16) { + if + transition.kind == language.node_kind_for_id(i) && + transition.named == language.node_kind_is_named(i) { let entry = transitions.entry(i).or_insert(Vec::new()); - let text_regex = if let Some(text) = transition.text.as_ref() { - Some(Regex::new(&text).map_err(|e| PropertySheetError::InvalidRegex(e))?) - } else { - None - }; entry.push(PropertyTransition { child_index: transition.index, state_id: transition.state_id, - text_regex + text_regex_index, }); } } @@ -734,6 +744,7 @@ impl PropertySheet

{ Ok(Self { property_sets: input.property_sets, states, + text_regexes, }) } } From ba9da0a9b48dd7d374438eece53749061453fefe Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 10:35:03 -0800 Subject: [PATCH 054/208] 0.3.4 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f61b1583..fde4fd31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.3" +version = "0.3.4" authors = ["Max Brunsfeld "] build = "build.rs" license = "MIT" From 7acfb2b74e5ba3d66aff67d9afb698add9cb8708 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 12:14:34 -0800 Subject: [PATCH 055/208] Implement flatten_grammar --- src/grammars.rs | 19 +- src/prepare_grammar/expand_repeats.rs | 221 +++++++++-------- src/prepare_grammar/expand_tokens.rs | 119 +++++---- src/prepare_grammar/extract_tokens.rs | 327 +++++++++++++------------ src/prepare_grammar/flatten_grammar.rs | 312 ++++++++++++++++++++++- src/prepare_grammar/intern_symbols.rs | 137 ++++++----- src/prepare_grammar/mod.rs | 20 +- src/rules.rs | 8 +- 8 files changed, 773 insertions(+), 390 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index c5e9aaa1..3b3d47f7 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -52,7 +52,6 @@ pub(crate) struct ProductionStep { pub precedence: i32, pub associativity: Option, pub alias: Option, - pub is_excluded: bool, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -65,6 +64,7 @@ pub(crate) struct Production { pub(crate) struct SyntaxVariable { pub name: String, pub kind: VariableType, + pub productions: Vec, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -81,7 +81,22 @@ pub(crate) struct SyntaxGrammar { pub expected_conflicts: Vec>, pub external_tokens: Vec, pub variables_to_inline: Vec, - pub word_token: Symbol, + pub word_token: Option, +} + +impl ProductionStep { + pub(crate) fn new(symbol: Symbol) -> Self { + Self { symbol, precedence: 0, associativity: None, alias: None } + } + + pub(crate) fn with_prec(self, precedence: i32, associativity: Option) -> Self { + Self { + symbol: self.symbol, + precedence, + associativity, + alias: self.alias, + } + } } impl Variable { diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index 85f37c80..f3811c5f 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,16 +1,15 @@ -use crate::rules::{Rule, Symbol}; +use super::ExtractedSyntaxGrammar; use crate::grammars::{Variable, VariableType}; +use crate::rules::{Rule, Symbol}; use std::collections::HashMap; use std::mem; -use std::rc::Rc; -use super::ExtractedSyntaxGrammar; struct Expander { variable_name: String, repeat_count_in_variable: usize, preceding_symbol_count: usize, auxiliary_variables: Vec, - existing_repeats: HashMap + existing_repeats: HashMap, } impl Expander { @@ -25,11 +24,19 @@ impl Expander { fn expand_rule(&mut self, rule: &Rule) -> Rule { match rule { - Rule::Choice(elements) => - Rule::Choice(elements.iter().map(|element| self.expand_rule(element)).collect()), + Rule::Choice(elements) => Rule::Choice( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), - Rule::Seq(elements) => - Rule::Seq(elements.iter().map(|element| self.expand_rule(element)).collect()), + Rule::Seq(elements) => Rule::Seq( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), Rule::Repeat(content) => { let inner_rule = self.expand_rule(content); @@ -39,9 +46,15 @@ impl Expander { } self.repeat_count_in_variable += 1; - let rule_name = format!("{}_repeat{}", self.variable_name, self.repeat_count_in_variable); - let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); - self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); + let rule_name = format!( + "{}_repeat{}", + self.variable_name, self.repeat_count_in_variable + ); + let repeat_symbol = Symbol::non_terminal( + self.preceding_symbol_count + self.auxiliary_variables.len(), + ); + self.existing_repeats + .insert(inner_rule.clone(), repeat_symbol); self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, @@ -50,7 +63,7 @@ impl Expander { Rule::Symbol(repeat_symbol), Rule::Symbol(repeat_symbol), ]), - inner_rule + inner_rule, ]), }); @@ -59,10 +72,10 @@ impl Expander { Rule::Metadata { rule, params } => Rule::Metadata { rule: Box::new(self.expand_rule(rule)), - params: params.clone() + params: params.clone(), }, - _ => rule.clone() + _ => rule.clone(), } } } @@ -80,7 +93,9 @@ pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSy expander.expand_variable(&mut variable); } - grammar.variables.extend(expander.auxiliary_variables.into_iter()); + grammar + .variables + .extend(expander.auxiliary_variables.into_iter()); grammar } @@ -91,112 +106,126 @@ mod tests { #[test] fn test_basic_repeat_expansion() { // Repeats nested inside of sequences and choices are expanded. - let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::seq(vec![ + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::repeat(Rule::terminal(11)), Rule::repeat(Rule::terminal(12)), ]), Rule::terminal(13), - ])), - ])); + ]), + )])); - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::seq(vec![ - Rule::terminal(10), - Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ]), - Rule::terminal(13), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(1), - ]), - Rule::terminal(11), - ])), - Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::terminal(12), - ])), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]), + Rule::terminal(13), + ]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(11), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(12), + ]) + ), + ] + ); } #[test] fn test_repeat_deduplication() { // Terminal 4 appears inside of a repeat in three different places. let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::choice(vec![ - Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), - Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), - ])), - Variable::named("rule1", Rule::seq(vec![ - Rule::terminal(3), - Rule::repeat(Rule::terminal(4)), - ])), + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]), + Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]), + ]), + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]), + ), ])); // Only one auxiliary rule is created for repeating terminal 4. - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::choice(vec![ - Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), - Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), - ])), - Variable::named("rule1", Rule::seq(vec![ - Rule::terminal(3), - Rule::non_terminal(2), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::terminal(4), - ])) - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]), + Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]), + ]) + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(4), + ]) + ) + ] + ); } #[test] fn test_expansion_of_nested_repeats() { - let grammar = expand_repeats(build_grammar(vec![ - Variable::named("rule0", Rule::seq(vec![ + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ Rule::terminal(10), Rule::repeat(Rule::seq(vec![ Rule::terminal(11), - Rule::repeat(Rule::terminal(12)) + Rule::repeat(Rule::terminal(12)), ])), - ])), - ])); + ]), + )])); - assert_eq!(grammar.variables, vec![ - Variable::named("rule0", Rule::seq(vec![ - Rule::terminal(10), - Rule::non_terminal(2), - ])), - Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(1), - ]), - Rule::terminal(12), - ])), - Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ - Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ]), - Rule::seq(vec![ - Rule::terminal(11), - Rule::non_terminal(1), - ]), - ])), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(12), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]), + ]) + ), + ] + ); } fn build_grammar(variables: Vec) -> ExtractedSyntaxGrammar { diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 9cfa819f..e0e1f9a9 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -1,14 +1,13 @@ +use super::ExtractedLexicalGrammar; use crate::error::{Error, Result}; -use crate::rules::Rule; use crate::grammars::{LexicalGrammar, LexicalVariable}; -use crate::nfa::{Nfa, NfaState, CharacterSet}; -use super::{ExtractedLexicalGrammar}; +use crate::nfa::{CharacterSet, Nfa, NfaState}; +use crate::rules::Rule; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { match item { - ClassPerlKind::Digit => CharacterSet::empty() - .add_range('0', '9'), + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), ClassPerlKind::Space => CharacterSet::empty() .add_char(' ') .add_char('\t') @@ -18,7 +17,7 @@ fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { .add_char('_') .add_range('A', 'Z') .add_range('a', 'z') - .add_range('0', '9') + .add_range('0', '9'), } } @@ -26,7 +25,9 @@ fn expand_character_class(item: &ClassSetItem) -> Result { match item { ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } ClassSetItem::Union(union) => { let mut result = CharacterSet::empty(); for item in &union.items { @@ -43,58 +44,64 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( Ast::Empty(_) => Ok(()), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index)); + nfa.states.push(NfaState::Advance( + CharacterSet::Include(vec![literal.c]), + next_state_index, + )); Ok(()) - }, + } Ast::Dot(_) => { - nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index)); + nfa.states.push(NfaState::Advance( + CharacterSet::Exclude(vec!['\n']), + next_state_index, + )); Ok(()) - }, + } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance(expand_perl_character_class(&class.kind), next_state_index)); + nfa.states.push(NfaState::Advance( + expand_perl_character_class(&class.kind), + next_state_index, + )); Ok(()) - }, + } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { let character_set = expand_character_class(&item)?; - nfa.states.push(NfaState::Advance(character_set, next_state_index)); + nfa.states + .push(NfaState::Advance(character_set, next_state_index)); Ok(()) - }, - ClassSet::BinaryOp(_) => { - Err(Error::regex("Binary operators in character classes aren't supported")) } - } + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { expand_regex(&repetition.ast, nfa, next_state_index)?; nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); Ok(()) - }, + } RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = + NfaState::Split(nfa.start_index(), next_state_index); Ok(()) - }, + } RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = + NfaState::Split(nfa.start_index(), next_state_index); nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); Ok(()) - }, + } RepetitionKind::Range(_) => unimplemented!(), }, Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), @@ -109,7 +116,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); } Ok(()) - }, + } Ast::Concat(concat) => { for ast in concat.asts.iter().rev() { expand_regex(&ast, nfa, next_state_index)?; @@ -123,16 +130,20 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { match rule { Rule::Pattern(s) => { - let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; expand_regex(&ast, nfa, next_state_index)?; Ok(()) - }, + } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); + nfa.prepend(|start_index| { + NfaState::Advance(CharacterSet::empty().add_char(c), start_index) + }); } Ok(()) - }, + } Rule::Choice(elements) => { let mut alternative_start_indices = Vec::new(); for element in elements { @@ -144,24 +155,21 @@ fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); } Ok(()) - }, + } Rule::Seq(elements) => { for element in elements.into_iter().rev() { expand_rule(element, nfa, next_state_index)?; next_state_index = nfa.start_index(); } Ok(()) - }, + } Rule::Repeat(rule) => { nfa.states.push(NfaState::Accept); // Placeholder for split let split_index = nfa.start_index(); expand_rule(*rule, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split( - nfa.start_index(), - next_state_index - ); + nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index); Ok(()) - }, + } _ => Err(Error::grammar("Unexpected rule type")), } } @@ -184,7 +192,10 @@ pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { let mut extractor = TokenExtractor { current_variable_name: String::new(), @@ -40,9 +39,15 @@ pub(super) fn extract_tokens( // variable in the lexical grammar. Symbols that pointed to later variables // will need to have their indices decremented. let mut variables = Vec::new(); - let mut symbol_replacer = SymbolReplacer { replacements: HashMap::new() }; + let mut symbol_replacer = SymbolReplacer { + replacements: HashMap::new(), + }; for (i, variable) in grammar.variables.into_iter().enumerate() { - if let Rule::Symbol(Symbol { kind: SymbolType::Terminal, index }) = variable.rule { + if let Rule::Symbol(Symbol { + kind: SymbolType::Terminal, + index, + }) = variable.rule + { if i > 0 && extractor.extracted_usage_counts[index] == 1 { let mut lexical_variable = &mut lexical_variables[index]; lexical_variable.kind = variable.kind; @@ -58,16 +63,19 @@ pub(super) fn extract_tokens( variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); } - let expected_conflicts = grammar.expected_conflicts + let expected_conflicts = grammar + .expected_conflicts .into_iter() - .map(|conflict| + .map(|conflict| { conflict .iter() .map(|symbol| symbol_replacer.replace_symbol(*symbol)) .collect() - ).collect(); + }) + .collect(); - let variables_to_inline = grammar.variables_to_inline + let variables_to_inline = grammar + .variables_to_inline .into_iter() .map(|symbol| symbol_replacer.replace_symbol(symbol)) .collect(); @@ -149,7 +157,7 @@ pub(super) fn extract_tokens( ExtractedLexicalGrammar { variables: lexical_variables, separators, - } + }, )) } @@ -161,7 +169,7 @@ struct TokenExtractor { } struct SymbolReplacer { - replacements: HashMap + replacements: HashMap, } impl TokenExtractor { @@ -198,20 +206,24 @@ impl TokenExtractor { } else { Rule::Metadata { params: params.clone(), - rule: Box::new(self.extract_tokens_in_rule((&rule).clone())) + rule: Box::new(self.extract_tokens_in_rule((&rule).clone())), } } - }, - Rule::Repeat(content) => Rule::Repeat( - Box::new(self.extract_tokens_in_rule(content)) - ), + } + Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))), Rule::Seq(elements) => Rule::Seq( - elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), ), Rule::Choice(elements) => Rule::Choice( - elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), ), - _ => input.clone() + _ => input.clone(), } } @@ -219,7 +231,7 @@ impl TokenExtractor { for (i, variable) in self.extracted_variables.iter_mut().enumerate() { if variable.rule == *rule { self.extracted_usage_counts[i] += 1; - return Symbol::terminal(i) + return Symbol::terminal(i); } } @@ -231,10 +243,9 @@ impl TokenExtractor { Variable::auxiliary( &format!( "{}_token{}", - &self.current_variable_name, - self.current_variable_token_count + &self.current_variable_name, self.current_variable_token_count ), - rule.clone() + rule.clone(), ) }; @@ -249,25 +260,29 @@ impl SymbolReplacer { match rule { Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), Rule::Choice(elements) => Rule::Choice( - elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), ), Rule::Seq(elements) => Rule::Seq( - elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() - ), - Rule::Repeat(content) => Rule::Repeat( - Box::new(self.replace_symbols_in_rule(content)) + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), ), + Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))), Rule::Metadata { rule, params } => Rule::Metadata { params: params.clone(), rule: Box::new(self.replace_symbols_in_rule(rule)), }, - _ => rule.clone() + _ => rule.clone(), } } fn replace_symbol(&self, symbol: Symbol) -> Symbol { if !symbol.is_non_terminal() { - return symbol + return symbol; } if let Some(replacement) = self.replacements.get(&symbol.index) { @@ -293,81 +308,95 @@ mod test { #[test] fn test_extraction() { let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ - Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ - Rule::string("a"), - Rule::pattern("b"), - Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - Rule::token(Rule::repeat(Rule::choice(vec![ - Rule::string("c"), - Rule::string("d"), - ]))) - ]) - ]))), + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + ]), + ])), + ), Variable::named("rule_1", Rule::pattern("e")), Variable::named("rule_2", Rule::pattern("b")), - Variable::named("rule_3", Rule::seq(vec![ - Rule::non_terminal(2), - Rule::Blank, - ])), - ])).unwrap(); + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]), + ), + ])) + .unwrap(); - assert_eq!(syntax_grammar.variables, vec![ - Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ - // The string "a" was replaced by a symbol referencing the lexical grammar - Rule::terminal(0), + assert_eq!( + syntax_grammar.variables, + vec![ + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ])) + ), + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,]) + ), + ] + ); - // The pattern "b" was replaced by a symbol referencing the lexical grammar - Rule::terminal(1), - Rule::choice(vec![ - // The symbol referencing `rule_1` was replaced by a symbol referencing - // the lexical grammar. - Rule::terminal(3), - - // The symbol referencing `rule_2` had its index decremented because - // `rule_1` was moved to the lexical grammar. - Rule::non_terminal(1), - - // The rule wrapped in `token` was replaced by a symbol referencing - // the lexical grammar. - Rule::terminal(2), - ]) - ]))), - - // The pattern "e" was only used in once place: as the definition of `rule_1`, - // so that rule was moved to the lexical grammar. The pattern "b" appeared in - // two places, so it was not moved into the lexical grammar. - Variable::named("rule_2", Rule::terminal(1)), - Variable::named("rule_3", Rule::seq(vec![ - Rule::non_terminal(1), - Rule::Blank, - ])), - ]); - - assert_eq!(lexical_grammar.variables, vec![ - Variable::anonymous("a", Rule::string("a")), - Variable::auxiliary("rule_0_token1", Rule::pattern("b")), - Variable::auxiliary("rule_0_token2", Rule::repeat(Rule::choice(vec![ - Rule::string("c"), - Rule::string("d"), - ]))), - Variable::named("rule_1", Rule::pattern("e")), - ]); + assert_eq!( + lexical_grammar.variables, + vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary( + "rule_0_token2", + Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),])) + ), + Variable::named("rule_1", Rule::pattern("e")), + ] + ); } #[test] fn test_start_rule_is_token() { - let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ - Variable::named("rule_0", Rule::string("hello")), - ])).unwrap(); + let (syntax_grammar, lexical_grammar) = + extract_tokens(build_grammar(vec![Variable::named( + "rule_0", + Rule::string("hello"), + )])) + .unwrap(); - assert_eq!(syntax_grammar.variables, vec![ - Variable::named("rule_0", Rule::terminal(0)), - ]); - assert_eq!(lexical_grammar.variables, vec![ - Variable::anonymous("hello", Rule::string("hello")), - ]) + assert_eq!( + syntax_grammar.variables, + vec![Variable::named("rule_0", Rule::terminal(0)),] + ); + assert_eq!( + lexical_grammar.variables, + vec![Variable::anonymous("hello", Rule::string("hello")),] + ) } #[test] @@ -376,29 +405,25 @@ mod test { Variable::named("rule_0", Rule::string("x")), Variable::named("comment", Rule::pattern("//.*")), ]); - grammar.extra_tokens = vec![ - Rule::string(" "), - Rule::non_terminal(1), - ]; + grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)]; let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); - assert_eq!(syntax_grammar.extra_tokens, vec![ - Symbol::terminal(1), - ]); - assert_eq!(lexical_grammar.separators, vec![ - Rule::string(" "), - ]); + assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]); + assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]); } #[test] fn test_extract_externals() { let mut grammar = build_grammar(vec![ - Variable::named("rule_0", Rule::seq(vec![ - Rule::external(0), - Rule::string("a"), - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), + Variable::named( + "rule_0", + Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ]), + ), Variable::named("rule_1", Rule::string("b")), Variable::named("rule_2", Rule::string("c")), ]); @@ -410,23 +435,26 @@ mod test { let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); - assert_eq!(syntax_grammar.external_tokens, vec![ - ExternalToken { - name: "external_0".to_string(), - kind: VariableType::Named, - corresponding_internal_token: None, - }, - ExternalToken { - name: "a".to_string(), - kind: VariableType::Anonymous, - corresponding_internal_token: Some(Symbol::terminal(0)), - }, - ExternalToken { - name: "rule_2".to_string(), - kind: VariableType::Named, - corresponding_internal_token: Some(Symbol::terminal(2)), - }, - ]); + assert_eq!( + syntax_grammar.external_tokens, + vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ] + ); } #[test] @@ -436,14 +464,15 @@ mod test { Variable::named("rule_1", Rule::non_terminal(2)), Variable::named("rule_2", Rule::string("x")), ]); - grammar.extra_tokens = vec![ - Rule::non_terminal(1), - ]; + grammar.extra_tokens = vec![Rule::non_terminal(1)]; match extract_tokens(grammar) { Err(Error::GrammarError(s)) => { - assert_eq!(s, "Non-token symbol 'rule_1' cannot be used as an extra token"); - }, + assert_eq!( + s, + "Non-token symbol 'rule_1' cannot be used as an extra token" + ); + } _ => { panic!("Expected an error but got no error"); } @@ -453,24 +482,22 @@ mod test { #[test] fn test_error_on_external_with_same_name_as_non_terminal() { let mut grammar = build_grammar(vec![ - Variable::named("rule_0", Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), - Variable::named("rule_1", Rule::seq(vec![ - Rule::non_terminal(2), - Rule::non_terminal(2), - ])), + Variable::named( + "rule_0", + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), + ), + Variable::named( + "rule_1", + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]), + ), Variable::named("rule_2", Rule::string("a")), ]); - grammar.external_tokens = vec![ - Variable::named("rule_1", Rule::non_terminal(1)), - ]; + grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; match extract_tokens(grammar) { Err(Error::GrammarError(s)) => { assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); - }, + } _ => { panic!("Expected an error but got no error"); } diff --git a/src/prepare_grammar/flatten_grammar.rs b/src/prepare_grammar/flatten_grammar.rs index 0f09cd14..3ffef086 100644 --- a/src/prepare_grammar/flatten_grammar.rs +++ b/src/prepare_grammar/flatten_grammar.rs @@ -1,7 +1,313 @@ -use crate::error::Result; -use crate::grammars::SyntaxGrammar; use super::ExtractedSyntaxGrammar; +use crate::error::Result; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; +use crate::rules::{Alias, Associativity, Rule}; + +struct RuleFlattener { + production: Production, + precedence_stack: Vec, + associativity_stack: Vec, + alias_stack: Vec, +} + +impl RuleFlattener { + fn new() -> Self { + Self { + production: Production { + steps: Vec::new(), + dynamic_precedence: 0, + }, + precedence_stack: Vec::new(), + associativity_stack: Vec::new(), + alias_stack: Vec::new(), + } + } + + fn flatten(mut self, rule: Rule) -> Production { + self.apply(rule, true); + self.production + } + + fn apply(&mut self, rule: Rule, at_end: bool) { + match rule { + Rule::Seq(members) => { + let last_index = members.len() - 1; + for (i, member) in members.into_iter().enumerate() { + self.apply(member, i == last_index && at_end); + } + } + Rule::Metadata { rule, params } => { + let mut has_precedence = false; + if let Some(precedence) = params.precedence { + has_precedence = true; + self.precedence_stack.push(precedence); + } + + let mut has_associativity = false; + if let Some(associativity) = params.associativity { + has_associativity = true; + self.associativity_stack.push(associativity); + } + + let mut has_alias = false; + if let Some(alias) = params.alias { + has_alias = true; + self.alias_stack.push(alias); + } + + if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() { + self.production.dynamic_precedence = params.dynamic_precedence; + } + + self.apply(*rule, at_end); + + if has_precedence { + self.precedence_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().precedence = + self.precedence_stack.last().cloned().unwrap_or(0); + } + } + + if has_associativity { + self.associativity_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().associativity = + self.associativity_stack.last().cloned(); + } + } + + if has_alias { + self.alias_stack.pop(); + } + } + Rule::Symbol(symbol) => { + self.production.steps.push(ProductionStep { + symbol, + precedence: self.precedence_stack.last().cloned().unwrap_or(0), + associativity: self.associativity_stack.last().cloned(), + alias: self.alias_stack.last().cloned(), + }); + } + _ => (), + } + } +} + +fn extract_choices(rule: Rule) -> Vec { + match rule { + Rule::Seq(elements) => { + let mut result = vec![Rule::Blank]; + for element in elements { + let extraction = extract_choices(element); + let mut next_result = Vec::new(); + for entry in result { + for extraction_entry in extraction.iter() { + next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()])); + } + } + result = next_result; + } + result + } + Rule::Choice(elements) => { + let mut result = Vec::new(); + for element in elements { + for rule in extract_choices(element) { + result.push(rule); + } + } + result + } + Rule::Metadata { rule, params } => extract_choices(*rule) + .into_iter() + .map(|rule| Rule::Metadata { + rule: Box::new(rule), + params: params.clone(), + }) + .collect(), + _ => vec![rule], + } +} + +fn flatten_variable(variable: Variable) -> Result { + let mut productions = Vec::new(); + for rule in extract_choices(variable.rule) { + let production = RuleFlattener::new().flatten(rule); + if !productions.contains(&production) { + productions.push(production); + } + } + Ok(SyntaxVariable { + name: variable.name, + kind: variable.kind, + productions, + }) +} pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { - unimplemented!(); + let mut variables = Vec::new(); + for variable in grammar.variables { + variables.push(flatten_variable(variable)?); + } + Ok(SyntaxGrammar { + extra_tokens: grammar.extra_tokens, + expected_conflicts: grammar.expected_conflicts, + variables_to_inline: grammar.variables_to_inline, + external_tokens: grammar.external_tokens, + word_token: grammar.word_token, + variables, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::VariableType; + use crate::rules::Symbol; + + #[test] + fn test_flatten_grammar() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_left( + 101, + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_right( + 102, + Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), + ), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ]), + ), + Rule::non_terminal(7), + ]), + }) + .unwrap(); + + assert_eq!( + result.productions, + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(3)) + .with_prec(102, Some(Associativity::Right)), + ProductionStep::new(Symbol::non_terminal(4)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(5)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + ] + ); + } + + #[test] + fn test_flatten_grammar_with_maximum_dynamic_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_dynamic(101, Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_dynamic(102, Rule::seq(vec![ + Rule::non_terminal(3), + Rule::non_terminal(4) + ])), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ])), + Rule::non_terminal(7), + ]) + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 102, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(3)), + ProductionStep::new(Symbol::non_terminal(4)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + Production { + dynamic_precedence: 101, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(5)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + ]); + } + + #[test] + fn test_flatten_grammar_with_final_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + } } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 17132262..5165875c 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -1,14 +1,15 @@ -use crate::error::{Error, Result}; -use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputGrammar, Variable, VariableType}; -use std::rc::Rc; use super::InternedGrammar; +use crate::error::{Error, Result}; +use crate::grammars::{InputGrammar, Variable, VariableType}; +use crate::rules::{Rule, Symbol}; pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result { let interner = Interner { grammar }; if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { - return Err(Error::GrammarError("Grammar's start rule must be visible".to_string())); + return Err(Error::GrammarError( + "Grammar's start rule must be visible".to_string(), + )); } let mut variables = Vec::with_capacity(grammar.variables.len()); @@ -40,9 +41,10 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result for conflict in grammar.expected_conflicts.iter() { let mut interned_conflict = Vec::with_capacity(conflict.len()); for name in conflict { - interned_conflict.push(interner - .intern_name(&name) - .ok_or_else(|| symbol_error(name))? + interned_conflict.push( + interner + .intern_name(&name) + .ok_or_else(|| symbol_error(name))?, ); } expected_conflicts.push(interned_conflict); @@ -57,9 +59,10 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let mut word_token = None; if let Some(name) = grammar.word_token.as_ref() { - word_token = Some(interner - .intern_name(&name) - .ok_or_else(|| symbol_error(&name))? + word_token = Some( + interner + .intern_name(&name) + .ok_or_else(|| symbol_error(&name))?, ); } @@ -74,7 +77,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result } struct Interner<'a> { - grammar: &'a InputGrammar + grammar: &'a InputGrammar, } impl<'a> Interner<'a> { @@ -86,22 +89,19 @@ impl<'a> Interner<'a> { result.push(self.intern_rule(element)?); } Ok(Rule::Choice(result)) - }, + } Rule::Seq(elements) => { let mut result = Vec::with_capacity(elements.len()); for element in elements { result.push(self.intern_rule(element)?); } Ok(Rule::Seq(result)) - }, - Rule::Repeat(content) => Ok(Rule::Repeat( - Box::new(self.intern_rule(content)?) - )), - Rule::Metadata { rule, params } => - Ok(Rule::Metadata { - rule: Box::new(self.intern_rule(rule)?), - params: params.clone() - }), + } + Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))), + Rule::Metadata { rule, params } => Ok(Rule::Metadata { + rule: Box::new(self.intern_rule(rule)?), + params: params.clone(), + }), Rule::NamedSymbol(name) => { if let Some(symbol) = self.intern_name(&name) { @@ -109,29 +109,28 @@ impl<'a> Interner<'a> { } else { Err(symbol_error(name)) } - }, - - _ => Ok(rule.clone()) + } + _ => Ok(rule.clone()), } } fn intern_name(&self, symbol: &str) -> Option { for (i, variable) in self.grammar.variables.iter().enumerate() { if variable.name == symbol { - return Some(Symbol::non_terminal(i)) + return Some(Symbol::non_terminal(i)); } } for (i, external_token) in self.grammar.external_tokens.iter().enumerate() { if let Rule::NamedSymbol(name) = external_token { if name == symbol { - return Some(Symbol::external(i)) + return Some(Symbol::external(i)); } } } - return None + return None; } } @@ -154,22 +153,23 @@ mod tests { #[test] fn test_basic_repeat_expansion() { let grammar = intern_symbols(&build_grammar(vec![ - Variable::named("x", Rule::choice(vec![ - Rule::named("y"), - Rule::named("_z"), - ])), + Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])), Variable::named("y", Rule::named("_z")), Variable::named("_z", Rule::string("a")), - ])).unwrap(); + ])) + .unwrap(); - assert_eq!(grammar.variables, vec![ - Variable::named("x", Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), - Variable::named("y", Rule::non_terminal(2)), - Variable::hidden("_z", Rule::string("a")), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "x", + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]) + ), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), + ] + ); } #[test] @@ -177,45 +177,50 @@ mod tests { // Variable `y` is both an internal and an external token. // Variable `z` is just an external token. let mut input_grammar = build_grammar(vec![ - Variable::named("w", Rule::choice(vec![ - Rule::named("x"), - Rule::named("y"), - Rule::named("z"), - ])), + Variable::named( + "w", + Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]), + ), Variable::named("x", Rule::string("a")), Variable::named("y", Rule::string("b")), ]); - input_grammar.external_tokens.extend(vec![ - Rule::named("y"), - Rule::named("z"), - ]); + input_grammar + .external_tokens + .extend(vec![Rule::named("y"), Rule::named("z")]); let grammar = intern_symbols(&input_grammar).unwrap(); // Variable `y` is referred to by its internal index. // Variable `z` is referred to by its external index. - assert_eq!(grammar.variables, vec![ - Variable::named("w", Rule::choice(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - Rule::external(1), - ])), - Variable::named("x", Rule::string("a")), - Variable::named("y", Rule::string("b")), - ]); + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "w", + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::external(1), + ]) + ), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), + ] + ); // The external token for `y` refers back to its internal index. - assert_eq!(grammar.external_tokens, vec![ - Variable::named("y", Rule::non_terminal(2)), - Variable::named("z", Rule::external(1)), - ]); + assert_eq!( + grammar.external_tokens, + vec![ + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), + ] + ); } #[test] fn test_grammar_with_undefined_symbols() { - let result = intern_symbols(&build_grammar(vec![ - Variable::named("x", Rule::named("y")), - ])); + let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); match result { Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index e2615479..08233c53 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -1,19 +1,19 @@ -mod intern_symbols; -mod extract_tokens; mod expand_repeats; -mod flatten_grammar; mod expand_tokens; mod extract_simple_aliases; +mod extract_tokens; +mod flatten_grammar; +mod intern_symbols; -use crate::rules::{AliasMap, Rule, Symbol}; -use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, Variable, ExternalToken}; -use crate::error::Result; -use self::intern_symbols::intern_symbols; -use self::extract_tokens::extract_tokens; use self::expand_repeats::expand_repeats; -use self::flatten_grammar::flatten_grammar; use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; +use self::extract_tokens::extract_tokens; +use self::flatten_grammar::flatten_grammar; +use self::intern_symbols::intern_symbols; +use crate::error::Result; +use crate::grammars::{ExternalToken, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable}; +use crate::rules::{AliasMap, Rule, Symbol}; pub(self) struct IntermediateGrammar { variables: Vec, @@ -35,7 +35,7 @@ pub(self) struct ExtractedLexicalGrammar { } pub(crate) fn prepare_grammar( - input_grammar: &InputGrammar + input_grammar: &InputGrammar, ) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { let interned_grammar = intern_symbols(input_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; diff --git a/src/rules.rs b/src/rules.rs index c6f18cf4..5d0af86c 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,5 +1,3 @@ -use std::rc::Rc; -use std::char; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -92,6 +90,12 @@ impl Rule { }) } + pub fn prec_dynamic(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.dynamic_precedence = value; + }) + } + pub fn repeat(rule: Rule) -> Self { Rule::Repeat(Box::new(rule)) } From 85347541f155736e423203944903033c76993187 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 17:30:12 -0800 Subject: [PATCH 056/208] Allow PropertySheet selectors to match the root node Co-Authored-By: Timothy Clem --- src/lib.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d70dc607..ad31d3c4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -550,13 +550,16 @@ impl<'a> Drop for TreeCursor<'a> { impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { - Self { + let mut result = Self { cursor: tree.root_node().walk(), child_index_stack: vec![0], state_stack: vec![0], property_sheet, source, - } + }; + let state = result.next_state(&result.current_state(), result.cursor.node().kind_id(), 0); + result.state_stack.push(state); + result } pub fn node(&self) -> Node<'a> { From 40d24097ecdcc188f255a9fbb03adca05c5f39fd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 11 Dec 2018 12:37:09 -0800 Subject: [PATCH 057/208] Implement extract_simple_aliases --- src/grammars.rs | 9 + src/prepare_grammar/extract_simple_aliases.rs | 191 +++++++++++++++++- src/prepare_grammar/mod.rs | 4 +- 3 files changed, 199 insertions(+), 5 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index 3b3d47f7..b76a583e 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -97,6 +97,15 @@ impl ProductionStep { alias: self.alias, } } + + pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self { + Self { + symbol: self.symbol, + precedence: self.precedence, + associativity: self.associativity, + alias: Some(Alias { value: value.to_string(), is_named }), + } + } } impl Variable { diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 2a175242..a10c7982 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -1,9 +1,194 @@ -use crate::rules::AliasMap; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +#[derive(Clone, Default)] +struct SymbolStatus { + alias: Option, + conflicting: bool, +} + pub(super) fn extract_simple_aliases( syntax_grammar: &mut SyntaxGrammar, - lexical_grammar: &mut LexicalGrammar + lexical_grammar: &LexicalGrammar ) -> AliasMap { - unimplemented!(); + // Determine which symbols in the grammars are *always* aliased to a single name. + let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; + let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + for variable in syntax_grammar.variables.iter() { + for production in variable.productions.iter() { + for step in production.steps.iter() { + let mut status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + }; + + if step.alias.is_none() { + status.alias = None; + status.conflicting = true; + } + + if !status.conflicting { + if status.alias.is_none() { + status.alias = step.alias.clone(); + } else if status.alias != step.alias { + status.alias = None; + status.conflicting = true; + } + } + } + } + } + + // Remove the aliases for those symbols. + for variable in syntax_grammar.variables.iter_mut() { + for production in variable.productions.iter_mut() { + for step in production.steps.iter_mut() { + let status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + }; + + if status.alias.is_some() { + step.alias = None; + } + } + } + } + + // Populate a map of the symbols to their aliases. + let mut result = AliasMap::new(); + for (i, status) in terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::terminal(i), alias); + } + } + for (i, status) in non_terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::non_terminal(i), alias); + } + } + for (i, status) in external_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::external(i), alias); + } + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; + use crate::nfa::Nfa; + + #[test] + fn test_extract_simple_aliases() { + let mut syntax_grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + + // Token 1 is aliased above, but not here. + ProductionStep::new(Symbol::terminal(1)), + + // Token 2 is aliased differently than above. + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ], + extra_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let lexical_grammar = LexicalGrammar { + variables: vec![ + LexicalVariable { + name: "t1".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + }, + LexicalVariable { + name: "t2".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + }, + LexicalVariable { + name: "t3".to_string(), + kind: VariableType::Anonymous, + nfa: Nfa::new(), + } + ], + separators: Vec::new(), + }; + + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + assert_eq!(simple_aliases.len(), 1); + assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias { + value: "a1".to_string(), + is_named: true, + }); + + assert_eq!(syntax_grammar.variables, vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // 'Simple' alias removed + ProductionStep::new(Symbol::terminal(0)), + + // Other aliases unchanged + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)), + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ]); + } } diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 08233c53..22435fca 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -41,7 +41,7 @@ pub(crate) fn prepare_grammar( let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; - let mut lexical_grammar = expand_tokens(lexical_grammar)?; - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &mut lexical_grammar); + let lexical_grammar = expand_tokens(lexical_grammar)?; + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); Ok((syntax_grammar, lexical_grammar, simple_aliases)) } From 0103a83f3f88cb8745706517a96f32c01ef1286a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 18:04:29 -0800 Subject: [PATCH 058/208] Integrate separator rules into lexer nfa --- src/grammars.rs | 4 +- src/nfa.rs | 44 +-- src/prepare_grammar/expand_tokens.rs | 265 +++++++++++------- src/prepare_grammar/extract_simple_aliases.rs | 8 +- src/rules.rs | 1 - 5 files changed, 199 insertions(+), 123 deletions(-) diff --git a/src/grammars.rs b/src/grammars.rs index b76a583e..74c213e1 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -35,13 +35,13 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, - pub nfa: Nfa, + pub start_state: u32, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexicalGrammar { + pub nfa: Nfa, pub variables: Vec, - pub separators: Vec, } // Extracted syntax grammar diff --git a/src/nfa.rs b/src/nfa.rs index 22cb2a2e..66861434 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -9,9 +9,13 @@ pub enum CharacterSet { #[derive(Debug, PartialEq, Eq)] pub enum NfaState { - Advance(CharacterSet, u32), + Advance { + chars: CharacterSet, + state: u32, + is_sep: bool, + }, Split(u32, u32), - Accept, + Accept(usize), } #[derive(PartialEq, Eq)] @@ -23,6 +27,7 @@ pub struct Nfa { pub struct NfaCursor<'a> { indices: Vec, nfa: &'a Nfa, + in_sep: bool, } impl CharacterSet { @@ -88,15 +93,15 @@ impl CharacterSet { impl Nfa { pub fn new() -> Self { - Nfa { states: vec![NfaState::Accept] } + Nfa { states: Vec::new() } } - pub fn start_index(&self) -> u32 { + pub fn last_state(&self) -> u32 { self.states.len() as u32 - 1 } pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.start_index())); + self.states.push(f(self.last_state())); } } @@ -116,38 +121,45 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa) -> Self { - let mut result = Self { nfa, indices: Vec::new() }; - result.add_indices(&mut vec![nfa.start_index()]); + let mut result = Self { nfa, indices: Vec::new(), in_sep: true }; + result.add_states(&mut vec![nfa.last_state()]); result } pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_indices = Vec::new(); + let mut any_sep_transitions = false; for index in &self.indices { - if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] { + if *is_sep { + any_sep_transitions = true; + } if chars.contains(c) { - new_indices.push(*next_index); + new_indices.push(*state); result = true; } } } + if !any_sep_transitions { + self.in_sep = false; + } self.indices.clear(); - self.add_indices(&mut new_indices); + self.add_states(&mut new_indices); result } - pub fn is_done(&self) -> bool { - self.indices.iter().any(|index| { - if let NfaState::Accept = self.nfa.states[*index as usize] { - true + pub fn finished_ids<'b>(&'b self) -> impl Iterator + 'b { + self.indices.iter().filter_map(move |index| { + if let NfaState::Accept(i) = self.nfa.states[*index as usize] { + Some(i) } else { - false + None } }) } - pub fn add_indices(&mut self, new_indices: &mut Vec) { + pub fn add_states(&mut self, new_indices: &mut Vec) { while let Some(index) = new_indices.pop() { let state = &self.nfa.states[index as usize]; if let NfaState::Split(left, right) = state { diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index e0e1f9a9..3019b2be 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } } -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match ast { - Ast::Empty(_) => Ok(()), + Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Include(vec![literal.c]), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Include(vec![literal.c]), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Dot(_) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Exclude(vec!['\n']), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Exclude(vec!['\n']), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance( - expand_perl_character_class(&class.kind), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: expand_perl_character_class(&class.kind), + state: next_state_index, + is_sep, + }); + Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { let character_set = expand_character_class(&item)?; - nfa.states - .push(NfaState::Advance(character_set, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: character_set, + state: next_state_index, + is_sep, + }); + Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( "Binary operators in character classes aren't supported", @@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - expand_regex(&repetition.ast, nfa, next_state_index)?; - nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); - Ok(()) + if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? { + nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::OneOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + nfa.states.pop(); + Ok(false) + } } RepetitionKind::ZeroOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep), Ast::Alternation(alternation) => { let mut alternative_start_indices = Vec::new(); for ast in alternation.asts.iter() { - expand_regex(&ast, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Ast::Concat(concat) => { + let mut result = false; for ast in concat.asts.iter().rev() { - expand_regex(&ast, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } } } -fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_index)?; - Ok(()) + expand_regex(&ast, nfa, next_state_index, is_sep) } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|start_index| { - NfaState::Advance(CharacterSet::empty().add_char(c), start_index) + nfa.prepend(|last_state| { + NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state: last_state, + is_sep, + } }); } - Ok(()) + Ok(s.len() > 0) } Rule::Choice(elements) => { let mut alternative_start_indices = Vec::new(); for element in elements { - expand_rule(element, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_rule(element, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Rule::Seq(elements) => { + let mut result = false; for element in elements.into_iter().rev() { - expand_rule(element, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_rule(element, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_rule(*rule, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_rule(rule, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + Ok(false) + } } - _ => Err(Error::grammar("Unexpected rule type")), + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + let mut variables = Vec::new(); - for variable in grammar.variables { - let mut nfa = Nfa::new(); - expand_rule(variable.rule, &mut nfa, 0)?; + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state = nfa.last_state(); + expand_rule(&variable.rule, &mut nfa, last_state, false)?; + + if !is_immediate_token { + let last_state = nfa.last_state(); + expand_rule(&separator_rule, &mut nfa, last_state, true)?; + } + variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - nfa, + start_state: nfa.last_state(), }); } - let mut separators = Vec::new(); - for separator in grammar.separators { - let mut nfa = Nfa::new(); - expand_rule(separator, &mut nfa, 0)?; - separators.push(nfa); - } - Ok(LexicalGrammar { - variables, - separators, - }) + Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; use crate::nfa::NfaCursor; + use crate::grammars::Variable; fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { let mut result = None; let mut char_count = 0; let mut cursor = NfaCursor::new(nfa); for c in s.chars() { - if cursor.is_done() { + if cursor.finished_ids().count() > 0 { result = Some(&s[0..char_count]); } if cursor.advance(c) { @@ -223,13 +266,13 @@ mod tests { #[test] fn test_rule_expansion() { struct Row { - rule: Rule, + rules: Vec, examples: Vec<(&'static str, Option<&'static str>)>, } let table = [ Row { - rule: Rule::pattern("a|bc"), + rules: vec![Rule::pattern("a|bc")], examples: vec![ ("a12", Some("a")), ("bc12", Some("bc")), @@ -238,7 +281,7 @@ mod tests { ], }, Row { - rule: Rule::pattern("(a|b|c)d(e|f|g)h?"), + rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], examples: vec![ ("ade1", Some("ade")), ("bdf1", Some("bdf")), @@ -247,11 +290,14 @@ mod tests { ], }, Row { - rule: Rule::pattern("a*"), - examples: vec![("aaa1", Some("aaa")), ("b", Some(""))], + rules: vec![Rule::pattern("a*")], + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], }, Row { - rule: Rule::pattern("a((bc)+|(de)*)f"), + rules: vec![Rule::pattern("a((bc)+|(de)*)f")], examples: vec![ ("af1", Some("af")), ("adedef1", Some("adedef")), @@ -260,32 +306,51 @@ mod tests { ], }, Row { - rule: Rule::pattern("[a-fA-F0-9]+"), - examples: vec![("A1ff0", Some("A1ff"))], + rules: vec![Rule::pattern("[a-fA-F0-9]+")], + examples: vec![ + ("A1ff0", Some("A1ff")), + ], }, Row { - rule: Rule::pattern("\\w\\d\\s"), - examples: vec![("_0 ", Some("_0 "))], + rules: vec![Rule::pattern("\\w\\d\\s")], + examples: vec![ + ("_0 ", Some("_0 ")), + ], }, Row { - rule: Rule::string("abc"), - examples: vec![("abcd", Some("abc")), ("ab", None)], + rules: vec![Rule::string("abc")], + examples: vec![ + ("abcd", Some("abc")), + ("ab", None) + ], }, Row { - rule: Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)], + rules: vec![ + Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ])), + ], + examples: vec![ + ("{a}{", Some("{a}")), + ("{a}{d", Some("{a}")), + ("ab", None), + ], }, ]; - for Row { rule, examples } in table.iter() { - let mut nfa = Nfa::new(); - expand_rule(rule.clone(), &mut nfa, 0).unwrap(); + for Row { rules, examples } in &table { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: vec![], + variables: rules + .into_iter() + .map(|rule| Variable::named("", rule.clone())) + .collect(), + }).unwrap(); + for (haystack, needle) in examples.iter() { - assert_eq!(simulate_nfa(&nfa, haystack), *needle); + assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle); } } } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index a10c7982..8b87ea2e 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -130,24 +130,24 @@ mod tests { }; let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), variables: vec![ LexicalVariable { name: "t1".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t2".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t3".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, } ], - separators: Vec::new(), }; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); diff --git a/src/rules.rs b/src/rules.rs index 5d0af86c..d7234f45 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -30,7 +30,6 @@ pub(crate) struct MetadataParams { pub is_string: bool, pub is_active: bool, pub is_main_token: bool, - pub is_excluded: bool, pub alias: Option, } From 842421633c1161351ec0ba764be8927d09b15728 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 20:58:26 -0800 Subject: [PATCH 059/208] Fix bugs in nfa generation --- src/nfa.rs | 95 +++++++---- src/prepare_grammar/expand_tokens.rs | 230 ++++++++++++++++++--------- 2 files changed, 212 insertions(+), 113 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index 66861434..bc084ede 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -11,7 +11,7 @@ pub enum CharacterSet { pub enum NfaState { Advance { chars: CharacterSet, - state: u32, + state_id: u32, is_sep: bool, }, Split(u32, u32), @@ -25,7 +25,7 @@ pub struct Nfa { #[derive(Debug)] pub struct NfaCursor<'a> { - indices: Vec, + pub(crate) state_ids: Vec, nfa: &'a Nfa, in_sep: bool, } @@ -96,23 +96,20 @@ impl Nfa { Nfa { states: Vec::new() } } - pub fn last_state(&self) -> u32 { + pub fn last_state_id(&self) -> u32 { self.states.len() as u32 - 1 } pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.last_state())); + self.states.push(f(self.last_state_id())); } } impl fmt::Debug for Nfa { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Nfa {{ states: {{")?; + write!(f, "Nfa {{ states: {{\n")?; for (i, state) in self.states.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {:?}", i, state)?; + write!(f, " {}: {:?},\n", i, state)?; } write!(f, "}} }}")?; Ok(()) @@ -120,23 +117,23 @@ impl fmt::Debug for Nfa { } impl<'a> NfaCursor<'a> { - pub fn new(nfa: &'a Nfa) -> Self { - let mut result = Self { nfa, indices: Vec::new(), in_sep: true }; - result.add_states(&mut vec![nfa.last_state()]); + pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { + let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true }; + result.add_states(&mut states); result } pub fn advance(&mut self, c: char) -> bool { let mut result = false; - let mut new_indices = Vec::new(); + let mut new_state_ids = Vec::new(); let mut any_sep_transitions = false; - for index in &self.indices { - if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] { - if *is_sep { - any_sep_transitions = true; - } + for current_state_id in &self.state_ids { + if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] { if chars.contains(c) { - new_indices.push(*state); + if *is_sep { + any_sep_transitions = true; + } + new_state_ids.push(*state_id); result = true; } } @@ -144,30 +141,58 @@ impl<'a> NfaCursor<'a> { if !any_sep_transitions { self.in_sep = false; } - self.indices.clear(); - self.add_states(&mut new_indices); + self.state_ids.clear(); + self.add_states(&mut new_state_ids); result } - pub fn finished_ids<'b>(&'b self) -> impl Iterator + 'b { - self.indices.iter().filter_map(move |index| { - if let NfaState::Accept(i) = self.nfa.states[*index as usize] { - Some(i) - } else { - None + pub fn finished_id(&self) -> Option { + let mut result = None; + for state_id in self.state_ids.iter() { + if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] { + match result { + None => { + result = Some(id) + }, + Some(existing_id) => if id < existing_id { + result = Some(id) + } + } } - }) + } + result } - pub fn add_states(&mut self, new_indices: &mut Vec) { - while let Some(index) = new_indices.pop() { - let state = &self.nfa.states[index as usize]; + pub fn in_separator(&self) -> bool { + self.in_sep + } + + pub fn add_states(&mut self, new_state_ids: &mut Vec) { + let mut i = 0; + while i < new_state_ids.len() { + let state_id = new_state_ids[i]; + let state = &self.nfa.states[state_id as usize]; if let NfaState::Split(left, right) = state { - new_indices.push(*left); - new_indices.push(*right); - } else if let Err(i) = self.indices.binary_search(&index) { - self.indices.insert(i, index); + let mut has_left = false; + let mut has_right = false; + for new_state_id in new_state_ids.iter() { + if *new_state_id == *left { + has_left = true; + } + if *new_state_id == *right { + has_right = true; + } + } + if !has_left { + new_state_ids.push(*left); + } + if !has_right { + new_state_ids.push(*right); + } + } else if let Err(i) = self.state_ids.binary_search(&state_id) { + self.state_ids.insert(i, state_id); } + i += 1; } } } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 3019b2be..8b8cd03a 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -39,14 +39,14 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } } -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match ast { Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { nfa.states.push(NfaState::Advance { chars: CharacterSet::Include(vec![literal.c]), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -54,7 +54,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo Ast::Dot(_) => { nfa.states.push(NfaState::Advance { chars: CharacterSet::Exclude(vec!['\n']), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -65,7 +65,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo Class::Perl(class) => { nfa.states.push(NfaState::Advance { chars: expand_perl_character_class(&class.kind), - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -75,7 +75,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo let character_set = expand_character_class(&item)?; nfa.states.push(NfaState::Advance { chars: character_set, - state: next_state_index, + state_id: next_state_id, is_sep, }); Ok(true) @@ -87,8 +87,8 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? { - nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state)); + if expand_regex(&repetition.ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); Ok(true) } else { Ok(false) @@ -96,10 +96,10 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::OneOrMore => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = - NfaState::Split(nfa.last_state(), next_state_index); + let split_state_id = nfa.last_state_id(); + if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); Ok(true) } else { nfa.states.pop(); @@ -108,11 +108,11 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::ZeroOrMore => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = - NfaState::Split(nfa.last_state(), next_state_index); - nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index)); + let split_state_id = nfa.last_state_id(); + if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); Ok(true) } else { Ok(false) @@ -120,47 +120,49 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: boo } RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), Ast::Alternation(alternation) => { - let mut alternative_start_indices = Vec::new(); + let mut alternative_state_ids = Vec::new(); for ast in alternation.asts.iter() { - if expand_regex(&ast, nfa, next_state_index, is_sep)? { - alternative_start_indices.push(nfa.last_state()); + if expand_regex(&ast, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); } } - alternative_start_indices.pop(); - for alternative_start_index in alternative_start_indices { - nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); } Ok(true) } Ast::Concat(concat) => { let mut result = false; for ast in concat.asts.iter().rev() { - if expand_regex(&ast, nfa, next_state_index, is_sep)? { + if expand_regex(&ast, nfa, next_state_id, is_sep)? { result = true; } - next_state_index = nfa.last_state(); + next_state_id = nfa.last_state_id(); } Ok(result) } } } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_index, is_sep) + expand_regex(&ast, nfa, next_state_id, is_sep) } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|last_state| { + nfa.prepend(|last_state_id| { NfaState::Advance { chars: CharacterSet::empty().add_char(c), - state: last_state, + state_id: last_state_id, is_sep, } }); @@ -168,33 +170,35 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bo Ok(s.len() > 0) } Rule::Choice(elements) => { - let mut alternative_start_indices = Vec::new(); + let mut alternative_state_ids = Vec::new(); for element in elements { - if expand_rule(element, nfa, next_state_index, is_sep)? { - alternative_start_indices.push(nfa.last_state()); + if expand_rule(element, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); } } - alternative_start_indices.pop(); - for alternative_start_index in alternative_start_indices { - nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); } Ok(true) } Rule::Seq(elements) => { let mut result = false; for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_index, is_sep)? { + if expand_rule(element, nfa, next_state_id, is_sep)? { result = true; } - next_state_index = nfa.last_state(); + next_state_id = nfa.last_state_id(); } Ok(result) } Rule::Repeat(rule) => { nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_index = nfa.last_state(); - if expand_rule(rule, nfa, split_index, is_sep)? { - nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index); + let split_state_id = nfa.last_state_id(); + if expand_rule(rule, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); Ok(true) } else { Ok(false) @@ -205,10 +209,11 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bo } } -pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { +pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut nfa = Nfa::new(); let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); Rule::repeat(Rule::choice(grammar.separators)) } else { Rule::Blank @@ -222,18 +227,18 @@ pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { + fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { + let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); + let mut cursor = NfaCursor::new(&grammar.nfa, start_states); + let mut result = None; - let mut char_count = 0; - let mut cursor = NfaCursor::new(nfa); + let mut start_char = 0; + let mut end_char = 0; for c in s.chars() { - if cursor.finished_ids().count() > 0 { - result = Some(&s[0..char_count]); + if let Some(id) = cursor.finished_id() { + result = Some((id, &s[start_char..end_char])); } if cursor.advance(c) { - char_count += 1; + end_char += 1; + if cursor.in_separator() { + start_char = end_char; + } } else { break; } } + + if let Some(id) = cursor.finished_id() { + result = Some((id, &s[start_char..end_char])); + } + result } @@ -267,63 +283,74 @@ mod tests { fn test_rule_expansion() { struct Row { rules: Vec, - examples: Vec<(&'static str, Option<&'static str>)>, + separators: Vec, + examples: Vec<(&'static str, Option<(usize, &'static str)>)>, } let table = [ - Row { - rules: vec![Rule::pattern("a|bc")], - examples: vec![ - ("a12", Some("a")), - ("bc12", Some("bc")), - ("b12", None), - ("c12", None), - ], - }, + // regex with sequences and alternatives Row { rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], + separators: vec![], examples: vec![ - ("ade1", Some("ade")), - ("bdf1", Some("bdf")), - ("bdfh1", Some("bdfh")), + ("ade1", Some((0, "ade"))), + ("bdf1", Some((0, "bdf"))), + ("bdfh1", Some((0, "bdfh"))), ("ad1", None), ], }, + + // regex with repeats Row { rules: vec![Rule::pattern("a*")], + separators: vec![], examples: vec![ - ("aaa1", Some("aaa")), - ("b", Some("")), + ("aaa1", Some((0, "aaa"))), + ("b", Some((0, ""))), ], }, + + // regex with repeats in sequences Row { rules: vec![Rule::pattern("a((bc)+|(de)*)f")], + separators: vec![], examples: vec![ - ("af1", Some("af")), - ("adedef1", Some("adedef")), - ("abcbcbcf1", Some("abcbcbcf")), + ("af1", Some((0, "af"))), + ("adedef1", Some((0, "adedef"))), + ("abcbcbcf1", Some((0, "abcbcbcf"))), ("a", None), ], }, + + // regex with character ranges Row { rules: vec![Rule::pattern("[a-fA-F0-9]+")], + separators: vec![], examples: vec![ - ("A1ff0", Some("A1ff")), + ("A1ff0.", Some((0, "A1ff0"))), ], }, + + // regex with perl character classes Row { rules: vec![Rule::pattern("\\w\\d\\s")], + separators: vec![], examples: vec![ - ("_0 ", Some("_0 ")), + ("_0 ", Some((0, "_0 "))), ], }, + + // string Row { rules: vec![Rule::string("abc")], + separators: vec![], examples: vec![ - ("abcd", Some("abc")), + ("abcd", Some((0, "abc"))), ("ab", None) ], }, + + // complex rule containing strings and regexes Row { rules: vec![ Rule::repeat(Rule::seq(vec![ @@ -332,17 +359,64 @@ mod tests { Rule::string("}"), ])), ], + separators: vec![], examples: vec![ - ("{a}{", Some("{a}")), - ("{a}{d", Some("{a}")), + ("{a}{", Some((0, "{a}"))), + ("{a}{d", Some((0, "{a}"))), ("ab", None), ], }, + + // longest match rule + Row { + rules: vec![ + Rule::pattern("a|bc"), + Rule::pattern("aa"), + Rule::pattern("bcd"), + ], + separators: vec![], + examples: vec![ + ("a.", Some((0, "a"))), + ("bc.", Some((0, "bc"))), + ("aa.", Some((1, "aa"))), + ("bcd?", Some((2, "bcd"))), + ("b.", None), + ("c.", None), + ], + }, + + // regexes with alternatives including the empty string + Row { + rules: vec![Rule::pattern("a(b|)+c")], + separators: vec![], + examples: vec![ + ("ac.", Some((0, "ac"))), + ("abc.", Some((0, "abc"))), + ("abbc.", Some((0, "abbc"))), + ], + }, + + // separators + Row { + rules: vec![ + Rule::pattern("[a-f]+"), + ], + separators: vec![ + Rule::string("\\\n"), + Rule::pattern("\\s"), + ], + examples: vec![ + (" a", Some((0, "a"))), + (" \nb", Some((0, "b"))), + (" \\a", None), + (" \\\na", Some((0, "a"))), + ], + }, ]; - for Row { rules, examples } in &table { + for Row { rules, separators, examples } in &table { let grammar = expand_tokens(ExtractedLexicalGrammar { - separators: vec![], + separators: separators.clone(), variables: rules .into_iter() .map(|rule| Variable::named("", rule.clone())) @@ -350,7 +424,7 @@ mod tests { }).unwrap(); for (haystack, needle) in examples.iter() { - assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle); + assert_eq!(simulate_nfa(&grammar, haystack), *needle); } } } From 5fa586f7c92916db288e258c91a0424e3af04f30 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 21:01:41 -0800 Subject: [PATCH 060/208] Format expand_tokens file --- src/prepare_grammar/expand_tokens.rs | 281 +++++++++++++-------------- 1 file changed, 130 insertions(+), 151 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 8b8cd03a..7a1d2f4d 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -5,37 +5,98 @@ use crate::nfa::{CharacterSet, Nfa, NfaState}; use crate::rules::Rule; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), +pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + + let mut variables = Vec::new(); + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state_id = nfa.last_state_id(); + expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; + + if !is_immediate_token { + let last_state_id = nfa.last_state_id(); + expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + } + + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + start_state: nfa.last_state_id(), + }); } + + Ok(LexicalGrammar { nfa, variables }) } -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + expand_regex(&ast, nfa, next_state_id, is_sep) } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); + Rule::String(s) => { + for c in s.chars().rev() { + nfa.prepend(|last_state_id| NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state_id: last_state_id, + is_sep, + }); + } + Ok(s.len() > 0) + } + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if expand_rule(element, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); + } + Ok(true) + } + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if expand_rule(element, nfa, next_state_id, is_sep)? { + result = true; + } + next_state_id = nfa.last_state_id(); } Ok(result) } - _ => Err(Error::regex("Unsupported character class syntax")), + Rule::Repeat(rule) => { + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_state_id = nfa.last_state_id(); + if expand_rule(rule, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); + Ok(true) + } else { + Ok(false) + } + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } @@ -149,107 +210,45 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) } } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) +fn expand_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| { - NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - } - }); - } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; - } - next_state_id = nfa.last_state_id(); + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(expand_character_class(&item)?); } Ok(result) } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + _ => Err(Error::regex("Unsupported character class syntax")), } } -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); - - let separator_rule = if grammar.separators.len() > 0 { - grammar.separators.push(Rule::Blank); - Rule::repeat(Rule::choice(grammar.separators)) - } else { - Rule::Blank - }; - - let mut variables = Vec::new(); - for (i, variable) in grammar.variables.into_iter().enumerate() { - let is_immediate_token = match &variable.rule { - Rule::Metadata { params, .. } => params.is_main_token, - _ => false, - }; - - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; - - if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; - } - - variables.push(LexicalVariable { - name: variable.name, - kind: variable.kind, - start_state: nfa.last_state_id(), - }); +fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - - Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; - use crate::nfa::NfaCursor; use crate::grammars::Variable; + use crate::nfa::NfaCursor; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); @@ -299,17 +298,12 @@ mod tests { ("ad1", None), ], }, - // regex with repeats Row { rules: vec![Rule::pattern("a*")], separators: vec![], - examples: vec![ - ("aaa1", Some((0, "aaa"))), - ("b", Some((0, ""))), - ], + examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))], }, - // regex with repeats in sequences Row { rules: vec![Rule::pattern("a((bc)+|(de)*)f")], @@ -321,44 +315,31 @@ mod tests { ("a", None), ], }, - // regex with character ranges Row { rules: vec![Rule::pattern("[a-fA-F0-9]+")], separators: vec![], - examples: vec![ - ("A1ff0.", Some((0, "A1ff0"))), - ], + examples: vec![("A1ff0.", Some((0, "A1ff0")))], }, - // regex with perl character classes Row { rules: vec![Rule::pattern("\\w\\d\\s")], separators: vec![], - examples: vec![ - ("_0 ", Some((0, "_0 "))), - ], + examples: vec![("_0 ", Some((0, "_0 ")))], }, - // string Row { rules: vec![Rule::string("abc")], separators: vec![], - examples: vec![ - ("abcd", Some((0, "abc"))), - ("ab", None) - ], + examples: vec![("abcd", Some((0, "abc"))), ("ab", None)], }, - // complex rule containing strings and regexes Row { - rules: vec![ - Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - ], + rules: vec![Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ]))], separators: vec![], examples: vec![ ("{a}{", Some((0, "{a}"))), @@ -366,7 +347,6 @@ mod tests { ("ab", None), ], }, - // longest match rule Row { rules: vec![ @@ -384,8 +364,7 @@ mod tests { ("c.", None), ], }, - - // regexes with alternatives including the empty string + // regex with an alternative including the empty string Row { rules: vec![Rule::pattern("a(b|)+c")], separators: vec![], @@ -395,16 +374,10 @@ mod tests { ("abbc.", Some((0, "abbc"))), ], }, - // separators Row { - rules: vec![ - Rule::pattern("[a-f]+"), - ], - separators: vec![ - Rule::string("\\\n"), - Rule::pattern("\\s"), - ], + rules: vec![Rule::pattern("[a-f]+")], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], examples: vec![ (" a", Some((0, "a"))), (" \nb", Some((0, "b"))), @@ -414,14 +387,20 @@ mod tests { }, ]; - for Row { rules, separators, examples } in &table { + for Row { + rules, + separators, + examples, + } in &table + { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: separators.clone(), variables: rules .into_iter() .map(|rule| Variable::named("", rule.clone())) .collect(), - }).unwrap(); + }) + .unwrap(); for (haystack, needle) in examples.iter() { assert_eq!(simulate_nfa(&grammar, haystack), *needle); From 494329c93b4c54b583e68634132e1f45b383e91f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 10:08:25 -0800 Subject: [PATCH 061/208] Add Parser.set_included_ranges and Node.range --- src/lib.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index ad31d3c4..98d2234e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,14 @@ pub struct Point { pub column: usize, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Range { + pub start_byte: usize, + pub end_byte: usize, + pub start_point: Point, + pub end_point: Point, +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { pub start_byte: usize, @@ -252,6 +260,14 @@ impl Parser { unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } } + pub fn set_included_ranges(&mut self, ranges: &[Range]) { + let ts_ranges: Vec = + ranges.iter().cloned().map(|range| range.into()).collect(); + unsafe { + ffi::ts_parser_set_included_ranges(self.0, ts_ranges.as_ptr(), ts_ranges.len() as u32) + }; + } + fn parse_utf8_ptr (*const u8, usize)>( &mut self, input: &mut T, @@ -421,6 +437,15 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_end_byte(self.0) as usize } } + pub fn range(&self) -> Range { + Range { + start_byte: self.start_byte(), + end_byte: self.end_byte(), + start_point: self.start_position(), + end_point: self.end_position(), + } + } + pub fn start_position(&self) -> Point { let result = unsafe { ffi::ts_node_start_point(self.0) }; result.into() @@ -677,6 +702,17 @@ impl From for Point { } } +impl Into for Range { + fn into(self) -> ffi::TSRange { + ffi::TSRange { + start_byte: self.start_byte as u32, + end_byte: self.end_byte as u32, + start_point: self.start_point.into(), + end_point: self.end_point.into(), + } + } +} + impl PropertySheet

{ pub fn new(language: Language, json: &str) -> Result { #[derive(Deserialize, Debug)] From 4a361fbb3fafa41ffa1247501f8199938e5aab6c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 10:08:50 -0800 Subject: [PATCH 062/208] Implement Copy for Node --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 98d2234e..428e8101 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,6 +77,7 @@ pub struct PropertySheet> { text_regexes: Vec, } +#[derive(Clone, Copy)] pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); pub struct Parser(*mut ffi::TSParser); From bdd3f20522eefe01831ad9cd74002dfe95de20d1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:30:40 -0800 Subject: [PATCH 063/208] Add PropertySheet::map method --- src/lib.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 428e8101..0a53e320 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,7 @@ pub enum PropertySheetError { InvalidRegex(regex::Error) } -pub struct PropertySheet> { +pub struct PropertySheet

> { states: Vec, property_sets: Vec

, text_regexes: Vec, @@ -86,7 +86,7 @@ pub struct Tree(*mut ffi::TSTree); pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); -pub struct TreePropertyCursor<'a, P: 'a + DeserializeOwned> { +pub struct TreePropertyCursor<'a, P> { cursor: TreeCursor<'a>, state_stack: Vec, child_index_stack: Vec, @@ -370,7 +370,7 @@ impl Tree { self.root_node().walk() } - pub fn walk_with_properties<'a, P: DeserializeOwned>( + pub fn walk_with_properties<'a, P>( &'a self, property_sheet: &'a PropertySheet

, source: &'a str, @@ -574,7 +574,7 @@ impl<'a> Drop for TreeCursor<'a> { } } -impl<'a, P: DeserializeOwned> TreePropertyCursor<'a, P> { +impl<'a, P> TreePropertyCursor<'a, P> { fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { let mut result = Self { cursor: tree.root_node().walk(), @@ -714,8 +714,11 @@ impl Into for Range { } } -impl PropertySheet

{ - pub fn new(language: Language, json: &str) -> Result { +impl

PropertySheet

{ + pub fn new(language: Language, json: &str) -> Result + where + P: DeserializeOwned, + { #[derive(Deserialize, Debug)] struct PropertyTransitionJSON { #[serde(rename = "type")] @@ -787,6 +790,21 @@ impl PropertySheet

{ text_regexes, }) } + + pub fn map(self, mut f: F) -> Result, E> + where + F: FnMut(P) -> Result, + { + let mut property_sets = Vec::with_capacity(self.property_sets.len()); + for set in self.property_sets { + property_sets.push(f(set)?); + } + Ok(PropertySheet { + states: self.states, + text_regexes: self.text_regexes, + property_sets, + }) + } } #[cfg(test)] From 6d3835d292e7bc37965ad5623c3688c4862ee4b1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:32:10 -0800 Subject: [PATCH 064/208] Add Node::children method --- src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 0a53e320..f1a83203 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -465,6 +465,12 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_child_count(self.0) as usize } } + pub fn children<'a>(&'a self) -> impl Iterator> + 'a { + (0..self.child_count()) + .into_iter() + .map(move |i| self.child(i).unwrap()) + } + pub fn named_child<'a>(&'a self, i: usize) -> Option { Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) } From 3f1fc65a2736a573920c4139a844d99187ebb894 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:32:22 -0800 Subject: [PATCH 065/208] Auto-format lib.rs --- src/lib.rs | 59 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f1a83203..65a57d16 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,12 +2,12 @@ mod ffi; #[macro_use] extern crate serde_derive; -extern crate serde_json; extern crate regex; extern crate serde; +extern crate serde_json; -use serde::de::DeserializeOwned; use regex::Regex; +use serde::de::DeserializeOwned; use std::collections::HashMap; use std::ffi::CStr; use std::fmt; @@ -68,7 +68,7 @@ struct PropertyState { #[derive(Debug)] pub enum PropertySheetError { InvalidJSON(serde_json::Error), - InvalidRegex(regex::Error) + InvalidRegex(regex::Error), } pub struct PropertySheet

> { @@ -187,7 +187,16 @@ impl Parser { pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let bytes = input.as_bytes(); - self.parse_utf8(&mut |offset, _| &bytes[offset..], old_tree) + self.parse_utf8( + &mut |offset, _| { + if offset < bytes.len() { + &bytes[offset..] + } else { + &[] + } + }, + old_tree, + ) } pub fn parse_utf8<'a, T: FnMut(usize, Point) -> &'a [u8]>( @@ -565,7 +574,8 @@ impl<'a> TreeCursor<'a> { } pub fn goto_first_child_for_index(&mut self, index: usize) -> Option { - let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; + let result = + unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; if result < 0 { None } else { @@ -645,7 +655,12 @@ impl<'a, P> TreePropertyCursor<'a, P> { } } - fn next_state(&self, state: &PropertyState, node_kind_id: u16, node_child_index: usize) -> usize { + fn next_state( + &self, + state: &PropertyState, + node_kind_id: u16, + node_child_index: usize, + ) -> usize { state .transitions .get(&node_kind_id) @@ -748,8 +763,8 @@ impl

PropertySheet

{ property_sets: Vec

, } - let input: PropertySheetJSON

= serde_json::from_str(json) - .map_err(PropertySheetError::InvalidJSON)?; + let input: PropertySheetJSON

= + serde_json::from_str(json).map_err(PropertySheetError::InvalidJSON)?; let mut states = Vec::new(); let mut text_regexes = Vec::new(); let mut text_regex_patterns = Vec::new(); @@ -759,11 +774,15 @@ impl

PropertySheet

{ let node_kind_count = language.node_kind_count(); for transition in state.transitions.iter() { let text_regex_index = if let Some(regex_pattern) = transition.text.as_ref() { - if let Some(index) = text_regex_patterns.iter().position(|r| *r == regex_pattern) { + if let Some(index) = + text_regex_patterns.iter().position(|r| *r == regex_pattern) + { Some(index) } else { text_regex_patterns.push(regex_pattern); - text_regexes.push(Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?); + text_regexes.push( + Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?, + ); Some(text_regexes.len() - 1) } } else { @@ -771,9 +790,8 @@ impl

PropertySheet

{ }; for i in 0..(node_kind_count as u16) { - if - transition.kind == language.node_kind_for_id(i) && - transition.named == language.node_kind_is_named(i) + if transition.kind == language.node_kind_for_id(i) + && transition.named == language.node_kind_is_named(i) { let entry = transitions.entry(i).or_insert(Vec::new()); entry.push(PropertyTransition { @@ -928,7 +946,10 @@ mod tests { define: Option, } - let empty_properties = Properties { reference: None, define: None }; + let empty_properties = Properties { + reference: None, + define: None, + }; let property_sheet = PropertySheet::::new( rust(), @@ -1018,7 +1039,10 @@ mod tests { assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().reference, Some("function".to_owned())); + assert_eq!( + cursor.node_properties().reference, + Some("function".to_owned()) + ); } #[test] @@ -1097,7 +1121,10 @@ mod tests { assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().scope, Some("constructor".to_owned())); + assert_eq!( + cursor.node_properties().scope, + Some("constructor".to_owned()) + ); } #[test] From d79203f58c7e3bb06232385a6da701ed5dfde739 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:42:46 -0800 Subject: [PATCH 066/208] Add test script --- script/test.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100755 script/test.sh diff --git a/script/test.sh b/script/test.sh new file mode 100755 index 00000000..eb6183c0 --- /dev/null +++ b/script/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +RUST_TREE_SITTER_TEST=1 cargo test $@ From 7bd9eaa97065c3153ae44d1f219d3bfc741e82a6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 13 Dec 2018 16:43:44 -0800 Subject: [PATCH 067/208] 0.3.5 --- Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fde4fd31..7f0458ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,8 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.4" +version = "0.3.5" authors = ["Max Brunsfeld "] -build = "build.rs" license = "MIT" readme = "README.md" keywords = ["incremental", "parsing"] From 889f232b4ca2cbdc932510bb75da6f686059eceb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Dec 2018 16:05:36 -0800 Subject: [PATCH 068/208] Implement variable inlining --- Cargo.lock | 15 +- Cargo.toml | 3 +- src/build_tables/inline_variables.rs | 318 +++++++++++++++++++++++++++ src/build_tables/item.rs | 213 ++++++++++++++++-- src/build_tables/mod.rs | 1 + src/grammars.rs | 12 + src/main.rs | 1 + src/parse_grammar.rs | 1 - src/rules.rs | 34 ++- 9 files changed, 567 insertions(+), 31 deletions(-) create mode 100644 src/build_tables/inline_variables.rs diff --git a/Cargo.lock b/Cargo.lock index d5109fb7..410580fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,11 +67,6 @@ name = "bitflags" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "bitvec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "blake2-rfc" version = "0.2.18" @@ -461,16 +456,17 @@ dependencies = [ name = "rust-tree-sitter-cli" version = "0.1.0" dependencies = [ - "bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -548,6 +544,11 @@ dependencies = [ "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "smallbitvec" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "smallvec" version = "0.6.7" @@ -729,7 +730,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e37e2176261200377c7cde4c6de020394174df556c356f965e4bc239f5ce1c5a" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" @@ -787,6 +787,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" "checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" "checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" diff --git a/Cargo.toml b/Cargo.toml index 93a49d2c..f3880a1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,8 @@ authors = ["Max Brunsfeld "] edition = "2018" [dependencies] -bitvec = "0.8" +lazy_static = "1.2.0" +smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" ignore = "0.4.4" diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs new file mode 100644 index 00000000..d201519f --- /dev/null +++ b/src/build_tables/inline_variables.rs @@ -0,0 +1,318 @@ +use super::item::ParseItem; +use crate::grammars::{Production, SyntaxGrammar}; +use std::collections::HashMap; + +pub(crate) struct InlinedProductionMap { + pub inlined_productions: Vec, + item_map: HashMap>, +} + +impl InlinedProductionMap { + pub fn new(grammar: &SyntaxGrammar) -> Self { + let mut result = Self { + inlined_productions: Vec::new(), + item_map: HashMap::new(), + }; + + let mut items_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + items_to_process.push(ParseItem::Normal { + variable_index: variable_index as u32, + production_index: production_index as u32, + step_index: 0, + }); + while !items_to_process.is_empty() { + let mut i = 0; + while i < items_to_process.len() { + let item = &items_to_process[i]; + if let Some(step) = item.step(grammar, &result) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_items = result + .inline(*item, grammar) + .into_iter() + .map(|production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index: *production_index, + step_index: item.step_index() as u32, + }) + .collect::>(); + items_to_process.splice(i..i + 1, inlined_items); + } else { + items_to_process[i] = item.successor(); + i += 1; + } + } else { + items_to_process.remove(i); + } + } + } + } + } + + result + } + + pub fn inlined_items<'a>( + &'a self, + item: ParseItem, + ) -> Option + 'a> { + self.item_map.get(&item).map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index, + step_index: item.step_index() as u32, + }) + }) + } + + fn inline(&mut self, item: ParseItem, grammar: &SyntaxGrammar) -> &Vec { + let step_index = item.step_index(); + let mut productions_to_add = grammar.variables + [item.step(grammar, self).unwrap().symbol.index] + .productions + .clone(); + + let mut i = 0; + while i < productions_to_add.len() { + if let Some(first_symbol) = productions_to_add[i].first_symbol() { + if grammar.variables_to_inline.contains(&first_symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[first_symbol.index] + .productions + .iter() + .map(|p| { + let mut p = p.clone(); + p.steps.extend(production.steps[1..].iter().cloned()); + p + }), + ); + continue; + } + } + i += 1; + } + + let result = productions_to_add + .into_iter() + .map(|production_to_add| { + let mut inlined_production = item.production(grammar, &self).clone(); + inlined_production.steps.splice( + step_index..step_index + 1, + production_to_add.steps.iter().cloned(), + ); + self.inlined_productions + .iter() + .position(|p| *p == inlined_production) + .unwrap_or({ + self.inlined_productions.push(inlined_production); + self.inlined_productions.len() - 1 + }) as u32 + }) + .collect(); + + self.item_map.entry(item).or_insert(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::Symbol; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + // Nothing to inline at step 0. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0 + }), + &grammar, + &inline_map + ), + None + ); + + // Inlining variable 1 yields two productions. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }), + &grammar, + &inline_map + ), + Some(vec![ + "terminal-10 • terminal-12 terminal-13 terminal-11".to_string(), + "terminal-10 • terminal-14 terminal-11".to_string(), + ]) + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "var2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "var3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + let items = inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }).unwrap().collect::>(); + + assert_eq!( + display_items(Some(items.iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), + "terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() + ]) + ); + + let item = items[0].successor().successor(); + assert_eq!( + display_items(Some([item].iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), + ]) + ); + + assert_eq!( + display_items(inline_map.inlined_items(item), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), + ]) + ); + } + + fn display_items( + items: Option>, + grammar: &SyntaxGrammar, + inline_map: &InlinedProductionMap, + ) -> Option> { + items.map(|items| { + items + .map(|item| format!("{}", item.with(grammar, inline_map))) + .collect() + }) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c8d30997..537b0928 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,22 +1,209 @@ -use crate::grammars::Production; +use super::inline_variables::InlinedProductionMap; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Symbol, SymbolType}; +use smallbitvec::SmallBitVec; use std::collections::HashMap; -use bitvec::BitVec; +use std::hash::{Hash, Hasher}; +use std::fmt; -#[derive(Debug, PartialEq, Eq)] -pub(super) struct LookaheadSet { - terminal_bits: BitVec, - external_bits: BitVec, +lazy_static! { + static ref START_PRODUCTION: Production = Production { + dynamic_precedence: 0, + steps: vec![ProductionStep { + symbol: Symbol { + index: 0, + kind: SymbolType::NonTerminal, + }, + precedence: 0, + associativity: None, + alias: None, + }], + }; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LookaheadSet { + terminal_bits: SmallBitVec, + external_bits: SmallBitVec, eof: bool, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub(super) struct ParseItem { - variable_index: u32, - production_index: u32, - step_index: u32, +pub(crate) enum ParseItem { + Start { + step_index: u32, + }, + Normal { + variable_index: u32, + production_index: u32, + step_index: u32, + }, + Inlined { + variable_index: u32, + production_index: u32, + step_index: u32, + }, } -#[derive(Debug, PartialEq, Eq)] -pub(super) struct ParseItemSet { - entries: HashMap +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseItemSet { + pub entries: HashMap, +} + +impl LookaheadSet { + pub fn new() -> Self { + Self { + terminal_bits: SmallBitVec::new(), + external_bits: SmallBitVec::new(), + eof: false, + } + } + + pub fn insert(&mut self, other: Symbol) { + match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.set(other.index, true), + SymbolType::External => self.external_bits.set(other.index, true), + } + } + + pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + let mut result = false; + if other.terminal_bits.len() > self.terminal_bits.len() { + self.terminal_bits.resize(other.terminal_bits.len(), false); + } + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } + for (i, element) in other.terminal_bits.iter().enumerate() { + if element { + result |= !self.terminal_bits[i]; + self.terminal_bits.set(i, element); + } + } + for (i, element) in other.external_bits.iter().enumerate() { + if element { + result |= !self.external_bits[i]; + self.external_bits.set(i, element); + } + } + if other.eof { + result |= !self.eof; + self.eof = true; + } + result + } +} + +impl ParseItem { + pub fn is_kernel(&self) -> bool { + match self { + ParseItem::Start { .. } => true, + ParseItem::Normal { step_index, .. } | ParseItem::Inlined { step_index, .. } => { + *step_index > 0 + } + } + } + + pub fn production<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> &'a Production { + match self { + ParseItem::Start { .. } => &START_PRODUCTION, + ParseItem::Normal { + variable_index, + production_index, + .. + } => { + &grammar.variables[*variable_index as usize].productions[*production_index as usize] + } + ParseItem::Inlined { + production_index, + .. + } => &inlined_productions.inlined_productions[*production_index as usize], + } + } + + pub fn step<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> Option<&'a ProductionStep> { + self.production(grammar, inlined_productions).steps.get(self.step_index()) + } + + pub fn variable_index(&self) -> u32 { + match self { + ParseItem::Start { .. } => panic!("Start item doesn't have a variable index"), + ParseItem::Normal { variable_index, .. } + | ParseItem::Inlined { variable_index, .. } => *variable_index, + } + } + + pub fn step_index(&self) -> usize { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => *step_index as usize, + } + } + + fn step_index_mut(&mut self) -> &mut u32 { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => step_index, + } + } + + pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, grammar, inlines) + } + + pub fn successor(&self) -> ParseItem { + let mut result = self.clone(); + *result.step_index_mut() += 1; + result + } +} + +pub struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); + +impl<'a> fmt::Display for ParseItemDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + let step_index = self.0.step_index(); + let production = self.0.production(self.1, self.2); + for (i, step) in production.steps.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + + if i == step_index { + write!(f, "• ")?; + } + + let name = if step.symbol.is_terminal() { + "terminal" + } else if step.symbol.is_external() { + "external" + } else { + "non-terminal" + }; + + write!(f, "{}-{}", name, step.symbol.index)?; + } + Ok(()) + } +} + +impl Hash for ParseItemSet { + fn hash(&self, hasher: &mut H) { + hasher.write_usize(self.entries.len()); + for (item, lookaheads) in self.entries.iter() { + item.hash(hasher); + lookaheads.hash(hasher); + } + } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c3518428..f7bb1f9c 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,4 +1,5 @@ mod item; +mod inline_variables; use std::collections::{HashMap, VecDeque}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; diff --git a/src/grammars.rs b/src/grammars.rs index 74c213e1..8abdad24 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -108,6 +108,18 @@ impl ProductionStep { } } +impl Production { + pub fn first_symbol(&self) -> Option { + self.steps.first().map(|s| s.symbol.clone()) + } +} + +impl Default for Production { + fn default() -> Self { + Production { dynamic_precedence: 0, steps: Vec::new() } + } +} + impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } diff --git a/src/main.rs b/src/main.rs index b83764fc..9dc9efb2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use clap::{App, Arg, SubCommand}; #[macro_use] extern crate serde_derive; #[macro_use] extern crate serde_json; +#[macro_use] extern crate lazy_static; mod build_tables; mod error; diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 0f1f5008..27dc8b05 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -2,7 +2,6 @@ use serde_json::{Map, Value}; use crate::error::Result; use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; -use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] diff --git a/src/rules.rs b/src/rules.rs index d7234f45..9374a283 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -10,7 +10,7 @@ pub(crate) enum SymbolType { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub(crate) enum Associativity { Left, - Right + Right, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] @@ -137,24 +137,37 @@ impl Rule { } impl Symbol { + pub fn is_terminal(&self) -> bool { + self.kind == SymbolType::Terminal + } + pub fn is_non_terminal(&self) -> bool { - return self.kind == SymbolType::NonTerminal + self.kind == SymbolType::NonTerminal } pub fn is_external(&self) -> bool { - return self.kind == SymbolType::External + self.kind == SymbolType::External } pub fn non_terminal(index: usize) -> Self { - Symbol { kind: SymbolType::NonTerminal, index } + Symbol { + kind: SymbolType::NonTerminal, + index, + } } pub fn terminal(index: usize) -> Self { - Symbol { kind: SymbolType::Terminal, index } + Symbol { + kind: SymbolType::Terminal, + index, + } } pub fn external(index: usize) -> Self { - Symbol { kind: SymbolType::External, index } + Symbol { + kind: SymbolType::External, + index, + } } } @@ -169,11 +182,14 @@ fn add_metadata(input: Rule, f: T) -> Rule { Rule::Metadata { rule, mut params } => { f(&mut params); Rule::Metadata { rule, params } - }, + } _ => { let mut params = MetadataParams::default(); f(&mut params); - Rule::Metadata { rule: Box::new(input), params } + Rule::Metadata { + rule: Box::new(input), + params, + } } } } @@ -184,7 +200,7 @@ fn choice_helper(result: &mut Vec, rule: Rule) { for element in elements { choice_helper(result, element); } - }, + } _ => { if !result.contains(&rule) { result.push(rule); From 143588c148a130217beb7c547647d8e3442b9762 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Dec 2018 17:31:54 -0800 Subject: [PATCH 069/208] Implement ItemSetBuilder --- src/build_tables/item.rs | 16 +- src/build_tables/item_set_builder.rs | 279 +++++++++++++++++++++++++++ src/build_tables/mod.rs | 2 + 3 files changed, 294 insertions(+), 3 deletions(-) create mode 100644 src/build_tables/item_set_builder.rs diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 537b0928..c99815eb 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -50,6 +50,8 @@ pub(crate) struct ParseItemSet { pub entries: HashMap, } +pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); + impl LookaheadSet { pub fn new() -> Self { Self { @@ -96,6 +98,10 @@ impl LookaheadSet { } impl ParseItem { + pub fn start() -> Self { + ParseItem::Start { step_index: 0 } + } + pub fn is_kernel(&self) -> bool { match self { ParseItem::Start { .. } => true, @@ -106,7 +112,7 @@ impl ParseItem { } pub fn production<'a>( - &'a self, + &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> &'a Production { @@ -127,7 +133,7 @@ impl ParseItem { } pub fn step<'a>( - &'a self, + &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> Option<&'a ProductionStep> { @@ -169,7 +175,11 @@ impl ParseItem { } } -pub struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); +impl ParseItemSet { + pub fn new() -> Self { + Self { entries: HashMap::new() } + } +} impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs new file mode 100644 index 00000000..61d45ded --- /dev/null +++ b/src/build_tables/item_set_builder.rs @@ -0,0 +1,279 @@ +use super::inline_variables::InlinedProductionMap; +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::rules::Symbol; +use std::collections::{HashMap, HashSet}; + +#[derive(Clone, Debug, PartialEq, Eq)] +struct TransitiveClosureAddition { + item: ParseItem, + info: FollowSetInfo, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +struct FollowSetInfo { + lookaheads: LookaheadSet, + propagates_lookaheads: bool, +} + +pub(crate) struct ParseItemSetBuilder { + first_sets: HashMap, + last_sets: HashMap, + transitive_closure_additions: Vec>, + inlined_production_map: InlinedProductionMap, +} + +fn find_or_push(vector: &mut Vec, value: T) { + if !vector.contains(&value) { + vector.push(value); + } +} + +impl ParseItemSetBuilder { + pub fn new(syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar) -> Self { + let mut result = Self { + first_sets: HashMap::new(), + last_sets: HashMap::new(), + transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], + inlined_production_map: InlinedProductionMap::new(syntax_grammar), + }; + + // For each grammar symbol, populate the FIRST and LAST sets: the set of + // terminals that appear at the beginning and end that symbol's productions, + // respectively. + // + // For a terminal symbol, the FIRST and LAST set just consists of the + // terminal itself. + for i in 0..lexical_grammar.variables.len() { + let symbol = Symbol::terminal(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + for i in 0..syntax_grammar.external_tokens.len() { + let symbol = Symbol::external(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + // The FIRST set of a non-terminal `i` is the union of the following sets: + // * the set of all terminals that appear at the beginings of i's productions + // * the FIRST sets of all the non-terminals that appear at the beginnings + // of i's productions + // + // Rather than computing these sets using recursion, we use an explicit stack + // called `symbols_to_process`. + let mut symbols_to_process = Vec::new(); + let mut processed_non_terminals = HashSet::new(); + for i in 0..syntax_grammar.variables.len() { + let symbol = Symbol::non_terminal(i); + + let first_set = &mut result + .first_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + first_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.first() { + symbols_to_process.push(step.symbol); + } + } + } + } + + // The LAST set is defined in a similar way to the FIRST set. + let last_set = &mut result + .last_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + last_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.last() { + symbols_to_process.push(step.symbol); + } + } + } + } + } + + // To compute an item set's transitive closure, we find each item in the set + // whose next symbol is a non-terminal, and we add new items to the set for + // each of that symbols' productions. These productions might themselves begin + // with non-terminals, so the process continues recursively. In this process, + // the total set of entries that get added depends only on two things: + // * the set of non-terminal symbols that occur at each item's current position + // * the set of terminals that occurs after each of these non-terminal symbols + // + // So we can avoid a lot of duplicated recursive work by precomputing, for each + // non-terminal symbol `i`, a final list of *additions* that must be made to an + // item set when `i` occurs as the next symbol in one if its core items. The + // structure of an *addition* is as follows: + // * `item` - the new item that must be added as part of the expansion of `i` + // * `lookaheads` - lookahead tokens that can always come after that item in + // the expansion of `i` + // * `propagates_lookaheads` - a boolean indicating whether or not `item` can + // occur at the *end* of the expansion of `i`, so that i's own current + // lookahead tokens can occur after `item`. + // + // Again, rather than computing these additions recursively, we use an explicit + // stack called `entries_to_process`. + for i in 0..syntax_grammar.variables.len() { + let empty_lookaheads = LookaheadSet::new(); + let mut entries_to_process = vec![(i, &empty_lookaheads, true)]; + + // First, build up a map whose keys are all of the non-terminals that can + // appear at the beginning of non-terminal `i`, and whose values store + // information about the tokens that can follow each non-terminal. + let mut follow_set_info_by_non_terminal = HashMap::new(); + while let Some(entry) = entries_to_process.pop() { + let (variable_index, lookaheads, propagates_lookaheads) = entry; + let existing_info = follow_set_info_by_non_terminal + .entry(variable_index) + .or_insert_with(|| FollowSetInfo { + lookaheads: LookaheadSet::new(), + propagates_lookaheads: false, + }); + + let did_add_follow_set_info; + if propagates_lookaheads { + did_add_follow_set_info = !existing_info.propagates_lookaheads; + existing_info.propagates_lookaheads = true; + } else { + did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads); + } + + if did_add_follow_set_info { + for production in &syntax_grammar.variables[variable_index].productions { + if let Some(symbol) = production.first_symbol() { + if symbol.is_non_terminal() { + if production.steps.len() == 1 { + entries_to_process.push(( + symbol.index, + lookaheads, + propagates_lookaheads, + )); + } else { + entries_to_process.push(( + symbol.index, + &result.first_sets[&production.steps[1].symbol], + false, + )); + } + } + } + } + } + } + + // Store all of those non-terminals' productions, along with their associated + // lookahead info, as *additions* associated with non-terminal `i`. + let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; + for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { + let variable = &syntax_grammar.variables[variable_index]; + for production_index in 0..variable.productions.len() { + let item = ParseItem::Normal { + variable_index: variable_index as u32, + production_index: production_index as u32, + step_index: 0, + }; + + if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) { + for inlined_item in inlined_items { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item: inlined_item, + info: follow_set_info.clone(), + }, + ); + } + } else { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item, + info: follow_set_info.clone(), + }, + ); + } + } + } + } + + result + } + + pub(crate) fn transitive_closure( + &mut self, + item_set: ParseItemSet, + grammar: &SyntaxGrammar, + ) -> ParseItemSet { + let mut result = ParseItemSet::new(); + for (item, lookaheads) in item_set.entries { + if let Some(items) = self.inlined_production_map.inlined_items(item) { + for item in items { + self.add_item(&mut result, item, lookaheads.clone(), grammar); + } + } else { + self.add_item(&mut result, item, lookaheads, grammar); + } + } + result + } + + fn add_item( + &self, + set: &mut ParseItemSet, + item: ParseItem, + lookaheads: LookaheadSet, + grammar: &SyntaxGrammar, + ) { + if let Some(step) = item.step(grammar, &self.inlined_production_map) { + if step.symbol.is_non_terminal() { + let next_step = item.successor().step(grammar, &self.inlined_production_map); + + // Determine which tokens can follow this non-terminal. + let following_tokens = if let Some(next_step) = next_step { + self.first_sets.get(&next_step.symbol).unwrap() + } else { + &lookaheads + }; + + // Use the pre-computed *additions* to expand the non-terminal. + for addition in &self.transitive_closure_additions[step.symbol.index] { + let lookaheads = set + .entries + .entry(addition.item) + .or_insert_with(|| LookaheadSet::new()); + lookaheads.insert_all(&addition.info.lookaheads); + if addition.info.propagates_lookaheads { + lookaheads.insert_all(following_tokens); + } + } + } + } + set.entries.insert(item, lookaheads); + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index f7bb1f9c..01d9219d 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,5 +1,7 @@ mod item; mod inline_variables; +mod item; +mod item_set_builder; use std::collections::{HashMap, VecDeque}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; From d078c263b0fc003c24ba2d08355fb1a87af6b65f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:35:13 -0800 Subject: [PATCH 070/208] Fix bugs in grammar JSON parsing --- Cargo.lock | 7 +++++++ Cargo.toml | 5 ++++- src/parse_grammar.rs | 23 +++++++++++++++++++---- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 410580fa..538517f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,6 +229,11 @@ dependencies = [ "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "indexmap" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "itoa" version = "0.4.3" @@ -539,6 +544,7 @@ name = "serde_json" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ + "indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -748,6 +754,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" "checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" +"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" "checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" "checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" diff --git a/Cargo.toml b/Cargo.toml index f3880a1c..b29bc85e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,9 @@ libloading = "0.5" rusqlite = "0.14.0" serde = "1.0" serde_derive = "1.0" -serde_json = "1.0" tree-sitter = "0.3.1" regex-syntax = "0.6.4" + +[dependencies.serde_json] +version = "1.0" +features = ["preserve_order"] diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 27dc8b05..07396329 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -7,6 +7,11 @@ use crate::rules::Rule; #[serde(tag = "type")] #[allow(non_camel_case_types)] enum RuleJSON { + ALIAS { + content: Box, + named: bool, + value: String, + }, BLANK, STRING { value: String, @@ -26,6 +31,13 @@ enum RuleJSON { REPEAT { content: Box, }, + REPEAT1 { + content: Box, + }, + PREC_DYNAMIC { + value: i32, + content: Box, + }, PREC_LEFT { value: i32, content: Box, @@ -41,7 +53,7 @@ enum RuleJSON { TOKEN { content: Box, }, - TOKEN_IMMEDIATE { + IMMEDIATE_TOKEN { content: Box, }, } @@ -97,18 +109,21 @@ pub(crate) fn parse_grammar(input: &str) -> Result { fn parse_rule(json: RuleJSON) -> Rule { match json { + RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named), RuleJSON::BLANK => Rule::Blank, RuleJSON::STRING { value } => Rule::String(value), RuleJSON::PATTERN { value } => Rule::Pattern(value), RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name), RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), - RuleJSON::REPEAT { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]), RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), + RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)), RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), - RuleJSON::TOKEN_IMMEDIATE { content } => Rule::immediate_token(parse_rule(*content)), + RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)), } } @@ -122,7 +137,7 @@ mod tests { "name": "my_lang", "rules": { "file": { - "type": "REPEAT", + "type": "REPEAT1", "content": { "type": "SYMBOL", "name": "statement" From 988dc7de35278f2ab36df90190a83c3727f391c9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:35:34 -0800 Subject: [PATCH 071/208] Handle precedence and aliases properly when inlining variables --- src/build_tables/inline_variables.rs | 263 ++++++++++++++++++++------- 1 file changed, 193 insertions(+), 70 deletions(-) diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs index d201519f..affbe163 100644 --- a/src/build_tables/inline_variables.rs +++ b/src/build_tables/inline_variables.rs @@ -108,10 +108,25 @@ impl InlinedProductionMap { .into_iter() .map(|production_to_add| { let mut inlined_production = item.production(grammar, &self).clone(); - inlined_production.steps.splice( - step_index..step_index + 1, - production_to_add.steps.iter().cloned(), - ); + let removed_step = inlined_production + .steps + .splice( + step_index..step_index + 1, + production_to_add.steps.iter().cloned(), + ) + .next() + .unwrap(); + let inserted_steps = &mut inlined_production.steps + [step_index..step_index + production_to_add.steps.len()]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } self.inlined_productions .iter() .position(|p| *p == inlined_production) @@ -129,8 +144,9 @@ impl InlinedProductionMap { #[cfg(test)] mod tests { use super::*; - use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; - use crate::rules::Symbol; + use crate::grammars::{LexicalGrammar, ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::{Alias, Associativity, Symbol}; + use std::borrow::Borrow; #[test] fn test_basic_inlining() { @@ -142,7 +158,7 @@ mod tests { variables_to_inline: vec![Symbol::non_terminal(1)], variables: vec![ SyntaxVariable { - name: "var0".to_string(), + name: "non-terminal-0".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -154,7 +170,7 @@ mod tests { }], }, SyntaxVariable { - name: "var1".to_string(), + name: "non-terminal-1".to_string(), kind: VariableType::Named, productions: vec![ Production { @@ -176,34 +192,32 @@ mod tests { let inline_map = InlinedProductionMap::new(&grammar); // Nothing to inline at step 0. - assert_eq!( - display_items( - inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0 - }), - &grammar, - &inline_map - ), - None - ); + assert!(inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0 + }) + .is_none()); // Inlining variable 1 yields two productions. assert_eq!( display_items( - inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }), + inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }) + .unwrap(), &grammar, &inline_map ), - Some(vec![ - "terminal-10 • terminal-12 terminal-13 terminal-11".to_string(), - "terminal-10 • terminal-14 terminal-11".to_string(), - ]) + vec![ + "non-terminal-0 → terminal-10 • terminal-12 terminal-13 terminal-11" + .to_string(), + "non-terminal-0 → terminal-10 • terminal-14 terminal-11".to_string(), + ] ); } @@ -212,23 +226,21 @@ mod tests { let grammar = SyntaxGrammar { variables: vec![ SyntaxVariable { - name: "var0".to_string(), + name: "non-terminal-0".to_string(), kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ProductionStep::new(Symbol::terminal(12)), - ], - }, - ], + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }], }, SyntaxVariable { - name: "var1".to_string(), + name: "non-terminal-1".to_string(), kind: VariableType::Named, productions: vec![ Production { @@ -245,7 +257,7 @@ mod tests { ], }, SyntaxVariable { - name: "var2".to_string(), + name: "non-terminal-2".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -253,7 +265,7 @@ mod tests { }], }, SyntaxVariable { - name: "var3".to_string(), + name: "non-terminal-3".to_string(), kind: VariableType::Named, productions: vec![Production { dynamic_precedence: 0, @@ -274,45 +286,156 @@ mod tests { let inline_map = InlinedProductionMap::new(&grammar); - let items = inline_map.inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }).unwrap().collect::>(); + let items = inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1, + }) + .unwrap() + .collect::>(); assert_eq!( - display_items(Some(items.iter().cloned()), &grammar, &inline_map), - Some(vec![ - "terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), - "terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() - ]) + display_items(&items, &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), + "non-terminal-0 → terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() + ] ); let item = items[0].successor().successor(); assert_eq!( - display_items(Some([item].iter().cloned()), &grammar, &inline_map), - Some(vec![ - "terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), - ]) + display_items(&[item], &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), + ] ); assert_eq!( - display_items(inline_map.inlined_items(item), &grammar, &inline_map), - Some(vec![ - "terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), - ]) + display_items(inline_map.inlined_items(item).unwrap(), &grammar, &inline_map), + vec![ + "non-terminal-0 → terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), + ] ); } + #[test] + fn test_inlining_with_precedence_and_alias() { + let grammar = SyntaxGrammar { + variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)) // inlined + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13)) + .with_alias("outer_alias", true)], + }], + }, + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + let items = inline_map + .inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0, + }) + .unwrap() + .collect::>(); + assert_eq!( + display_items(&items, &grammar, &inline_map)[0], + "non-terminal-0 → • terminal-11 terminal-12 terminal-10 non-terminal-2".to_string(), + ); + + // The first step in the inlined production retains its precedence and alias. + let item = items[0].successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 • terminal-12 terminal-10 non-terminal-2".to_string(), + ); + assert_eq!(item.precedence(&grammar, &inline_map), 2); + assert_eq!( + items[0].step(&grammar, &inline_map).unwrap().alias, + Some(Alias { + value: "inner_alias".to_string(), + is_named: true, + }) + ); + + // The final terminal of the inlined production inherits the precedence of + // the inlined step. + let item = item.successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 • terminal-10 non-terminal-2".to_string(), + ); + assert_eq!(item.precedence(&grammar, &inline_map), 1); + + let item = item.successor(); + assert_eq!( + display_items(&[item], &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 terminal-10 • non-terminal-2".to_string(), + ); + + // All steps of the inlined production inherit their alias from the + // inlined step. + let items = inline_map.inlined_items(item).unwrap().collect::>(); + assert_eq!( + display_items(&items, &grammar, &inline_map)[0], + "non-terminal-0 → terminal-11 terminal-12 terminal-10 • terminal-13".to_string(), + ); + assert_eq!( + items[0].step(&grammar, &inline_map).unwrap().alias, + Some(Alias { + value: "outer_alias".to_string(), + is_named: true, + }) + ) + } + fn display_items( - items: Option>, + items: impl IntoIterator>, grammar: &SyntaxGrammar, inline_map: &InlinedProductionMap, - ) -> Option> { - items.map(|items| { - items - .map(|item| format!("{}", item.with(grammar, inline_map))) - .collect() - }) + ) -> Vec { + let lex = LexicalGrammar::default(); + items + .into_iter() + .map(|item| format!("{}", item.borrow().display_with(grammar, &lex, inline_map))) + .collect() } } From 5eb88069597ed72d9dd6b4f5b2ed5d772463a853 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:36:21 -0800 Subject: [PATCH 072/208] Handle repetition ranges in regexes --- src/prepare_grammar/expand_tokens.rs | 114 ++++++++++++++++++++------- 1 file changed, 86 insertions(+), 28 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 7a1d2f4d..37f75e5a 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -3,7 +3,9 @@ use crate::error::{Error, Result}; use crate::grammars::{LexicalGrammar, LexicalVariable}; use crate::nfa::{CharacterSet, Nfa, NfaState}; use crate::rules::Rule; -use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; +use regex_syntax::ast::{ + parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, +}; pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut nfa = Nfa::new(); @@ -24,7 +26,10 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Error::RegexError(format!("Rule {} {}", variable.name, msg)), + _ => e, + })?; if !is_immediate_token { let last_state_id = nfa.last_state_id(); @@ -95,11 +100,62 @@ fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) Ok(false) } } + Rule::Metadata { rule, .. } => { + // TODO - implement precedence + expand_rule(rule, nfa, next_state_id, is_sep) + } Rule::Blank => Ok(false), _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } +fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_state_id = nfa.last_state_id(); + if expand_regex(&ast, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); + Ok(true) + } else { + nfa.states.pop(); + Ok(false) + } +} + +fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + if expand_regex(ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + Ok(true) + } else { + Ok(false) + } +} + +fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { + if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + Ok(true) + } else { + Ok(false) + } +} + +fn expand_count( + ast: &Ast, + count: u32, + nfa: &mut Nfa, + mut next_state_id: u32, + is_sep: bool, +) -> Result { + let mut result = false; + for _ in 0..count { + if expand_regex(ast, nfa, next_state_id, is_sep)? { + result = true; + next_state_id = nfa.last_state_id(); + } + } + Ok(result) +} + fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { match ast { Ast::Empty(_) => Ok(false), @@ -148,38 +204,36 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - if expand_regex(&repetition.ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); - Ok(true) - } else { - Ok(false) - } + expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep) } RepetitionKind::OneOrMore => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - nfa.states.pop(); - Ok(false) - } + expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep) } RepetitionKind::ZeroOrMore => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&repetition.ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); - Ok(true) + expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + expand_count(&repetition.ast, count, nfa, next_state_id, is_sep) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { + expand_count(ast, min, nfa, next_state_id, is_sep) } else { Ok(false) } } - RepetitionKind::Range(_) => unimplemented!(), + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?; + for _ in min..max { + if result { + next_state_id = nfa.last_state_id(); + } + if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? { + result = true; + } + } + Ok(result) + } }, Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), Ast::Alternation(alternation) => { @@ -202,8 +256,8 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) for ast in concat.asts.iter().rev() { if expand_regex(&ast, nfa, next_state_id, is_sep)? { result = true; + next_state_id = nfa.last_state_id(); } - next_state_id = nfa.last_state_id(); } Ok(result) } @@ -224,7 +278,11 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } Ok(result) } - _ => Err(Error::regex("Unsupported character class syntax")), + ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), } } From a3dcfa0a52b74fc56a53aef270bd9f4a474732e8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Dec 2018 13:36:39 -0800 Subject: [PATCH 073/208] Implement more of parse table generation --- src/build_tables/item.rs | 260 +++++++- src/build_tables/item_set_builder.rs | 34 +- src/build_tables/mod.rs | 596 +++++++++++++++++- src/error.rs | 1 + src/generate.rs | 4 +- src/grammars.rs | 16 +- src/js/dsl.js | 334 ++++++++++ src/main.rs | 65 +- src/nfa.rs | 6 + src/prepare_grammar/extract_simple_aliases.rs | 2 + src/prepare_grammar/extract_tokens.rs | 7 +- src/render/mod.rs | 206 +++++- src/rules.rs | 23 +- src/tables.rs | 68 +- 14 files changed, 1515 insertions(+), 107 deletions(-) create mode 100644 src/js/dsl.js diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c99815eb..9208f602 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,10 +1,10 @@ use super::inline_variables::InlinedProductionMap; -use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; -use crate::rules::{Symbol, SymbolType}; +use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Associativity, Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; +use std::collections::{HashMap, BTreeMap}; use std::fmt; +use std::hash::{Hash, Hasher}; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -28,7 +28,7 @@ pub(crate) struct LookaheadSet { eof: bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum ParseItem { Start { step_index: u32, @@ -47,10 +47,29 @@ pub(crate) enum ParseItem { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet { - pub entries: HashMap, + pub entries: BTreeMap, } -pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); +pub(crate) struct ParseItemDisplay<'a>( + &'a ParseItem, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +pub(crate) struct ParseItemSetDisplay<'a>( + &'a ParseItemSet, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +struct ParseItemSetMapEntry(ParseItemSet, u64); +pub(crate) struct ParseItemSetMap { + map: HashMap +} impl LookaheadSet { pub fn new() -> Self { @@ -61,12 +80,61 @@ impl LookaheadSet { } } - pub fn insert(&mut self, other: Symbol) { - match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), - SymbolType::Terminal => self.terminal_bits.set(other.index, true), - SymbolType::External => self.external_bits.set(other.index, true), + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + .chain( + self.external_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::external(i)) + } else { + None + } + }), + ) + .chain(if self.eof { Some(Symbol::end()) } else { None }) + } + + pub fn with<'a>(symbols: impl IntoIterator) -> Self { + let mut result = Self::new(); + for symbol in symbols { + result.insert(*symbol); } + result + } + + pub fn contains(&self, symbol: &Symbol) -> bool { + match symbol.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), + SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), + SymbolType::End => self.eof, + } + } + + pub fn insert(&mut self, other: Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = true; + return; + } + }; + vec.resize(other.index + 1, false); + vec.set(other.index, true); } pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { @@ -95,6 +163,14 @@ impl LookaheadSet { } result } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> LookaheadSetDisplay<'a> { + LookaheadSetDisplay(self, syntax_grammar, lexical_grammar) + } } impl ParseItem { @@ -126,18 +202,53 @@ impl ParseItem { &grammar.variables[*variable_index as usize].productions[*production_index as usize] } ParseItem::Inlined { - production_index, - .. + production_index, .. } => &inlined_productions.inlined_productions[*production_index as usize], } } + pub fn symbol( + &self, + grammar: &SyntaxGrammar, + inlined_productions: &InlinedProductionMap, + ) -> Option { + self.step(grammar, inlined_productions).map(|s| s.symbol) + } + pub fn step<'a>( &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> Option<&'a ProductionStep> { - self.production(grammar, inlined_productions).steps.get(self.step_index()) + self.production(grammar, inlined_productions) + .steps + .get(self.step_index()) + } + + pub fn precedence<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> i32 { + self.production(grammar, inlines) + .steps + .get(self.step_index() - 1) + .map(|s| s.precedence) + .unwrap_or(0) + } + + pub fn associativity<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> Option { + let production = self.production(grammar, inlines); + let step_index = self.step_index(); + if step_index == production.steps.len() { + production.steps.last().and_then(|s| s.associativity) + } else { + None + } } pub fn variable_index(&self) -> u32 { @@ -156,6 +267,14 @@ impl ParseItem { } } + pub fn is_final(&self) -> bool { + if let ParseItem::Start { step_index: 1 } = self { + true + } else { + false + } + } + fn step_index_mut(&mut self) -> &mut u32 { match self { ParseItem::Start { step_index } @@ -164,8 +283,13 @@ impl ParseItem { } } - pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, grammar, inlines) + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines) } pub fn successor(&self) -> ParseItem { @@ -176,33 +300,107 @@ impl ParseItem { } impl ParseItemSet { - pub fn new() -> Self { - Self { entries: HashMap::new() } + pub fn with<'a>(elements: impl IntoIterator) -> Self { + let mut result = Self::default(); + for (item, lookaheads) in elements { + result.entries.insert(*item, lookaheads.clone()); + } + result + } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemSetDisplay<'a> { + ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines) + } +} + +impl Default for ParseItemSet { + fn default() -> Self { + Self { + entries: BTreeMap::new(), + } } } impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + if let ParseItem::Start { .. } = &self.0 { + write!(f, "START →")?; + } else { + write!( + f, + "{} →", + &self.1.variables[self.0.variable_index() as usize].name + )?; + } + let step_index = self.0.step_index(); - let production = self.0.production(self.1, self.2); + let production = self.0.production(self.1, self.3); for (i, step) in production.steps.iter().enumerate() { - if i > 0 { - write!(f, " ")?; - } - if i == step_index { - write!(f, "• ")?; + write!(f, " •")?; } - let name = if step.symbol.is_terminal() { - "terminal" + write!(f, " ")?; + if step.symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(step.symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", step.symbol.index)?; + } } else if step.symbol.is_external() { - "external" + write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?; } else { - "non-terminal" - }; + write!(f, "{}", &self.1.variables[step.symbol.index].name)?; + } + } - write!(f, "{}-{}", name, step.symbol.index)?; + if production.steps.len() == step_index { + write!(f, " •")?; + } + + Ok(()) + } +} + +impl<'a> fmt::Display for LookaheadSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "[")?; + for (i, symbol) in self.0.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + + if symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", symbol.index)?; + } + } else if symbol.is_external() { + write!(f, "{}", &self.1.external_tokens[symbol.index].name)?; + } else { + write!(f, "{}", &self.1.variables[symbol.index].name)?; + } + } + write!(f, "]")?; + Ok(()) + } +} + +impl<'a> fmt::Display for ParseItemSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + for (item, lookaheads) in self.0.entries.iter() { + writeln!( + f, + "{}\t{}", + item.display_with(self.1, self.2, self.3), + lookaheads.display_with(self.1, self.2) + )?; } Ok(()) } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 61d45ded..530c1f25 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -20,7 +20,7 @@ pub(crate) struct ParseItemSetBuilder { first_sets: HashMap, last_sets: HashMap, transitive_closure_additions: Vec>, - inlined_production_map: InlinedProductionMap, + pub inlines: InlinedProductionMap, } fn find_or_push(vector: &mut Vec, value: T) { @@ -35,7 +35,7 @@ impl ParseItemSetBuilder { first_sets: HashMap::new(), last_sets: HashMap::new(), transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], - inlined_production_map: InlinedProductionMap::new(syntax_grammar), + inlines: InlinedProductionMap::new(syntax_grammar), }; // For each grammar symbol, populate the FIRST and LAST sets: the set of @@ -192,6 +192,10 @@ impl ParseItemSetBuilder { let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { let variable = &syntax_grammar.variables[variable_index]; + let non_terminal = Symbol::non_terminal(variable_index); + if syntax_grammar.variables_to_inline.contains(&non_terminal) { + continue; + } for production_index in 0..variable.productions.len() { let item = ParseItem::Normal { variable_index: variable_index as u32, @@ -199,7 +203,7 @@ impl ParseItemSetBuilder { step_index: 0, }; - if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) { + if let Some(inlined_items) = result.inlines.inlined_items(item) { for inlined_item in inlined_items { find_or_push( additions_for_non_terminal, @@ -227,32 +231,36 @@ impl ParseItemSetBuilder { pub(crate) fn transitive_closure( &mut self, - item_set: ParseItemSet, + item_set: &ParseItemSet, grammar: &SyntaxGrammar, ) -> ParseItemSet { - let mut result = ParseItemSet::new(); - for (item, lookaheads) in item_set.entries { - if let Some(items) = self.inlined_production_map.inlined_items(item) { + let mut result = ParseItemSet::default(); + for (item, lookaheads) in &item_set.entries { + if let Some(items) = self.inlines.inlined_items(*item) { for item in items { - self.add_item(&mut result, item, lookaheads.clone(), grammar); + self.add_item(&mut result, item, lookaheads, grammar); } } else { - self.add_item(&mut result, item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads, grammar); } } result } + pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + fn add_item( &self, set: &mut ParseItemSet, item: ParseItem, - lookaheads: LookaheadSet, + lookaheads: &LookaheadSet, grammar: &SyntaxGrammar, ) { - if let Some(step) = item.step(grammar, &self.inlined_production_map) { + if let Some(step) = item.step(grammar, &self.inlines) { if step.symbol.is_non_terminal() { - let next_step = item.successor().step(grammar, &self.inlined_production_map); + let next_step = item.successor().step(grammar, &self.inlines); // Determine which tokens can follow this non-terminal. let following_tokens = if let Some(next_step) = next_step { @@ -274,6 +282,6 @@ impl ParseItemSetBuilder { } } } - set.entries.insert(item, lookaheads); + set.entries.insert(item, lookaheads.clone()); } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 01d9219d..091c5486 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,37 +1,611 @@ -mod item; mod inline_variables; mod item; mod item_set_builder; -use std::collections::{HashMap, VecDeque}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable, ParseStateId}; -use crate::rules::{AliasMap, Symbol}; -use crate::error::Result; -use self::item::ParseItemSet; +use self::item::{LookaheadSet, ParseItem, ParseItemSet}; +use self::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::ParseTableEntry; +use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; struct ParseStateQueueEntry { preceding_symbols: SymbolSequence, - item_set: ParseItemSet, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, state_id: ParseStateId, } struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, simple_aliases: &'a AliasMap, state_ids_by_item_set: HashMap, - item_sets_by_state_id: Vec<&'a ParseItemSet>, + item_sets_by_state_id: Vec, parse_state_queue: VecDeque, parse_table: ParseTable, } +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { + // Ensure that the empty rename sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::default(), + ); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + + Err(Error::grammar("oh no")) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => { + // eprintln!("Item set already processed at state {}", *o.get()); + *o.get() + } + Entry::Vacant(v) => { + // eprintln!("Item set not yet processed"); + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id].display_with( + &self.syntax_grammar, + &self.lexical_grammar, + &self.item_set_builder.inlines + ) + ); + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + ); + + // println!("TRANSITIVE CLOSURE:"); + // for item in item_set.entries.keys() { + // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); + // } + // println!(""); + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = + item.symbol(self.syntax_grammar, &self.item_set_builder.inlines) + { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_final() { + ParseAction::Accept + } else { + let production = + item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index() as usize), + child_count: item.step_index(), + precedence: production.last_precedence(), + associativity: production.last_associativity(), + dynamic_precedence: production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let step_index = item.step_index(); + if let Some(step) = production.steps.get(step_index) { + if step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = production.steps[step_index - 1].precedence; + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index(); + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index() == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index() as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index()) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index() as usize].name + ) + .unwrap(); + + for (j, step) in item + .production(&self.syntax_grammar, &self.item_set_builder.inlines) + .steps + .iter() + .enumerate() + { + if j == item.step_index() { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item + .step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines); + let associativity = + item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol) + { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect(); + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - simple_aliases: &AliasMap + simple_aliases: &AliasMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - unimplemented!(); + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + simple_aliases, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() } diff --git a/src/error.rs b/src/error.rs index 49064c22..b03efa93 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,7 @@ pub enum Error { GrammarError(String), SymbolError(String), RegexError(String), + ConflictError(String), } pub type Result = std::result::Result; diff --git a/src/generate.rs b/src/generate.rs index 4507fb6f..dc3d5176 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -4,8 +4,8 @@ use crate::prepare_grammar::prepare_grammar; use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: String) -> Result { - let input_grammar = parse_grammar(&input)?; +pub fn generate_parser_for_grammar(input: &str) -> Result { + let input_grammar = parse_grammar(input)?; let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, diff --git a/src/grammars.rs b/src/grammars.rs index 8abdad24..7512ec03 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -38,7 +38,7 @@ pub(crate) struct LexicalVariable { pub start_state: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Default, PartialEq, Eq)] pub(crate) struct LexicalGrammar { pub nfa: Nfa, pub variables: Vec, @@ -112,6 +112,14 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } + + pub fn last_precedence(&self) -> i32 { + self.steps.last().map(|s| s.precedence).unwrap_or(0) + } + + pub fn last_associativity(&self) -> Option { + self.steps.last().map(|s| s.associativity).unwrap_or(None) + } } impl Default for Production { @@ -137,3 +145,9 @@ impl Variable { Self { name: name.to_string(), kind: VariableType::Anonymous, rule } } } + +impl SyntaxVariable { + pub fn is_auxiliary(&self) -> bool { + self.kind == VariableType::Auxiliary + } +} diff --git a/src/js/dsl.js b/src/js/dsl.js new file mode 100644 index 00000000..ba3962cd --- /dev/null +++ b/src/js/dsl.js @@ -0,0 +1,334 @@ +const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; +const DELIMITER_ESCAPE_PATTERN = /\\\//g; + +function alias(rule, value) { + const result = { + type: "ALIAS", + content: normalize(rule), + named: false, + value: null + }; + + switch (value.constructor) { + case String: + result.named = false; + result.value = value; + return result; + case ReferenceError: + result.named = true; + result.value = value.symbol.name; + return result; + case Object: + if (typeof value.type === 'string' && value.type === 'SYMBOL') { + result.named = true; + result.value = value.name; + return result; + } + } + + throw new Error('Invalid alias value ' + value); +} + +function blank() { + return { + type: "BLANK" + }; +} + +function choice(...elements) { + return { + type: "CHOICE", + members: elements.map(normalize) + }; +} + +function optional(value) { + return choice(value, blank()); +} + +function prec(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC", + value: number, + content: normalize(rule) + }; +} + +prec.left = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_LEFT", + value: number, + content: normalize(rule) + }; +} + +prec.right = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_RIGHT", + value: number, + content: normalize(rule) + }; +} + +prec.dynamic = function(number, rule) { + return { + type: "PREC_DYNAMIC", + value: number, + content: normalize(rule) + }; +} + +function repeat(rule) { + return { + type: "REPEAT", + content: normalize(rule) + }; +} + +function repeat1(rule) { + return { + type: "REPEAT1", + content: normalize(rule) + }; +} + +function seq(...elements) { + return { + type: "SEQ", + members: elements.map(normalize) + }; +} + +function sym(name) { + return { + type: "SYMBOL", + name: name + }; +} + +function token(value) { + return { + type: "TOKEN", + content: normalize(value) + }; +} + +token.immediate = function(value) { + return { + type: "IMMEDIATE_TOKEN", + content: normalize(value) + }; +} + +function normalize(value) { + + if (typeof value == "undefined") + throw new Error("Undefined symbol"); + + switch (value.constructor) { + case String: + return { + type: 'STRING', + value + }; + case RegExp: + return { + type: 'PATTERN', + value: value.source + .replace( + DELIMITER_ESCAPE_PATTERN, + '/' + ) + .replace( + UNICODE_ESCAPE_PATTERN, + (match, group) => String.fromCharCode(parseInt(group, 16)) + ) + }; + case ReferenceError: + throw value + default: + if (typeof value.type === 'string') { + return value; + } else { + throw new TypeError("Invalid rule: " + value.toString()); + } + } +} + +function RuleBuilder(ruleMap) { + return new Proxy({}, { + get(target, propertyName) { + const symbol = { + type: 'SYMBOL', + name: propertyName + }; + + if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) { + return symbol; + } else { + const error = new ReferenceError(`Undefined symbol '${propertyName}'`); + error.symbol = symbol; + return error; + } + } + }) +} + +function grammar(baseGrammar, options) { + if (!options) { + options = baseGrammar; + baseGrammar = { + name: null, + rules: {}, + extras: [normalize(/\s/)], + conflicts: [], + externals: [], + inline: [] + }; + } + + let externals = baseGrammar.externals; + if (options.externals) { + if (typeof options.externals !== "function") { + throw new Error("Grammar's 'externals' property must be a function."); + } + + const externalsRuleBuilder = RuleBuilder(null) + const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals); + + if (!Array.isArray(externalRules)) { + throw new Error("Grammar's 'externals' property must return an array of rules."); + } + + externals = externalRules.map(normalize); + } + + const ruleMap = {}; + for (const key in options.rules) { + ruleMap[key] = true; + } + for (const key in baseGrammar.rules) { + ruleMap[key] = true; + } + for (const external of externals) { + if (typeof external.name === 'string') { + ruleMap[external.name] = true; + } + } + + const ruleBuilder = RuleBuilder(ruleMap); + + const name = options.name; + if (typeof name !== "string") { + throw new Error("Grammar's 'name' property must be a string."); + } + + if (!/^[a-zA-Z_]\w*$/.test(name)) { + throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters."); + } + + let rules = Object.assign({}, baseGrammar.rules); + if (options.rules) { + if (typeof options.rules !== "object") { + throw new Error("Grammar's 'rules' property must be an object."); + } + + for (const ruleName in options.rules) { + const ruleFn = options.rules[ruleName]; + if (typeof ruleFn !== "function") { + throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not."); + } + rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName])); + } + } + + let extras = baseGrammar.extras.slice(); + if (options.extras) { + if (typeof options.extras !== "function") { + throw new Error("Grammar's 'extras' property must be a function."); + } + + extras = options.extras + .call(ruleBuilder, ruleBuilder, baseGrammar.extras) + .map(normalize); + } + + let word = baseGrammar.word; + if (options.word) { + word = options.word.call(ruleBuilder, ruleBuilder).name; + if (typeof word != 'string') { + throw new Error("Grammar's 'word' property must be a named rule."); + } + } + + let conflicts = baseGrammar.conflicts; + if (options.conflicts) { + if (typeof options.conflicts !== "function") { + throw new Error("Grammar's 'conflicts' property must be a function."); + } + + const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym)); + const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules); + + if (!Array.isArray(conflictRules)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + conflicts = conflictRules.map(conflictSet => { + if (!Array.isArray(conflictSet)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + return conflictSet.map(symbol => symbol.name); + }); + } + + let inline = baseGrammar.inline; + if (options.inline) { + if (typeof options.inline !== "function") { + throw new Error("Grammar's 'inline' property must be a function."); + } + + const baseInlineRules = baseGrammar.inline.map(sym); + const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules); + + if (!Array.isArray(inlineRules)) { + throw new Error("Grammar's inline must be an array of rules."); + } + + inline = inlineRules.map(symbol => symbol.name); + } + + if (Object.keys(rules).length == 0) { + throw new Error("Grammar must have at least one rule."); + } + + return {name, word, rules, extras, conflicts, externals, inline}; + } + +global.alias = alias; +global.blank = blank; +global.choice = choice; +global.optional = optional; +global.prec = prec; +global.repeat = repeat; +global.repeat1 = repeat1; +global.seq = seq; +global.sym = sym; +global.token = token; +global.grammar = grammar; diff --git a/src/main.rs b/src/main.rs index 9dc9efb2..c7ca2ca5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,15 @@ -use clap::{App, Arg, SubCommand}; +#[macro_use] +extern crate serde_derive; +#[macro_use] +extern crate serde_json; +#[macro_use] +extern crate lazy_static; -#[macro_use] extern crate serde_derive; -#[macro_use] extern crate serde_json; -#[macro_use] extern crate lazy_static; +use std::path::PathBuf; +use clap::{App, Arg, SubCommand}; +use std::env; +use std::io::Write; +use std::process::{Command, Stdio}; mod build_tables; mod error; @@ -20,25 +27,59 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") + .subcommand(SubCommand::with_name("generate").about("Generate a parser")) .subcommand( - SubCommand::with_name("generate") - .about("Generate a parser") - ).subcommand( SubCommand::with_name("parse") .about("Parse a file") - .arg(Arg::with_name("path").index(1)) - ).subcommand( + .arg(Arg::with_name("path").index(1)), + ) + .subcommand( SubCommand::with_name("test") .about("Run a parser's tests") .arg(Arg::with_name("path").index(1).required(true)) .arg(Arg::with_name("line").index(2).required(true)) - .arg(Arg::with_name("column").index(3).required(true)) - ).get_matches(); + .arg(Arg::with_name("column").index(3).required(true)), + ) + .get_matches(); if let Some(matches) = matches.subcommand_matches("generate") { - let code = generate::generate_parser_for_grammar(String::new())?; + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); + grammar_path.push("grammar.js"); + let grammar_json = load_js_grammar_file(grammar_path); + let code = generate::generate_parser_for_grammar(&grammar_json)?; println!("{}", code); } Ok(()) } + +fn load_js_grammar_file(grammar_path: PathBuf) -> String { + let mut node_process = Command::new("node") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .expect("Failed to run `node`"); + + let js_prelude = include_str!("./js/dsl.js"); + let mut node_stdin = node_process + .stdin + .take() + .expect("Failed to open stdin for node"); + write!( + node_stdin, + "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", + js_prelude, + grammar_path.to_str().unwrap() + ).expect("Failed to write to node's stdin"); + drop(node_stdin); + let output = node_process + .wait_with_output() + .expect("Failed to read output from node"); + match output.status.code() { + None => panic!("Node process was killed"), + Some(0) => {} + Some(code) => panic!(format!("Node process exited with status {}", code)), + } + + String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") +} diff --git a/src/nfa.rs b/src/nfa.rs index bc084ede..f6acb67a 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -23,6 +23,12 @@ pub struct Nfa { pub states: Vec } +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + #[derive(Debug)] pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 8b87ea2e..ff7204a0 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -22,6 +22,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if step.alias.is_none() { @@ -49,6 +50,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if status.alias.is_some() { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index d53555af..eaeede90 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -67,10 +67,13 @@ pub(super) fn extract_tokens( .expected_conflicts .into_iter() .map(|conflict| { - conflict + let mut result: Vec<_> = conflict .iter() .map(|symbol| symbol_replacer.replace_symbol(*symbol)) - .collect() + .collect(); + result.sort_unstable(); + result.dedup(); + result }) .collect(); diff --git a/src/render/mod.rs b/src/render/mod.rs index 5bd11a34..2ca610a6 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,6 +1,188 @@ -use crate::rules::{Symbol, AliasMap}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use std::collections::{HashMap, HashSet}; +use std::fmt::Write; + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + for _ in 0..$this.indent_level { + write!(&mut $this.buffer, " ").unwrap(); + } + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + $this.buffer += "\n"; + } +} + +struct Generator { + buffer: String, + indent_level: usize, + + language_name: String, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, + symbol_ids: HashMap, + parse_table_entries: Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: usize, + unique_aliases: HashSet, +} + +impl Generator { + fn generate(mut self) -> String { + self.add_includes(); + self.add_pragmas(); + self.add_stats(); + self.add_symbol_enum(); + self.add_symbol_names_list(); + self.buffer + } + + fn add_includes(&mut self) { + add_line!(self, "#include "); + add_line!(self, ""); + } + + fn add_pragmas(&mut self) { + add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); + add_line!(self, "#pragma GCC diagnostic push"); + add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!(self, "#endif"); + add_line!(self, ""); + + // Compiling large lexer functions can be very slow, especially when + // using Visual Studio on Windows. Disabling optimizations is not + // ideal, but only a very small fraction of overall parse time is + // spent lexing, so the performance impact of this is pretty small. + if self.main_lex_table.states.len() > 500 { + add_line!(self, "#ifdef _MSC_VER"); + add_line!(self, "#pragma optimize(\"\", off)"); + add_line!(self, "#endif"); + add_line!(self, ""); + } + } + + fn add_stats(&mut self) { + let mut token_count = 0; + + for symbol in &self.parse_table.symbols { + if symbol.is_terminal() { + token_count += 1; + } else if symbol.is_external() { + let external_token = &self.syntax_grammar.external_tokens[symbol.index]; + if external_token.corresponding_internal_token.is_none() { + token_count += 1; + } + } + } + + for alias_sequence in &self.parse_table.alias_sequences { + for entry in alias_sequence { + if let Some(alias) = entry { + self.unique_aliases.insert(alias.clone()); + } + } + } + + let mut symbol_id_values = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); + } + + add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); + add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); + add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!(self, "#define TOKEN_COUNT {}", token_count); + add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); + // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!(self, ""); + } + + fn add_symbol_enum(&mut self) { + add_line!(self, "enum {{"); + self.indent(); + for i in 0..self.parse_table.symbols.len() { + let symbol = self.parse_table.symbols[i]; + if symbol != Symbol::end() { + add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + } + } + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_symbol_names_list(&mut self) { + add_line!(self, "static const char *ts_symbol_names[] = {{"); + self.indent(); + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = self.metadata_for_symbol(symbol); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", self.sanitize_name(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_ids.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_ids.insert(id.clone()); + self.symbol_ids.insert(symbol, id); + } + + fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { + match symbol.kind { + SymbolType::End => ("end", VariableType::Auxiliary), + SymbolType::NonTerminal => { + let variable = &self.syntax_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::External => { + let token = &self.syntax_grammar.external_tokens[symbol.index]; + (&token.name, token.kind) + } + } + } + + fn sanitize_name(&self, name: &str) -> String { + name.to_string() + } + + fn indent(&mut self) { + self.indent_level += 1; + } + + fn dedent(&mut self) { + self.indent_level -= 1; + } +} pub(crate) fn render_c_code( name: &str, @@ -12,5 +194,21 @@ pub(crate) fn render_c_code( lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, ) -> String { - unimplemented!(); + Generator { + buffer: String::new(), + indent_level: 0, + language_name: name.to_string(), + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + symbol_ids: HashMap::new(), + parse_table_entries: Vec::new(), + next_parse_action_list_index: 0, + unique_aliases: HashSet::new(), + } + .generate() } diff --git a/src/rules.rs b/src/rules.rs index 9374a283..34f4c8b9 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { External, Terminal, NonTerminal, + End, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -33,7 +34,7 @@ pub(crate) struct MetadataParams { pub alias: Option, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct Symbol { pub kind: SymbolType, pub index: usize, @@ -56,6 +57,15 @@ pub(crate) enum Rule { } impl Rule { + pub fn alias(content: Rule, value: String, is_named: bool) -> Self { + add_metadata(content, move |params| { + params.alias = Some(Alias { + is_named, + value + }); + }) + } + pub fn token(content: Rule) -> Self { add_metadata(content, |params| { params.is_token = true; @@ -169,6 +179,13 @@ impl Symbol { index, } } + + pub fn end() -> Self { + Symbol { + kind: SymbolType::End, + index: 0, + } + } } impl From for Rule { @@ -177,7 +194,7 @@ impl From for Rule { } } -fn add_metadata(input: Rule, f: T) -> Rule { +fn add_metadata(input: Rule, f: T) -> Rule { match input { Rule::Metadata { rule, mut params } => { f(&mut params); diff --git a/src/tables.rs b/src/tables.rs index de66253c..9100b81e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -6,20 +6,13 @@ pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum ParseActionType { - Error, - Shift, - Reduce, - Accept, - Recover, -} - #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum ParseAction { Accept, - Error, - Shift(ParseStateId), + Shift { + state: ParseStateId, + is_repetition: bool, + }, ShiftExtra, Recover, Reduce { @@ -28,50 +21,69 @@ pub(crate) enum ParseAction { precedence: i32, dynamic_precedence: i32, associativity: Option, - alias_sequence_id: Option, - is_repetition: bool, + alias_sequence_id: AliasSequenceId, } } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseTableEntry { - actions: Vec, - reusable: bool, + pub actions: Vec, + pub reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { - terminal_entries: HashMap, - nonterminal_entries: HashMap + pub terminal_entries: HashMap, + pub nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] pub(crate) struct ParseTable { - states: Vec, - alias_sequences: Vec>, + pub states: Vec, + pub symbols: Vec, + pub alias_sequences: Vec>>, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { - state: LexStateId, - precedence: Range, - in_main_token: bool, + pub state: LexStateId, + pub precedence: Range, + pub in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AcceptTokenAction { - symbol: Symbol, - precedence: i32, - implicit_precedence: i32, + pub symbol: Symbol, + pub precedence: i32, + pub implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - advance_actions: HashMap, - accept_action: Option, + pub advance_actions: HashMap, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexTable { - states: Vec, + pub states: Vec, +} + +impl ParseTableEntry { + pub fn new() -> Self { + Self { + reusable: true, + actions: Vec::new(), + } + } +} + +impl ParseAction { + pub fn precedence(&self) -> i32 { + if let ParseAction::Reduce { precedence, .. } = self { + *precedence + } else { + 0 + } + } } From 261a7fd07347b20ad500b58ac3d1dbf96990da81 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 21 Dec 2018 15:02:48 -0800 Subject: [PATCH 074/208] Represent ParseItem with reference to Production Implement comparisons in a way that disregards past steps. --- src/build_tables/inline_variables.rs | 441 ----------------------- src/build_tables/item.rs | 315 ++++++++-------- src/build_tables/item_set_builder.rs | 66 ++-- src/build_tables/mod.rs | 120 +++---- src/generate.rs | 5 +- src/grammars.rs | 78 +++- src/prepare_grammar/mod.rs | 16 +- src/prepare_grammar/process_inlines.rs | 477 +++++++++++++++++++++++++ src/rules.rs | 4 +- 9 files changed, 803 insertions(+), 719 deletions(-) delete mode 100644 src/build_tables/inline_variables.rs create mode 100644 src/prepare_grammar/process_inlines.rs diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs deleted file mode 100644 index affbe163..00000000 --- a/src/build_tables/inline_variables.rs +++ /dev/null @@ -1,441 +0,0 @@ -use super::item::ParseItem; -use crate::grammars::{Production, SyntaxGrammar}; -use std::collections::HashMap; - -pub(crate) struct InlinedProductionMap { - pub inlined_productions: Vec, - item_map: HashMap>, -} - -impl InlinedProductionMap { - pub fn new(grammar: &SyntaxGrammar) -> Self { - let mut result = Self { - inlined_productions: Vec::new(), - item_map: HashMap::new(), - }; - - let mut items_to_process = Vec::new(); - for (variable_index, variable) in grammar.variables.iter().enumerate() { - for production_index in 0..variable.productions.len() { - items_to_process.push(ParseItem::Normal { - variable_index: variable_index as u32, - production_index: production_index as u32, - step_index: 0, - }); - while !items_to_process.is_empty() { - let mut i = 0; - while i < items_to_process.len() { - let item = &items_to_process[i]; - if let Some(step) = item.step(grammar, &result) { - if grammar.variables_to_inline.contains(&step.symbol) { - let inlined_items = result - .inline(*item, grammar) - .into_iter() - .map(|production_index| ParseItem::Inlined { - variable_index: item.variable_index(), - production_index: *production_index, - step_index: item.step_index() as u32, - }) - .collect::>(); - items_to_process.splice(i..i + 1, inlined_items); - } else { - items_to_process[i] = item.successor(); - i += 1; - } - } else { - items_to_process.remove(i); - } - } - } - } - } - - result - } - - pub fn inlined_items<'a>( - &'a self, - item: ParseItem, - ) -> Option + 'a> { - self.item_map.get(&item).map(|production_indices| { - production_indices - .iter() - .cloned() - .map(move |production_index| ParseItem::Inlined { - variable_index: item.variable_index(), - production_index, - step_index: item.step_index() as u32, - }) - }) - } - - fn inline(&mut self, item: ParseItem, grammar: &SyntaxGrammar) -> &Vec { - let step_index = item.step_index(); - let mut productions_to_add = grammar.variables - [item.step(grammar, self).unwrap().symbol.index] - .productions - .clone(); - - let mut i = 0; - while i < productions_to_add.len() { - if let Some(first_symbol) = productions_to_add[i].first_symbol() { - if grammar.variables_to_inline.contains(&first_symbol) { - // Remove the production from the vector, replacing it with a placeholder. - let production = productions_to_add - .splice(i..i + 1, [Production::default()].iter().cloned()) - .next() - .unwrap(); - - // Replace the placeholder with the inlined productions. - productions_to_add.splice( - i..i + 1, - grammar.variables[first_symbol.index] - .productions - .iter() - .map(|p| { - let mut p = p.clone(); - p.steps.extend(production.steps[1..].iter().cloned()); - p - }), - ); - continue; - } - } - i += 1; - } - - let result = productions_to_add - .into_iter() - .map(|production_to_add| { - let mut inlined_production = item.production(grammar, &self).clone(); - let removed_step = inlined_production - .steps - .splice( - step_index..step_index + 1, - production_to_add.steps.iter().cloned(), - ) - .next() - .unwrap(); - let inserted_steps = &mut inlined_production.steps - [step_index..step_index + production_to_add.steps.len()]; - if let Some(alias) = removed_step.alias { - for inserted_step in inserted_steps.iter_mut() { - inserted_step.alias = Some(alias.clone()); - } - } - if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; - } - self.inlined_productions - .iter() - .position(|p| *p == inlined_production) - .unwrap_or({ - self.inlined_productions.push(inlined_production); - self.inlined_productions.len() - 1 - }) as u32 - }) - .collect(); - - self.item_map.entry(item).or_insert(result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::grammars::{LexicalGrammar, ProductionStep, SyntaxVariable, VariableType}; - use crate::rules::{Alias, Associativity, Symbol}; - use std::borrow::Borrow; - - #[test] - fn test_basic_inlining() { - let grammar = SyntaxGrammar { - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - variables_to_inline: vec![Symbol::non_terminal(1)], - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(12)), - ProductionStep::new(Symbol::terminal(13)), - ], - }, - Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(14))], - }, - ], - }, - ], - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - // Nothing to inline at step 0. - assert!(inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0 - }) - .is_none()); - - // Inlining variable 1 yields two productions. - assert_eq!( - display_items( - inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1 - }) - .unwrap(), - &grammar, - &inline_map - ), - vec![ - "non-terminal-0 → terminal-10 • terminal-12 terminal-13 terminal-11" - .to_string(), - "non-terminal-0 → terminal-10 • terminal-14 terminal-11".to_string(), - ] - ); - } - - #[test] - fn test_nested_inlining() { - let grammar = SyntaxGrammar { - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(1)), // inlined - ProductionStep::new(Symbol::terminal(11)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ProductionStep::new(Symbol::terminal(12)), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(13))], - }, - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(3)), // inlined - ProductionStep::new(Symbol::terminal(14)), - ], - }, - ], - }, - SyntaxVariable { - name: "non-terminal-2".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(15))], - }], - }, - SyntaxVariable { - name: "non-terminal-3".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(16))], - }], - }, - ], - variables_to_inline: vec![ - Symbol::non_terminal(1), - Symbol::non_terminal(2), - Symbol::non_terminal(3), - ], - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - let items = inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 1, - }) - .unwrap() - .collect::>(); - - assert_eq!( - display_items(&items, &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), - "non-terminal-0 → terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() - ] - ); - - let item = items[0].successor().successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), - ] - ); - - assert_eq!( - display_items(inline_map.inlined_items(item).unwrap(), &grammar, &inline_map), - vec![ - "non-terminal-0 → terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), - ] - ); - } - - #[test] - fn test_inlining_with_precedence_and_alias() { - let grammar = SyntaxGrammar { - variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], - variables: vec![ - SyntaxVariable { - name: "non-terminal-0".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)) // inlined - .with_prec(1, Some(Associativity::Left)), - ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(2)), // inlined - ], - }], - }, - SyntaxVariable { - name: "non-terminal-1".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(11)) - .with_prec(2, None) - .with_alias("inner_alias", true), - ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), - ], - }], - }, - SyntaxVariable { - name: "non-terminal-2".to_string(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ProductionStep::new(Symbol::terminal(13)) - .with_alias("outer_alias", true)], - }], - }, - ], - expected_conflicts: Vec::new(), - extra_tokens: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let inline_map = InlinedProductionMap::new(&grammar); - - let items = inline_map - .inlined_items(ParseItem::Normal { - variable_index: 0, - production_index: 0, - step_index: 0, - }) - .unwrap() - .collect::>(); - assert_eq!( - display_items(&items, &grammar, &inline_map)[0], - "non-terminal-0 → • terminal-11 terminal-12 terminal-10 non-terminal-2".to_string(), - ); - - // The first step in the inlined production retains its precedence and alias. - let item = items[0].successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 • terminal-12 terminal-10 non-terminal-2".to_string(), - ); - assert_eq!(item.precedence(&grammar, &inline_map), 2); - assert_eq!( - items[0].step(&grammar, &inline_map).unwrap().alias, - Some(Alias { - value: "inner_alias".to_string(), - is_named: true, - }) - ); - - // The final terminal of the inlined production inherits the precedence of - // the inlined step. - let item = item.successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 • terminal-10 non-terminal-2".to_string(), - ); - assert_eq!(item.precedence(&grammar, &inline_map), 1); - - let item = item.successor(); - assert_eq!( - display_items(&[item], &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 terminal-10 • non-terminal-2".to_string(), - ); - - // All steps of the inlined production inherit their alias from the - // inlined step. - let items = inline_map.inlined_items(item).unwrap().collect::>(); - assert_eq!( - display_items(&items, &grammar, &inline_map)[0], - "non-terminal-0 → terminal-11 terminal-12 terminal-10 • terminal-13".to_string(), - ); - assert_eq!( - items[0].step(&grammar, &inline_map).unwrap().alias, - Some(Alias { - value: "outer_alias".to_string(), - is_named: true, - }) - ) - } - - fn display_items( - items: impl IntoIterator>, - grammar: &SyntaxGrammar, - inline_map: &InlinedProductionMap, - ) -> Vec { - let lex = LexicalGrammar::default(); - items - .into_iter() - .map(|item| format!("{}", item.borrow().display_with(grammar, &lex, inline_map))) - .collect() - } -} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 9208f602..49ab4f27 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,10 +1,12 @@ -use super::inline_variables::InlinedProductionMap; use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; -use crate::rules::{Associativity, Symbol, SymbolType}; +use crate::rules::Associativity; +use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; use std::collections::{HashMap, BTreeMap}; use std::fmt; use std::hash::{Hash, Hasher}; +use std::u32; +use std::cmp::Ordering; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -28,49 +30,26 @@ pub(crate) struct LookaheadSet { eof: bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub(crate) enum ParseItem { - Start { - step_index: u32, - }, - Normal { - variable_index: u32, - production_index: u32, - step_index: u32, - }, - Inlined { - variable_index: u32, - production_index: u32, - step_index: u32, - }, +#[derive(Clone, Copy, Debug)] +pub(crate) struct ParseItem<'a> { + pub variable_index: u32, + pub step_index: u32, + pub production: &'a Production, } #[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct ParseItemSet { - pub entries: BTreeMap, +pub(crate) struct ParseItemSet<'a> { + pub entries: BTreeMap, LookaheadSet>, } -pub(crate) struct ParseItemDisplay<'a>( - &'a ParseItem, - &'a SyntaxGrammar, - &'a LexicalGrammar, - &'a InlinedProductionMap, -); - +pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem<'a>, &'a SyntaxGrammar, &'a LexicalGrammar); pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); - pub(crate) struct ParseItemSetDisplay<'a>( - &'a ParseItemSet, + &'a ParseItemSet<'a>, &'a SyntaxGrammar, &'a LexicalGrammar, - &'a InlinedProductionMap, ); -struct ParseItemSetMapEntry(ParseItemSet, u64); -pub(crate) struct ParseItemSetMap { - map: HashMap -} - impl LookaheadSet { pub fn new() -> Self { Self { @@ -173,152 +152,79 @@ impl LookaheadSet { } } -impl ParseItem { +impl<'a> ParseItem<'a> { pub fn start() -> Self { - ParseItem::Start { step_index: 0 } - } - - pub fn is_kernel(&self) -> bool { - match self { - ParseItem::Start { .. } => true, - ParseItem::Normal { step_index, .. } | ParseItem::Inlined { step_index, .. } => { - *step_index > 0 - } + ParseItem { + variable_index: u32::MAX, + production: &START_PRODUCTION, + step_index: 0, } } - pub fn production<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlined_productions: &'a InlinedProductionMap, - ) -> &'a Production { - match self { - ParseItem::Start { .. } => &START_PRODUCTION, - ParseItem::Normal { - variable_index, - production_index, - .. - } => { - &grammar.variables[*variable_index as usize].productions[*production_index as usize] - } - ParseItem::Inlined { - production_index, .. - } => &inlined_productions.inlined_productions[*production_index as usize], + pub fn step(&self) -> Option<&'a ProductionStep> { + self.production.steps.get(self.step_index as usize) + } + + pub fn symbol(&self) -> Option { + self.step().map(|step| step.symbol) + } + + pub fn associativity(&self) -> Option { + self.prev_step().and_then(|step| step.associativity) + } + + pub fn precedence(&self) -> i32 { + self.prev_step().map_or(0, |step| step.precedence) + } + + pub fn prev_step(&self) -> Option<&'a ProductionStep> { + self.production.steps.get(self.step_index as usize - 1) + } + + pub fn is_done(&self) -> bool { + self.step_index as usize == self.production.steps.len() + } + + pub fn is_augmented(&self) -> bool { + self.variable_index == u32::MAX + } + + pub fn successor(&self) -> ParseItem<'a> { + ParseItem { + variable_index: self.variable_index, + production: self.production, + step_index: self.step_index + 1, } } - pub fn symbol( - &self, - grammar: &SyntaxGrammar, - inlined_productions: &InlinedProductionMap, - ) -> Option { - self.step(grammar, inlined_productions).map(|s| s.symbol) - } - - pub fn step<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlined_productions: &'a InlinedProductionMap, - ) -> Option<&'a ProductionStep> { - self.production(grammar, inlined_productions) - .steps - .get(self.step_index()) - } - - pub fn precedence<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> i32 { - self.production(grammar, inlines) - .steps - .get(self.step_index() - 1) - .map(|s| s.precedence) - .unwrap_or(0) - } - - pub fn associativity<'a>( - &self, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> Option { - let production = self.production(grammar, inlines); - let step_index = self.step_index(); - if step_index == production.steps.len() { - production.steps.last().and_then(|s| s.associativity) - } else { - None - } - } - - pub fn variable_index(&self) -> u32 { - match self { - ParseItem::Start { .. } => panic!("Start item doesn't have a variable index"), - ParseItem::Normal { variable_index, .. } - | ParseItem::Inlined { variable_index, .. } => *variable_index, - } - } - - pub fn step_index(&self) -> usize { - match self { - ParseItem::Start { step_index } - | ParseItem::Normal { step_index, .. } - | ParseItem::Inlined { step_index, .. } => *step_index as usize, - } - } - - pub fn is_final(&self) -> bool { - if let ParseItem::Start { step_index: 1 } = self { - true - } else { - false - } - } - - fn step_index_mut(&mut self) -> &mut u32 { - match self { - ParseItem::Start { step_index } - | ParseItem::Normal { step_index, .. } - | ParseItem::Inlined { step_index, .. } => step_index, - } - } - - pub fn display_with<'a>( + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, ) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines) - } - - pub fn successor(&self) -> ParseItem { - let mut result = self.clone(); - *result.step_index_mut() += 1; - result + ParseItemDisplay(self, syntax_grammar, lexical_grammar) } } -impl ParseItemSet { - pub fn with<'a>(elements: impl IntoIterator) -> Self { +impl<'a> ParseItemSet<'a> { + pub fn with(elements: impl IntoIterator, LookaheadSet)>) -> Self { let mut result = Self::default(); for (item, lookaheads) in elements { - result.entries.insert(*item, lookaheads.clone()); + result.entries.insert(item, lookaheads); } result } - pub fn display_with<'a>( + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, ) -> ParseItemSetDisplay<'a> { - ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines) + ParseItemSetDisplay(self, syntax_grammar, lexical_grammar) } } -impl Default for ParseItemSet { +impl<'a> Default for ParseItemSet<'a> { fn default() -> Self { Self { entries: BTreeMap::new(), @@ -328,20 +234,18 @@ impl Default for ParseItemSet { impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - if let ParseItem::Start { .. } = &self.0 { + if self.0.is_augmented() { write!(f, "START →")?; } else { write!( f, "{} →", - &self.1.variables[self.0.variable_index() as usize].name + &self.1.variables[self.0.variable_index as usize].name )?; } - let step_index = self.0.step_index(); - let production = self.0.production(self.1, self.3); - for (i, step) in production.steps.iter().enumerate() { - if i == step_index { + for (i, step) in self.0.production.steps.iter().enumerate() { + if i == self.0.step_index as usize { write!(f, " •")?; } @@ -359,7 +263,7 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } } - if production.steps.len() == step_index { + if self.0.is_done() { write!(f, " •")?; } @@ -398,7 +302,7 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { writeln!( f, "{}\t{}", - item.display_with(self.1, self.2, self.3), + item.display_with(self.1, self.2), lookaheads.display_with(self.1, self.2) )?; } @@ -406,7 +310,94 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { } } -impl Hash for ParseItemSet { +impl<'a> Hash for ParseItem<'a> { + fn hash(&self, hasher: &mut H) { + hasher.write_u32(self.variable_index); + hasher.write_u32(self.step_index); + hasher.write_i32(self.production.dynamic_precedence); + hasher.write_usize(self.production.steps.len()); + hasher.write_i32(self.precedence()); + self.associativity().hash(hasher); + for step in &self.production.steps[0..self.step_index as usize] { + step.alias.hash(hasher); + } + for step in &self.production.steps[self.step_index as usize..] { + step.hash(hasher); + } + } +} + +impl<'a> PartialEq for ParseItem<'a> { + fn eq(&self, other: &Self) -> bool { + if self.variable_index != other.variable_index + || self.step_index != other.step_index + || self.production.dynamic_precedence != other.production.dynamic_precedence + || self.production.steps.len() != other.production.steps.len() + || self.precedence() != other.precedence() + || self.associativity() != other.associativity() + { + return false; + } + + for (i, step) in self.production.steps.iter().enumerate() { + if i < self.step_index as usize { + if step.alias != other.production.steps[i].alias { + return false; + } + } else { + if *step != other.production.steps[i] { + return false; + } + } + } + + return true; + } +} + +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + if let Some(o) = self.variable_index.partial_cmp(&other.variable_index) { + return Some(o); + } + if let Some(o) = self.step_index.partial_cmp(&other.step_index) { + return Some(o); + } + if let Some(o) = self.production.dynamic_precedence.partial_cmp(&other.production.dynamic_precedence) { + return Some(o); + } + if let Some(o) = self.production.steps.len().partial_cmp(&other.production.steps.len()) { + return Some(o); + } + if let Some(o) = self.precedence().partial_cmp(&other.precedence()) { + return Some(o); + } + if let Some(o) = self.associativity().partial_cmp(&other.associativity()) { + return Some(o); + } + for (i, step) in self.production.steps.iter().enumerate() { + let cmp = if i < self.step_index as usize { + step.alias.partial_cmp(&other.production.steps[i].alias) + } else { + step.partial_cmp(&other.production.steps[i]) + }; + if let Some(o) = cmp { + return Some(o); + } + } + return None; + } +} + +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap_or(Ordering::Equal) + } +} + +impl<'a> Eq for ParseItem<'a> {} + +impl<'a> Hash for ParseItemSet<'a> { fn hash(&self, hasher: &mut H) { hasher.write_usize(self.entries.len()); for (item, lookaheads) in self.entries.iter() { diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 530c1f25..52ee0a45 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,12 +1,11 @@ -use super::inline_variables::InlinedProductionMap; use super::item::{LookaheadSet, ParseItem, ParseItemSet}; -use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use std::collections::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq, Eq)] -struct TransitiveClosureAddition { - item: ParseItem, +struct TransitiveClosureAddition<'a> { + item: ParseItem<'a>, info: FollowSetInfo, } @@ -16,11 +15,10 @@ struct FollowSetInfo { propagates_lookaheads: bool, } -pub(crate) struct ParseItemSetBuilder { +pub(crate) struct ParseItemSetBuilder<'a> { first_sets: HashMap, last_sets: HashMap, - transitive_closure_additions: Vec>, - pub inlines: InlinedProductionMap, + transitive_closure_additions: Vec>>, } fn find_or_push(vector: &mut Vec, value: T) { @@ -29,13 +27,16 @@ fn find_or_push(vector: &mut Vec, value: T) { } } -impl ParseItemSetBuilder { - pub fn new(syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar) -> Self { +impl<'a> ParseItemSetBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> Self { let mut result = Self { first_sets: HashMap::new(), last_sets: HashMap::new(), transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], - inlines: InlinedProductionMap::new(syntax_grammar), }; // For each grammar symbol, populate the FIRST and LAST sets: the set of @@ -193,22 +194,28 @@ impl ParseItemSetBuilder { for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { let variable = &syntax_grammar.variables[variable_index]; let non_terminal = Symbol::non_terminal(variable_index); + let variable_index = variable_index as u32; if syntax_grammar.variables_to_inline.contains(&non_terminal) { continue; } - for production_index in 0..variable.productions.len() { - let item = ParseItem::Normal { - variable_index: variable_index as u32, - production_index: production_index as u32, + for (production_index, production) in variable.productions.iter().enumerate() { + let item = ParseItem { + variable_index, + production, step_index: 0, }; - if let Some(inlined_items) = result.inlines.inlined_items(item) { - for inlined_item in inlined_items { + // let step_id = item.as_step_id(syntax_grammar, inlines); + if let Some(inlined_productions) = inlines.inlined_productions(item.production, item.step_index) { + for production in inlined_productions { find_or_push( additions_for_non_terminal, TransitiveClosureAddition { - item: inlined_item, + item: ParseItem { + variable_index, + production, + step_index: item.step_index, + }, info: follow_set_info.clone(), }, ); @@ -231,14 +238,19 @@ impl ParseItemSetBuilder { pub(crate) fn transitive_closure( &mut self, - item_set: &ParseItemSet, - grammar: &SyntaxGrammar, - ) -> ParseItemSet { + item_set: &ParseItemSet<'a>, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(items) = self.inlines.inlined_items(*item) { - for item in items { - self.add_item(&mut result, item, lookaheads, grammar); + if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) { + for production in productions { + self.add_item(&mut result, ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, lookaheads, grammar); } } else { self.add_item(&mut result, *item, lookaheads, grammar); @@ -253,14 +265,14 @@ impl ParseItemSetBuilder { fn add_item( &self, - set: &mut ParseItemSet, - item: ParseItem, + set: &mut ParseItemSet<'a>, + item: ParseItem<'a>, lookaheads: &LookaheadSet, grammar: &SyntaxGrammar, ) { - if let Some(step) = item.step(grammar, &self.inlines) { + if let Some(step) = item.step() { if step.symbol.is_non_terminal() { - let next_step = item.successor().step(grammar, &self.inlines); + let next_step = item.successor().step(); // Determine which tokens can follow this non-terminal. let following_tokens = if let Some(next_step) = next_step { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 091c5486..27951453 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,14 +1,14 @@ -mod inline_variables; mod item; mod item_set_builder; use self::item::{LookaheadSet, ParseItem, ParseItemSet}; use self::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; -use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; -use crate::tables::ParseTableEntry; -use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable}; +use crate::tables::{ + AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; use core::ops::Range; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet, VecDeque}; @@ -30,12 +30,13 @@ struct ParseStateQueueEntry { } struct ParseTableBuilder<'a> { - item_set_builder: ParseItemSetBuilder, + item_set_builder: ParseItemSetBuilder<'a>, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, simple_aliases: &'a AliasMap, - state_ids_by_item_set: HashMap, - item_sets_by_state_id: Vec, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, } @@ -46,16 +47,17 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. - let error_state_id = self.add_parse_state( - &Vec::new(), - &Vec::new(), - ParseItemSet::default(), - ); + let error_state_id = + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); self.add_parse_state( &Vec::new(), &Vec::new(), - ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]), + ParseItemSet::with( + [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + .iter() + .cloned(), + ), ); self.process_part_state_queue()?; @@ -68,7 +70,7 @@ impl<'a> ParseTableBuilder<'a> { &mut self, preceding_symbols: &SymbolSequence, preceding_auxiliary_symbols: &AuxiliarySymbolSequence, - item_set: ParseItemSet, + item_set: ParseItemSet<'a>, ) -> ParseStateId { match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => { @@ -99,16 +101,14 @@ impl<'a> ParseTableBuilder<'a> { println!( "ITEM SET {}:\n{}", entry.state_id, - self.item_sets_by_state_id[entry.state_id].display_with( - &self.syntax_grammar, - &self.lexical_grammar, - &self.item_set_builder.inlines - ) + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) ); let item_set = self.item_set_builder.transitive_closure( &self.item_sets_by_state_id[entry.state_id], self.syntax_grammar, + self.inlines, ); // println!("TRANSITIVE CLOSURE:"); @@ -131,7 +131,7 @@ impl<'a> ParseTableBuilder<'a> { &mut self, mut preceding_symbols: SymbolSequence, mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet, + item_set: ParseItemSet<'a>, state_id: ParseStateId, ) -> Result<()> { let mut terminal_successors = HashMap::new(); @@ -139,9 +139,7 @@ impl<'a> ParseTableBuilder<'a> { let mut lookaheads_with_conflicts = HashSet::new(); for (item, lookaheads) in &item_set.entries { - if let Some(next_symbol) = - item.symbol(self.syntax_grammar, &self.item_set_builder.inlines) - { + if let Some(next_symbol) = item.symbol() { let successor = item.successor(); if next_symbol.is_non_terminal() { // Keep track of where auxiliary non-terminals (repeat symbols) are @@ -169,17 +167,15 @@ impl<'a> ParseTableBuilder<'a> { .insert_all(lookaheads); } } else { - let action = if item.is_final() { + let action = if item.is_augmented() { ParseAction::Accept } else { - let production = - item.production(&self.syntax_grammar, &self.item_set_builder.inlines); ParseAction::Reduce { - symbol: Symbol::non_terminal(item.variable_index() as usize), - child_count: item.step_index(), - precedence: production.last_precedence(), - associativity: production.last_associativity(), - dynamic_precedence: production.dynamic_precedence, + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, alias_sequence_id: self.get_alias_sequence_id(item), } }; @@ -280,17 +276,15 @@ impl<'a> ParseTableBuilder<'a> { let mut shift_precedence: Option> = None; let mut conflicting_items = HashSet::new(); for (item, lookaheads) in &item_set.entries { - let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); - let step_index = item.step_index(); - if let Some(step) = production.steps.get(step_index) { - if step_index > 0 { + if let Some(step) = item.step() { + if item.step_index > 0 { if self .item_set_builder .first_set(&step.symbol) .contains(&conflicting_lookahead) { conflicting_items.insert(item); - let precedence = production.steps[step_index - 1].precedence; + let precedence = item.precedence(); if let Some(range) = &mut shift_precedence { if precedence < range.start { range.start = precedence; @@ -316,11 +310,11 @@ impl<'a> ParseTableBuilder<'a> { // by leaving it in the parse table, but marking the SHIFT action with // an `is_repetition` flag. let conflicting_variable_index = - conflicting_items.iter().next().unwrap().variable_index(); + conflicting_items.iter().next().unwrap().variable_index; if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { if conflicting_items .iter() - .all(|item| item.variable_index() == conflicting_variable_index) + .all(|item| item.variable_index == conflicting_variable_index) { *is_repetition = true; return Ok(()); @@ -340,10 +334,7 @@ impl<'a> ParseTableBuilder<'a> { && shift_precedence.start < reduce_precedence) { entry.actions.pop(); - conflicting_items.retain(|item| { - item.step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - }); + conflicting_items.retain(|item| item.is_done()); } // If the SHIFT and REDUCE actions have the same predence, consider // the REDUCE actions' associativity. @@ -367,10 +358,7 @@ impl<'a> ParseTableBuilder<'a> { match (has_left, has_non, has_right) { (true, false, false) => { entry.actions.pop(); - conflicting_items.retain(|item| { - item.step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - }); + conflicting_items.retain(|item| item.is_done()); } (false, false, true) => { entry.actions.drain(0..entry.actions.len() - 1); @@ -392,7 +380,7 @@ impl<'a> ParseTableBuilder<'a> { // Determine the set of parent symbols involved in this conflict. let mut actual_conflict = Vec::new(); for item in &conflicting_items { - let symbol = Symbol::non_terminal(item.variable_index() as usize); + let symbol = Symbol::non_terminal(item.variable_index as usize); if self.syntax_grammar.variables[symbol.index].is_auxiliary() { actual_conflict.extend( preceding_auxiliary_symbols @@ -441,7 +429,7 @@ impl<'a> ParseTableBuilder<'a> { for preceding_symbol in preceding_symbols .iter() - .take(preceding_symbols.len() - item.step_index()) + .take(preceding_symbols.len() - item.step_index as usize) { write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); } @@ -449,17 +437,12 @@ impl<'a> ParseTableBuilder<'a> { write!( &mut msg, " ({}", - &self.syntax_grammar.variables[item.variable_index() as usize].name + &self.syntax_grammar.variables[item.variable_index as usize].name ) .unwrap(); - for (j, step) in item - .production(&self.syntax_grammar, &self.item_set_builder.inlines) - .steps - .iter() - .enumerate() - { - if j == item.step_index() { + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { write!(&mut msg, " •").unwrap(); } write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); @@ -467,10 +450,7 @@ impl<'a> ParseTableBuilder<'a> { write!(&mut msg, ")").unwrap(); - if item - .step(&self.syntax_grammar, &self.item_set_builder.inlines) - .is_none() - { + if item.is_done() { write!( &mut msg, " • {}", @@ -479,9 +459,8 @@ impl<'a> ParseTableBuilder<'a> { .unwrap(); } - let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines); - let associativity = - item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines); + let precedence = item.precedence(); + let associativity = item.associativity(); if precedence != 0 || associativity.is_some() { write!( &mut msg, @@ -506,8 +485,7 @@ impl<'a> ParseTableBuilder<'a> { .entries .keys() .filter_map(|item| { - if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol) - { + if item.symbol() == Some(symbol) { None } else { None @@ -554,8 +532,12 @@ impl<'a> ParseTableBuilder<'a> { } fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); - let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect(); + let alias_sequence = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); if let Some(index) = self .parse_table .alias_sequences @@ -592,12 +574,14 @@ pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap, + inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { ParseTableBuilder { syntax_grammar, lexical_grammar, simple_aliases, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar), + inlines, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), parse_state_queue: VecDeque::new(), diff --git a/src/generate.rs b/src/generate.rs index dc3d5176..cdbbea4f 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -6,11 +6,12 @@ use crate::render::render_c_code; pub fn generate_parser_for_grammar(input: &str) -> Result { let input_grammar = parse_grammar(input)?; - let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, - &simple_aliases + &simple_aliases, + &inlines )?; let c_code = render_c_code( &input_grammar.name, diff --git a/src/grammars.rs b/src/grammars.rs index 7512ec03..b751e4e4 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,12 +1,13 @@ -use crate::rules::{Associativity, Alias, Rule, Symbol}; use crate::nfa::Nfa; +use crate::rules::{Alias, Associativity, Rule, Symbol}; +use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { Hidden, Auxiliary, Anonymous, - Named + Named, } // Input grammar @@ -46,12 +47,12 @@ pub(crate) struct LexicalGrammar { // Extracted syntax grammar -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct ProductionStep { - pub symbol: Symbol, - pub precedence: i32, - pub associativity: Option, - pub alias: Option, + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -60,6 +61,11 @@ pub(crate) struct Production { pub dynamic_precedence: i32, } +pub(crate) struct InlinedProductionMap { + pub productions: Vec, + pub production_map: HashMap<(*const Production, u32), Vec>, +} + #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct SyntaxVariable { pub name: String, @@ -86,7 +92,12 @@ pub(crate) struct SyntaxGrammar { impl ProductionStep { pub(crate) fn new(symbol: Symbol) -> Self { - Self { symbol, precedence: 0, associativity: None, alias: None } + Self { + symbol, + precedence: 0, + associativity: None, + alias: None, + } } pub(crate) fn with_prec(self, precedence: i32, associativity: Option) -> Self { @@ -103,7 +114,10 @@ impl ProductionStep { symbol: self.symbol, precedence: self.precedence, associativity: self.associativity, - alias: Some(Alias { value: value.to_string(), is_named }), + alias: Some(Alias { + value: value.to_string(), + is_named, + }), } } } @@ -124,25 +138,44 @@ impl Production { impl Default for Production { fn default() -> Self { - Production { dynamic_precedence: 0, steps: Vec::new() } + Production { + dynamic_precedence: 0, + steps: Vec::new(), + } } } impl Variable { pub fn named(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Named, rule } + Self { + name: name.to_string(), + kind: VariableType::Named, + rule, + } } pub fn auxiliary(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Auxiliary, rule } + Self { + name: name.to_string(), + kind: VariableType::Auxiliary, + rule, + } } pub fn hidden(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Hidden, rule } + Self { + name: name.to_string(), + kind: VariableType::Hidden, + rule, + } } pub fn anonymous(name: &str, rule: Rule) -> Self { - Self { name: name.to_string(), kind: VariableType::Anonymous, rule } + Self { + name: name.to_string(), + kind: VariableType::Anonymous, + rule, + } } } @@ -151,3 +184,20 @@ impl SyntaxVariable { self.kind == VariableType::Auxiliary } } + +impl InlinedProductionMap { + pub fn inlined_productions<'a>( + &'a self, + production: &Production, + step_index: u32, + ) -> Option + 'a> { + self.production_map + .get(&(production as *const Production, step_index)) + .map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |index| &self.productions[index]) + }) + } +} diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 22435fca..f325383b 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -4,6 +4,7 @@ mod extract_simple_aliases; mod extract_tokens; mod flatten_grammar; mod intern_symbols; +mod process_inlines; use self::expand_repeats::expand_repeats; use self::expand_tokens::expand_tokens; @@ -11,8 +12,11 @@ use self::extract_simple_aliases::extract_simple_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; use self::intern_symbols::intern_symbols; +use self::process_inlines::process_inlines; use crate::error::Result; -use crate::grammars::{ExternalToken, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable}; +use crate::grammars::{ + ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable, +}; use crate::rules::{AliasMap, Rule, Symbol}; pub(self) struct IntermediateGrammar { @@ -36,12 +40,18 @@ pub(self) struct ExtractedLexicalGrammar { pub(crate) fn prepare_grammar( input_grammar: &InputGrammar, -) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { +) -> Result<( + SyntaxGrammar, + LexicalGrammar, + InlinedProductionMap, + AliasMap, +)> { let interned_grammar = intern_symbols(input_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; let lexical_grammar = expand_tokens(lexical_grammar)?; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); - Ok((syntax_grammar, lexical_grammar, simple_aliases)) + let inlines = process_inlines(&syntax_grammar); + Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases)) } diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs new file mode 100644 index 00000000..0d7f6827 --- /dev/null +++ b/src/prepare_grammar/process_inlines.rs @@ -0,0 +1,477 @@ +use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; +use std::collections::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct ProductionStepId { + variable_index: Option, + production_index: usize, + step_index: usize, +} + +struct InlinedProductionMapBuilder { + production_indices_by_step_id: HashMap>, + productions: Vec, +} + +impl ProductionStepId { + pub fn successor(&self) -> Self { + Self { + variable_index: self.variable_index, + production_index: self.production_index, + step_index: self.step_index + 1, + } + } +} + +fn production_for_id<'a>( + map: &'a InlinedProductionMapBuilder, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> &'a Production { + if let Some(variable_index) = id.variable_index { + &grammar.variables[variable_index].productions[id.production_index] + } else { + &map.productions[id.production_index] + } +} + +fn production_step_for_id<'a>( + map: &'a InlinedProductionMapBuilder, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> Option<&'a ProductionStep> { + production_for_id(map, id, grammar).steps.get(id.step_index) +} + +fn inline<'a>( + map: &'a mut InlinedProductionMapBuilder, + step_id: ProductionStepId, + grammar: &'a SyntaxGrammar, +) -> &'a Vec { + let step = production_step_for_id(map, step_id, grammar).unwrap(); + let mut productions_to_add = grammar.variables[step.symbol.index].productions.clone(); + + let mut i = 0; + while i < productions_to_add.len() { + if let Some(first_symbol) = productions_to_add[i].first_symbol() { + if grammar.variables_to_inline.contains(&first_symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[first_symbol.index] + .productions + .iter() + .map(|p| { + let mut p = p.clone(); + p.steps.extend(production.steps[1..].iter().cloned()); + p + }), + ); + continue; + } + } + i += 1; + } + + let result = productions_to_add + .into_iter() + .map(|production_to_add| { + let mut inlined_production = production_for_id(&map, step_id, grammar).clone(); + let removed_step = inlined_production + .steps + .splice( + step_id.step_index..step_id.step_index + 1, + production_to_add.steps.iter().cloned(), + ) + .next() + .unwrap(); + let inserted_steps = &mut inlined_production.steps + [step_id.step_index..step_id.step_index + production_to_add.steps.len()]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } + map.productions + .iter() + .position(|p| *p == inlined_production) + .unwrap_or({ + map.productions.push(inlined_production); + map.productions.len() - 1 + }) + }) + .collect(); + + map.production_indices_by_step_id + .entry(step_id) + .or_insert(result) +} + +pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { + let mut result = InlinedProductionMapBuilder { + productions: Vec::new(), + production_indices_by_step_id: HashMap::new(), + }; + + let mut step_ids_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + step_ids_to_process.push(ProductionStepId { + variable_index: Some(variable_index), + production_index, + step_index: 0, + }); + while !step_ids_to_process.is_empty() { + let mut i = 0; + while i < step_ids_to_process.len() { + let step_id = step_ids_to_process[i]; + if let Some(step) = production_step_for_id(&result, step_id, grammar) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_step_ids = inline(&mut result, step_id, grammar) + .into_iter() + .cloned() + .map(|production_index| ProductionStepId { + variable_index: None, + production_index, + step_index: step_id.step_index, + }) + .collect::>(); + step_ids_to_process.splice(i..i + 1, inlined_step_ids); + } else { + step_ids_to_process[i] = step_id.successor(); + i += 1; + } + } else { + step_ids_to_process.remove(i); + } + } + } + } + } + + // result + let productions = result.productions; + let production_indices_by_step_id = result.production_indices_by_step_id; + + let production_map = production_indices_by_step_id + .into_iter() + .map(|(step_id, production_indices)| { + let production = if let Some(variable_index) = step_id.variable_index { + &grammar.variables[variable_index].productions[step_id.production_index] + } else { + &productions[step_id.production_index] + } as *const Production; + ((production, step_id.step_index as u32), production_indices) + }) + .collect(); + + InlinedProductionMap { productions, production_map } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::{Associativity, Symbol}; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + let inline_map = process_inlines(&grammar); + + // Nothing to inline at step 0. + assert!(inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .is_none()); + + // Inlining variable 1 yields two productions. + assert_eq!( + inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .cloned() + .collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + ] + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "non-terminal-3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + let inline_map = process_inlines(&grammar); + + let productions: Vec<&Production> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(16)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ] + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::terminal(15)), + ProductionStep::new(Symbol::terminal(12)), + ], + },] + ); + } + + #[test] + fn test_inlining_with_precedence_and_alias() { + let grammar = SyntaxGrammar { + variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // inlined + ProductionStep::new(Symbol::non_terminal(1)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // inlined + ProductionStep::new(Symbol::non_terminal(2)) + .with_alias("outer_alias", true), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }], + }, + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = process_inlines(&grammar); + + let productions: Vec<_> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + // The first step in the inlined production retains its precedence + // and alias. + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + // The final step of the inlined production inherits the precedence of + // the inlined step. + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_alias("outer_alias", true), + ] + }], + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // All steps of the inlined production inherit their alias from the + // inlined step. + ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true), + ] + }], + ); + } +} diff --git a/src/rules.rs b/src/rules.rs index 34f4c8b9..3bfd5181 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -8,13 +8,13 @@ pub(crate) enum SymbolType { End, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum Associativity { Left, Right, } -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct Alias { pub value: String, pub is_named: bool, From 99ecf29e4b4bb394b17f9818ce31f5da781f7575 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 23 Dec 2018 10:15:23 -0800 Subject: [PATCH 075/208] Fix typo causing infinite recursion in expand_regex --- src/prepare_grammar/expand_tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 37f75e5a..5ee9861f 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -217,7 +217,7 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) } RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { - expand_count(ast, min, nfa, next_state_id, is_sep) + expand_count(&repetition.ast, min, nfa, next_state_id, is_sep) } else { Ok(false) } From 5258ee2e6ad3f202e43f98a093c82da1143a27fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 23 Dec 2018 10:16:03 -0800 Subject: [PATCH 076/208] Implement more C code generation --- src/build_tables/item.rs | 60 +- src/build_tables/item_set_builder.rs | 27 +- src/build_tables/lex_table_builder.rs | 24 + src/build_tables/mod.rs | 61 ++- src/render/mod.rs | 761 ++++++++++++++++++++++++-- src/tables.rs | 12 +- 6 files changed, 840 insertions(+), 105 deletions(-) create mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 49ab4f27..28723d24 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,7 +2,7 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::{HashMap, BTreeMap}; +use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; @@ -178,7 +178,11 @@ impl<'a> ParseItem<'a> { } pub fn prev_step(&self) -> Option<&'a ProductionStep> { - self.production.steps.get(self.step_index as usize - 1) + if self.step_index > 0 { + Some(&self.production.steps[self.step_index as usize - 1]) + } else { + None + } } pub fn is_done(&self) -> bool { @@ -355,43 +359,49 @@ impl<'a> PartialEq for ParseItem<'a> { } } -impl<'a> PartialOrd for ParseItem<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - if let Some(o) = self.variable_index.partial_cmp(&other.variable_index) { - return Some(o); +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + let o = self.variable_index.cmp(&other.variable_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.step_index.partial_cmp(&other.step_index) { - return Some(o); + let o = self.step_index.cmp(&other.step_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.dynamic_precedence.partial_cmp(&other.production.dynamic_precedence) { - return Some(o); + let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.steps.len().partial_cmp(&other.production.steps.len()) { - return Some(o); + let o = self.production.steps.len().cmp(&other.production.steps.len()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.precedence().partial_cmp(&other.precedence()) { - return Some(o); + let o = self.precedence().cmp(&other.precedence()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.associativity().partial_cmp(&other.associativity()) { - return Some(o); + let o = self.associativity().cmp(&other.associativity()); + if o != Ordering::Equal { + return o; } for (i, step) in self.production.steps.iter().enumerate() { - let cmp = if i < self.step_index as usize { - step.alias.partial_cmp(&other.production.steps[i].alias) + let o = if i < self.step_index as usize { + step.alias.cmp(&other.production.steps[i].alias) } else { - step.partial_cmp(&other.production.steps[i]) + step.cmp(&other.production.steps[i]) }; - if let Some(o) = cmp { - return Some(o); + if o != Ordering::Equal { + return o; } } - return None; + return Ordering::Equal; } } -impl<'a> Ord for ParseItem<'a> { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap_or(Ordering::Equal) +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) } } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 52ee0a45..d7883988 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -198,15 +198,16 @@ impl<'a> ParseItemSetBuilder<'a> { if syntax_grammar.variables_to_inline.contains(&non_terminal) { continue; } - for (production_index, production) in variable.productions.iter().enumerate() { + for production in &variable.productions { let item = ParseItem { variable_index, production, step_index: 0, }; - // let step_id = item.as_step_id(syntax_grammar, inlines); - if let Some(inlined_productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(inlined_productions) = + inlines.inlined_productions(item.production, item.step_index) + { for production in inlined_productions { find_or_push( additions_for_non_terminal, @@ -244,16 +245,21 @@ impl<'a> ParseItemSetBuilder<'a> { ) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + { for production in productions { - self.add_item(&mut result, ParseItem { - variable_index: item.variable_index, - production, - step_index: item.step_index, - }, lookaheads, grammar); + self.add_item( + &mut result, + ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, + lookaheads, + ); } } else { - self.add_item(&mut result, *item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads); } } result @@ -268,7 +274,6 @@ impl<'a> ParseItemSetBuilder<'a> { set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet, - grammar: &SyntaxGrammar, ) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs new file mode 100644 index 00000000..86d1578b --- /dev/null +++ b/src/build_tables/lex_table_builder.rs @@ -0,0 +1,24 @@ +use crate::rules::Symbol; +use crate::tables::LexTable; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; + +pub(crate) struct LexTableBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + table: LexTable, +} + +impl<'a> LexTableBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> Self { + Self { + syntax_grammar, lexical_grammar, table: LexTable::default() + } + } + + pub fn build(self) -> (LexTable, LexTable, Option) { + (LexTable::default(), LexTable::default(), None) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 27951453..fc17ce7f 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,10 +1,13 @@ mod item; mod item_set_builder; +mod lex_table_builder; use self::item::{LookaheadSet, ParseItem, ParseItemSet}; use self::item_set_builder::ParseItemSetBuilder; +use self::lex_table_builder::LexTableBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::Alias; use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; use crate::tables::{ AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, @@ -43,7 +46,7 @@ struct ParseTableBuilder<'a> { impl<'a> ParseTableBuilder<'a> { fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty rename sequence has index 0. + // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. @@ -61,9 +64,18 @@ impl<'a> ParseTableBuilder<'a> { ); self.process_part_state_queue()?; + + let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); + self.populate_used_symbols(); - Err(Error::grammar("oh no")) + let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); + Ok(( + self.parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + )) } fn add_parse_state( @@ -82,6 +94,7 @@ impl<'a> ParseTableBuilder<'a> { let state_id = self.parse_table.states.len(); self.item_sets_by_state_id.push(v.key().clone()); self.parse_table.states.push(ParseState { + lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), }); @@ -98,12 +111,16 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } let item_set = self.item_set_builder.transitive_closure( &self.item_sets_by_state_id[entry.state_id], @@ -111,11 +128,12 @@ impl<'a> ParseTableBuilder<'a> { self.inlines, ); - // println!("TRANSITIVE CLOSURE:"); - // for item in item_set.entries.keys() { - // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); - // } - // println!(""); + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } self.add_actions( entry.preceding_symbols, @@ -249,6 +267,17 @@ impl<'a> ParseTableBuilder<'a> { )?; } + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + Ok(()) } @@ -514,6 +543,7 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } + self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); @@ -532,12 +562,15 @@ impl<'a> ParseTableBuilder<'a> { } fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let alias_sequence = item + let mut alias_sequence: Vec> = item .production .steps .iter() .map(|s| s.alias.clone()) .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } if let Some(index) = self .parse_table .alias_sequences diff --git a/src/render/mod.rs b/src/render/mod.rs index 2ca610a6..fc4cdafb 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,8 +1,16 @@ -use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; use std::fmt::Write; +use std::mem::swap; + +macro_rules! add { + ($this: tt, $($arg: tt)*) => {{ + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + }} +} macro_rules! add_line { ($this: tt, $($arg: tt)*) => { @@ -14,10 +22,21 @@ macro_rules! add_line { } } +macro_rules! indent { + ($this: tt) => { + $this.indent_level += 1; + }; +} + +macro_rules! dedent { + ($this: tt) => { + $this.indent_level -= 1; + }; +} + struct Generator { buffer: String, indent_level: usize, - language_name: String, parse_table: ParseTable, main_lex_table: LexTable, @@ -27,9 +46,9 @@ struct Generator { lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, symbol_ids: HashMap, - parse_table_entries: Vec<(usize, ParseTableEntry)>, - next_parse_action_list_index: usize, - unique_aliases: HashSet, + alias_ids: HashMap, + external_scanner_states: Vec>, + alias_map: HashMap>, } impl Generator { @@ -39,6 +58,30 @@ impl Generator { self.add_stats(); self.add_symbol_enum(); self.add_symbol_names_list(); + self.add_symbol_metadata_list(); + self.add_alias_sequences(); + + let mut main_lex_table = LexTable::default(); + swap(&mut main_lex_table, &mut self.main_lex_table); + self.add_lex_function("ts_lex", main_lex_table); + + if self.keyword_capture_token.is_some() { + let mut keyword_lex_table = LexTable::default(); + swap(&mut keyword_lex_table, &mut self.keyword_lex_table); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); + } + + self.add_lex_modes_list(); + + if !self.syntax_grammar.external_tokens.is_empty() { + self.add_external_token_enum(); + self.add_external_scanner_symbol_map(); + self.add_external_scanner_states_list(); + } + + self.add_parse_table(); + self.add_parser_export(); + self.buffer } @@ -50,7 +93,10 @@ impl Generator { fn add_pragmas(&mut self) { add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); add_line!(self, "#pragma GCC diagnostic push"); - add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!( + self, + "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"" + ); add_line!(self, "#endif"); add_line!(self, ""); @@ -67,81 +113,639 @@ impl Generator { } fn add_stats(&mut self) { - let mut token_count = 0; - - for symbol in &self.parse_table.symbols { - if symbol.is_terminal() { - token_count += 1; - } else if symbol.is_external() { - let external_token = &self.syntax_grammar.external_tokens[symbol.index]; - if external_token.corresponding_internal_token.is_none() { - token_count += 1; + let token_count = self + .parse_table + .symbols + .iter() + .filter(|symbol| { + if symbol.is_terminal() { + true + } else if symbol.is_external() { + self.syntax_grammar.external_tokens[symbol.index] + .corresponding_internal_token + .is_none() + } else { + false } - } + }) + .count(); + + let mut symbol_identifiers = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); } for alias_sequence in &self.parse_table.alias_sequences { for entry in alias_sequence { if let Some(alias) = entry { - self.unique_aliases.insert(alias.clone()); + let alias_kind = if alias.is_named { + VariableType::Named + } else { + VariableType::Anonymous + }; + let matching_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { + let (name, kind) = self.metadata_for_symbol(*symbol); + name == alias.value && kind == alias_kind + }); + let alias_id = if let Some(symbol) = matching_symbol { + self.symbol_ids[&symbol].clone() + } else if alias.is_named { + format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + }; + self.alias_ids.entry(alias.clone()).or_insert(alias_id); + self.alias_map + .entry(alias.clone()) + .or_insert(matching_symbol); } } } - let mut symbol_id_values = HashSet::new(); - for i in 0..self.parse_table.symbols.len() { - self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); - } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); - add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); - add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); - add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!( + self, + "#define STATE_COUNT {}", + self.parse_table.states.len() + ); + add_line!( + self, + "#define SYMBOL_COUNT {}", + self.parse_table.symbols.len() + ); + add_line!( + self, + "#define ALIAS_COUNT {}", + self.alias_map.iter().filter(|e| e.1.is_none()).count() + ); add_line!(self, "#define TOKEN_COUNT {}", token_count); - add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); - // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!( + self, + "#define EXTERNAL_TOKEN_COUNT {}", + self.syntax_grammar.external_tokens.len() + ); + if let Some(max_alias_sequence_length) = self + .parse_table + .alias_sequences + .iter() + .map(|seq| seq.len()) + .max() + { + add_line!( + self, + "#define MAX_ALIAS_SEQUENCE_LENGTH {}", + max_alias_sequence_length + ); + } add_line!(self, ""); } fn add_symbol_enum(&mut self) { add_line!(self, "enum {{"); - self.indent(); - for i in 0..self.parse_table.symbols.len() { - let symbol = self.parse_table.symbols[i]; - if symbol != Symbol::end() { - add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + indent!(self); + let mut i = 1; + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!(self, "{} = {},", self.symbol_ids[&symbol], i); + i += 1; } } - self.dedent(); + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!(self, "{} = {},", self.alias_ids[&alias], i); + } + i += 1; + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } fn add_symbol_names_list(&mut self) { add_line!(self, "static const char *ts_symbol_names[] = {{"); - self.indent(); - self.dedent(); + indent!(self); + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!( + self, + "[{}] = \"{}\",", + self.symbol_ids[&symbol], + self.sanitize_string(self.metadata_for_symbol(*symbol).0) + ); + } + } + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!( + self, + "[{}] = \"{}\",", + self.alias_ids[&alias], + self.sanitize_string(&alias.value) + ); + } + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } - fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + fn add_symbol_metadata_list(&mut self) { + add_line!( + self, + "static const TSSymbolMetadata ts_symbol_metadata[] = {{" + ); + indent!(self); + for symbol in &self.parse_table.symbols { + add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); + indent!(self); + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } + } + dedent!(self); + add_line!(self, "}},"); + } + for (alias, matching_symbol) in &self.alias_map { + if matching_symbol.is_none() { + add_line!(self, "[{}] = {{", self.alias_ids[&alias]); + indent!(self); + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", alias.is_named); + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_alias_sequences(&mut self) { + add_line!( + self, + "static TSSymbol ts_alias_sequences[{}][MAX_ALIAS_SEQUENCE_LENGTH] = {{", + self.parse_table.alias_sequences.len() + ); + indent!(self); + for (i, sequence) in self.parse_table.alias_sequences.iter().enumerate().skip(1) { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (j, alias) in sequence.iter().enumerate() { + if let Some(alias) = alias { + add_line!(self, "[{}] = {},", j, self.alias_ids[&alias]); + } + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { + add_line!( + self, + "static bool {}(TSLexer *lexer, TSStateId state) {{", + name + ); + indent!(self); + add_line!(self, "START_LEXER();"); + add_line!(self, "switch (state) {{"); + indent!(self); + + for (i, state) in lex_table.states.into_iter().enumerate() { + add_line!(self, "case {}:", i); + indent!(self); + self.add_lex_state(state); + dedent!(self); + } + + add_line!(self, "default:"); + indent!(self); + add_line!(self, "return false;"); + dedent!(self); + + dedent!(self); + add_line!(self, "}}"); + dedent!(self); + add_line!(self, "}}"); + add_line!(self, ""); + } + + fn add_lex_state(&mut self, state: LexState) { + if let Some(accept_action) = state.accept_action { + add_line!( + self, + "ACCEPT_TOKEN({})", + self.symbol_ids[&accept_action.symbol] + ); + } + + let mut ruled_out_characters = HashSet::new(); + for (characters, action) in state.advance_actions { + let previous_length = self.buffer.len(); + + add!(self, "if ("); + if self.add_character_set_condition(&characters, &ruled_out_characters) { + add!(self, ")"); + indent!(self); + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + if let CharacterSet::Include(chars) = characters { + ruled_out_characters.extend(chars.iter()); + } + dedent!(self); + } else { + self.buffer.truncate(previous_length); + } + } + + add_line!(self, "END_STATE();"); + } + + fn add_character_set_condition( + &mut self, + characters: &CharacterSet, + ruled_out_characters: &HashSet, + ) -> bool { + true + } + + fn add_lex_modes_list(&mut self) { + self.get_external_scanner_state_id(HashSet::new()); + + let mut external_tokens_by_corresponding_internal_token = HashMap::new(); + for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() { + if let Some(symbol) = external_token.corresponding_internal_token { + external_tokens_by_corresponding_internal_token.insert(symbol.index, i); + } + } + + add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{"); + indent!(self); + for i in 0..self.parse_table.states.len() { + let mut external_tokens = HashSet::new(); + for token in self.parse_table.states[i].terminal_entries.keys() { + if token.is_external() { + external_tokens.insert(token.index); + } else if token.is_terminal() { + if let Some(external_index) = + external_tokens_by_corresponding_internal_token.get(&token.index) + { + external_tokens.insert(*external_index); + } + } + } + + let external_state_id = self.get_external_scanner_state_id(external_tokens); + let state = &self.parse_table.states[i]; + if external_state_id > 0 { + add_line!( + self, + "[{}] = {{.lex_state = {}, .external_lex_state = {}}},", + i, + state.lex_state_id, + external_state_id + ); + } else { + add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_token_enum(&mut self) { + add_line!(self, "enum {{"); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "{} = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + i + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_symbol_map(&mut self) { + add_line!( + self, + "static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {{" + ); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "[{}] = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + self.symbol_ids[&Symbol::external(i)], + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_states_list(&mut self) { + add_line!( + self, + "static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{", + self.external_scanner_states.len(), + ); + indent!(self); + for i in 0..self.external_scanner_states.len() { + if !self.external_scanner_states[i].is_empty() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for token_index in &self.external_scanner_states[i] { + add_line!( + self, + "[{}] = true,", + self.external_token_id(&self.syntax_grammar.external_tokens[*token_index]) + ); + } + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parse_table(&mut self) { + let mut parse_table_entries = Vec::new(); + let mut next_parse_action_list_index = 0; + + self.get_parse_action_list_id( + &ParseTableEntry { + actions: Vec::new(), + reusable: false, + }, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + + add_line!( + self, + "static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{" + ); + indent!(self); + for (i, state) in self.parse_table.states.iter().enumerate() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (symbol, state_id) in &state.nonterminal_entries { + add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id); + } + for (symbol, entry) in &state.terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!( + self, + "[{}] = ACTIONS({}),", + self.symbol_ids[symbol], + entry_id + ); + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + + self.add_parse_action_list(parse_table_entries); + } + + fn add_parse_action_list(&mut self, parse_table_entries: Vec<(usize, ParseTableEntry)>) { + add_line!(self, "static TSParseActionEntry ts_parse_actions[] = {{"); + indent!(self); + for (i, entry) in parse_table_entries { + add!( + self, + " [{}] = {{.count = {}, .reusable = {}}},", + i, + entry.actions.len(), + entry.reusable + ); + for action in entry.actions { + add!(self, " "); + match action { + ParseAction::Accept => add!(self, " ACCEPT_INPUT()"), + ParseAction::Recover => add!(self, "RECOVER()"), + ParseAction::ShiftExtra => add!(self, "SHIFT_EXTRA()"), + ParseAction::Shift { + state, + is_repetition, + } => { + if is_repetition { + add!(self, "SHIFT_REPEAT({})", state); + } else { + add!(self, "SHIFT({})", state); + } + } + ParseAction::Reduce { + symbol, + child_count, + dynamic_precedence, + alias_sequence_id, + .. + } => { + if !self.symbol_ids.contains_key(&symbol) { + eprintln!( + "SYMBOL: {:?} {:?}", + symbol, + self.metadata_for_symbol(symbol) + ); + } + add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); + if dynamic_precedence != 0 { + add!(self, ", .dynamic_precedence = {}", dynamic_precedence); + } + if alias_sequence_id != 0 { + add!(self, ", .alias_sequence_id = {}", alias_sequence_id); + } + add!(self, ")"); + } + } + add!(self, ",") + } + add!(self, "\n"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parser_export(&mut self) { + let language_function_name = format!("tree_sitter_{}", self.language_name); + let external_scanner_name = format!("{}_external_scanner", language_function_name); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, "void *{}_create();", external_scanner_name); + add_line!(self, "void {}_destroy(void *);", external_scanner_name); + add_line!( + self, + "bool {}_scan(void *, TSLexer *, const bool *);", + external_scanner_name + ); + add_line!( + self, + "unsigned {}_serialize(void *, char *);", + external_scanner_name + ); + add_line!( + self, + "void {}_deserialize(void *, const char *, unsigned);", + external_scanner_name + ); + add_line!(self, ""); + } + + add_line!(self, "#ifdef _WIN32"); + add_line!(self, "#define extern __declspec(dllexport)"); + add_line!(self, "#endif"); + add_line!(self, ""); + + add_line!( + self, + "extern const TSLanguage *{}() {{", + language_function_name + ); + indent!(self); + add_line!(self, "static TSLanguage language = {{"); + indent!(self); + add_line!(self, ".version = LANGUAGE_VERSION,"); + add_line!(self, ".symbol_count = SYMBOL_COUNT,"); + add_line!(self, ".alias_count = ALIAS_COUNT,"); + add_line!(self, ".token_count = TOKEN_COUNT,"); + add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); + add_line!( + self, + ".parse_table = (const unsigned short *)ts_parse_table," + ); + add_line!(self, ".parse_actions = ts_parse_actions,"); + add_line!(self, ".lex_modes = ts_lex_modes,"); + add_line!(self, ".symbol_names = ts_symbol_names,"); + add_line!( + self, + ".alias_sequences = (const TSSymbol *)ts_alias_sequences," + ); + + add_line!( + self, + ".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH," + ); + add_line!(self, ".lex_fn = ts_lex,"); + + if let Some(keyword_capture_token) = self.keyword_capture_token { + add_line!(self, ".keyword_lex_fn = ts_lex_keywords,"); + add_line!( + self, + ".keyword_capture_token = {},", + self.symbol_ids[&keyword_capture_token] + ); + } + + add_line!(self, ".external_token_count = EXTERNAL_TOKEN_COUNT,"); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, ".external_scanner = {{"); + indent!(self); + add_line!(self, "(const bool *)ts_external_scanner_states,"); + add_line!(self, "ts_external_scanner_symbol_map,"); + add_line!(self, "{}_create,", external_scanner_name); + add_line!(self, "{}_destroy,", external_scanner_name); + add_line!(self, "{}_scan,", external_scanner_name); + add_line!(self, "{}_serialize,", external_scanner_name); + add_line!(self, "{}_deserialize,", external_scanner_name); + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + + add_line!(self, "}};"); + add_line!(self, "return &language;"); + dedent!(self); + add_line!(self, "}}"); + } + + fn get_parse_action_list_id( + &self, + entry: &ParseTableEntry, + parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: &mut usize, + ) -> usize { + if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { + return *index; + } + + let result = *next_parse_action_list_index; + parse_table_entries.push((result, entry.clone())); + *next_parse_action_list_index += 1 + entry.actions.len(); + result + } + + fn get_external_scanner_state_id(&mut self, external_tokens: HashSet) -> usize { + self.external_scanner_states + .iter() + .position(|tokens| *tokens == external_tokens) + .unwrap_or_else(|| { + self.external_scanner_states.push(external_tokens); + self.external_scanner_states.len() - 1 + }) + } + + fn external_token_id(&self, token: &ExternalToken) -> String { + format!( + "ts_external_token_{}", + self.sanitize_identifier(&token.name) + ) + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { let mut id; if symbol == Symbol::end() { id = "ts_builtin_sym_end".to_string(); } else { let (name, kind) = self.metadata_for_symbol(symbol); id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), - VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), VariableType::Hidden | VariableType::Named => { - format!("sym_{}", self.sanitize_name(name)) + format!("sym_{}", self.sanitize_identifier(name)) } }; let mut suffix_number = 1; let mut suffix = String::new(); - while used_ids.contains(&id) { + while used_identifiers.contains(&id) { id.drain(id.len() - suffix.len()..); suffix_number += 1; suffix = suffix_number.to_string(); @@ -149,7 +753,7 @@ impl Generator { } } - used_ids.insert(id.clone()); + used_identifiers.insert(id.clone()); self.symbol_ids.insert(symbol, id); } @@ -171,16 +775,67 @@ impl Generator { } } - fn sanitize_name(&self, name: &str) -> String { - name.to_string() + fn sanitize_identifier(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + { + result.push(c); + } else { + result += match c { + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + _ => continue, + } + } + } + result } - fn indent(&mut self) { - self.indent_level += 1; - } - - fn dedent(&mut self) { - self.indent_level -= 1; + fn sanitize_string(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ['\\', '\n', '\r', '\"'].contains(&c) { + result.push('\\'); + } + result.push(c); + } + result } } @@ -206,9 +861,9 @@ pub(crate) fn render_c_code( lexical_grammar, simple_aliases, symbol_ids: HashMap::new(), - parse_table_entries: Vec::new(), - next_parse_action_list_index: 0, - unique_aliases: HashSet::new(), + alias_ids: HashMap::new(), + external_scanner_states: Vec::new(), + alias_map: HashMap::new(), } .generate() } diff --git a/src/tables.rs b/src/tables.rs index 9100b81e..01cecb49 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; +use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -34,7 +35,8 @@ pub(crate) struct ParseTableEntry { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { pub terminal_entries: HashMap, - pub nonterminal_entries: HashMap + pub nonterminal_entries: HashMap, + pub lex_state_id: usize, } #[derive(Debug, PartialEq, Eq)] @@ -60,7 +62,7 @@ pub(crate) struct AcceptTokenAction { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, + pub advance_actions: HashMap, pub accept_action: Option, } @@ -78,6 +80,12 @@ impl ParseTableEntry { } } +impl Default for LexTable { + fn default() -> Self { + LexTable { states: Vec::new() } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self { From 479400e5d3e7fdc1395868c0f19fe6415cb68bda Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:56:00 -0800 Subject: [PATCH 077/208] Add handling of precedence within tokens --- src/nfa.rs | 366 +++++++++++++++++- src/prepare_grammar/expand_tokens.rs | 557 +++++++++++++++------------ src/prepare_grammar/mod.rs | 14 +- 3 files changed, 670 insertions(+), 267 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index f6acb67a..4a4fa17b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -1,5 +1,8 @@ -use std::fmt; use std::char; +use std::cmp::max; +use std::cmp::Ordering; +use std::fmt; +use std::mem::swap; #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum CharacterSet { @@ -13,14 +16,18 @@ pub enum NfaState { chars: CharacterSet, state_id: u32, is_sep: bool, + precedence: i32, }, Split(u32, u32), - Accept(usize), + Accept { + variable_index: usize, + precedence: i32, + }, } #[derive(PartialEq, Eq)] pub struct Nfa { - pub states: Vec + pub states: Vec, } impl Default for Nfa { @@ -78,14 +85,57 @@ impl CharacterSet { } } - pub fn add(self, other: CharacterSet) -> Self { - if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - CharacterSet::Include(chars) + pub fn add(self, other: &CharacterSet) -> Self { + if let CharacterSet::Include(other_chars) = other { + if let CharacterSet::Include(mut chars) = self { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + return CharacterSet::Include(chars); + } + } + panic!("Called add with a negated character set"); + } + + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => { + CharacterSet::Include(remove_chars(chars, other_chars, true)) + } + CharacterSet::Exclude(other_chars) => { + let mut removed = remove_chars(chars, other_chars, false); + add_chars(other_chars, chars); + swap(&mut removed, chars); + CharacterSet::Include(removed) + } + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => { + let mut removed = remove_chars(other_chars, chars, false); + add_chars(chars, other_chars); + swap(&mut removed, other_chars); + CharacterSet::Include(removed) + } + CharacterSet::Exclude(other_chars) => { + let removed = remove_chars(chars, other_chars, true); + let mut included_characters = Vec::new(); + let mut other_included_characters = Vec::new(); + swap(&mut included_characters, other_chars); + swap(&mut other_included_characters, chars); + *self = CharacterSet::Include(included_characters); + *other = CharacterSet::Include(other_included_characters); + CharacterSet::Exclude(removed) + } + }, + } + } + + pub fn is_empty(&self) -> bool { + if let CharacterSet::Include(c) = self { + c.is_empty() } else { - panic!("Called add with a negated character set"); + false } } @@ -97,6 +147,84 @@ impl CharacterSet { } } +impl Ord for CharacterSet { + fn cmp(&self, other: &CharacterSet) -> Ordering { + match self { + CharacterSet::Include(chars) => { + if let CharacterSet::Include(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Less + } + } + CharacterSet::Exclude(chars) => { + if let CharacterSet::Exclude(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Greater + } + } + } + } +} + +impl PartialOrd for CharacterSet { + fn partial_cmp(&self, other: &CharacterSet) -> Option { + Some(self.cmp(other)) + } +} + +fn add_chars(left: &mut Vec, right: &Vec) { + for c in right { + match left.binary_search(c) { + Err(i) => left.insert(i, *c), + _ => {} + } + } +} + +fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) -> Vec { + let mut result = Vec::new(); + right.retain(|right_char| { + if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) { + left.remove(index); + result.push(*right_char); + false || !mutate_right + } else { + true + } + }); + result +} + +fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { + if chars.is_empty() { + if other_chars.is_empty() { + Ordering::Equal + } else { + Ordering::Less + } + } else if other_chars.is_empty() { + Ordering::Greater + } else { + let mut other_c = other_chars.iter(); + for c in chars.iter() { + if let Some(other_c) = other_c.next() { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } + } else { + return Ordering::Greater; + } + } + if other_c.next().is_some() { + return Ordering::Less; + } + Ordering::Equal + } +} + impl Nfa { pub fn new() -> Self { Nfa { states: Vec::new() } @@ -124,17 +252,32 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { - let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true }; + let mut result = Self { + nfa, + state_ids: Vec::new(), + in_sep: true, + }; result.add_states(&mut states); result } + pub fn reset(&mut self, mut states: Vec) { + self.state_ids.clear(); + self.add_states(&mut states); + } + pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_state_ids = Vec::new(); let mut any_sep_transitions = false; for current_state_id in &self.state_ids { - if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] { + if let NfaState::Advance { + chars, + state_id, + is_sep, + .. + } = &self.nfa.states[*current_state_id as usize] + { if chars.contains(c) { if *is_sep { any_sep_transitions = true; @@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> { result } - pub fn finished_id(&self) -> Option { + pub fn successors(&self) -> impl Iterator { + self.state_ids.iter().filter_map(move |id| { + if let NfaState::Advance { + chars, + state_id, + precedence, + .. + } = &self.nfa.states[*id as usize] + { + Some((chars, *precedence, *state_id)) + } else { + None + } + }) + } + + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + Self::group_successors(self.successors()) + } + + fn group_successors<'b>( + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec)> { + let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); + for (chars, prec, state) in iter { + let mut chars = chars.clone(); + let mut i = 0; + while i < result.len() { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let mut precedence = result[i].1; + states.push(state); + result.insert(i, (intersection, max(precedence, prec), states)); + i += 1; + } + i += 1; + } + if !chars.is_empty() { + result.push((chars, prec, vec![state])); + } + } + result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + result + } + + pub fn finished_id(&self) -> Option<(usize, i32)> { let mut result = None; for state_id in self.state_ids.iter() { - if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] { + if let NfaState::Accept { + variable_index, + precedence, + } = self.nfa.states[*state_id as usize] + { match result { - None => { - result = Some(id) - }, - Some(existing_id) => if id < existing_id { - result = Some(id) + None => result = Some((variable_index, precedence)), + Some((existing_id, existing_precedence)) => { + if precedence > existing_precedence + || (precedence == existing_precedence && variable_index < existing_id) + { + result = Some((variable_index, precedence)) + } } } } @@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_group_successors() { + let table = [ + ( + vec![ + (CharacterSet::empty().add_range('a', 'f'), 0, 1), + (CharacterSet::empty().add_range('d', 'i'), 1, 2), + ], + vec![ + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), + (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + ], + ), + ( + vec![ + (CharacterSet::empty().add_range('a', 'z'), 0, 1), + (CharacterSet::empty().add_char('d'), 0, 2), + (CharacterSet::empty().add_char('i'), 0, 3), + (CharacterSet::empty().add_char('f'), 0, 4), + ], + vec![ + ( + CharacterSet::empty() + .add_range('a', 'c') + .add_char('e') + .add_range('g', 'h') + .add_range('j', 'z'), + 0, + vec![1], + ), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + ], + ), + ]; + + for row in table.iter() { + assert_eq!( + NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + row.1 + ); + } + + // let successors = NfaCursor::group_successors( + // [ + // (&CharacterSet::empty().add_range('a', 'f'), 1), + // (&CharacterSet::empty().add_range('d', 'i'), 2), + // ] + // .iter() + // .cloned(), + // ); + // + // assert_eq!( + // successors, + // vec![ + // (CharacterSet::empty().add_range('a', 'c'), vec![1],), + // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), + // (CharacterSet::empty().add_range('g', 'i'), vec![2],), + // ] + // ); + } + + #[test] + fn test_character_set_intersection() { + // whitelist - whitelist + // both sets contain 'c', 'd', and 'f' + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + // whitelist - blacklist + // both sets contain 'e', 'f', and 'm' + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + // blacklist - blacklist + // both sets exclude 'c', 'd', and 'e' + let mut a = CharacterSet::empty().add_range('a', 'e').negate(); + let mut b = CharacterSet::empty().add_range('c', 'h').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Exclude(vec!['c', 'd', 'e']) + ); + assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); + assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + } +} diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 5ee9861f..b0d2ae04 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -7,8 +7,18 @@ use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); +struct NfaBuilder { + nfa: Nfa, + is_sep: bool, + precedence_stack: Vec, +} + +pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut builder = NfaBuilder { + nfa: Nfa::new(), + is_sep: true, + precedence_stack: vec![0], + }; let separator_rule = if grammar.separators.len() > 0 { grammar.separators.push(Rule::Blank); @@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result false, }; - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e { - Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)), - _ => e, - })?; + builder.is_sep = false; + builder.nfa.states.push(NfaState::Accept { + variable_index: i, + precedence: 0, + }); + let last_state_id = builder.nfa.last_state_id(); + builder + .expand_rule(&variable.rule, last_state_id) + .map_err(|e| match e { + Error::RegexError(msg) => { + Error::RegexError(format!("Rule {} {}", variable.name, msg)) + } + _ => e, + })?; if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + builder.is_sep = true; + let last_state_id = builder.nfa.last_state_id(); + builder.expand_rule(&separator_rule, last_state_id)?; } variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - start_state: nfa.last_state_id(), + start_state: builder.nfa.last_state_id(), }); } - Ok(LexicalGrammar { nfa, variables }) + Ok(LexicalGrammar { + nfa: builder.nfa, + variables, + }) } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) - } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - }); +impl NfaBuilder { + fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + self.expand_regex(&ast, next_state_id) } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); + Rule::String(s) => { + for c in s.chars().rev() { + self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id()); } + Ok(s.len() > 0) } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if self.expand_rule(element, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } } - next_state_id = nfa.last_state_id(); - } - Ok(result) - } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Metadata { rule, .. } => { - // TODO - implement precedence - expand_rule(rule, nfa, next_state_id, is_sep) - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), - } -} - -fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - nfa.states.pop(); - Ok(false) - } -} - -fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_count( - ast: &Ast, - count: u32, - nfa: &mut Nfa, - mut next_state_id: u32, - is_sep: bool, -) -> Result { - let mut result = false; - for _ in 0..count { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); - } - } - Ok(result) -} - -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match ast { - Ast::Empty(_) => Ok(false), - Ast::Flags(_) => Err(Error::regex("Flags are not supported")), - Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Include(vec![literal.c]), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Dot(_) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Exclude(vec!['\n']), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), - Ast::Class(class) => match class { - Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), - Class::Perl(class) => { - nfa.states.push(NfaState::Advance { - chars: expand_perl_character_class(&class.kind), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Class::Bracketed(class) => match &class.kind { - ClassSet::Item(item) => { - let character_set = expand_character_class(&item)?; - nfa.states.push(NfaState::Advance { - chars: character_set, - state_id: next_state_id, - is_sep, + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) }); - Ok(true) } - ClassSet::BinaryOp(_) => Err(Error::regex( - "Binary operators in character classes aren't supported", - )), - }, - }, - Ast::Repetition(repetition) => match repetition.op.kind { - RepetitionKind::ZeroOrOne => { - expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep) + Ok(true) } - RepetitionKind::OneOrMore => { - expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep) + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if self.expand_rule(element, next_state_id)? { + result = true; + } + next_state_id = self.nfa.last_state_id(); + } + Ok(result) } - RepetitionKind::ZeroOrMore => { - expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::Exactly(count)) => { - expand_count(&repetition.ast, count, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { - if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { - expand_count(&repetition.ast, min, nfa, next_state_id, is_sep) + Rule::Repeat(rule) => { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_rule(rule, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); + Ok(true) } else { Ok(false) } } - RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { - let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?; - for _ in min..max { - if result { - next_state_id = nfa.last_state_id(); + Rule::Metadata { rule, params } => { + if let Some(precedence) = params.precedence { + self.precedence_stack.push(precedence); + } + let result = self.expand_rule(rule, next_state_id); + if params.precedence.is_some() { + self.precedence_stack.pop(); + } + result + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + } + } + + fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result { + match ast { + Ast::Empty(_) => Ok(false), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id); + Ok(true) + } + Ast::Dot(_) => { + self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id); + Ok(true) + } + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => { + Err(Error::regex("Unicode character classes are not supported")) + } + Class::Perl(class) => { + self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + Ok(true) + } + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + self.push_advance(self.expand_character_class(&item)?, next_state_id); + Ok(true) } - if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? { + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + self.expand_zero_or_one(&repetition.ast, next_state_id) + } + RepetitionKind::OneOrMore => { + self.expand_one_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::ZeroOrMore => { + self.expand_zero_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + self.expand_count(&repetition.ast, count, next_state_id) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if self.expand_zero_or_more(&repetition.ast, next_state_id)? { + self.expand_count(&repetition.ast, min, next_state_id) + } else { + Ok(false) + } + } + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = self.expand_count(&repetition.ast, min, next_state_id)?; + for _ in min..max { + if result { + next_state_id = self.nfa.last_state_id(); + } + if self.expand_zero_or_one(&repetition.ast, next_state_id)? { + result = true; + } + } + Ok(result) + } + }, + Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()), + Ast::Alternation(alternation) => { + let mut alternative_state_ids = Vec::new(); + for ast in alternation.asts.iter() { + if self.expand_regex(&ast, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) + }); + } + Ok(true) + } + Ast::Concat(concat) => { + let mut result = false; + for ast in concat.asts.iter().rev() { + if self.expand_regex(&ast, next_state_id)? { result = true; + next_state_id = self.nfa.last_state_id(); } } Ok(result) } - }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), - Ast::Alternation(alternation) => { - let mut alternative_state_ids = Vec::new(); - for ast in alternation.asts.iter() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } + } + } + + fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_regex(&ast, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); Ok(true) + } else { + self.nfa.states.pop(); + Ok(false) } - Ast::Concat(concat) => { - let mut result = false; - for ast in concat.asts.iter().rev() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); + } + + fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_regex(ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_one_or_more(&ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result { + let mut result = false; + for _ in 0..count { + if self.expand_regex(ast, next_state_id)? { + result = true; + next_state_id = self.nfa.last_state_id(); + } + } + Ok(result) + } + + fn expand_character_class(&self, item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(&self.expand_character_class(&item)?); } + Ok(result) } - Ok(result) + ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), } } -} -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); - } - Ok(result) - } - ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)), - _ => Err(Error::regex(&format!( - "Unsupported character class syntax {:?}", - item - ))), } -} -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), + fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { + let precedence = *self.precedence_stack.last().unwrap(); + self.add_precedence(precedence, vec![state_id]); + self.nfa.states.push(NfaState::Advance { + chars, + state_id, + precedence, + is_sep: self.is_sep, + }); + } + + fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { + let mut i = 0; + while i < state_ids.len() { + let state_id = state_ids[i]; + let (left, right) = match &mut self.nfa.states[state_id as usize] { + NfaState::Accept {precedence, ..} => { + *precedence = prec; + return; + }, + NfaState::Split(left, right) => (*left, *right), + _ => return + }; + if !state_ids.contains(&left) { + state_ids.push(left); + } + if !state_ids.contains(&right) { + state_ids.push(right); + } + i += 1; + } } } @@ -313,11 +367,15 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; + let mut result_precedence = 0; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } if cursor.advance(c) { end_char += 1; @@ -329,8 +387,11 @@ mod tests { } } - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } result @@ -443,6 +504,20 @@ mod tests { (" \\\na", Some((0, "a"))), ], }, + // shorter tokens with higher precedence + Row { + rules: vec![ + Rule::prec(2, Rule::pattern("abc")), + Rule::prec(1, Rule::pattern("ab[cd]e")), + Rule::pattern("[a-e]+"), + ], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], + examples: vec![ + ("abceef", Some((0, "abc"))), + ("abdeef", Some((1, "abde"))), + ("aeeeef", Some((2, "aeeee"))), + ], + }, ]; for Row { diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index f325383b..b0c1d2a3 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -7,7 +7,7 @@ mod intern_symbols; mod process_inlines; use self::expand_repeats::expand_repeats; -use self::expand_tokens::expand_tokens; +pub(crate) use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; @@ -19,7 +19,7 @@ use crate::grammars::{ }; use crate::rules::{AliasMap, Rule, Symbol}; -pub(self) struct IntermediateGrammar { +pub(crate) struct IntermediateGrammar { variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, @@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(crate) type InternedGrammar = IntermediateGrammar; -pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar; +pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar; #[derive(Debug, PartialEq, Eq)] -pub(self) struct ExtractedLexicalGrammar { - variables: Vec, - separators: Vec, +pub(crate) struct ExtractedLexicalGrammar { + pub variables: Vec, + pub separators: Vec, } pub(crate) fn prepare_grammar( From 605b50e58bf03661774ce7eb18f3b98dbd767ce3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:57:34 -0800 Subject: [PATCH 078/208] Start work on shrinking parse table --- src/build_tables/build_parse_table.rs | 605 ++++++++++++++++++++++++ src/build_tables/mod.rs | 630 +------------------------ src/build_tables/shrink_parse_table.rs | 117 +++++ src/build_tables/token_conflict_map.rs | 77 +++ src/tables.rs | 56 ++- 5 files changed, 866 insertions(+), 619 deletions(-) create mode 100644 src/build_tables/build_parse_table.rs create mode 100644 src/build_tables/shrink_parse_table.rs create mode 100644 src/build_tables/token_conflict_map.rs diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs new file mode 100644 index 00000000..5087c55c --- /dev/null +++ b/src/build_tables/build_parse_table.rs @@ -0,0 +1,605 @@ +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::{ + AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} + +type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder<'a>, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, + parse_state_queue: VecDeque, + parse_table: ParseTable, +} + +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result { + // Ensure that the empty alias sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with( + [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + .iter() + .cloned(), + ), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + Ok(self.parse_table) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet<'a>, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + lex_state_id: 0, + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.inlines, + ); + + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet<'a>, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = item.symbol() { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_augmented() { + ParseAction::Accept + } else { + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + if let Some(step) = item.step() { + if item.step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = item.precedence(); + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index; + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index as usize) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index as usize].name + ) + .unwrap(); + + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item.is_done() { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(); + let associativity = item.associativity(); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol() == Some(symbol) { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let mut alias_sequence: Vec> = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + +pub(crate) fn build_parse_table( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + inlines: &InlinedProductionMap, +) -> Result { + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + inlines, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index fc17ce7f..a5ac74fb 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,607 +1,17 @@ +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseTable}; + +mod build_parse_table; mod item; mod item_set_builder; mod lex_table_builder; +mod shrink_parse_table; +mod token_conflict_map; -use self::item::{LookaheadSet, ParseItem, ParseItemSet}; -use self::item_set_builder::ParseItemSetBuilder; -use self::lex_table_builder::LexTableBuilder; -use crate::error::{Error, Result}; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::Alias; -use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; -use crate::tables::{ - AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, -}; -use core::ops::Range; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet, VecDeque}; -use std::fmt::Write; - -#[derive(Clone)] -struct AuxiliarySymbolInfo { - auxiliary_symbol: Symbol, - parent_symbols: Vec, -} - -type SymbolSequence = Vec; -type AuxiliarySymbolSequence = Vec; - -struct ParseStateQueueEntry { - preceding_symbols: SymbolSequence, - preceding_auxiliary_symbols: AuxiliarySymbolSequence, - state_id: ParseStateId, -} - -struct ParseTableBuilder<'a> { - item_set_builder: ParseItemSetBuilder<'a>, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, - simple_aliases: &'a AliasMap, - state_ids_by_item_set: HashMap, ParseStateId>, - item_sets_by_state_id: Vec>, - parse_state_queue: VecDeque, - parse_table: ParseTable, -} - -impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty alias sequence has index 0. - self.parse_table.alias_sequences.push(Vec::new()); - - // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); - - self.add_parse_state( - &Vec::new(), - &Vec::new(), - ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] - .iter() - .cloned(), - ), - ); - - self.process_part_state_queue()?; - - let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); - - self.populate_used_symbols(); - - let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); - Ok(( - self.parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - )) - } - - fn add_parse_state( - &mut self, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &AuxiliarySymbolSequence, - item_set: ParseItemSet<'a>, - ) -> ParseStateId { - match self.state_ids_by_item_set.entry(item_set) { - Entry::Occupied(o) => { - // eprintln!("Item set already processed at state {}", *o.get()); - *o.get() - } - Entry::Vacant(v) => { - // eprintln!("Item set not yet processed"); - let state_id = self.parse_table.states.len(); - self.item_sets_by_state_id.push(v.key().clone()); - self.parse_table.states.push(ParseState { - lex_state_id: 0, - terminal_entries: HashMap::new(), - nonterminal_entries: HashMap::new(), - }); - self.parse_state_queue.push_back(ParseStateQueueEntry { - state_id, - preceding_symbols: preceding_symbols.clone(), - preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), - }); - v.insert(state_id); - state_id - } - } - } - - fn process_part_state_queue(&mut self) -> Result<()> { - while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - - self.add_actions( - entry.preceding_symbols, - entry.preceding_auxiliary_symbols, - item_set, - entry.state_id, - )?; - } - Ok(()) - } - - fn add_actions( - &mut self, - mut preceding_symbols: SymbolSequence, - mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet<'a>, - state_id: ParseStateId, - ) -> Result<()> { - let mut terminal_successors = HashMap::new(); - let mut non_terminal_successors = HashMap::new(); - let mut lookaheads_with_conflicts = HashSet::new(); - - for (item, lookaheads) in &item_set.entries { - if let Some(next_symbol) = item.symbol() { - let successor = item.successor(); - if next_symbol.is_non_terminal() { - // Keep track of where auxiliary non-terminals (repeat symbols) are - // used within visible symbols. This information may be needed later - // for conflict resolution. - if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { - preceding_auxiliary_symbols - .push(self.get_auxiliary_node_info(&item_set, next_symbol)); - } - - non_terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } else { - terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } - } else { - let action = if item.is_augmented() { - ParseAction::Accept - } else { - ParseAction::Reduce { - symbol: Symbol::non_terminal(item.variable_index as usize), - child_count: item.step_index as usize, - precedence: item.precedence(), - associativity: item.associativity(), - dynamic_precedence: item.production.dynamic_precedence, - alias_sequence_id: self.get_alias_sequence_id(item), - } - }; - - for lookahead in lookaheads.iter() { - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(lookahead); - let entry = entry.or_insert_with(|| ParseTableEntry::new()); - if entry.actions.is_empty() { - entry.actions.push(action); - } else if action.precedence() > entry.actions[0].precedence() { - entry.actions.clear(); - entry.actions.push(action); - lookaheads_with_conflicts.remove(&lookahead); - } else if action.precedence() == entry.actions[0].precedence() { - entry.actions.push(action); - lookaheads_with_conflicts.insert(lookahead); - } - } - } - } - - for (symbol, next_item_set) in terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(symbol); - if let Entry::Occupied(e) = &entry { - if !e.get().actions.is_empty() { - lookaheads_with_conflicts.insert(symbol); - } - } - - entry - .or_insert_with(|| ParseTableEntry::new()) - .actions - .push(ParseAction::Shift { - state: next_state_id, - is_repetition: false, - }); - } - - for (symbol, next_item_set) in non_terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - self.parse_table.states[state_id] - .nonterminal_entries - .insert(symbol, next_state_id); - } - - for symbol in lookaheads_with_conflicts { - self.handle_conflict( - &item_set, - state_id, - &preceding_symbols, - &preceding_auxiliary_symbols, - symbol, - )?; - } - - let state = &mut self.parse_table.states[state_id]; - for extra_token in &self.syntax_grammar.extra_tokens { - state - .terminal_entries - .entry(*extra_token) - .or_insert(ParseTableEntry { - reusable: true, - actions: vec![ParseAction::ShiftExtra], - }); - } - - Ok(()) - } - - fn handle_conflict( - &mut self, - item_set: &ParseItemSet, - state_id: ParseStateId, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &Vec, - conflicting_lookahead: Symbol, - ) -> Result<()> { - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - - // Determine which items in the set conflict with each other, and the - // precedences associated with SHIFT vs REDUCE actions. There won't - // be multiple REDUCE actions with different precedences; that is - // sorted out ahead of time in `add_actions`. But there can still be - // REDUCE-REDUCE conflicts where all actions have the *same* - // precedence, and there can still be SHIFT/REDUCE conflicts. - let reduce_precedence = entry.actions[0].precedence(); - let mut considered_associativity = false; - let mut shift_precedence: Option> = None; - let mut conflicting_items = HashSet::new(); - for (item, lookaheads) in &item_set.entries { - if let Some(step) = item.step() { - if item.step_index > 0 { - if self - .item_set_builder - .first_set(&step.symbol) - .contains(&conflicting_lookahead) - { - conflicting_items.insert(item); - let precedence = item.precedence(); - if let Some(range) = &mut shift_precedence { - if precedence < range.start { - range.start = precedence; - } else if precedence > range.end { - range.end = precedence; - } - } else { - shift_precedence = Some(precedence..precedence); - } - } - } - } else if lookaheads.contains(&conflicting_lookahead) { - conflicting_items.insert(item); - } - } - - if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { - let shift_precedence = shift_precedence.unwrap_or(0..0); - - // If all of the items in the conflict have the same parent symbol, - // and that parent symbols is auxiliary, then this is just the intentional - // ambiguity associated with a repeat rule. Resolve that class of ambiguity - // by leaving it in the parse table, but marking the SHIFT action with - // an `is_repetition` flag. - let conflicting_variable_index = - conflicting_items.iter().next().unwrap().variable_index; - if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { - if conflicting_items - .iter() - .all(|item| item.variable_index == conflicting_variable_index) - { - *is_repetition = true; - return Ok(()); - } - } - - // If the SHIFT action has higher precedence, remove all the REDUCE actions. - if shift_precedence.start > reduce_precedence - || (shift_precedence.start == reduce_precedence - && shift_precedence.end > reduce_precedence) - { - entry.actions.drain(0..entry.actions.len() - 1); - } - // If the REDUCE actions have higher precedence, remove the SHIFT action. - else if shift_precedence.end < reduce_precedence - || (shift_precedence.end == reduce_precedence - && shift_precedence.start < reduce_precedence) - { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - // If the SHIFT and REDUCE actions have the same predence, consider - // the REDUCE actions' associativity. - else if shift_precedence == (reduce_precedence..reduce_precedence) { - considered_associativity = true; - let mut has_left = false; - let mut has_right = false; - let mut has_non = false; - for action in &entry.actions { - if let ParseAction::Reduce { associativity, .. } = action { - match associativity { - Some(Associativity::Left) => has_left = true, - Some(Associativity::Right) => has_right = true, - None => has_non = true, - } - } - } - - // If all reduce actions are left associative, remove the SHIFT action. - // If all reduce actions are right associative, remove the REDUCE actions. - match (has_left, has_non, has_right) { - (true, false, false) => { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - (false, false, true) => { - entry.actions.drain(0..entry.actions.len() - 1); - } - _ => {} - } - } - } - - // If all of the actions but one have been eliminated, then there's no problem. - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - if entry.actions.len() == 1 { - return Ok(()); - } - - // Determine the set of parent symbols involved in this conflict. - let mut actual_conflict = Vec::new(); - for item in &conflicting_items { - let symbol = Symbol::non_terminal(item.variable_index as usize); - if self.syntax_grammar.variables[symbol.index].is_auxiliary() { - actual_conflict.extend( - preceding_auxiliary_symbols - .iter() - .rev() - .find_map(|info| { - if info.auxiliary_symbol == symbol { - Some(&info.parent_symbols) - } else { - None - } - }) - .unwrap() - .iter(), - ); - } else { - actual_conflict.push(symbol); - } - } - actual_conflict.sort_unstable(); - actual_conflict.dedup(); - - // If this set of symbols has been whitelisted, then there's no error. - if self - .syntax_grammar - .expected_conflicts - .contains(&actual_conflict) - { - return Ok(()); - } - - let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); - for symbol in preceding_symbols { - write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); - } - - write!( - &mut msg, - " • {} …\n\n", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - write!(&mut msg, "Possible interpretations:\n").unwrap(); - for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, "\n {}:", i).unwrap(); - - for preceding_symbol in preceding_symbols - .iter() - .take(preceding_symbols.len() - item.step_index as usize) - { - write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); - } - - write!( - &mut msg, - " ({}", - &self.syntax_grammar.variables[item.variable_index as usize].name - ) - .unwrap(); - - for (j, step) in item.production.steps.iter().enumerate() { - if j as u32 == item.step_index { - write!(&mut msg, " •").unwrap(); - } - write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); - } - - write!(&mut msg, ")").unwrap(); - - if item.is_done() { - write!( - &mut msg, - " • {}", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - } - - let precedence = item.precedence(); - let associativity = item.associativity(); - if precedence != 0 || associativity.is_some() { - write!( - &mut msg, - "(precedence: {}, associativity: {:?})", - precedence, associativity - ) - .unwrap(); - } - } - - // TODO - generate suggested resolutions - - Err(Error::ConflictError(msg)) - } - - fn get_auxiliary_node_info( - &self, - item_set: &ParseItemSet, - symbol: Symbol, - ) -> AuxiliarySymbolInfo { - let parent_symbols = item_set - .entries - .keys() - .filter_map(|item| { - if item.symbol() == Some(symbol) { - None - } else { - None - } - }) - .collect(); - AuxiliarySymbolInfo { - auxiliary_symbol: symbol, - parent_symbols, - } - } - - fn populate_used_symbols(&mut self) { - let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; - let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; - let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; - for state in &self.parse_table.states { - for symbol in state.terminal_entries.keys() { - match symbol.kind { - SymbolType::Terminal => terminal_usages[symbol.index] = true, - SymbolType::External => external_usages[symbol.index] = true, - _ => {} - } - } - for symbol in state.nonterminal_entries.keys() { - non_terminal_usages[symbol.index] = true; - } - } - self.parse_table.symbols.push(Symbol::end()); - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } - for (i, value) in non_terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::non_terminal(i)); - } - } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); - } - } - } - - fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let mut alias_sequence: Vec> = item - .production - .steps - .iter() - .map(|s| s.alias.clone()) - .collect(); - while alias_sequence.last() == Some(&None) { - alias_sequence.pop(); - } - if let Some(index) = self - .parse_table - .alias_sequences - .iter() - .position(|seq| *seq == alias_sequence) - { - index - } else { - self.parse_table.alias_sequences.push(alias_sequence); - self.parse_table.alias_sequences.len() - 1 - } - } - - fn symbol_name(&self, symbol: &Symbol) -> String { - match symbol.kind { - SymbolType::End => "EOF".to_string(), - SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] - .name - .clone(), - SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), - SymbolType::Terminal => { - let variable = &self.lexical_grammar.variables[symbol.index]; - if variable.kind == VariableType::Named { - variable.name.clone() - } else { - format!("\"{}\"", &variable.name) - } - } - } - } -} +use self::build_parse_table::build_parse_table; +use self::shrink_parse_table::shrink_parse_table; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -609,20 +19,8 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - ParseTableBuilder { - syntax_grammar, - lexical_grammar, - simple_aliases, - inlines, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), - state_ids_by_item_set: HashMap::new(), - item_sets_by_state_id: Vec::new(), - parse_state_queue: VecDeque::new(), - parse_table: ParseTable { - states: Vec::new(), - alias_sequences: Vec::new(), - symbols: Vec::new(), - }, - } - .build() + + let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + Ok((parse_table, LexTable::default(), LexTable::default(), None)) } diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs new file mode 100644 index 00000000..8e826f5c --- /dev/null +++ b/src/build_tables/shrink_parse_table.rs @@ -0,0 +1,117 @@ +use crate::grammars::{SyntaxGrammar, VariableType}; +use crate::rules::AliasMap; +use crate::tables::{ParseAction, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) fn shrink_parse_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + remove_unused_states(parse_table); +} + +fn remove_unit_reductions( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + let mut aliased_symbols = HashSet::new(); + for variable in &syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } + } + } + } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && syntax_grammar.variables[symbol.index].kind != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } +} + +fn remove_unused_states(parse_table: &mut ParseTable) { + let mut state_usage_map = vec![false; parse_table.states.len()]; + for state in &parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } + } + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; parse_table.states.len()]; + for state_id in 0..parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < parse_table.states.len() { + if state_usage_map[original_state_id] { + parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + parse_table.states.remove(state_id); + } + original_state_id += 1; + } +} diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs new file mode 100644 index 00000000..46a00986 --- /dev/null +++ b/src/build_tables/token_conflict_map.rs @@ -0,0 +1,77 @@ +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{CharacterSet, NfaCursor}; +use std::collections::HashSet; + +#[derive(Default)] +struct TokenConflictStatus { + matches_same_string: bool, + matches_longer_string_with_valid_next_char: bool, +} + +pub(crate) struct TokenConflictMap { + starting_chars_by_index: Vec, + status_matrix: Vec, +} + +impl TokenConflictMap { + pub fn new(grammar: &LexicalGrammar) -> Self { + let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + + let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _, _) in cursor.successors() { + all_chars = all_chars.add(chars); + } + starting_chars_by_index.push(all_chars); + } + + let status_matrix = + Vec::with_capacity(grammar.variables.len() * grammar.variables.len()); + + TokenConflictMap { + starting_chars_by_index, + status_matrix, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{Variable, VariableType}; + use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; + use crate::rules::Rule; + + #[test] + fn test_starting_characters() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "token_0".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("[a-f]1|0x\\d"), + }, + Variable { + name: "token_1".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("d*ef"), + }, + ], + }) + .unwrap(); + + let token_map = TokenConflictMap::new(&grammar); + + assert_eq!( + token_map.starting_chars_by_index[0], + CharacterSet::empty().add_range('a', 'f').add_char('0') + ); + assert_eq!( + token_map.starting_chars_by_index[1], + CharacterSet::empty().add_range('d', 'e') + ); + } +} diff --git a/src/tables.rs b/src/tables.rs index 01cecb49..0815aac8 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,7 @@ +use crate::nfa::CharacterSet; +use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; use std::ops::Range; -use crate::rules::{Associativity, Symbol, Alias}; -use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -23,7 +23,7 @@ pub(crate) enum ParseAction { dynamic_precedence: i32, associativity: Option, alias_sequence_id: AliasSequenceId, - } + }, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -86,6 +86,56 @@ impl Default for LexTable { } } +impl ParseState { + pub fn referenced_states<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_entries + .iter() + .flat_map(|(_, entry)| { + entry.actions.iter().filter_map(|action| match action { + ParseAction::Shift { state, .. } => Some(*state), + _ => None, + }) + }) + .chain(self.nonterminal_entries.iter().map(|(_, state)| *state)) + } + + pub fn update_referenced_states(&mut self, mut f: F) + where + F: FnMut(usize, &ParseState) -> usize, + { + let mut updates = Vec::new(); + for (symbol, entry) in &self.terminal_entries { + for (i, action) in entry.actions.iter().enumerate() { + if let ParseAction::Shift { state, .. } = action { + let result = f(*state, self); + if result != *state { + updates.push((*symbol, i, result)); + } + } + } + } + for (symbol, other_state) in &self.nonterminal_entries { + let result = f(*other_state, self); + if result != *other_state { + updates.push((*symbol, 0, result)); + } + } + for (symbol, action_index, new_state) in updates { + if symbol.is_non_terminal() { + self.nonterminal_entries.insert(symbol, new_state); + } else { + let entry = self.terminal_entries.get_mut(&symbol).unwrap(); + if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] { + entry.actions[action_index] = ParseAction::Shift { + state: new_state, + is_repetition, + }; + } + } + } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self { From c6b9e97c5820bd2f24c42e58fd2e82944354a6b6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 30 Dec 2018 19:31:17 -0800 Subject: [PATCH 079/208] Implement token conflict map --- src/build_tables/build_parse_table.rs | 20 +- src/build_tables/item_set_builder.rs | 4 + src/build_tables/mod.rs | 6 +- src/build_tables/token_conflict_map.rs | 315 +++++++++++++++++- src/grammars.rs | 7 + src/nfa.rs | 156 ++++++--- src/prepare_grammar/expand_tokens.rs | 40 ++- src/prepare_grammar/extract_simple_aliases.rs | 3 + 8 files changed, 471 insertions(+), 80 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 5087c55c..a7911689 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -2,7 +2,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType}; +use crate::rules::{Alias, Associativity, Symbol, SymbolType}; use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; @@ -35,10 +35,11 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, + following_tokens: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result { + fn build(mut self) -> Result<(ParseTable, Vec)> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -58,7 +59,7 @@ impl<'a> ParseTableBuilder<'a> { self.process_part_state_queue()?; self.populate_used_symbols(); - Ok(self.parse_table) + Ok((self.parse_table, self.following_tokens)) } fn add_parse_state( @@ -67,6 +68,16 @@ impl<'a> ParseTableBuilder<'a> { preceding_auxiliary_symbols: &AuxiliarySymbolSequence, item_set: ParseItemSet<'a>, ) -> ParseStateId { + if preceding_symbols.len() > 1 { + let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]); + for left_token in left_tokens.iter() { + if left_token.is_terminal() { + self.following_tokens[left_token.index].insert_all(right_tokens); + } + } + } + match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -586,7 +597,7 @@ pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, -) -> Result { +) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, @@ -600,6 +611,7 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), symbols: Vec::new(), }, + following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], } .build() } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index d7883988..8649cb52 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -269,6 +269,10 @@ impl<'a> ParseItemSetBuilder<'a> { &self.first_sets[symbol] } + pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + fn add_item( &self, set: &mut ParseItemSet<'a>, diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index a5ac74fb..d1983068 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -12,6 +12,7 @@ mod token_conflict_map; use self::build_parse_table::build_parse_table; use self::shrink_parse_table::shrink_parse_table; +use self::token_conflict_map::TokenConflictMap; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -19,8 +20,9 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - - let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + let (mut parse_table, following_tokens) = + build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); Ok((parse_table, LexTable::default(), LexTable::default(), None)) } diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs index 46a00986..52c68cc7 100644 --- a/src/build_tables/token_conflict_map.rs +++ b/src/build_tables/token_conflict_map.rs @@ -1,40 +1,262 @@ -use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::build_tables::item::LookaheadSet; +use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; +use std::fmt; -#[derive(Default)] +#[derive(Clone, Debug, Default)] struct TokenConflictStatus { + does_overlap: bool, + does_match_valid_continuation: bool, matches_same_string: bool, - matches_longer_string_with_valid_next_char: bool, } pub(crate) struct TokenConflictMap { - starting_chars_by_index: Vec, + n: usize, status_matrix: Vec, + starting_chars_by_index: Vec, + following_chars_by_index: Vec, } impl TokenConflictMap { - pub fn new(grammar: &LexicalGrammar) -> Self { + pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + let starting_chars = get_starting_chars(&mut cursor, grammar); + let following_chars = get_following_chars(&starting_chars, following_tokens); - let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len()); - for variable in &grammar.variables { - cursor.reset(vec![variable.start_state]); - let mut all_chars = CharacterSet::empty(); - for (chars, _, _) in cursor.successors() { - all_chars = all_chars.add(chars); + let n = grammar.variables.len(); + let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; + for i in 0..grammar.variables.len() { + for j in 0..i { + let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j); + status_matrix[matrix_index(n, i, j)] = status.0; + status_matrix[matrix_index(n, j, i)] = status.1; } - starting_chars_by_index.push(all_chars); } - let status_matrix = - Vec::with_capacity(grammar.variables.len() * grammar.variables.len()); - TokenConflictMap { - starting_chars_by_index, + n, status_matrix, + starting_chars_by_index: starting_chars, + following_chars_by_index: following_chars, } } + + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].matches_same_string + } + + pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation + } + + pub fn does_overlap(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].does_overlap + } +} + +impl fmt::Debug for TokenConflictMap { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "TokenConflictMap {{\n")?; + + write!(f, " starting_characters: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; + } + write!(f, " }},\n")?; + + write!(f, " following_characters: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; + } + write!(f, " }},\n")?; + + write!(f, " status_matrix: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", i)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + j, + self.status_matrix[matrix_index(self.n, i, j)] + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} + +fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize { + variable_count * i + j +} + +fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec { + let mut result = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _, _) in cursor.successors() { + all_chars = all_chars.add(chars); + } + result.push(all_chars); + } + result +} + +fn get_following_chars( + starting_chars: &Vec, + following_tokens: Vec, +) -> Vec { + following_tokens + .into_iter() + .map(|following_tokens| { + let mut chars = CharacterSet::empty(); + for token in following_tokens.iter() { + if token.is_terminal() { + chars = chars.add(&starting_chars[token.index]); + } + } + chars + }) + .collect() +} + +fn compute_conflict_status( + cursor: &mut NfaCursor, + grammar: &LexicalGrammar, + following_chars: &Vec, + i: usize, + j: usize, +) -> (TokenConflictStatus, TokenConflictStatus) { + let mut visited_state_sets = HashSet::new(); + let mut state_set_queue = vec![vec![ + grammar.variables[i].start_state, + grammar.variables[j].start_state, + ]]; + let mut result = ( + TokenConflictStatus::default(), + TokenConflictStatus::default(), + ); + + while let Some(state_set) = state_set_queue.pop() { + // Don't pursue states where there's no potential for conflict. + if variable_ids_for_states(&state_set, grammar).count() > 1 { + cursor.reset(state_set); + } else { + continue; + } + + let mut completion = None; + for (id, precedence) in cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if id == prev_id { + continue; + } + + // Prefer tokens with higher precedence. For tokens with equal precedence, + // prefer those listed earlier in the grammar. + let winning_id; + if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) { + winning_id = prev_id; + } else { + winning_id = id; + completion = Some((id, precedence)); + } + + if winning_id == i { + result.0.matches_same_string = true; + result.0.does_overlap = true; + } else { + result.1.matches_same_string = true; + result.1.does_overlap = true; + } + } else { + completion = Some((id, precedence)); + } + } + + for (chars, advance_precedence, next_states) in cursor.grouped_successors() { + let mut can_advance = true; + if let Some((completed_id, completed_precedence)) = completion { + let mut other_id = None; + let mut successor_contains_completed_id = false; + for variable_id in variable_ids_for_states(&next_states, grammar) { + if variable_id == completed_id { + successor_contains_completed_id = true; + break; + } else { + other_id = Some(variable_id); + } + } + + if let (Some(other_id), false) = (other_id, successor_contains_completed_id) { + let winning_id; + if advance_precedence < completed_precedence { + winning_id = completed_id; + can_advance = false; + } else { + winning_id = other_id; + } + + if winning_id == i { + result.0.does_overlap = true; + if chars.does_intersect(&following_chars[j]) { + result.0.does_match_valid_continuation = true; + } + } else { + result.1.does_overlap = true; + if chars.does_intersect(&following_chars[i]) { + result.1.does_match_valid_continuation = true; + } + } + } + } + + if can_advance && visited_state_sets.insert(next_states.clone()) { + state_set_queue.push(next_states); + } + } + } + result +} + +fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match ( + grammar.variables[left.1].is_string, + grammar.variables[right.1].is_string, + ) { + (true, false) => return true, + (false, true) => return false, + _ => {} + } + + left.0 < right.0 +} + +fn variable_ids_for_states<'a>( + state_ids: &'a Vec, + grammar: &'a LexicalGrammar, +) -> impl Iterator + 'a { + let mut prev = None; + state_ids.iter().filter_map(move |state_id| { + let variable_id = grammar.variable_index_for_nfa_state(*state_id); + if prev != Some(variable_id) { + prev = Some(variable_id); + prev + } else { + None + } + }) } #[cfg(test)] @@ -42,7 +264,7 @@ mod tests { use super::*; use crate::grammars::{Variable, VariableType}; use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; - use crate::rules::Rule; + use crate::rules::{Rule, Symbol}; #[test] fn test_starting_characters() { @@ -63,7 +285,7 @@ mod tests { }) .unwrap(); - let token_map = TokenConflictMap::new(&grammar); + let token_map = TokenConflictMap::new(&grammar, Vec::new()); assert_eq!( token_map.starting_chars_by_index[0], @@ -74,4 +296,61 @@ mod tests { CharacterSet::empty().add_range('d', 'e') ); } + + #[test] + fn test_token_conflicts() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "in".to_string(), + kind: VariableType::Named, + rule: Rule::string("in"), + }, + Variable { + name: "identifier".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("\\w+"), + }, + Variable { + name: "instanceof".to_string(), + kind: VariableType::Named, + rule: Rule::string("instanceof"), + }, + ], + }) + .unwrap(); + + let var = |name| index_of_var(&grammar, name); + + let token_map = TokenConflictMap::new( + &grammar, + vec![ + LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + LookaheadSet::with(&[Symbol::terminal(var("in"))]), + LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + ], + ); + + // Given the string "in", the `in` token is preferred over the `identifier` token + assert!(token_map.does_match_same_string(var("in"), var("identifier"))); + assert!(!token_map.does_match_same_string(var("identifier"), var("in"))); + + // Depending on what character follows, the string "in" may be treated as part of an + // `identifier` token. + assert!(token_map.does_match_valid_continuation(var("identifier"), var("in"))); + + // Depending on what character follows, the string "instanceof" may be treated as part of + // an `identifier` token. + assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof"))); + assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in"))); + } + + fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { + grammar + .variables + .iter() + .position(|v| v.name == name) + .unwrap() + } } diff --git a/src/grammars.rs b/src/grammars.rs index b751e4e4..18da86d8 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -36,6 +36,7 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, + pub is_string: bool, pub start_state: u32, } @@ -179,6 +180,12 @@ impl Variable { } } +impl LexicalGrammar { + pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize { + self.variables.iter().position(|v| v.start_state >= state_id).unwrap() + } +} + impl SyntaxVariable { pub fn is_auxiliary(&self) -> bool { self.kind == VariableType::Auxiliary diff --git a/src/nfa.rs b/src/nfa.rs index 4a4fa17b..738d1b40 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -97,6 +97,19 @@ impl CharacterSet { panic!("Called add with a negated character set"); } + pub fn does_intersect(&self, other: &CharacterSet) -> bool { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common, + CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only, + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only, + CharacterSet::Exclude(_) => true, + }, + } + } + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { match self { CharacterSet::Include(chars) => match other { @@ -152,14 +165,14 @@ impl Ord for CharacterSet { match self { CharacterSet::Include(chars) => { if let CharacterSet::Include(other_chars) = other { - compare_chars(chars, other_chars) + order_chars(chars, other_chars) } else { Ordering::Less } } CharacterSet::Exclude(chars) => { if let CharacterSet::Exclude(other_chars) = other { - compare_chars(chars, other_chars) + order_chars(chars, other_chars) } else { Ordering::Greater } @@ -197,7 +210,39 @@ fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) result } -fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { +struct SetComparision { + left_only: bool, + common: bool, + right_only: bool, +} + +fn compare_chars(left: &Vec, right: &Vec) -> SetComparision { + let mut result = SetComparision { + left_only: false, + common: false, + right_only: false, + }; + let mut left = left.iter().cloned(); + let mut right = right.iter().cloned(); + let mut i = left.next(); + let mut j = right.next(); + while let (Some(left_char), Some(right_char)) = (i, j) { + if left_char < right_char { + i = left.next(); + result.left_only = true; + } else if left_char > right_char { + j = right.next(); + result.right_only = true; + } else { + i = left.next(); + j = right.next(); + result.common = true; + } + } + result +} + +fn order_chars(chars: &Vec, other_chars: &Vec) -> Ordering { if chars.is_empty() { if other_chars.is_empty() { Ordering::Equal @@ -207,19 +252,15 @@ fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { } else if other_chars.is_empty() { Ordering::Greater } else { - let mut other_c = other_chars.iter(); - for c in chars.iter() { - if let Some(other_c) = other_c.next() { - let cmp = c.cmp(other_c); - if cmp != Ordering::Equal { - return cmp; - } - } else { - return Ordering::Greater; - } + let cmp = chars.len().cmp(&other_chars.len()); + if cmp != Ordering::Equal { + return cmp; } - if other_c.next().is_some() { - return Ordering::Less; + for (c, other_c) in chars.iter().zip(other_chars.iter()) { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } } Ordering::Equal } @@ -233,10 +274,6 @@ impl Nfa { pub fn last_state_id(&self) -> u32 { self.states.len() as u32 - 1 } - - pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.last_state_id())); - } } impl fmt::Debug for Nfa { @@ -325,11 +362,17 @@ impl<'a> NfaCursor<'a> { while i < result.len() { let intersection = result[i].0.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut states = result[i].2.clone(); - let mut precedence = result[i].1; - states.push(state); - result.insert(i, (intersection, max(precedence, prec), states)); - i += 1; + if result[i].0.is_empty() { + result[i].0 = intersection; + result[i].1 = max(result[i].1, prec); + result[i].2.push(state); + } else { + let mut states = result[i].2.clone(); + let mut precedence = result[i].1; + states.push(state); + result.insert(i, (intersection, max(precedence, prec), states)); + i += 1; + } } i += 1; } @@ -341,27 +384,18 @@ impl<'a> NfaCursor<'a> { result } - pub fn finished_id(&self) -> Option<(usize, i32)> { - let mut result = None; - for state_id in self.state_ids.iter() { + pub fn completions(&self) -> impl Iterator + '_ { + self.state_ids.iter().filter_map(move |state_id| { if let NfaState::Accept { variable_index, precedence, } = self.nfa.states[*state_id as usize] { - match result { - None => result = Some((variable_index, precedence)), - Some((existing_id, existing_precedence)) => { - if precedence > existing_precedence - || (precedence == existing_precedence && variable_index < existing_id) - { - result = Some((variable_index, precedence)) - } - } - } + Some((variable_index, precedence)) + } else { + None } - } - result + }) } pub fn in_separator(&self) -> bool { @@ -467,7 +501,7 @@ mod tests { } #[test] - fn test_character_set_intersection() { + fn test_character_set_remove_intersection() { // whitelist - whitelist // both sets contain 'c', 'd', and 'f' let mut a = CharacterSet::empty().add_range('a', 'f'); @@ -529,4 +563,46 @@ mod tests { assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); } + + #[test] + fn test_character_set_does_intersect() { + let (a, b) = (CharacterSet::empty(), CharacterSet::empty()); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('a'), + CharacterSet::empty().add_char('a'), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('b'), + CharacterSet::empty().add_char('a').add_char('c'), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'b', 'c']), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'c']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Exclude(vec!['a']), + CharacterSet::Exclude(vec!['a']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + } } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index b0d2ae04..2b7e7b4d 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -13,6 +13,14 @@ struct NfaBuilder { precedence_stack: Vec, } +fn is_string(rule: &Rule) -> bool { + match rule { + Rule::String(_) => true, + Rule::Metadata { rule, .. } => is_string(rule), + _ => false + } +} + pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut builder = NfaBuilder { nfa: Nfa::new(), @@ -58,6 +66,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Result { if self.expand_regex(ast, next_state_id)? { - self.nfa - .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + self.push_split(next_state_id); Ok(true) } else { Ok(false) @@ -265,8 +269,7 @@ impl NfaBuilder { fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { if self.expand_one_or_more(&ast, next_state_id)? { - self.nfa - .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + self.push_split(next_state_id); Ok(true) } else { Ok(false) @@ -333,6 +336,11 @@ impl NfaBuilder { }); } + fn push_split(&mut self, state_id: u32) { + let last_state_id = self.nfa.last_state_id(); + self.nfa.states.push(NfaState::Split(state_id, last_state_id)); + } + fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { let mut i = 0; while i < state_ids.len() { @@ -371,10 +379,10 @@ mod tests { let mut start_char = 0; let mut end_char = 0; for c in s.chars() { - if let Some((id, finished_precedence)) = cursor.finished_id() { - if result.is_none() || result_precedence <= finished_precedence { + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { result = Some((id, &s[start_char..end_char])); - result_precedence = finished_precedence; + result_precedence = precedence; } } if cursor.advance(c) { @@ -387,10 +395,10 @@ mod tests { } } - if let Some((id, finished_precedence)) = cursor.finished_id() { - if result.is_none() || result_precedence <= finished_precedence { + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { result = Some((id, &s[start_char..end_char])); - result_precedence = finished_precedence; + result_precedence = precedence; } } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index ff7204a0..ee748f5d 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -137,16 +137,19 @@ mod tests { LexicalVariable { name: "t1".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, }, LexicalVariable { name: "t2".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, }, LexicalVariable { name: "t3".to_string(), kind: VariableType::Anonymous, + is_string: true, start_state: 0, } ], From a46b8fcb46a1f8799bd50ebe7e04e7cddf4bff2d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 1 Jan 2019 13:47:29 -0800 Subject: [PATCH 080/208] Implement parse state merging --- src/build_tables/build_parse_table.rs | 13 +- src/build_tables/coincident_tokens.rs | 36 ++++ src/build_tables/item.rs | 32 +++- src/build_tables/mod.rs | 88 +++++++++- src/build_tables/shrink_parse_table.rs | 158 +++++++++++++++++- ...ken_conflict_map.rs => token_conflicts.rs} | 23 ++- src/nfa.rs | 41 +++-- src/prepare_grammar/expand_tokens.rs | 12 +- src/tables.rs | 1 + 9 files changed, 364 insertions(+), 40 deletions(-) create mode 100644 src/build_tables/coincident_tokens.rs rename src/build_tables/{token_conflict_map.rs => token_conflicts.rs} (92%) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index a7911689..2fe6fd8d 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,7 +7,8 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::Entry; +use std::hash::Hasher; +use std::collections::hash_map::{Entry, DefaultHasher}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; @@ -44,14 +45,13 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); self.add_parse_state( &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] .iter() .cloned(), ), @@ -78,6 +78,10 @@ impl<'a> ParseTableBuilder<'a> { } } + let mut hasher = DefaultHasher::new(); + item_set.hash_unfinished_items(&mut hasher); + let unfinished_item_signature = hasher.finish(); + match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -87,6 +91,7 @@ impl<'a> ParseTableBuilder<'a> { lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), + unfinished_item_signature, }); self.parse_state_queue.push_back(ParseStateQueueEntry { state_id, diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs new file mode 100644 index 00000000..10707489 --- /dev/null +++ b/src/build_tables/coincident_tokens.rs @@ -0,0 +1,36 @@ +use crate::rules::Symbol; +use crate::tables::{ParseStateId, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) struct CoincidentTokenIndex { + entries: HashMap<(Symbol, Symbol), HashSet>, + empty: HashSet, +} + +impl CoincidentTokenIndex { + pub fn new(table: &ParseTable) -> Self { + let mut entries = HashMap::new(); + for (i, state) in table.states.iter().enumerate() { + for symbol in state.terminal_entries.keys() { + for other_symbol in state.terminal_entries.keys() { + entries + .entry((*symbol, *other_symbol)) + .or_insert(HashSet::new()) + .insert(i); + } + } + } + Self { + entries, + empty: HashSet::new(), + } + } + + pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + self.entries.get(&(a, b)).unwrap_or(&self.empty) + } + + pub fn contains(&self, a: Symbol, b: Symbol) -> bool { + self.entries.contains_key(&(a, b)) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 28723d24..4cd2f643 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,11 +2,11 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; +use std::cmp::Ordering; use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; -use std::cmp::Ordering; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -85,10 +85,10 @@ impl LookaheadSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } - pub fn with<'a>(symbols: impl IntoIterator) -> Self { + pub fn with(symbols: impl IntoIterator) -> Self { let mut result = Self::new(); for symbol in symbols { - result.insert(*symbol); + result.insert(symbol); } result } @@ -219,6 +219,21 @@ impl<'a> ParseItemSet<'a> { result } + pub fn hash_unfinished_items(&self, h: &mut impl Hasher) { + let mut previous_variable_index = u32::MAX; + let mut previous_step_index = u32::MAX; + for item in self.entries.keys() { + if item.step().is_none() && item.variable_index != previous_variable_index + || item.step_index != previous_step_index + { + h.write_u32(item.variable_index); + h.write_u32(item.step_index); + previous_variable_index = item.variable_index; + previous_step_index = item.step_index; + } + } + } + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, @@ -369,11 +384,18 @@ impl<'a> Ord for ParseItem<'a> { if o != Ordering::Equal { return o; } - let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + let o = self + .production + .dynamic_precedence + .cmp(&other.production.dynamic_precedence); if o != Ordering::Equal { return o; } - let o = self.production.steps.len().cmp(&other.production.steps.len()); + let o = self + .production + .steps + .len() + .cmp(&other.production.steps.len()); if o != Ordering::Equal { return o; } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index d1983068..665c56a0 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,18 +1,20 @@ -use crate::error::Result; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::rules::{AliasMap, Symbol}; -use crate::tables::{LexTable, ParseTable}; - mod build_parse_table; +mod coincident_tokens; mod item; mod item_set_builder; mod lex_table_builder; mod shrink_parse_table; -mod token_conflict_map; +mod token_conflicts; use self::build_parse_table::build_parse_table; +use self::coincident_tokens::CoincidentTokenIndex; +use self::item::LookaheadSet; use self::shrink_parse_table::shrink_parse_table; -use self::token_conflict_map::TokenConflictMap; +use self::token_conflicts::TokenConflictMap; +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -23,6 +25,76 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + populate_error_state( + &mut parse_table, + syntax_grammar, + lexical_grammar, + &coincident_token_index, + &token_conflict_map, + ); + shrink_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + ); Ok((parse_table, LexTable::default(), LexTable::default(), None)) } + +fn populate_error_state( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + coincident_token_index: &CoincidentTokenIndex, + token_conflict_map: &TokenConflictMap, +) { + let state = &mut parse_table.states[0]; + let n = lexical_grammar.variables.len(); + let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { + j == i + || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + || !token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + Some(Symbol::terminal(i)) + } + })); + + let recover_entry = ParseTableEntry { + reusable: false, + actions: vec![ParseAction::Recover], + }; + + for i in 0..n { + let symbol = Symbol::terminal(i); + let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) + || conflict_free_tokens.iter().all(|t| { + coincident_token_index.contains(symbol, t) + || !token_conflict_map.does_conflict(i, t.index) + }); + if can_be_used_for_recovery { + eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); + } else { + eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); + } + } + + for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { + if external_token.corresponding_internal_token.is_none() { + state + .terminal_entries + .entry(Symbol::external(i)) + .or_insert_with(|| recover_entry.clone()); + } + } + + state.terminal_entries.insert(Symbol::end(), recover_entry); +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 8e826f5c..026c3058 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,14 +1,17 @@ +use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; -use crate::rules::AliasMap; -use crate::tables::{ParseAction, ParseTable}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, + token_conflict_map: &TokenConflictMap, ) { remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); remove_unused_states(parse_table); } @@ -86,6 +89,157 @@ fn remove_unit_reductions( } } +fn merge_compatible_states( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, +) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { + continue; + } + if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } + } + } + } + + if state_replacements.is_empty() { + break; + } + + for state in parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); + } + } +} + +fn merge_parse_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + parse_table: &mut ParseTable, + left: usize, + right: usize, +) -> bool { + let left_state = &parse_table.states[left]; + let right_state = &parse_table.states[right]; + + if left_state.nonterminal_entries != right_state.nonterminal_entries { + return false; + } + + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + right_state, + *symbol, + left_entry, + ) { + return false; + } + } + + eprintln!("maybe merge {} {}", left, right); + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + left_state, + *symbol, + right_entry, + ) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = parse_table.states[right].terminal_entries[&symbol].clone(); + parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true +} + +fn can_add_entry_to_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, +) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; + } + + // Only merge parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if token_conflict_map.does_conflict(token.index, existing_token.index) { + return false; + } + } + } + + true +} + fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; for state in &parse_table.states { diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflicts.rs similarity index 92% rename from src/build_tables/token_conflict_map.rs rename to src/build_tables/token_conflicts.rs index 52c68cc7..09d5e97c 100644 --- a/src/build_tables/token_conflict_map.rs +++ b/src/build_tables/token_conflicts.rs @@ -8,6 +8,7 @@ use std::fmt; struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, + does_match_separators: bool, matches_same_string: bool, } @@ -46,8 +47,9 @@ impl TokenConflictMap { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } - pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool { - self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation + pub fn does_conflict(&self, i: usize, j: usize) -> bool { + let entry = &self.status_matrix[matrix_index(self.n, i, j)]; + entry.does_match_valid_continuation || entry.does_match_separators } pub fn does_overlap(&self, i: usize, j: usize) -> bool { @@ -207,10 +209,15 @@ fn compute_conflict_status( if chars.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } + if cursor.in_separator() { + result.0.does_match_separators = true; + } } else { result.1.does_overlap = true; if chars.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; + } else { + result.1.does_match_separators = true; } } } @@ -326,9 +333,9 @@ mod tests { let token_map = TokenConflictMap::new( &grammar, vec![ - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), - LookaheadSet::with(&[Symbol::terminal(var("in"))]), - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), ], ); @@ -338,12 +345,12 @@ mod tests { // Depending on what character follows, the string "in" may be treated as part of an // `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("in"))); // Depending on what character follows, the string "instanceof" may be treated as part of // an `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof"))); - assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); + assert!(token_map.does_conflict(var("instanceof"), var("in"))); } fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { diff --git a/src/nfa.rs b/src/nfa.rs index 738d1b40..ee39d178 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -86,15 +86,34 @@ impl CharacterSet { } pub fn add(self, other: &CharacterSet) -> Self { - if let CharacterSet::Include(other_chars) = other { - if let CharacterSet::Include(mut chars) = self { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - return CharacterSet::Include(chars); - } + match self { + CharacterSet::Include(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } + CharacterSet::Exclude(other_chars) => { + let excluded_chars = other_chars + .iter() + .cloned() + .filter(|c| !chars.contains(&c)) + .collect(); + CharacterSet::Exclude(excluded_chars) + } + }, + CharacterSet::Exclude(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.retain(|c| !other_chars.contains(&c)); + CharacterSet::Exclude(chars) + } + CharacterSet::Exclude(other_chars) => { + chars.retain(|c| other_chars.contains(&c)); + CharacterSet::Exclude(chars) + }, + }, } - panic!("Called add with a negated character set"); } pub fn does_intersect(&self, other: &CharacterSet) -> bool { @@ -458,6 +477,9 @@ mod tests { (CharacterSet::empty().add_char('f'), 0, 4), ], vec![ + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ( CharacterSet::empty() .add_range('a', 'c') @@ -467,9 +489,6 @@ mod tests { 0, vec![1], ), - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ], ), ]; diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 2b7e7b4d..4ef17b27 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -164,12 +164,20 @@ impl NfaBuilder { Err(Error::regex("Unicode character classes are not supported")) } Class::Perl(class) => { - self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + let mut chars = self.expand_perl_character_class(&class.kind); + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { - self.push_advance(self.expand_character_class(&item)?, next_state_id); + let mut chars = self.expand_character_class(&item)?; + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( diff --git a/src/tables.rs b/src/tables.rs index 0815aac8..344c4816 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -37,6 +37,7 @@ pub(crate) struct ParseState { pub terminal_entries: HashMap, pub nonterminal_entries: HashMap, pub lex_state_id: usize, + pub unfinished_item_signature: u64, } #[derive(Debug, PartialEq, Eq)] From 9824ebbbc31f7cda43f8a5aa5b3847462ab4c6aa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 12:34:40 -0800 Subject: [PATCH 081/208] Implement lex table construction --- src/build_tables/build_lex_table.rs | 124 ++++++++++++++++ src/build_tables/build_parse_table.rs | 31 ++-- src/build_tables/item_set_builder.rs | 20 +-- src/build_tables/lex_table_builder.rs | 24 --- src/build_tables/mod.rs | 131 ++++++++++++++++- src/build_tables/shrink_parse_table.rs | 2 - src/build_tables/token_conflicts.rs | 80 +++++----- src/grammars.rs | 10 +- src/main.rs | 2 +- src/nfa.rs | 130 ++++++----------- src/prepare_grammar/expand_tokens.rs | 24 ++- src/prepare_grammar/extract_tokens.rs | 17 ++- src/render/mod.rs | 195 +++++++++++++++++++++++-- src/rules.rs | 3 + src/tables.rs | 15 +- 15 files changed, 581 insertions(+), 227 deletions(-) create mode 100644 src/build_tables/build_lex_table.rs delete mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs new file mode 100644 index 00000000..aa929d97 --- /dev/null +++ b/src/build_tables/build_lex_table.rs @@ -0,0 +1,124 @@ +use super::item::LookaheadSet; +use super::token_conflicts::TokenConflictMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::nfa::NfaCursor; +use crate::rules::Symbol; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, VecDeque}; + +pub(crate) fn build_lex_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + keywords: &LookaheadSet, +) -> (LexTable, LexTable) { + let keyword_lex_table; + if syntax_grammar.word_token.is_some() { + let mut builder = LexTableBuilder::new(lexical_grammar); + builder.add_state_for_tokens(keywords.iter()); + keyword_lex_table = builder.table; + } else { + keyword_lex_table = LexTable::default(); + } + + let mut builder = LexTableBuilder::new(lexical_grammar); + for state in parse_table.states.iter_mut() { + let tokens = state.terminal_entries.keys().filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else { + None + } + }); + state.lex_state_id = builder.add_state_for_tokens(tokens); + } + + (builder.table, keyword_lex_table) +} + +struct LexTableBuilder<'a> { + lexical_grammar: &'a LexicalGrammar, + cursor: NfaCursor<'a>, + table: LexTable, + state_queue: VecDeque<(usize, Vec)>, + state_ids_by_nfa_state_set: HashMap, usize>, +} + +impl<'a> LexTableBuilder<'a> { + fn new(lexical_grammar: &'a LexicalGrammar) -> Self { + Self { + lexical_grammar, + cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]), + table: LexTable::default(), + state_queue: VecDeque::new(), + state_ids_by_nfa_state_set: HashMap::new(), + } + } + + fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + let nfa_states = tokens + .map(|token| self.lexical_grammar.variables[token.index].start_state) + .collect(); + let result = self.add_state(nfa_states); + while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + self.populate_state(state_id, nfa_states); + } + result + } + + fn add_state(&mut self, nfa_states: Vec) -> usize { + match self.state_ids_by_nfa_state_set.entry(nfa_states) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.table.states.len(); + self.table.states.push(LexState::default()); + self.state_queue.push_back((state_id, v.key().clone())); + v.insert(state_id); + state_id + } + } + } + + fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + self.cursor.reset(nfa_states); + + let mut completion = None; + for (id, prec) in self.cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if TokenConflictMap::prefer_token( + self.lexical_grammar, + (prev_precedence, prev_id), + (prec, id), + ) { + continue; + } + } + completion = Some((id, prec)); + } + + for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + if let Some((_, completed_precedence)) = completion { + if advance_precedence < completed_precedence { + continue; + } + } + let next_state_id = self.add_state(next_states); + self.table.states[state_id].advance_actions.push(( + chars, + AdvanceAction { + state: next_state_id, + in_main_token: !is_sep, + }, + )); + } + + if let Some((completion_index, _)) = completion { + self.table.states[state_id].accept_action = Some(completion_index); + } + } +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 2fe6fd8d..c17261dc 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,10 +7,10 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::hash::Hasher; -use std::collections::hash_map::{Entry, DefaultHasher}; +use std::collections::hash_map::{DefaultHasher, Entry}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; +use std::hash::Hasher; #[derive(Clone)] struct AuxiliarySymbolInfo { @@ -31,7 +31,6 @@ struct ParseTableBuilder<'a> { item_set_builder: ParseItemSetBuilder<'a>, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, state_ids_by_item_set: HashMap, ParseStateId>, item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, @@ -51,9 +50,12 @@ impl<'a> ParseTableBuilder<'a> { &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] - .iter() - .cloned(), + [( + ParseItem::start(), + LookaheadSet::with([Symbol::end()].iter().cloned()), + )] + .iter() + .cloned(), ), ); @@ -69,8 +71,12 @@ impl<'a> ParseTableBuilder<'a> { item_set: ParseItemSet<'a>, ) -> ParseStateId { if preceding_symbols.len() > 1 { - let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]); - let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]); + let left_tokens = self + .item_set_builder + .last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self + .item_set_builder + .first_set(&preceding_symbols[preceding_symbols.len() - 1]); for left_token in left_tokens.iter() { if left_token.is_terminal() { self.following_tokens[left_token.index].insert_all(right_tokens); @@ -117,11 +123,9 @@ impl<'a> ParseTableBuilder<'a> { ); } - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); if debug { println!( @@ -606,7 +610,6 @@ pub(crate) fn build_parse_table( ParseTableBuilder { syntax_grammar, lexical_grammar, - inlines, item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 8649cb52..5e61bfcc 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -18,6 +18,7 @@ struct FollowSetInfo { pub(crate) struct ParseItemSetBuilder<'a> { first_sets: HashMap, last_sets: HashMap, + inlines: &'a InlinedProductionMap, transitive_closure_additions: Vec>>, } @@ -36,6 +37,7 @@ impl<'a> ParseItemSetBuilder<'a> { let mut result = Self { first_sets: HashMap::new(), last_sets: HashMap::new(), + inlines, transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], }; @@ -237,15 +239,12 @@ impl<'a> ParseItemSetBuilder<'a> { result } - pub(crate) fn transitive_closure( - &mut self, - item_set: &ParseItemSet<'a>, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> ParseItemSet<'a> { + pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + if let Some(productions) = self + .inlines + .inlined_productions(item.production, item.step_index) { for production in productions { self.add_item( @@ -273,12 +272,7 @@ impl<'a> ParseItemSetBuilder<'a> { &self.first_sets[symbol] } - fn add_item( - &self, - set: &mut ParseItemSet<'a>, - item: ParseItem<'a>, - lookaheads: &LookaheadSet, - ) { + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { let next_step = item.successor().step(); diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs deleted file mode 100644 index 86d1578b..00000000 --- a/src/build_tables/lex_table_builder.rs +++ /dev/null @@ -1,24 +0,0 @@ -use crate::rules::Symbol; -use crate::tables::LexTable; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; - -pub(crate) struct LexTableBuilder<'a> { - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - table: LexTable, -} - -impl<'a> LexTableBuilder<'a> { - pub fn new( - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> Self { - Self { - syntax_grammar, lexical_grammar, table: LexTable::default() - } - } - - pub fn build(self) -> (LexTable, LexTable, Option) { - (LexTable::default(), LexTable::default(), None) - } -} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 665c56a0..8b3a2db4 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,11 +1,12 @@ +mod build_lex_table; mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; -mod lex_table_builder; mod shrink_parse_table; mod token_conflicts; +use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; use self::item::LookaheadSet; @@ -13,6 +14,7 @@ use self::shrink_parse_table::shrink_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::nfa::{CharacterSet, NfaCursor}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; @@ -25,7 +27,22 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); + + eprintln!("{:?}", token_conflict_map); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + let keywords = if let Some(word_token) = syntax_grammar.word_token { + identify_keywords( + lexical_grammar, + &parse_table, + word_token, + &token_conflict_map, + &coincident_token_index, + ) + } else { + LookaheadSet::new() + }; + populate_error_state( &mut parse_table, syntax_grammar, @@ -39,7 +56,14 @@ pub(crate) fn build_tables( simple_aliases, &token_conflict_map, ); - Ok((parse_table, LexTable::default(), LexTable::default(), None)) + let (main_lex_table, keyword_lex_table) = + build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); + Ok(( + parse_table, + main_lex_table, + keyword_lex_table, + syntax_grammar.word_token, + )) } fn populate_error_state( @@ -77,13 +101,10 @@ fn populate_error_state( || !token_conflict_map.does_conflict(i, t.index) }); if can_be_used_for_recovery { - eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); state .terminal_entries .entry(symbol) .or_insert_with(|| recover_entry.clone()); - } else { - eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); } } @@ -98,3 +119,103 @@ fn populate_error_state( state.terminal_entries.insert(Symbol::end(), recover_entry); } + +fn identify_keywords( + lexical_grammar: &LexicalGrammar, + parse_table: &ParseTable, + word_token: Symbol, + token_conflict_map: &TokenConflictMap, + coincident_token_index: &CoincidentTokenIndex, +) -> LookaheadSet { + let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); + + // First find all of the candidate keyword tokens: tokens that start with + // letters or underscore and can match the same string as a word token. + let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( + |(i, variable)| { + cursor.reset(vec![variable.start_state]); + if all_chars_are_alphabetical(&cursor) + && token_conflict_map.does_match_same_string(i, word_token.index) + { + Some(Symbol::terminal(i)) + } else { + None + } + }, + )); + + // Exclude keyword candidates that shadow another keyword candidate. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + eprintln!( + "Exclude {} from keywords because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } + } + true + })); + + // Exclude keyword candidates for which substituting the keyword capture + // token would introduce new lexical conflicts with other tokens. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + eprintln!( + "Exclude {} from keywords because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } + } + + eprintln!( + "Include {} in keywords", + lexical_grammar.variables[token.index].name, + ); + true + })); + + keywords +} + +fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { + cursor.successors().all(|(chars, _, _, is_sep)| { + if is_sep { + true + } else if let CharacterSet::Include(chars) = chars { + chars.iter().all(|c| c.is_alphabetic() || *c == '_') + } else { + false + } + }) +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 026c3058..b943158f 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -166,8 +166,6 @@ fn merge_parse_state( } } - eprintln!("maybe merge {} {}", left, right); - let mut symbols_to_add = Vec::new(); for (symbol, right_entry) in &right_state.terminal_entries { if !left_state.terminal_entries.contains_key(&symbol) { diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 09d5e97c..9f1c4426 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -4,7 +4,7 @@ use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; use std::fmt; -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, @@ -12,15 +12,16 @@ struct TokenConflictStatus { matches_same_string: bool, } -pub(crate) struct TokenConflictMap { +pub(crate) struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, + grammar: &'a LexicalGrammar, } -impl TokenConflictMap { - pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { +impl<'a> TokenConflictMap<'a> { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); @@ -40,9 +41,16 @@ impl TokenConflictMap { status_matrix, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, + grammar, } } + pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { + let left = &self.status_matrix[matrix_index(self.n, a, other)]; + let right = &self.status_matrix[matrix_index(self.n, b, other)]; + left == right + } + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } @@ -55,9 +63,28 @@ impl TokenConflictMap { pub fn does_overlap(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].does_overlap } + + pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match ( + grammar.variables[left.1].is_string, + grammar.variables[right.1].is_string, + ) { + (true, false) => return true, + (false, true) => return false, + _ => {} + } + + left.0 < right.0 + } } -impl fmt::Debug for TokenConflictMap { +impl<'a> fmt::Debug for TokenConflictMap<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; @@ -69,18 +96,22 @@ impl fmt::Debug for TokenConflictMap { write!(f, " following_characters: {{\n")?; for i in 0..self.n { - write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; + write!( + f, + " {}: {:?},\n", + self.grammar.variables[i].name, self.following_chars_by_index[i] + )?; } write!(f, " }},\n")?; write!(f, " status_matrix: {{\n")?; for i in 0..self.n { - write!(f, " {}: {{\n", i)?; + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; for j in 0..self.n { write!( f, " {}: {:?},\n", - j, + self.grammar.variables[j].name, self.status_matrix[matrix_index(self.n, i, j)] )?; } @@ -101,7 +132,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec bool { - if left.0 > right.0 { - return true; - } else if left.0 < right.0 { - return false; - } - - match ( - grammar.variables[left.1].is_string, - grammar.variables[right.1].is_string, - ) { - (true, false) => return true, - (false, true) => return false, - _ => {} - } - - left.0 < right.0 -} - fn variable_ids_for_states<'a>( state_ids: &'a Vec, grammar: &'a LexicalGrammar, diff --git a/src/grammars.rs b/src/grammars.rs index 18da86d8..d23e8ca6 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -91,6 +91,7 @@ pub(crate) struct SyntaxGrammar { pub word_token: Option, } +#[cfg(test)] impl ProductionStep { pub(crate) fn new(symbol: Symbol) -> Self { Self { @@ -127,14 +128,6 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } - - pub fn last_precedence(&self) -> i32 { - self.steps.last().map(|s| s.precedence).unwrap_or(0) - } - - pub fn last_associativity(&self) -> Option { - self.steps.last().map(|s| s.associativity).unwrap_or(None) - } } impl Default for Production { @@ -146,6 +139,7 @@ impl Default for Production { } } +#[cfg(test)] impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { diff --git a/src/main.rs b/src/main.rs index c7ca2ca5..cd672186 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(matches) = matches.subcommand_matches("generate") { + if let Some(_) = matches.subcommand_matches("generate") { let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); diff --git a/src/nfa.rs b/src/nfa.rs index ee39d178..e14dac44 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -40,7 +40,6 @@ impl Default for Nfa { pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, nfa: &'a Nfa, - in_sep: bool, } impl CharacterSet { @@ -111,7 +110,7 @@ impl CharacterSet { CharacterSet::Exclude(other_chars) => { chars.retain(|c| other_chars.contains(&c)); CharacterSet::Exclude(chars) - }, + } }, } } @@ -311,7 +310,6 @@ impl<'a> NfaCursor<'a> { let mut result = Self { nfa, state_ids: Vec::new(), - in_sep: true, }; result.add_states(&mut states); result @@ -322,81 +320,59 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } - pub fn advance(&mut self, c: char) -> bool { - let mut result = false; - let mut new_state_ids = Vec::new(); - let mut any_sep_transitions = false; - for current_state_id in &self.state_ids { - if let NfaState::Advance { - chars, - state_id, - is_sep, - .. - } = &self.nfa.states[*current_state_id as usize] - { - if chars.contains(c) { - if *is_sep { - any_sep_transitions = true; - } - new_state_ids.push(*state_id); - result = true; - } - } - } - if !any_sep_transitions { - self.in_sep = false; - } - self.state_ids.clear(); - self.add_states(&mut new_state_ids); - result - } - - pub fn successors(&self) -> impl Iterator { + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { chars, state_id, precedence, - .. + is_sep, } = &self.nfa.states[*id as usize] { - Some((chars, *precedence, *state_id)) + Some((chars, *precedence, *state_id, *is_sep)) } else { None } }) } - pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec, bool)> { Self::group_successors(self.successors()) } fn group_successors<'b>( - iter: impl Iterator, - ) -> Vec<(CharacterSet, i32, Vec)> { - let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); - for (chars, prec, state) in iter { + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec, bool)> { + let mut result: Vec<(CharacterSet, i32, Vec, bool)> = Vec::new(); + for (chars, prec, state, is_sep) in iter { let mut chars = chars.clone(); let mut i = 0; while i < result.len() { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - if result[i].0.is_empty() { - result[i].0 = intersection; - result[i].1 = max(result[i].1, prec); - result[i].2.push(state); - } else { + if result[i].0 == chars { + result[i].1 = max(result[i].1, prec); + result[i].2.push(state); + result[i].3 |= is_sep; + } else { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { let mut states = result[i].2.clone(); - let mut precedence = result[i].1; states.push(state); - result.insert(i, (intersection, max(precedence, prec), states)); + result.insert( + i, + ( + intersection, + max(result[i].1, prec), + states, + result[i].3 || is_sep, + ), + ); i += 1; } } i += 1; } if !chars.is_empty() { - result.push((chars, prec, vec![state])); + result.push((chars, prec, vec![state], is_sep)); } } result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); @@ -417,10 +393,6 @@ impl<'a> NfaCursor<'a> { }) } - pub fn in_separator(&self) -> bool { - self.in_sep - } - pub fn add_states(&mut self, new_state_ids: &mut Vec) { let mut i = 0; while i < new_state_ids.len() { @@ -460,26 +432,31 @@ mod tests { let table = [ ( vec![ - (CharacterSet::empty().add_range('a', 'f'), 0, 1), - (CharacterSet::empty().add_range('d', 'i'), 1, 2), + (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), + (CharacterSet::empty().add_range('d', 'i'), 1, 2, false), ], vec![ - (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), - (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), - (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false), + ( + CharacterSet::empty().add_range('d', 'f'), + 1, + vec![1, 2], + false, + ), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), ], ), ( vec![ - (CharacterSet::empty().add_range('a', 'z'), 0, 1), - (CharacterSet::empty().add_char('d'), 0, 2), - (CharacterSet::empty().add_char('i'), 0, 3), - (CharacterSet::empty().add_char('f'), 0, 4), + (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), + (CharacterSet::empty().add_char('d'), 0, 2, false), + (CharacterSet::empty().add_char('i'), 0, 3, false), + (CharacterSet::empty().add_char('f'), 0, 4, false), ], vec![ - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2], false), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4], false), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3], false), ( CharacterSet::empty() .add_range('a', 'c') @@ -488,6 +465,7 @@ mod tests { .add_range('j', 'z'), 0, vec![1], + false, ), ], ), @@ -495,28 +473,10 @@ mod tests { for row in table.iter() { assert_eq!( - NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))), row.1 ); } - - // let successors = NfaCursor::group_successors( - // [ - // (&CharacterSet::empty().add_range('a', 'f'), 1), - // (&CharacterSet::empty().add_range('d', 'i'), 2), - // ] - // .iter() - // .cloned(), - // ); - // - // assert_eq!( - // successors, - // vec![ - // (CharacterSet::empty().add_range('a', 'c'), vec![1],), - // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), - // (CharacterSet::empty().add_range('g', 'i'), vec![2],), - // ] - // ); } #[test] diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 4ef17b27..fdf085f6 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -6,6 +6,7 @@ use crate::rules::Rule; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; +use std::i32; struct NfaBuilder { nfa: Nfa, @@ -17,7 +18,7 @@ fn is_string(rule: &Rule) -> bool { match rule { Rule::String(_) => true, Rule::Metadata { rule, .. } => is_string(rule), - _ => false + _ => false, } } @@ -346,7 +347,9 @@ impl NfaBuilder { fn push_split(&mut self, state_id: u32) { let last_state_id = self.nfa.last_state_id(); - self.nfa.states.push(NfaState::Split(state_id, last_state_id)); + self.nfa + .states + .push(NfaState::Split(state_id, last_state_id)); } fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { @@ -354,12 +357,12 @@ impl NfaBuilder { while i < state_ids.len() { let state_id = state_ids[i]; let (left, right) = match &mut self.nfa.states[state_id as usize] { - NfaState::Accept {precedence, ..} => { + NfaState::Accept { precedence, .. } => { *precedence = prec; return; - }, + } NfaState::Split(left, right) => (*left, *right), - _ => return + _ => return, }; if !state_ids.contains(&left) { state_ids.push(left); @@ -383,7 +386,7 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; - let mut result_precedence = 0; + let mut result_precedence = i32::MIN; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { @@ -393,9 +396,14 @@ mod tests { result_precedence = precedence; } } - if cursor.advance(c) { + if let Some((_, _, next_states, in_sep)) = cursor + .grouped_successors() + .into_iter() + .find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence) + { + cursor.reset(next_states); end_char += 1; - if cursor.in_separator() { + if in_sep { start_char = end_char; } } else { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index eaeede90..5f3f6e16 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,6 +1,6 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; -use crate::grammars::{ExternalToken, Variable}; +use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; use std::collections::HashMap; use std::mem; @@ -240,16 +240,21 @@ impl TokenExtractor { let index = self.extracted_variables.len(); let variable = if let Some(string_value) = string_value { - Variable::anonymous(string_value, rule.clone()) + Variable { + name: string_value.clone(), + kind: VariableType::Anonymous, + rule: rule.clone() + } } else { self.current_variable_token_count += 1; - Variable::auxiliary( - &format!( + Variable { + name: format!( "{}_token{}", &self.current_variable_name, self.current_variable_token_count ), - rule.clone(), - ) + kind: VariableType::Auxiliary, + rule: rule.clone(), + } }; self.extracted_variables.push(variable); diff --git a/src/render/mod.rs b/src/render/mod.rs index fc4cdafb..cbb8ba0d 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,6 +2,7 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use core::ops::Range; use std::collections::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -12,11 +13,17 @@ macro_rules! add { }} } -macro_rules! add_line { - ($this: tt, $($arg: tt)*) => { +macro_rules! add_whitespace { + ($this: tt) => {{ for _ in 0..$this.indent_level { write!(&mut $this.buffer, " ").unwrap(); } + }}; +} + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + add_whitespace!($this); $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); $this.buffer += "\n"; } @@ -162,7 +169,7 @@ impl Generator { } } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define LANGUAGE_VERSION {}", 9); add_line!( self, "#define STATE_COUNT {}", @@ -352,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&accept_action.symbol] + self.symbol_ids[&Symbol::terminal(accept_action)] ); } @@ -360,9 +367,10 @@ impl Generator { for (characters, action) in state.advance_actions { let previous_length = self.buffer.len(); + add_whitespace!(self); add!(self, "if ("); if self.add_character_set_condition(&characters, &ruled_out_characters) { - add!(self, ")"); + add!(self, ")\n"); indent!(self); if action.in_main_token { add_line!(self, "ADVANCE({});", action.state); @@ -370,7 +378,7 @@ impl Generator { add_line!(self, "SKIP({});", action.state); } if let CharacterSet::Include(chars) = characters { - ruled_out_characters.extend(chars.iter()); + ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { @@ -384,9 +392,106 @@ impl Generator { fn add_character_set_condition( &mut self, characters: &CharacterSet, - ruled_out_characters: &HashSet, + ruled_out_characters: &HashSet, ) -> bool { - true + match characters { + CharacterSet::Include(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, false) + } + CharacterSet::Exclude(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, true) + } + } + } + + fn add_character_range_conditions( + &mut self, + ranges: impl Iterator>, + is_negated: bool, + ) -> bool { + let line_break = "\n "; + let mut did_add = false; + for range in ranges { + if is_negated { + if did_add { + add!(self, " &&{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead != "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead != "); + self.add_character(range.start); + add!(self, " &&{}lookahead != ", line_break); + self.add_character(range.end); + } else { + add!(self, "(lookahead < "); + self.add_character(range.start); + add!(self, " || "); + self.add_character(range.end); + add!(self, " < lookahead)"); + } + } else { + if did_add { + add!(self, " ||{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead == "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead == "); + self.add_character(range.start); + add!(self, " ||{}lookahead == ", line_break); + self.add_character(range.end); + } else { + add!(self, "("); + self.add_character(range.start); + add!(self, " <= lookahead && lookahead <= "); + self.add_character(range.end); + add!(self, ")"); + } + } + did_add = true; + } + did_add + } + + fn get_ranges<'a>( + chars: &'a Vec, + ruled_out_characters: &'a HashSet, + ) -> impl Iterator> + 'a { + let mut prev_range: Option> = None; + chars + .iter() + .cloned() + .chain(Some('\0')) + .filter_map(move |c| { + if ruled_out_characters.contains(&(c as u32)) { + return None; + } + if let Some(range) = prev_range.clone() { + if c == '\0' { + prev_range = Some(c..c); + return Some(range); + } + + let mut prev_range_successor = range.end as u32 + 1; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains(&prev_range_successor) { + prev_range = Some(c..c); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..c); + None + } else { + prev_range = Some(c..c); + None + } + }) } fn add_lex_modes_list(&mut self) { @@ -577,13 +682,6 @@ impl Generator { alias_sequence_id, .. } => { - if !self.symbol_ids.contains_key(&symbol) { - eprintln!( - "SYMBOL: {:?} {:?}", - symbol, - self.metadata_for_symbol(symbol) - ); - } add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); if dynamic_precedence != 0 { add!(self, ", .dynamic_precedence = {}", dynamic_precedence); @@ -785,7 +883,7 @@ impl Generator { { result.push(c); } else { - result += match c { + let replacement = match c { '~' => "TILDE", '`' => "BQUOTE", '!' => "BANG", @@ -821,7 +919,11 @@ impl Generator { '\r' => "CR", '\t' => "TAB", _ => continue, + }; + if !result.is_empty() && !result.ends_with("_") { + result.push('_'); } + result += replacement; } } result @@ -837,6 +939,21 @@ impl Generator { } result } + + fn add_character(&mut self, c: char) { + if c.is_ascii() { + match c { + '\'' => add!(self, "'\\''"), + '\\' => add!(self, "'\\\\'"), + '\t' => add!(self, "'\\t'"), + '\n' => add!(self, "'\\n'"), + '\r' => add!(self, "'\\r'"), + _ => add!(self, "'{}'", c), + } + } else { + add!(self, "{}", c as u32) + } + } } pub(crate) fn render_c_code( @@ -867,3 +984,49 @@ pub(crate) fn render_c_code( } .generate() } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_char_ranges() { + struct Row { + chars: Vec, + ruled_out_chars: Vec, + expected_ranges: Vec>, + } + + let table = [ + Row { + chars: vec!['a'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'a'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'z'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'h', 'z'], + ruled_out_chars: vec!['d', 'f', 'g'], + expected_ranges: vec!['a'..'h', 'z'..'z'], + }, + ]; + + for Row { + chars, + ruled_out_chars, + expected_ranges, + } in table.iter() + { + let ruled_out_chars = ruled_out_chars + .into_iter() + .map(|c: &char| *c as u32) + .collect(); + let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::>(); + assert_eq!(ranges, *expected_ranges); + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 3bfd5181..77e50d3c 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -120,7 +120,10 @@ impl Rule { pub fn seq(rules: Vec) -> Self { Rule::Seq(rules) } +} +#[cfg(test)] +impl Rule { pub fn terminal(index: usize) -> Self { Rule::Symbol(Symbol::terminal(index)) } diff --git a/src/tables.rs b/src/tables.rs index 344c4816..1c125621 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; -use std::ops::Range; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -50,21 +49,13 @@ pub(crate) struct ParseTable { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { pub state: LexStateId, - pub precedence: Range, pub in_main_token: bool, } -#[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct AcceptTokenAction { - pub symbol: Symbol, - pub precedence: i32, - pub implicit_precedence: i32, -} - -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, - pub accept_action: Option, + pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] From 3fbaff5e69a1bfd200a7c9979e52412b55a26ba0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 16:48:44 -0800 Subject: [PATCH 082/208] Fix various logic errors in parse table construction --- Cargo.lock | 18 ++++ Cargo.toml | 5 ++ src/build_tables/build_lex_table.rs | 116 +++++++++++++++++++++---- src/build_tables/build_parse_table.rs | 59 +++++++------ src/build_tables/coincident_tokens.rs | 38 ++++---- src/build_tables/item.rs | 4 +- src/build_tables/item_set_builder.rs | 2 +- src/build_tables/mod.rs | 44 +++++----- src/build_tables/shrink_parse_table.rs | 6 +- src/build_tables/token_conflicts.rs | 2 +- src/grammars.rs | 2 +- src/logger.rs | 29 +++++++ src/main.rs | 28 ++++-- src/nfa.rs | 26 ++++-- src/parse_grammar.rs | 4 +- src/prepare_grammar/expand_repeats.rs | 2 +- src/prepare_grammar/extract_tokens.rs | 2 +- src/prepare_grammar/process_inlines.rs | 2 +- src/render/mod.rs | 19 ++-- src/rules.rs | 2 +- src/tables.rs | 2 +- 21 files changed, 297 insertions(+), 115 deletions(-) create mode 100644 src/logger.rs diff --git a/Cargo.lock b/Cargo.lock index 538517f1..2312d362 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,11 @@ dependencies = [ "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "byteorder" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cc" version = "1.0.25" @@ -212,6 +217,15 @@ dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "hashbrown" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ignore" version = "0.4.4" @@ -463,9 +477,11 @@ version = "0.1.0" dependencies = [ "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -737,6 +753,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "94f88df23a25417badc922ab0f5716cc1330e87f71ddd9203b3a3ccd9cedf75d" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" @@ -753,6 +770,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e" "checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" "checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" diff --git a/Cargo.toml b/Cargo.toml index b29bc85e..29b10e17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ lazy_static = "1.2.0" smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" +hashbrown = "0.1" ignore = "0.4.4" libloading = "0.5" rusqlite = "0.14.0" @@ -20,3 +21,7 @@ regex-syntax = "0.6.4" [dependencies.serde_json] version = "1.0" features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index aa929d97..c002f427 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -2,10 +2,9 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::NfaCursor; -use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeMap, HashMap, VecDeque}; pub(crate) fn build_lex_table( parse_table: &mut ParseTable, @@ -16,15 +15,16 @@ pub(crate) fn build_lex_table( let keyword_lex_table; if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); - builder.add_state_for_tokens(keywords.iter()); + builder.add_state_for_tokens(keywords); keyword_lex_table = builder.table; } else { keyword_lex_table = LexTable::default(); } let mut builder = LexTableBuilder::new(lexical_grammar); - for state in parse_table.states.iter_mut() { - let tokens = state.terminal_entries.keys().filter_map(|token| { + for (i, state) in parse_table.states.iter_mut().enumerate() { + info!("populate lex state for parse state {}", i); + let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { syntax_grammar.word_token @@ -34,11 +34,14 @@ pub(crate) fn build_lex_table( } else { None } - }); - state.lex_state_id = builder.add_state_for_tokens(tokens); + })); + state.lex_state_id = builder.add_state_for_tokens(&tokens); } - (builder.table, keyword_lex_table) + let mut table = builder.table; + shrink_lex_table(&mut table, parse_table); + + (table, keyword_lex_table) } struct LexTableBuilder<'a> { @@ -60,32 +63,49 @@ impl<'a> LexTableBuilder<'a> { } } - fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { let nfa_states = tokens + .iter() .map(|token| self.lexical_grammar.variables[token.index].start_state) .collect(); - let result = self.add_state(nfa_states); - while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + let (state_id, is_new) = self.add_state(nfa_states); + + if is_new { + info!( + "entry point state: {}, tokens: {:?}", + state_id, + tokens + .iter() + .map(|t| &self.lexical_grammar.variables[t.index].name) + .collect::>() + ); + } + + while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { self.populate_state(state_id, nfa_states); } - result + state_id } - fn add_state(&mut self, nfa_states: Vec) -> usize { - match self.state_ids_by_nfa_state_set.entry(nfa_states) { - Entry::Occupied(o) => *o.get(), + fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + self.cursor.reset(nfa_states); + match self + .state_ids_by_nfa_state_set + .entry(self.cursor.state_ids.clone()) + { + Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); self.state_queue.push_back((state_id, v.key().clone())); v.insert(state_id); - state_id + (state_id, true) } } } fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { - self.cursor.reset(nfa_states); + self.cursor.force_reset(nfa_states); let mut completion = None; for (id, prec) in self.cursor.completions() { @@ -102,12 +122,16 @@ impl<'a> LexTableBuilder<'a> { } for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!( + "populate state: {}, characters: {:?}, precedence: {:?}", + state_id, chars, advance_precedence + ); if let Some((_, completed_precedence)) = completion { if advance_precedence < completed_precedence { continue; } } - let next_state_id = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -122,3 +146,59 @@ impl<'a> LexTableBuilder<'a> { } } } + +fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { + let mut state_replacements = BTreeMap::new(); + let mut done = false; + while !done { + done = true; + for (i, state_i) in table.states.iter().enumerate() { + if state_replacements.contains_key(&i) { + continue; + } + for (j, state_j) in table.states.iter().enumerate() { + if state_replacements.contains_key(&j) { + continue; + } + if j == i { + break; + } + if state_i == state_j { + info!("replace state {} with state {}", i, j); + state_replacements.insert(i, j); + done = false; + } + } + } + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + if let Some(new_state_id) = state_replacements.get(&advance_action.1.state) { + advance_action.1.state = *new_state_id; + } + } + } + } + + let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { + let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); + let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); + replacement - prior_removed + }).collect::>(); + + for state in parse_table.states.iter_mut() { + state.lex_state_id = final_state_replacements[state.lex_state_id]; + } + + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + advance_action.1.state = final_state_replacements[advance_action.1.state]; + } + } + + let mut i = 0; + table.states.retain(|_| { + let result = !state_replacements.contains_key(&i); + i += 1; + result + }); +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index c17261dc..ada34dff 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,8 +7,11 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::{DefaultHasher, Entry}; -use std::collections::{HashMap, HashSet, VecDeque}; +use hashbrown::hash_map::Entry; +use hashbrown::{HashMap, HashSet}; +use std::collections::hash_map::DefaultHasher; +use std::collections::VecDeque; + use std::fmt::Write; use std::hash::Hasher; @@ -43,9 +46,10 @@ impl<'a> ParseTableBuilder<'a> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); - // Ensure that the error state has index 0. + // Add the error state at index 0. self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + // Add the starting state at index 1. self.add_parse_state( &Vec::new(), &Vec::new(), @@ -61,6 +65,8 @@ impl<'a> ParseTableBuilder<'a> { self.process_part_state_queue()?; self.populate_used_symbols(); + self.remove_precedences(); + Ok((self.parse_table, self.following_tokens)) } @@ -112,28 +118,9 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - self.add_actions( entry.preceding_symbols, entry.preceding_auxiliary_symbols, @@ -527,6 +514,7 @@ impl<'a> ParseTableBuilder<'a> { } fn populate_used_symbols(&mut self) { + self.parse_table.symbols.push(Symbol::end()); let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; @@ -542,20 +530,39 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } - self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); } } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::non_terminal(i)); } } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); + } + + fn remove_precedences(&mut self) { + for state in self.parse_table.states.iter_mut() { + for (_, entry) in state.terminal_entries.iter_mut() { + for action in entry.actions.iter_mut() { + match action { + ParseAction::Reduce { + precedence, + associativity, + .. + } => { + *precedence = 0; + *associativity = None; + } + _ => {} + } + } } } } diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 10707489..5f2bb3ec 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,36 +1,44 @@ +use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: HashMap<(Symbol, Symbol), HashSet>, - empty: HashSet, + entries: Vec>, + n: usize, } impl CoincidentTokenIndex { - pub fn new(table: &ParseTable) -> Self { - let mut entries = HashMap::new(); + pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { + let n = lexical_grammar.variables.len(); + let mut result = Self { + n, + entries: vec![HashSet::new(); n * n], + }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - entries - .entry((*symbol, *other_symbol)) - .or_insert(HashSet::new()) - .insert(i); + let index = result.index(*symbol, *other_symbol); + result.entries[index].insert(i); } } } - Self { - entries, - empty: HashSet::new(), - } + result } pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { - self.entries.get(&(a, b)).unwrap_or(&self.empty) + &self.entries[self.index(a, b)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - self.entries.contains_key(&(a, b)) + !self.entries[self.index(a, b)].is_empty() + } + + fn index(&self, a: Symbol, b: Symbol) -> usize { + if a.index < b.index { + a.index * self.n + b.index + } else { + b.index * self.n + a.index + } } } diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 4cd2f643..511d7bef 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -112,7 +112,9 @@ impl LookaheadSet { return; } }; - vec.resize(other.index + 1, false); + if other.index >= vec.len() { + vec.resize(other.index + 1, false); + } vec.set(other.index, true); } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 5e61bfcc..5714e7e2 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,7 +1,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq, Eq)] struct TransitiveClosureAddition<'a> { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 8b3a2db4..207431dd 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -27,22 +27,14 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - - eprintln!("{:?}", token_conflict_map); - - let coincident_token_index = CoincidentTokenIndex::new(&parse_table); - let keywords = if let Some(word_token) = syntax_grammar.word_token { - identify_keywords( - lexical_grammar, - &parse_table, - word_token, - &token_conflict_map, - &coincident_token_index, - ) - } else { - LookaheadSet::new() - }; - + let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); + let keywords = identify_keywords( + lexical_grammar, + &parse_table, + syntax_grammar.word_token, + &token_conflict_map, + &coincident_token_index, + ); populate_error_state( &mut parse_table, syntax_grammar, @@ -123,10 +115,15 @@ fn populate_error_state( fn identify_keywords( lexical_grammar: &LexicalGrammar, parse_table: &ParseTable, - word_token: Symbol, + word_token: Option, token_conflict_map: &TokenConflictMap, coincident_token_index: &CoincidentTokenIndex, ) -> LookaheadSet { + if word_token.is_none() { + return LookaheadSet::new(); + } + + let word_token = word_token.unwrap(); let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); // First find all of the candidate keyword tokens: tokens that start with @@ -137,6 +134,7 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { + info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); Some(Symbol::terminal(i)) } else { None @@ -150,8 +148,8 @@ fn identify_keywords( if other_token != *token && token_conflict_map.does_match_same_string(token.index, other_token.index) { - eprintln!( - "Exclude {} from keywords because it matches the same string as {}", + info!( + "Keywords - exclude {} because it matches the same string as {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_token.index].name ); @@ -189,8 +187,8 @@ fn identify_keywords( word_token.index, other_index, ) { - eprintln!( - "Exclude {} from keywords because of conflict with {}", + info!( + "Keywords - exclude {} because of conflict with {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_index].name ); @@ -198,8 +196,8 @@ fn identify_keywords( } } - eprintln!( - "Include {} in keywords", + info!( + "Keywords - include {}", lexical_grammar.variables[token.index].name, ); true diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index b943158f..33b72c32 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -2,7 +2,7 @@ use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, @@ -240,6 +240,10 @@ fn can_add_entry_to_state( fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + for state in &parse_table.states { for referenced_state in state.referenced_states() { state_usage_map[referenced_state] = true; diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 9f1c4426..18a80484 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -1,7 +1,7 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; -use std::collections::HashSet; +use hashbrown::HashSet; use std::fmt; #[derive(Clone, Debug, Default, PartialEq, Eq)] diff --git a/src/grammars.rs b/src/grammars.rs index d23e8ca6..7f587a8c 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,6 +1,6 @@ use crate::nfa::Nfa; use crate::rules::{Alias, Associativity, Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { diff --git a/src/logger.rs b/src/logger.rs new file mode 100644 index 00000000..18df763d --- /dev/null +++ b/src/logger.rs @@ -0,0 +1,29 @@ +use log::{LevelFilter, Log, Metadata, Record}; + +struct Logger { + pub filter: Option, +} + +impl Log for Logger { + fn enabled(&self, _: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + eprintln!( + "[{}] {}", + record + .module_path() + .unwrap_or_default() + .trim_start_matches("rust_tree_sitter_cli::"), + record.args() + ); + } + + fn flush(&self) {} +} + +pub(crate) fn init() { + log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap(); + log::set_max_level(LevelFilter::Info); +} diff --git a/src/main.rs b/src/main.rs index cd672186..a08922b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,20 +1,23 @@ #[macro_use] -extern crate serde_derive; -#[macro_use] -extern crate serde_json; -#[macro_use] extern crate lazy_static; +#[macro_use] +extern crate log; +#[macro_use] +extern crate serde_derive; +extern crate hashbrown; +extern crate serde_json; -use std::path::PathBuf; use clap::{App, Arg, SubCommand}; use std::env; use std::io::Write; +use std::path::PathBuf; use std::process::{Command, Stdio}; mod build_tables; mod error; mod generate; mod grammars; +mod logger; mod nfa; mod parse_grammar; mod prepare_grammar; @@ -27,7 +30,11 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") - .subcommand(SubCommand::with_name("generate").about("Generate a parser")) + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + .arg(Arg::with_name("log").long("log")), + ) .subcommand( SubCommand::with_name("parse") .about("Parse a file") @@ -42,7 +49,11 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(_) = matches.subcommand_matches("generate") { + if let Some(matches) = matches.subcommand_matches("generate") { + if matches.is_present("log") { + logger::init(); + } + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); @@ -70,7 +81,8 @@ fn load_js_grammar_file(grammar_path: PathBuf) -> String { "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", js_prelude, grammar_path.to_str().unwrap() - ).expect("Failed to write to node's stdin"); + ) + .expect("Failed to write to node's stdin"); drop(node_stdin); let output = node_process .wait_with_output() diff --git a/src/nfa.rs b/src/nfa.rs index e14dac44..1c7ff53b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -320,6 +320,10 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } + pub fn force_reset(&mut self, states: Vec) { + self.state_ids = states + } + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { @@ -352,16 +356,26 @@ impl<'a> NfaCursor<'a> { result[i].1 = max(result[i].1, prec); result[i].2.push(state); result[i].3 |= is_sep; - } else { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - let mut states = result[i].2.clone(); - states.push(state); + chars = CharacterSet::empty(); + break; + } + + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let max_prec = max(result[i].1, prec); + states.push(state); + if result[i].0.is_empty() { + result[i].0 = intersection; + result[i].1 = max_prec; + result[i].2 = states; + result[i].3 |= is_sep; + } else { result.insert( i, ( intersection, - max(result[i].1, prec), + max_prec, states, result[i].3 || is_sep, ), diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 07396329..6808f402 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -133,7 +133,7 @@ mod tests { #[test] fn test_parse_grammar() { - let grammar = parse_grammar(&json!({ + let grammar = parse_grammar(r#"{ "name": "my_lang", "rules": { "file": { @@ -148,7 +148,7 @@ mod tests { "value": "foo" } } - }).to_string()).unwrap(); + }"#).unwrap(); assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index f3811c5f..4589bd11 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,7 +1,7 @@ use super::ExtractedSyntaxGrammar; use crate::grammars::{Variable, VariableType}; use crate::rules::{Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; struct Expander { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 5f3f6e16..115933ee 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -2,7 +2,7 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; pub(super) fn extract_tokens( diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 0d7f6827..24bbc14d 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -1,5 +1,5 @@ use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct ProductionStepId { diff --git a/src/render/mod.rs b/src/render/mod.rs index cbb8ba0d..250218c1 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,9 +1,9 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use core::ops::Range; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -372,17 +372,14 @@ impl Generator { if self.add_character_set_condition(&characters, &ruled_out_characters) { add!(self, ")\n"); indent!(self); - if action.in_main_token { - add_line!(self, "ADVANCE({});", action.state); - } else { - add_line!(self, "SKIP({});", action.state); - } + self.add_advance_action(&action); if let CharacterSet::Include(chars) = characters { ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { self.buffer.truncate(previous_length); + self.add_advance_action(&action); } } @@ -494,6 +491,14 @@ impl Generator { }) } + fn add_advance_action(&mut self, action: &AdvanceAction) { + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + } + fn add_lex_modes_list(&mut self) { self.get_external_scanner_state_id(HashSet::new()); diff --git a/src/rules.rs b/src/rules.rs index 77e50d3c..ad16c632 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { diff --git a/src/tables.rs b/src/tables.rs index 1c125621..21222135 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; From 92d4fe419c291f48233a8cbcd5073111e2ebfaa7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 10:30:59 -0800 Subject: [PATCH 083/208] Fix character set intersection bugs --- src/nfa.rs | 159 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 34 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index 1c7ff53b..b746200f 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -149,14 +149,18 @@ impl CharacterSet { CharacterSet::Include(removed) } CharacterSet::Exclude(other_chars) => { - let removed = remove_chars(chars, other_chars, true); + let mut result_exclusion = chars.clone(); + result_exclusion.extend(other_chars.iter().cloned()); + result_exclusion.sort_unstable(); + result_exclusion.dedup(); + remove_chars(chars, other_chars, true); let mut included_characters = Vec::new(); let mut other_included_characters = Vec::new(); swap(&mut included_characters, other_chars); swap(&mut other_included_characters, chars); *self = CharacterSet::Include(included_characters); *other = CharacterSet::Include(other_included_characters); - CharacterSet::Exclude(removed) + CharacterSet::Exclude(result_exclusion) } }, } @@ -351,35 +355,24 @@ impl<'a> NfaCursor<'a> { for (chars, prec, state, is_sep) in iter { let mut chars = chars.clone(); let mut i = 0; - while i < result.len() { - if result[i].0 == chars { - result[i].1 = max(result[i].1, prec); - result[i].2.push(state); - result[i].3 |= is_sep; - chars = CharacterSet::empty(); - break; - } - + while i < result.len() && !chars.is_empty() { let intersection = result[i].0.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut states = result[i].2.clone(); - let max_prec = max(result[i].1, prec); - states.push(state); + let mut intersection_states = result[i].2.clone(); + match intersection_states.binary_search(&state) { + Err(j) => intersection_states.insert(j, state), + _ => {} + } + let intersection_entry = ( + intersection, + max(result[i].1, prec), + intersection_states, + result[i].3 || is_sep, + ); if result[i].0.is_empty() { - result[i].0 = intersection; - result[i].1 = max_prec; - result[i].2 = states; - result[i].3 |= is_sep; + result[i] = intersection_entry; } else { - result.insert( - i, - ( - intersection, - max_prec, - states, - result[i].3 || is_sep, - ), - ); + result.insert(i, intersection_entry); i += 1; } } @@ -444,6 +437,7 @@ mod tests { #[test] fn test_group_successors() { let table = [ + // overlapping character classes ( vec![ (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), @@ -460,6 +454,7 @@ mod tests { (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), ], ), + // large character class followed by many individual characters ( vec![ (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), @@ -483,6 +478,63 @@ mod tests { ), ], ), + // negated character class followed by an individual character + ( + vec![ + (CharacterSet::empty().add_char('0'), 0, 1, false), + (CharacterSet::empty().add_char('b'), 0, 2, false), + ( + CharacterSet::empty().add_range('a', 'f').negate(), + 0, + 3, + false, + ), + (CharacterSet::empty().add_char('c'), 0, 4, false), + ], + vec![ + (CharacterSet::empty().add_char('0'), 0, vec![1, 3], false), + (CharacterSet::empty().add_char('b'), 0, vec![2], false), + (CharacterSet::empty().add_char('c'), 0, vec![4], false), + ( + CharacterSet::empty() + .add_range('a', 'f') + .add_char('0') + .negate(), + 0, + vec![3], + false, + ), + ], + ), + // multiple negated character classes + ( + vec![ + (CharacterSet::Include(vec!['a']), 0, 1, false), + (CharacterSet::Exclude(vec!['a', 'b', 'c']), 0, 2, false), + (CharacterSet::Include(vec!['g']), 0, 6, false), + (CharacterSet::Exclude(vec!['d', 'e', 'f']), 0, 3, false), + (CharacterSet::Exclude(vec!['g', 'h', 'i']), 0, 4, false), + (CharacterSet::Include(vec!['g']), 0, 5, false), + ], + vec![ + (CharacterSet::Include(vec!['a']), 0, vec![1, 3, 4], false), + (CharacterSet::Include(vec!['g']), 0, vec![2, 3, 5, 6], false), + (CharacterSet::Include(vec!['b', 'c']), 0, vec![3, 4], false), + (CharacterSet::Include(vec!['h', 'i']), 0, vec![2, 3], false), + ( + CharacterSet::Include(vec!['d', 'e', 'f']), + 0, + vec![2, 4], + false, + ), + ( + CharacterSet::Exclude(vec!['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), + 0, + vec![2, 3, 4], + false, + ), + ], + ), ]; for row in table.iter() { @@ -495,8 +547,8 @@ mod tests { #[test] fn test_character_set_remove_intersection() { - // whitelist - whitelist - // both sets contain 'c', 'd', and 'f' + // A whitelist and an overlapping whitelist. + // Both sets contain 'c', 'd', and 'f' let mut a = CharacterSet::empty().add_range('a', 'f'); let mut b = CharacterSet::empty().add_range('c', 'h'); assert_eq!( @@ -515,8 +567,37 @@ mod tests { assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); - // whitelist - blacklist - // both sets contain 'e', 'f', and 'm' + // A whitelist and a larger whitelist. + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + // A whitelist and an intersecting blacklist. + // Both sets contain 'e', 'f', and 'm' let mut a = CharacterSet::empty() .add_range('c', 'h') .add_range('k', 'm'); @@ -545,16 +626,26 @@ mod tests { assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); - // blacklist - blacklist - // both sets exclude 'c', 'd', and 'e' + // A blacklist and an overlapping blacklist. + // Both sets exclude 'c', 'd', and 'e' let mut a = CharacterSet::empty().add_range('a', 'e').negate(); let mut b = CharacterSet::empty().add_range('c', 'h').negate(); assert_eq!( a.remove_intersection(&mut b), - CharacterSet::Exclude(vec!['c', 'd', 'e']) + CharacterSet::empty().add_range('a', 'h').negate(), ); assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + + // A blacklist and a larger blacklist. + let mut a = CharacterSet::empty().add_range('b', 'c').negate(); + let mut b = CharacterSet::empty().add_range('a', 'd').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('a', 'd').negate(), + ); + assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d')); + assert_eq!(b, CharacterSet::empty()); } #[test] From 82fda8929e0019f6ba676f659677e84000ae1632 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 10:31:14 -0800 Subject: [PATCH 084/208] Add EOF actions to lex table --- src/build_tables/build_lex_table.rs | 97 +++++++++++++++++++++------ src/build_tables/coincident_tokens.rs | 11 +-- src/render/mod.rs | 19 +++--- src/rules.rs | 4 ++ src/tables.rs | 2 +- 5 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index c002f427..66a4fe43 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -1,7 +1,8 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::NfaCursor; +use crate::nfa::{CharacterSet, NfaCursor}; +use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; @@ -23,7 +24,6 @@ pub(crate) fn build_lex_table( let mut builder = LexTableBuilder::new(lexical_grammar); for (i, state) in parse_table.states.iter_mut().enumerate() { - info!("populate lex state for parse state {}", i); let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { @@ -31,10 +31,13 @@ pub(crate) fn build_lex_table( } else { Some(*token) } + } else if token.is_eof() { + Some(*token) } else { None } })); + info!("populate lex state for parse state {}", i); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -44,12 +47,18 @@ pub(crate) fn build_lex_table( (table, keyword_lex_table) } +struct QueueEntry { + state_id: usize, + nfa_states: Vec, + eof_valid: bool, +} + struct LexTableBuilder<'a> { lexical_grammar: &'a LexicalGrammar, cursor: NfaCursor<'a>, table: LexTable, - state_queue: VecDeque<(usize, Vec)>, - state_ids_by_nfa_state_set: HashMap, usize>, + state_queue: VecDeque, + state_ids_by_nfa_state_set: HashMap<(Vec, bool), usize>, } impl<'a> LexTableBuilder<'a> { @@ -64,11 +73,19 @@ impl<'a> LexTableBuilder<'a> { } fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + let mut eof_valid = false; let nfa_states = tokens .iter() - .map(|token| self.lexical_grammar.variables[token.index].start_state) + .filter_map(|token| { + if token.is_terminal() { + Some(self.lexical_grammar.variables[token.index].start_state) + } else { + eof_valid = true; + None + } + }) .collect(); - let (state_id, is_new) = self.add_state(nfa_states); + let (state_id, is_new) = self.add_state(nfa_states, eof_valid); if is_new { info!( @@ -81,32 +98,42 @@ impl<'a> LexTableBuilder<'a> { ); } - while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { - self.populate_state(state_id, nfa_states); + while let Some(QueueEntry { + state_id, + nfa_states, + eof_valid, + }) = self.state_queue.pop_front() + { + self.populate_state(state_id, nfa_states, eof_valid); } state_id } - fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + fn add_state(&mut self, nfa_states: Vec, eof_valid: bool) -> (usize, bool) { self.cursor.reset(nfa_states); match self .state_ids_by_nfa_state_set - .entry(self.cursor.state_ids.clone()) + .entry((self.cursor.state_ids.clone(), eof_valid)) { Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); - self.state_queue.push_back((state_id, v.key().clone())); + self.state_queue.push_back(QueueEntry { + state_id, + nfa_states: v.key().0.clone(), + eof_valid, + }); v.insert(state_id); (state_id, true) } } } - fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + fn populate_state(&mut self, state_id: usize, nfa_states: Vec, eof_valid: bool) { self.cursor.force_reset(nfa_states); + // The EOF state is represented as an empty list of NFA states. let mut completion = None; for (id, prec) in self.cursor.completions() { if let Some((prev_id, prev_precedence)) = completion { @@ -121,7 +148,24 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!("raw successors: {:?}", self.cursor.successors().collect::>()); + let successors = self.cursor.grouped_successors(); + + // If EOF is a valid lookahead token, add a transition predicated on the null + // character that leads to the empty set of NFA states. + if eof_valid { + let (next_state_id, _) = self.add_state(Vec::new(), false); + info!("populate state: {}, character: EOF", state_id); + self.table.states[state_id].advance_actions.push(( + CharacterSet::empty().add_char('\0'), + AdvanceAction { + state: next_state_id, + in_main_token: true, + }, + )); + } + + for (chars, advance_precedence, next_states, is_sep) in successors { info!( "populate state: {}, characters: {:?}, precedence: {:?}", state_id, chars, advance_precedence @@ -131,7 +175,7 @@ impl<'a> LexTableBuilder<'a> { continue; } } - let (next_state_id, _) = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -141,8 +185,10 @@ impl<'a> LexTableBuilder<'a> { )); } - if let Some((completion_index, _)) = completion { - self.table.states[state_id].accept_action = Some(completion_index); + if let Some((complete_id, _)) = completion { + self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id)); + } else if self.cursor.state_ids.is_empty() { + self.table.states[state_id].accept_action = Some(Symbol::end()); } } } @@ -179,11 +225,20 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } } - let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { - let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); - let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); - replacement - prior_removed - }).collect::>(); + let final_state_replacements = (0..table.states.len()) + .into_iter() + .map(|state_id| { + let replacement = state_replacements + .get(&state_id) + .cloned() + .unwrap_or(state_id); + let prior_removed = state_replacements + .iter() + .take_while(|i| *i.0 < replacement) + .count(); + replacement - prior_removed + }) + .collect::>(); for state in parse_table.states.iter_mut() { state.lex_state_id = final_state_replacements[state.lex_state_id]; diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 5f2bb3ec..ac5931e1 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,10 +1,9 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: Vec>, + entries: Vec>, n: usize, } @@ -13,20 +12,22 @@ impl CoincidentTokenIndex { let n = lexical_grammar.variables.len(); let mut result = Self { n, - entries: vec![HashSet::new(); n * n], + entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { let index = result.index(*symbol, *other_symbol); - result.entries[index].insert(i); + if result.entries[index].last().cloned() != Some(i) { + result.entries[index].push(i); + } } } } result } - pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { &self.entries[self.index(a, b)] } diff --git a/src/render/mod.rs b/src/render/mod.rs index 250218c1..624fa1e0 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -125,7 +125,7 @@ impl Generator { .symbols .iter() .filter(|symbol| { - if symbol.is_terminal() { + if symbol.is_terminal() || symbol.is_eof() { true } else if symbol.is_external() { self.syntax_grammar.external_tokens[symbol.index] @@ -359,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&Symbol::terminal(accept_action)] + self.symbol_ids[&accept_action] ); } @@ -462,18 +462,16 @@ impl Generator { let mut prev_range: Option> = None; chars .iter() - .cloned() - .chain(Some('\0')) - .filter_map(move |c| { + .map(|c| (*c, false)) + .chain(Some(('\0', true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); + } if ruled_out_characters.contains(&(c as u32)) { return None; } if let Some(range) = prev_range.clone() { - if c == '\0' { - prev_range = Some(c..c); - return Some(range); - } - let mut prev_range_successor = range.end as u32 + 1; while prev_range_successor < c as u32 { if !ruled_out_characters.contains(&prev_range_successor) { @@ -948,6 +946,7 @@ impl Generator { fn add_character(&mut self, c: char) { if c.is_ascii() { match c { + '\0' => add!(self, "'\\0'"), '\'' => add!(self, "'\\''"), '\\' => add!(self, "'\\\\'"), '\t' => add!(self, "'\\t'"), diff --git a/src/rules.rs b/src/rules.rs index ad16c632..bd0340fc 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -162,6 +162,10 @@ impl Symbol { self.kind == SymbolType::External } + pub fn is_eof(&self) -> bool { + self.kind == SymbolType::End + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, diff --git a/src/tables.rs b/src/tables.rs index 21222135..f400d25c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -55,7 +55,7 @@ pub(crate) struct AdvanceAction { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, - pub accept_action: Option, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] From 02ca84fb4ae339753f2742d69017bdb7c39dda44 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 11:52:45 -0800 Subject: [PATCH 085/208] Add missing ';' in generated code --- src/render/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index 624fa1e0..dd046c93 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -358,7 +358,7 @@ impl Generator { if let Some(accept_action) = state.accept_action { add_line!( self, - "ACCEPT_TOKEN({})", + "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action] ); } From c0f48dff6f3128d94855826e63588847dfcabb61 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 11:52:57 -0800 Subject: [PATCH 086/208] Fix incorrect NFA generation for string rules --- src/build_tables/build_lex_table.rs | 6 +-- src/prepare_grammar/expand_tokens.rs | 63 +++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 66a4fe43..6cd9a1ce 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -148,8 +148,8 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - info!("raw successors: {:?}", self.cursor.successors().collect::>()); let successors = self.cursor.grouped_successors(); + info!("populate state: {}, successors: {:?}", state_id, successors); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. @@ -166,10 +166,6 @@ impl<'a> LexTableBuilder<'a> { } for (chars, advance_precedence, next_states, is_sep) in successors { - info!( - "populate state: {}, characters: {:?}, precedence: {:?}", - state_id, chars, advance_precedence - ); if let Some((_, completed_precedence)) = completion { if advance_precedence < completed_precedence { continue; diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index fdf085f6..61b1897c 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -89,7 +89,8 @@ impl NfaBuilder { } Rule::String(s) => { for c in s.chars().rev() { - self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id()); + self.push_advance(CharacterSet::empty().add_char(c), next_state_id); + next_state_id = self.nfa.last_state_id(); } Ok(s.len() > 0) } @@ -102,6 +103,8 @@ impl NfaBuilder { alternative_state_ids.push(next_state_id); } } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); for alternative_state_id in alternative_state_ids { self.push_split(alternative_state_id); @@ -542,6 +545,64 @@ mod tests { ("aeeeef", Some((2, "aeeee"))), ], }, + Row { + rules: vec![ + Rule::seq(vec![ + Rule::string("a"), + Rule::choice(vec![ + Rule::string("b"), + Rule::string("c"), + ]), + Rule::string("d"), + ]) + ], + separators: vec![], + examples: vec![ + ("abd", Some((0, "abd"))), + ("acd", Some((0, "acd"))), + ("abc", None), + ("ad", None), + ("d", None), + ("a", None), + ] + }, + // nested choices within sequences + Row { + rules: vec![ + Rule::seq(vec![ + Rule::pattern("[0-9]+"), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![ + Rule::seq(vec![ + Rule::choice(vec![ + Rule::string("e"), + Rule::string("E") + ]), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![ + Rule::string("+"), + Rule::string("-"), + ]) + ]), + Rule::pattern("[0-9]+"), + ]) + ]) + ]), + ]), + ], + separators: vec![], + examples: vec![ + ("12", Some((0, "12"))), + ("12e", Some((0, "12"))), + ("12g", Some((0, "12"))), + ("12e3", Some((0, "12e3"))), + ("12e+", Some((0, "12"))), + ("12E+34 +", Some((0, "12E+34"))), + ("12e34", Some((0, "12e34"))), + ], + }, ]; for Row { From 70f00d1a1e2e82582c576605d7f3e10c01345511 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 13:49:37 -0800 Subject: [PATCH 087/208] Give immediate tokens higher implicit precedence than other tokens --- src/build_tables/token_conflicts.rs | 17 ++++++++--------- src/grammars.rs | 2 +- src/prepare_grammar/expand_tokens.rs | 16 +++++++++++----- src/prepare_grammar/extract_simple_aliases.rs | 6 +++--- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 18a80484..91edadec 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -2,6 +2,7 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; use hashbrown::HashSet; +use std::cmp::Ordering; use std::fmt; #[derive(Clone, Debug, Default, PartialEq, Eq)] @@ -71,16 +72,14 @@ impl<'a> TokenConflictMap<'a> { return false; } - match ( - grammar.variables[left.1].is_string, - grammar.variables[right.1].is_string, - ) { - (true, false) => return true, - (false, true) => return false, - _ => {} + match grammar.variables[left.1] + .implicit_precedence + .cmp(&grammar.variables[right.1].implicit_precedence) + { + Ordering::Less => false, + Ordering::Greater => true, + Ordering::Equal => left.1 < right.1, } - - left.0 < right.0 } } diff --git a/src/grammars.rs b/src/grammars.rs index 7f587a8c..f82d6b02 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -36,7 +36,7 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, - pub is_string: bool, + pub implicit_precedence: i32, pub start_state: u32, } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 61b1897c..6520c432 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -14,11 +14,17 @@ struct NfaBuilder { precedence_stack: Vec, } -fn is_string(rule: &Rule) -> bool { +fn get_implicit_precedence(rule: &Rule) -> i32 { match rule { - Rule::String(_) => true, - Rule::Metadata { rule, .. } => is_string(rule), - _ => false, + Rule::String(_) => 1, + Rule::Metadata { rule, params } => { + if params.is_main_token { + get_implicit_precedence(rule) + 2 + } else { + get_implicit_precedence(rule) + } + } + _ => 0, } } @@ -67,7 +73,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result Date: Thu, 3 Jan 2019 13:49:50 -0800 Subject: [PATCH 088/208] Fix logic for identifying error recovery tokens --- src/build_tables/coincident_tokens.rs | 46 ++- src/build_tables/mod.rs | 54 ++- src/build_tables/shrink_parse_table.rs | 464 +++++++++++++------------ 3 files changed, 311 insertions(+), 253 deletions(-) diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index ac5931e1..62295073 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,23 +1,26 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; +use std::fmt; -pub(crate) struct CoincidentTokenIndex { +pub(crate) struct CoincidentTokenIndex<'a> { entries: Vec>, + grammar: &'a LexicalGrammar, n: usize, } -impl CoincidentTokenIndex { - pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { +impl<'a> CoincidentTokenIndex<'a> { + pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self { let n = lexical_grammar.variables.len(); let mut result = Self { n, + grammar: lexical_grammar, entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - let index = result.index(*symbol, *other_symbol); + let index = result.index(symbol.index, other_symbol.index); if result.entries[index].last().cloned() != Some(i) { result.entries[index].push(i); } @@ -28,18 +31,41 @@ impl CoincidentTokenIndex { } pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { - &self.entries[self.index(a, b)] + &self.entries[self.index(a.index, b.index)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - !self.entries[self.index(a, b)].is_empty() + !self.entries[self.index(a.index, b.index)].is_empty() } - fn index(&self, a: Symbol, b: Symbol) -> usize { - if a.index < b.index { - a.index * self.n + b.index + fn index(&self, a: usize, b: usize) -> usize { + if a < b { + a * self.n + b } else { - b.index * self.n + a.index + b * self.n + a } } } + +impl<'a> fmt::Debug for CoincidentTokenIndex<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CoincidentTokenIndex {{\n")?; + + write!(f, " entries: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[j].name, + self.entries[self.index(i, j)].len() + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 207431dd..84659600 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -47,6 +47,7 @@ pub(crate) fn build_tables( syntax_grammar, simple_aliases, &token_conflict_map, + &keywords, ); let (main_lex_table, keyword_lex_table) = build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); @@ -67,15 +68,22 @@ fn populate_error_state( ) { let state = &mut parse_table.states[0]; let n = lexical_grammar.variables.len(); + + // First identify the *conflict-free tokens*: tokens that do not overlap with + // any other token in any way. let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { - let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { - j == i - || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - || !token_conflict_map.does_conflict(i, j) + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) }); if conflicts_with_other_tokens { None } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } })); @@ -85,19 +93,32 @@ fn populate_error_state( actions: vec![ParseAction::Recover], }; + // Exclude from the error-recovery state any token that conflicts with one of + // the *conflict-free tokens* identified above. for i in 0..n { let symbol = Symbol::terminal(i); - let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) - || conflict_free_tokens.iter().all(|t| { - coincident_token_index.contains(symbol, t) - || !token_conflict_map.does_conflict(i, t.index) - }); - if can_be_used_for_recovery { - state - .terminal_entries - .entry(symbol) - .or_insert_with(|| recover_entry.clone()); + if !conflict_free_tokens.contains(&symbol) { + if syntax_grammar.word_token != Some(symbol) { + if let Some(t) = conflict_free_tokens.iter().find(|t| { + !coincident_token_index.contains(symbol, *t) + && token_conflict_map.does_conflict(symbol.index, t.index) + }) { + info!( + "error recovery - exclude token {} because of conflict with {}", + lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name + ); + continue; + } + } } + info!( + "error recovery - include token {}", + lexical_grammar.variables[i].name + ); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); } for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { @@ -134,7 +155,10 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { - info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); + info!( + "Keywords - add candidate {}", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } else { None diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 33b72c32..64a4b259 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,3 +1,4 @@ +use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; @@ -9,265 +10,272 @@ pub(crate) fn shrink_parse_table( syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, + keywords: &LookaheadSet, ) { - remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); - merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); - remove_unused_states(parse_table); + let mut optimizer = Optimizer { + parse_table, + syntax_grammar, + token_conflict_map, + keywords, + simple_aliases, + }; + optimizer.remove_unit_reductions(); + optimizer.merge_compatible_states(); + optimizer.remove_unused_states(); } -fn remove_unit_reductions( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - simple_aliases: &AliasMap, -) { - let mut aliased_symbols = HashSet::new(); - for variable in &syntax_grammar.variables { - for production in &variable.productions { - for step in &production.steps { - if step.alias.is_some() { - aliased_symbols.insert(step.symbol); +struct Optimizer<'a> { + parse_table: &'a mut ParseTable, + syntax_grammar: &'a SyntaxGrammar, + token_conflict_map: &'a TokenConflictMap<'a>, + keywords: &'a LookaheadSet, + simple_aliases: &'a AliasMap, +} + +impl<'a> Optimizer<'a> { + fn remove_unit_reductions(&mut self) { + let mut aliased_symbols = HashSet::new(); + for variable in &self.syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } } } } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !self.simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && self.syntax_grammar.variables[symbol.index].kind + != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in self.parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } } - let mut unit_reduction_symbols_by_state = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - let mut only_unit_reductions = true; - let mut unit_reduction_symbol = None; - for (_, entry) in &state.terminal_entries { - for action in &entry.actions { - match action { - ParseAction::ShiftExtra => continue, - ParseAction::Reduce { - child_count: 1, - alias_sequence_id: 0, - symbol, - .. - } => { - if !simple_aliases.contains_key(&symbol) - && !aliased_symbols.contains(&symbol) - && syntax_grammar.variables[symbol.index].kind != VariableType::Named - && (unit_reduction_symbol.is_none() - || unit_reduction_symbol == Some(symbol)) - { - unit_reduction_symbol = Some(symbol); + fn merge_compatible_states(&mut self) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { continue; } + if self.merge_parse_state(*j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } } - _ => {} } - only_unit_reductions = false; + } + + if state_replacements.is_empty() { break; } - if !only_unit_reductions { - break; - } - } - - if let Some(symbol) = unit_reduction_symbol { - if only_unit_reductions { - unit_reduction_symbols_by_state.insert(i, *symbol); + for state in self.parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); } } } - for state in parse_table.states.iter_mut() { - let mut done = false; - while !done { - done = true; - state.update_referenced_states(|other_state_id, state| { - if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { - done = false; - state.nonterminal_entries[symbol] - } else { - other_state_id - } - }) - } - } -} + fn merge_parse_state(&mut self, left: usize, right: usize) -> bool { + let left_state = &self.parse_table.states[left]; + let right_state = &self.parse_table.states[right]; -fn merge_compatible_states( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, -) { - let mut state_ids_by_signature = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - state_ids_by_signature - .entry(state.unfinished_item_signature) - .or_insert(Vec::new()) - .push(i); - } - - let mut deleted_states = HashSet::new(); - loop { - let mut state_replacements = HashMap::new(); - for (_, state_ids) in &state_ids_by_signature { - for i in state_ids { - for j in state_ids { - if j == i { - break; - } - if deleted_states.contains(j) || deleted_states.contains(i) { - continue; - } - if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { - deleted_states.insert(*i); - state_replacements.insert(*i, *j); - } - } - } - } - - if state_replacements.is_empty() { - break; - } - - for state in parse_table.states.iter_mut() { - state.update_referenced_states(|other_state_id, _| { - *state_replacements - .get(&other_state_id) - .unwrap_or(&other_state_id) - }); - } - } -} - -fn merge_parse_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - parse_table: &mut ParseTable, - left: usize, - right: usize, -) -> bool { - let left_state = &parse_table.states[left]; - let right_state = &parse_table.states[right]; - - if left_state.nonterminal_entries != right_state.nonterminal_entries { - return false; - } - - for (symbol, left_entry) in &left_state.terminal_entries { - if let Some(right_entry) = right_state.terminal_entries.get(symbol) { - if right_entry.actions != left_entry.actions { - return false; - } - } else if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - right_state, - *symbol, - left_entry, - ) { + if left_state.nonterminal_entries != right_state.nonterminal_entries { return false; } - } - let mut symbols_to_add = Vec::new(); - for (symbol, right_entry) in &right_state.terminal_entries { - if !left_state.terminal_entries.contains_key(&symbol) { - if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - left_state, - *symbol, - right_entry, - ) { - return false; - } - symbols_to_add.push(*symbol); - } - } - - for symbol in symbols_to_add { - let entry = parse_table.states[right].terminal_entries[&symbol].clone(); - parse_table.states[left] - .terminal_entries - .insert(symbol, entry); - } - - true -} - -fn can_add_entry_to_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - state: &ParseState, - token: Symbol, - entry: &ParseTableEntry, -) -> bool { - // Do not add external tokens; they could conflict lexically with any of the state's - // existing lookahead tokens. - if token.is_external() { - return false; - } - - // Only merge parse states by allowing existing reductions to happen - // with additional lookahead tokens. Do not alter parse states in ways - // that allow entirely new types of actions to happen. - if state.terminal_entries.iter().all(|(_, e)| e != entry) { - return false; - } - match entry.actions.last() { - Some(ParseAction::Reduce { .. }) => {} - _ => return false, - } - - // Do not add tokens which are both internal and external. Their validity could - // influence the behavior of the external scanner. - if syntax_grammar - .external_tokens - .iter() - .any(|t| t.corresponding_internal_token == Some(token)) - { - return false; - } - - // Do not add a token if it conflicts with an existing token. - if token.is_terminal() { - for existing_token in state.terminal_entries.keys() { - if token_conflict_map.does_conflict(token.index, existing_token.index) { + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) { return false; } } + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !self.can_add_entry_to_state(left_state, *symbol, right_entry) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = self.parse_table.states[right].terminal_entries[&symbol].clone(); + self.parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true } - true -} - -fn remove_unused_states(parse_table: &mut ParseTable) { - let mut state_usage_map = vec![false; parse_table.states.len()]; - - state_usage_map[0] = true; - state_usage_map[1] = true; - - for state in &parse_table.states { - for referenced_state in state.referenced_states() { - state_usage_map[referenced_state] = true; + fn can_add_entry_to_state( + &self, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, + ) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; } + + // Only merge_compatible_states parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if self + .syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + let is_word_token = self.syntax_grammar.word_token == Some(token); + let is_keyword = self.keywords.contains(&token); + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if (is_word_token && self.keywords.contains(existing_token)) + || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token) + { + continue; + } + if self + .token_conflict_map + .does_conflict(token.index, existing_token.index) + || self + .token_conflict_map + .does_match_same_string(token.index, existing_token.index) + { + return false; + } + } + } + + true } - let mut removed_predecessor_count = 0; - let mut state_replacement_map = vec![0; parse_table.states.len()]; - for state_id in 0..parse_table.states.len() { - state_replacement_map[state_id] = state_id - removed_predecessor_count; - if !state_usage_map[state_id] { - removed_predecessor_count += 1; + + fn remove_unused_states(&mut self) { + let mut state_usage_map = vec![false; self.parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + + for state in &self.parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } } - } - let mut state_id = 0; - let mut original_state_id = 0; - while state_id < parse_table.states.len() { - if state_usage_map[original_state_id] { - parse_table.states[state_id].update_referenced_states(|other_state_id, _| { - state_replacement_map[other_state_id] - }); - state_id += 1; - } else { - parse_table.states.remove(state_id); + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; self.parse_table.states.len()]; + for state_id in 0..self.parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < self.parse_table.states.len() { + if state_usage_map[original_state_id] { + self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + self.parse_table.states.remove(state_id); + } + original_state_id += 1; } - original_state_id += 1; } } From 5d3d161c057f112baed490bb767f16cfecde9948 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 14:08:24 -0800 Subject: [PATCH 089/208] Respect simple aliases in code gen --- src/render/mod.rs | 58 +++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index dd046c93..0c0e6e59 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -233,12 +233,13 @@ impl Generator { indent!(self); for symbol in self.parse_table.symbols.iter() { if *symbol != Symbol::end() { - add_line!( - self, - "[{}] = \"{}\",", - self.symbol_ids[&symbol], - self.sanitize_string(self.metadata_for_symbol(*symbol).0) + let name = self.sanitize_string( + self.simple_aliases + .get(symbol) + .map(|alias| alias.value.as_str()) + .unwrap_or(self.metadata_for_symbol(*symbol).0), ); + add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); } } for (alias, symbol) in &self.alias_map { @@ -265,22 +266,27 @@ impl Generator { for symbol in &self.parse_table.symbols { add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); indent!(self); - match self.metadata_for_symbol(*symbol).1 { - VariableType::Named => { - add_line!(self, ".visible = true,"); - add_line!(self, ".named = true,"); - } - VariableType::Anonymous => { - add_line!(self, ".visible = true,"); - add_line!(self, ".named = false,"); - } - VariableType::Hidden => { - add_line!(self, ".visible = false,"); - add_line!(self, ".named = true,"); - } - VariableType::Auxiliary => { - add_line!(self, ".visible = false,"); - add_line!(self, ".named = false,"); + if let Some(Alias { is_named, .. }) = self.simple_aliases.get(symbol) { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", is_named); + } else { + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } } } dedent!(self); @@ -356,11 +362,7 @@ impl Generator { fn add_lex_state(&mut self, state: LexState) { if let Some(accept_action) = state.accept_action { - add_line!( - self, - "ACCEPT_TOKEN({});", - self.symbol_ids[&accept_action] - ); + add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } let mut ruled_out_characters = HashSet::new(); @@ -397,7 +399,9 @@ impl Generator { self.add_character_range_conditions(ranges, false) } CharacterSet::Exclude(chars) => { - let ranges = Self::get_ranges(chars, ruled_out_characters); + let ranges = Some('\0'..'\0') + .into_iter() + .chain(Self::get_ranges(chars, ruled_out_characters)); self.add_character_range_conditions(ranges, true) } } From bf9556dadc470dd2c543f9aab94070cc801e3d96 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 16:35:16 -0800 Subject: [PATCH 090/208] Fix recursive processing of rule inlining --- src/build_tables/build_lex_table.rs | 10 +- src/build_tables/build_parse_table.rs | 40 ++-- src/build_tables/item.rs | 48 ++-- src/build_tables/item_set_builder.rs | 30 ++- src/prepare_grammar/process_inlines.rs | 311 ++++++++++++------------- 5 files changed, 230 insertions(+), 209 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 6cd9a1ce..60810f83 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -23,7 +23,7 @@ pub(crate) fn build_lex_table( } let mut builder = LexTableBuilder::new(lexical_grammar); - for (i, state) in parse_table.states.iter_mut().enumerate() { + for state in parse_table.states.iter_mut() { let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { @@ -37,7 +37,6 @@ pub(crate) fn build_lex_table( None } })); - info!("populate lex state for parse state {}", i); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -199,16 +198,17 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { continue; } for (j, state_j) in table.states.iter().enumerate() { - if state_replacements.contains_key(&j) { - continue; - } if j == i { break; } + if state_replacements.contains_key(&j) { + continue; + } if state_i == state_j { info!("replace state {} with state {}", i, j); state_replacements.insert(i, j); done = false; + break; } } } diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index ada34dff..6f930463 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -63,7 +63,28 @@ impl<'a> ParseTableBuilder<'a> { ), ); - self.process_part_state_queue()?; + while let Some(entry) = self.parse_state_queue.pop_front() { + // info!( + // "state: {}, item set: {}", + // entry.state_id, + // ParseItemSetDisplay( + // &self.item_sets_by_state_id[entry.state_id], + // self.syntax_grammar, + // self.lexical_grammar, + // ) + // ); + + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + entry.state_id, + item_set, + )?; + } + self.populate_used_symbols(); self.remove_precedences(); @@ -116,27 +137,12 @@ impl<'a> ParseTableBuilder<'a> { } } - fn process_part_state_queue(&mut self) -> Result<()> { - while let Some(entry) = self.parse_state_queue.pop_front() { - let item_set = self - .item_set_builder - .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - self.add_actions( - entry.preceding_symbols, - entry.preceding_auxiliary_symbols, - item_set, - entry.state_id, - )?; - } - Ok(()) - } - fn add_actions( &mut self, mut preceding_symbols: SymbolSequence, mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet<'a>, state_id: ParseStateId, + item_set: ParseItemSet<'a>, ) -> Result<()> { let mut terminal_successors = HashMap::new(); let mut non_terminal_successors = HashMap::new(); diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 511d7bef..d1d0cbbf 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -42,12 +42,19 @@ pub(crate) struct ParseItemSet<'a> { pub entries: BTreeMap, LookaheadSet>, } -pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem<'a>, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct ParseItemDisplay<'a>( + pub &'a ParseItem<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar +); + pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +#[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( - &'a ParseItemSet<'a>, - &'a SyntaxGrammar, - &'a LexicalGrammar, + pub &'a ParseItemSet<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, ); impl LookaheadSet { @@ -144,14 +151,6 @@ impl LookaheadSet { } result } - - pub fn display_with<'a>( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> LookaheadSetDisplay<'a> { - LookaheadSetDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> ParseItem<'a> { @@ -202,14 +201,6 @@ impl<'a> ParseItem<'a> { step_index: self.step_index + 1, } } - - pub fn display_with( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> ParseItemSet<'a> { @@ -235,14 +226,6 @@ impl<'a> ParseItemSet<'a> { } } } - - pub fn display_with( - &'a self, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> ParseItemSetDisplay<'a> { - ParseItemSetDisplay(self, syntax_grammar, lexical_grammar) - } } impl<'a> Default for ParseItemSet<'a> { @@ -253,6 +236,7 @@ impl<'a> Default for ParseItemSet<'a> { } } +#[allow(dead_code)] impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { if self.0.is_augmented() { @@ -282,6 +266,10 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } else { write!(f, "{}", &self.1.variables[step.symbol.index].name)?; } + + if let Some(alias) = &step.alias { + write!(f, " (alias {})", alias.value)?; + } } if self.0.is_done() { @@ -323,8 +311,8 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { writeln!( f, "{}\t{}", - item.display_with(self.1, self.2), - lookaheads.display_with(self.1, self.2) + ParseItemDisplay(item, self.1, self.2), + LookaheadSetDisplay(lookaheads, self.1, self.2) )?; } Ok(()) diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 5714e7e2..939d700c 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,7 +1,8 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use hashbrown::{HashMap, HashSet}; +use std::fmt; #[derive(Clone, Debug, PartialEq, Eq)] struct TransitiveClosureAddition<'a> { @@ -16,6 +17,8 @@ struct FollowSetInfo { } pub(crate) struct ParseItemSetBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, first_sets: HashMap, last_sets: HashMap, inlines: &'a InlinedProductionMap, @@ -35,6 +38,8 @@ impl<'a> ParseItemSetBuilder<'a> { inlines: &'a InlinedProductionMap, ) -> Self { let mut result = Self { + syntax_grammar, + lexical_grammar, first_sets: HashMap::new(), last_sets: HashMap::new(), inlines, @@ -300,3 +305,26 @@ impl<'a> ParseItemSetBuilder<'a> { set.entries.insert(item, lookaheads.clone()); } } + +impl<'a> fmt::Debug for ParseItemSetBuilder<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ParseItemSetBuilder {{\n")?; + + write!(f, " additions: {{\n")?; + for (i, variable) in self.syntax_grammar.variables.iter().enumerate() { + write!(f, " {}: {{\n", variable.name)?; + for addition in &self.transitive_closure_additions[i] { + write!( + f, + " {}\n", + ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar) + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + + write!(f, "}}")?; + Ok(()) + } +} diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 24bbc14d..9fd2f2c6 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -3,6 +3,9 @@ use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct ProductionStepId { + // A `None` value here means that the production itself was produced via inlining, + // and is stored in the the builder's `productions` vector, as opposed to being + // stored in one of the grammar's variables. variable_index: Option, production_index: usize, step_index: usize, @@ -13,169 +16,166 @@ struct InlinedProductionMapBuilder { productions: Vec, } -impl ProductionStepId { - pub fn successor(&self) -> Self { - Self { - variable_index: self.variable_index, - production_index: self.production_index, - step_index: self.step_index + 1, - } - } -} - -fn production_for_id<'a>( - map: &'a InlinedProductionMapBuilder, - id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> &'a Production { - if let Some(variable_index) = id.variable_index { - &grammar.variables[variable_index].productions[id.production_index] - } else { - &map.productions[id.production_index] - } -} - -fn production_step_for_id<'a>( - map: &'a InlinedProductionMapBuilder, - id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> Option<&'a ProductionStep> { - production_for_id(map, id, grammar).steps.get(id.step_index) -} - -fn inline<'a>( - map: &'a mut InlinedProductionMapBuilder, - step_id: ProductionStepId, - grammar: &'a SyntaxGrammar, -) -> &'a Vec { - let step = production_step_for_id(map, step_id, grammar).unwrap(); - let mut productions_to_add = grammar.variables[step.symbol.index].productions.clone(); - - let mut i = 0; - while i < productions_to_add.len() { - if let Some(first_symbol) = productions_to_add[i].first_symbol() { - if grammar.variables_to_inline.contains(&first_symbol) { - // Remove the production from the vector, replacing it with a placeholder. - let production = productions_to_add - .splice(i..i + 1, [Production::default()].iter().cloned()) - .next() - .unwrap(); - - // Replace the placeholder with the inlined productions. - productions_to_add.splice( - i..i + 1, - grammar.variables[first_symbol.index] - .productions - .iter() - .map(|p| { - let mut p = p.clone(); - p.steps.extend(production.steps[1..].iter().cloned()); - p - }), - ); - continue; - } - } - i += 1; - } - - let result = productions_to_add - .into_iter() - .map(|production_to_add| { - let mut inlined_production = production_for_id(&map, step_id, grammar).clone(); - let removed_step = inlined_production - .steps - .splice( - step_id.step_index..step_id.step_index + 1, - production_to_add.steps.iter().cloned(), - ) - .next() - .unwrap(); - let inserted_steps = &mut inlined_production.steps - [step_id.step_index..step_id.step_index + production_to_add.steps.len()]; - if let Some(alias) = removed_step.alias { - for inserted_step in inserted_steps.iter_mut() { - inserted_step.alias = Some(alias.clone()); - } - } - if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; - } - map.productions - .iter() - .position(|p| *p == inlined_production) - .unwrap_or({ - map.productions.push(inlined_production); - map.productions.len() - 1 - }) - }) - .collect(); - - map.production_indices_by_step_id - .entry(step_id) - .or_insert(result) -} - -pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { - let mut result = InlinedProductionMapBuilder { - productions: Vec::new(), - production_indices_by_step_id: HashMap::new(), - }; - - let mut step_ids_to_process = Vec::new(); - for (variable_index, variable) in grammar.variables.iter().enumerate() { - for production_index in 0..variable.productions.len() { - step_ids_to_process.push(ProductionStepId { - variable_index: Some(variable_index), - production_index, - step_index: 0, - }); - while !step_ids_to_process.is_empty() { - let mut i = 0; - while i < step_ids_to_process.len() { - let step_id = step_ids_to_process[i]; - if let Some(step) = production_step_for_id(&result, step_id, grammar) { - if grammar.variables_to_inline.contains(&step.symbol) { - let inlined_step_ids = inline(&mut result, step_id, grammar) - .into_iter() - .cloned() - .map(|production_index| ProductionStepId { - variable_index: None, - production_index, - step_index: step_id.step_index, - }) - .collect::>(); - step_ids_to_process.splice(i..i + 1, inlined_step_ids); +impl InlinedProductionMapBuilder { + fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap { + let mut step_ids_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + step_ids_to_process.push(ProductionStepId { + variable_index: Some(variable_index), + production_index, + step_index: 0, + }); + while !step_ids_to_process.is_empty() { + let mut i = 0; + while i < step_ids_to_process.len() { + let step_id = step_ids_to_process[i]; + if let Some(step) = self.production_step_for_id(step_id, grammar) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_step_ids = self + .inline_production_at_step(step_id, grammar) + .into_iter() + .cloned() + .map(|production_index| ProductionStepId { + variable_index: None, + production_index, + step_index: step_id.step_index, + }); + step_ids_to_process.splice(i..i + 1, inlined_step_ids); + } else { + step_ids_to_process[i] = ProductionStepId { + variable_index: step_id.variable_index, + production_index: step_id.production_index, + step_index: step_id.step_index + 1, + }; + i += 1; + } } else { - step_ids_to_process[i] = step_id.successor(); - i += 1; + step_ids_to_process.remove(i); } - } else { - step_ids_to_process.remove(i); } } } } + + let productions = self.productions; + let production_indices_by_step_id = self.production_indices_by_step_id; + let production_map = production_indices_by_step_id + .into_iter() + .map(|(step_id, production_indices)| { + let production = if let Some(variable_index) = step_id.variable_index { + &grammar.variables[variable_index].productions[step_id.production_index] + } else { + &productions[step_id.production_index] + } as *const Production; + ((production, step_id.step_index as u32), production_indices) + }) + .collect(); + + InlinedProductionMap { + productions, + production_map, + } } - // result - let productions = result.productions; - let production_indices_by_step_id = result.production_indices_by_step_id; + fn inline_production_at_step<'a>( + &'a mut self, + step_id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Vec { + // Build a list of productions produced by inlining rules. + let mut i = 0; + let step_index = step_id.step_index; + let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()]; + while i < productions_to_add.len() { + if let Some(step) = productions_to_add[i].steps.get(step_index) { + let symbol = step.symbol.clone(); - let production_map = production_indices_by_step_id - .into_iter() - .map(|(step_id, production_indices)| { - let production = if let Some(variable_index) = step_id.variable_index { - &grammar.variables[variable_index].productions[step_id.production_index] - } else { - &productions[step_id.production_index] - } as *const Production; - ((production, step_id.step_index as u32), production_indices) - }) - .collect(); + if grammar.variables_to_inline.contains(&symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); - InlinedProductionMap { productions, production_map } + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[symbol.index].productions.iter().map(|p| { + let mut production = production.clone(); + let removed_step = production + .steps + .splice(step_index..(step_index + 1), p.steps.iter().cloned()) + .next() + .unwrap(); + let inserted_steps = + &mut production.steps[step_index..(step_index + p.steps.len())]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + last_inserted_step.precedence = removed_step.precedence; + last_inserted_step.associativity = removed_step.associativity; + } + production + }), + ); + + continue; + } + } + i += 1; + } + + // Store all the computed productions. + let result = productions_to_add + .into_iter() + .map(|production| { + self.productions + .iter() + .position(|p| *p == production) + .unwrap_or({ + self.productions.push(production); + self.productions.len() - 1 + }) + }) + .collect(); + + // Cache these productions based on the original production step. + self.production_indices_by_step_id + .entry(step_id) + .or_insert(result) + } + + fn production_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Production { + if let Some(variable_index) = id.variable_index { + &grammar.variables[variable_index].productions[id.production_index] + } else { + &self.productions[id.production_index] + } + } + + fn production_step_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> Option<&'a ProductionStep> { + self.production_for_id(id, grammar).steps.get(id.step_index) + } +} + +pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { + InlinedProductionMapBuilder { + productions: Vec::new(), + production_indices_by_step_id: HashMap::new(), + } + .build(grammar) } #[cfg(test)] @@ -234,7 +234,7 @@ mod tests { // Inlining variable 1 yields two productions. assert_eq!( inline_map - .inlined_productions(&grammar.variables[0].productions[0], 1) + .inlined_productions(&grammar.variables[0].productions[0], 1) .unwrap() .cloned() .collect::>(), @@ -446,8 +446,7 @@ mod tests { ProductionStep::new(Symbol::terminal(12)) .with_prec(1, Some(Associativity::Left)), ProductionStep::new(Symbol::terminal(10)), - ProductionStep::new(Symbol::non_terminal(2)) - .with_alias("outer_alias", true), + ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true), ] }], ); From 70aa4c2b2d97fbcf6e330f85e4d4fd0df026cfce Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:11:44 -0800 Subject: [PATCH 091/208] Add a --no-minimize flag to suppress table minimization for debugging --- src/build_tables/build_lex_table.rs | 22 ++++++++++++++----- src/build_tables/build_parse_table.rs | 13 ++++++++++- ...parse_table.rs => minimize_parse_table.rs} | 14 ++++++------ src/build_tables/mod.rs | 22 +++++++++++++------ src/generate.rs | 5 +++-- src/main.rs | 6 +++-- 6 files changed, 58 insertions(+), 24 deletions(-) rename src/build_tables/{shrink_parse_table.rs => minimize_parse_table.rs} (97%) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 60810f83..9c440f4e 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -12,6 +12,7 @@ pub(crate) fn build_lex_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, keywords: &LookaheadSet, + minimize: bool, ) -> (LexTable, LexTable) { let keyword_lex_table; if syntax_grammar.word_token.is_some() { @@ -41,7 +42,10 @@ pub(crate) fn build_lex_table( } let mut table = builder.table; - shrink_lex_table(&mut table, parse_table); + + if minimize { + minimize_lex_table(&mut table, parse_table); + } (table, keyword_lex_table) } @@ -147,14 +151,20 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } + info!( + "lex state: {}, completion: {:?}", + state_id, + completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) + ); + let successors = self.cursor.grouped_successors(); - info!("populate state: {}, successors: {:?}", state_id, successors); + info!("lex state: {}, successors: {:?}", state_id, successors); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. if eof_valid { let (next_state_id, _) = self.add_state(Vec::new(), false); - info!("populate state: {}, character: EOF", state_id); + info!("lex state: {}, successor: EOF", state_id); self.table.states[state_id].advance_actions.push(( CharacterSet::empty().add_char('\0'), AdvanceAction { @@ -166,7 +176,9 @@ impl<'a> LexTableBuilder<'a> { for (chars, advance_precedence, next_states, is_sep) in successors { if let Some((_, completed_precedence)) = completion { - if advance_precedence < completed_precedence { + if advance_precedence < completed_precedence + || (advance_precedence == completed_precedence && is_sep) + { continue; } } @@ -188,7 +200,7 @@ impl<'a> LexTableBuilder<'a> { } } -fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { +fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { let mut state_replacements = BTreeMap::new(); let mut done = false; while !done { diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 6f930463..9bccf238 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -67,7 +67,7 @@ impl<'a> ParseTableBuilder<'a> { // info!( // "state: {}, item set: {}", // entry.state_id, - // ParseItemSetDisplay( + // super::item::ParseItemSetDisplay( // &self.item_sets_by_state_id[entry.state_id], // self.syntax_grammar, // self.lexical_grammar, @@ -77,6 +77,17 @@ impl<'a> ParseTableBuilder<'a> { let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + + // info!( + // "state: {}, closed item set: {}", + // entry.state_id, + // super::item::ParseItemSetDisplay( + // &item_set, + // self.syntax_grammar, + // self.lexical_grammar, + // ) + // ); + self.add_actions( entry.preceding_symbols, entry.preceding_auxiliary_symbols, diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/minimize_parse_table.rs similarity index 97% rename from src/build_tables/shrink_parse_table.rs rename to src/build_tables/minimize_parse_table.rs index 64a4b259..573bf974 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/minimize_parse_table.rs @@ -5,26 +5,26 @@ use crate::rules::{AliasMap, Symbol}; use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use hashbrown::{HashMap, HashSet}; -pub(crate) fn shrink_parse_table( +pub(crate) fn minimize_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, keywords: &LookaheadSet, ) { - let mut optimizer = Optimizer { + let mut minimizer = Minimizer { parse_table, syntax_grammar, token_conflict_map, keywords, simple_aliases, }; - optimizer.remove_unit_reductions(); - optimizer.merge_compatible_states(); - optimizer.remove_unused_states(); + minimizer.remove_unit_reductions(); + minimizer.merge_compatible_states(); + minimizer.remove_unused_states(); } -struct Optimizer<'a> { +struct Minimizer<'a> { parse_table: &'a mut ParseTable, syntax_grammar: &'a SyntaxGrammar, token_conflict_map: &'a TokenConflictMap<'a>, @@ -32,7 +32,7 @@ struct Optimizer<'a> { simple_aliases: &'a AliasMap, } -impl<'a> Optimizer<'a> { +impl<'a> Minimizer<'a> { fn remove_unit_reductions(&mut self) { let mut aliased_symbols = HashSet::new(); for variable in &self.syntax_grammar.variables { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 84659600..886594f8 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -3,14 +3,14 @@ mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; -mod shrink_parse_table; +mod minimize_parse_table; mod token_conflicts; use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; use self::item::LookaheadSet; -use self::shrink_parse_table::shrink_parse_table; +use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; @@ -23,6 +23,7 @@ pub(crate) fn build_tables( lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap, inlines: &InlinedProductionMap, + minimize: bool, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; @@ -42,15 +43,22 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); - shrink_parse_table( + if minimize { + minimize_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + &keywords, + ); + } + let (main_lex_table, keyword_lex_table) = build_lex_table( &mut parse_table, syntax_grammar, - simple_aliases, - &token_conflict_map, + lexical_grammar, &keywords, + minimize, ); - let (main_lex_table, keyword_lex_table) = - build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); Ok(( parse_table, main_lex_table, diff --git a/src/generate.rs b/src/generate.rs index cdbbea4f..d574c165 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -4,14 +4,15 @@ use crate::prepare_grammar::prepare_grammar; use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: &str) -> Result { +pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result { let input_grammar = parse_grammar(input)?; let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, - &inlines + &inlines, + minimize )?; let c_code = render_c_code( &input_grammar.name, diff --git a/src/main.rs b/src/main.rs index a08922b7..10820ed1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,7 +33,8 @@ fn main() -> error::Result<()> { .subcommand( SubCommand::with_name("generate") .about("Generate a parser") - .arg(Arg::with_name("log").long("log")), + .arg(Arg::with_name("log").long("log")) + .arg(Arg::with_name("no-minimize").long("no-minimize")), ) .subcommand( SubCommand::with_name("parse") @@ -54,10 +55,11 @@ fn main() -> error::Result<()> { logger::init(); } + let minimize = !matches.is_present("no-minimize"); let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); - let code = generate::generate_parser_for_grammar(&grammar_json)?; + let code = generate::generate_parser_for_grammar(&grammar_json, minimize)?; println!("{}", code); } From cc0fbc0d9306a838d10a7b258a58fa7f76c55cc3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:12:05 -0800 Subject: [PATCH 092/208] Fix and simplify handling of precedence for completion of tokens --- src/prepare_grammar/expand_tokens.rs | 88 +++++++++++----------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 6520c432..01b925f9 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -28,6 +28,13 @@ fn get_implicit_precedence(rule: &Rule) -> i32 { } } +fn get_completion_precedence(rule: &Rule) -> i32 { + match rule { + Rule::Metadata { params, .. } => params.precedence.unwrap_or(0), + _ => 0, + } +} + pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut builder = NfaBuilder { nfa: Nfa::new(), @@ -52,7 +59,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result) { - let mut i = 0; - while i < state_ids.len() { - let state_id = state_ids[i]; - let (left, right) = match &mut self.nfa.states[state_id as usize] { - NfaState::Accept { precedence, .. } => { - *precedence = prec; - return; - } - NfaState::Split(left, right) => (*left, *right), - _ => return, - }; - if !state_ids.contains(&left) { - state_ids.push(left); - } - if !state_ids.contains(&right) { - state_ids.push(right); - } - i += 1; - } - } } #[cfg(test)] @@ -551,17 +535,21 @@ mod tests { ("aeeeef", Some((2, "aeeee"))), ], }, + // immediate tokens with higher precedence Row { rules: vec![ - Rule::seq(vec![ - Rule::string("a"), - Rule::choice(vec![ - Rule::string("b"), - Rule::string("c"), - ]), - Rule::string("d"), - ]) + Rule::prec(1, Rule::pattern("[^a]+")), + Rule::immediate_token(Rule::prec(2, Rule::pattern("[^ab]+"))), ], + separators: vec![Rule::pattern("\\s")], + examples: vec![("cccb", Some((1, "ccc")))], + }, + Row { + rules: vec![Rule::seq(vec![ + Rule::string("a"), + Rule::choice(vec![Rule::string("b"), Rule::string("c")]), + Rule::string("d"), + ])], separators: vec![], examples: vec![ ("abd", Some((0, "abd"))), @@ -570,34 +558,24 @@ mod tests { ("ad", None), ("d", None), ("a", None), - ] + ], }, // nested choices within sequences Row { - rules: vec![ - Rule::seq(vec![ - Rule::pattern("[0-9]+"), - Rule::choice(vec![ - Rule::Blank, + rules: vec![Rule::seq(vec![ + Rule::pattern("[0-9]+"), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![Rule::seq(vec![ + Rule::choice(vec![Rule::string("e"), Rule::string("E")]), Rule::choice(vec![ - Rule::seq(vec![ - Rule::choice(vec![ - Rule::string("e"), - Rule::string("E") - ]), - Rule::choice(vec![ - Rule::Blank, - Rule::choice(vec![ - Rule::string("+"), - Rule::string("-"), - ]) - ]), - Rule::pattern("[0-9]+"), - ]) - ]) - ]), + Rule::Blank, + Rule::choice(vec![Rule::string("+"), Rule::string("-")]), + ]), + Rule::pattern("[0-9]+"), + ])]), ]), - ], + ])], separators: vec![], examples: vec![ ("12", Some((0, "12"))), From d845b81ee961d37e8506a2b421d54b867bb7e3c7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 09:42:06 -0800 Subject: [PATCH 093/208] Represent nfa transitions as structs with named fields, not tuples --- src/build_tables/build_lex_table.rs | 24 ++- src/build_tables/mod.rs | 2 +- src/build_tables/token_conflicts.rs | 26 +-- src/nfa.rs | 259 +++++++++++++++++---------- src/prepare_grammar/expand_tokens.rs | 16 +- 5 files changed, 211 insertions(+), 116 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 9c440f4e..4212d62b 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -1,7 +1,7 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::{CharacterSet, NfaCursor}; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; @@ -157,8 +157,8 @@ impl<'a> LexTableBuilder<'a> { completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) ); - let successors = self.cursor.grouped_successors(); - info!("lex state: {}, successors: {:?}", state_id, successors); + let transitions = self.cursor.transitions(); + info!("lex state: {}, transitions: {:?}", state_id, transitions); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. @@ -174,20 +174,26 @@ impl<'a> LexTableBuilder<'a> { )); } - for (chars, advance_precedence, next_states, is_sep) in successors { + for NfaTransition { + characters, + precedence, + states, + is_separator, + } in transitions + { if let Some((_, completed_precedence)) = completion { - if advance_precedence < completed_precedence - || (advance_precedence == completed_precedence && is_sep) + if precedence < completed_precedence + || (precedence == completed_precedence && is_separator) { continue; } } - let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep); + let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); self.table.states[state_id].advance_actions.push(( - chars, + characters, AdvanceAction { state: next_state_id, - in_main_token: !is_sep, + in_main_token: !is_separator, }, )); } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 886594f8..78798732 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -239,7 +239,7 @@ fn identify_keywords( } fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { - cursor.successors().all(|(chars, _, _, is_sep)| { + cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { true } else if let CharacterSet::Include(chars) = chars { diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 91edadec..cb2b6efe 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -1,6 +1,6 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; -use crate::nfa::{CharacterSet, NfaCursor}; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; use std::cmp::Ordering; use std::fmt; @@ -131,7 +131,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, } -impl Default for Nfa { - fn default() -> Self { - Self { states: Vec::new() } - } -} - #[derive(Debug)] pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, nfa: &'a Nfa, } +#[derive(Debug, PartialEq, Eq)] +pub struct NfaTransition { + pub characters: CharacterSet, + pub is_separator: bool, + pub precedence: i32, + pub states: Vec, +} + +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + impl CharacterSet { pub fn empty() -> Self { CharacterSet::Include(Vec::new()) @@ -328,7 +336,15 @@ impl<'a> NfaCursor<'a> { self.state_ids = states } - pub fn successors(&self) -> impl Iterator { + pub fn transition_chars(&self) -> impl Iterator { + self.raw_transitions().map(|t| (t.0, t.1)) + } + + pub fn transitions(&self) -> Vec { + Self::group_transitions(self.raw_transitions()) + } + + fn raw_transitions(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { chars, @@ -337,52 +353,53 @@ impl<'a> NfaCursor<'a> { is_sep, } = &self.nfa.states[*id as usize] { - Some((chars, *precedence, *state_id, *is_sep)) + Some((chars, *is_sep, *precedence, *state_id)) } else { None } }) } - pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec, bool)> { - Self::group_successors(self.successors()) - } - - fn group_successors<'b>( - iter: impl Iterator, - ) -> Vec<(CharacterSet, i32, Vec, bool)> { - let mut result: Vec<(CharacterSet, i32, Vec, bool)> = Vec::new(); - for (chars, prec, state, is_sep) in iter { + fn group_transitions<'b>( + iter: impl Iterator, + ) -> Vec { + let mut result: Vec = Vec::new(); + for (chars, is_sep, prec, state) in iter { let mut chars = chars.clone(); let mut i = 0; while i < result.len() && !chars.is_empty() { - let intersection = result[i].0.remove_intersection(&mut chars); + let intersection = result[i].characters.remove_intersection(&mut chars); if !intersection.is_empty() { - let mut intersection_states = result[i].2.clone(); + let mut intersection_states = result[i].states.clone(); match intersection_states.binary_search(&state) { Err(j) => intersection_states.insert(j, state), _ => {} } - let intersection_entry = ( - intersection, - max(result[i].1, prec), - intersection_states, - result[i].3 || is_sep, - ); - if result[i].0.is_empty() { - result[i] = intersection_entry; + let intersection_transition = NfaTransition { + characters: intersection, + is_separator: result[i].is_separator || is_sep, + precedence: max(result[i].precedence, prec), + states: intersection_states, + }; + if result[i].characters.is_empty() { + result[i] = intersection_transition; } else { - result.insert(i, intersection_entry); + result.insert(i, intersection_transition); i += 1; } } i += 1; } if !chars.is_empty() { - result.push((chars, prec, vec![state], is_sep)); + result.push(NfaTransition { + characters: chars, + precedence: prec, + states: vec![state], + is_separator: is_sep, + }); } } - result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters)); result } @@ -435,111 +452,173 @@ mod tests { use super::*; #[test] - fn test_group_successors() { + fn test_group_transitions() { let table = [ // overlapping character classes ( vec![ - (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), - (CharacterSet::empty().add_range('d', 'i'), 1, 2, false), + (CharacterSet::empty().add_range('a', 'f'), false, 0, 1), + (CharacterSet::empty().add_range('d', 'i'), false, 1, 2), ], vec![ - (CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false), - ( - CharacterSet::empty().add_range('d', 'f'), - 1, - vec![1, 2], - false, - ), - (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), + NfaTransition { + characters: CharacterSet::empty().add_range('a', 'c'), + is_separator: false, + precedence: 0, + states: vec![1], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('d', 'f'), + is_separator: false, + precedence: 1, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('g', 'i'), + is_separator: false, + precedence: 1, + states: vec![2], + }, ], ), // large character class followed by many individual characters ( vec![ - (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), - (CharacterSet::empty().add_char('d'), 0, 2, false), - (CharacterSet::empty().add_char('i'), 0, 3, false), - (CharacterSet::empty().add_char('f'), 0, 4, false), + (CharacterSet::empty().add_range('a', 'z'), false, 0, 1), + (CharacterSet::empty().add_char('d'), false, 0, 2), + (CharacterSet::empty().add_char('i'), false, 0, 3), + (CharacterSet::empty().add_char('f'), false, 0, 4), ], vec![ - (CharacterSet::empty().add_char('d'), 0, vec![1, 2], false), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4], false), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3], false), - ( - CharacterSet::empty() + NfaTransition { + characters: CharacterSet::empty().add_char('d'), + is_separator: false, + precedence: 0, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('f'), + is_separator: false, + precedence: 0, + states: vec![1, 4], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('i'), + is_separator: false, + precedence: 0, + states: vec![1, 3], + }, + NfaTransition { + characters: CharacterSet::empty() .add_range('a', 'c') .add_char('e') .add_range('g', 'h') .add_range('j', 'z'), - 0, - vec![1], - false, - ), + is_separator: false, + precedence: 0, + states: vec![1], + }, ], ), // negated character class followed by an individual character ( vec![ - (CharacterSet::empty().add_char('0'), 0, 1, false), - (CharacterSet::empty().add_char('b'), 0, 2, false), + (CharacterSet::empty().add_char('0'), false, 0, 1), + (CharacterSet::empty().add_char('b'), false, 0, 2), ( CharacterSet::empty().add_range('a', 'f').negate(), + false, 0, 3, - false, ), - (CharacterSet::empty().add_char('c'), 0, 4, false), + (CharacterSet::empty().add_char('c'), false, 0, 4), ], vec![ - (CharacterSet::empty().add_char('0'), 0, vec![1, 3], false), - (CharacterSet::empty().add_char('b'), 0, vec![2], false), - (CharacterSet::empty().add_char('c'), 0, vec![4], false), - ( - CharacterSet::empty() + NfaTransition { + characters: CharacterSet::empty().add_char('0'), + precedence: 0, + states: vec![1, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('b'), + precedence: 0, + states: vec![2], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('c'), + precedence: 0, + states: vec![4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty() .add_range('a', 'f') .add_char('0') .negate(), - 0, - vec![3], - false, - ), + precedence: 0, + states: vec![3], + is_separator: false, + }, ], ), // multiple negated character classes ( vec![ - (CharacterSet::Include(vec!['a']), 0, 1, false), - (CharacterSet::Exclude(vec!['a', 'b', 'c']), 0, 2, false), - (CharacterSet::Include(vec!['g']), 0, 6, false), - (CharacterSet::Exclude(vec!['d', 'e', 'f']), 0, 3, false), - (CharacterSet::Exclude(vec!['g', 'h', 'i']), 0, 4, false), - (CharacterSet::Include(vec!['g']), 0, 5, false), + (CharacterSet::Include(vec!['a']), false, 0, 1), + (CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2), + (CharacterSet::Include(vec!['g']), false, 0, 6), + (CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3), + (CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4), + (CharacterSet::Include(vec!['g']), false, 0, 5), ], vec![ - (CharacterSet::Include(vec!['a']), 0, vec![1, 3, 4], false), - (CharacterSet::Include(vec!['g']), 0, vec![2, 3, 5, 6], false), - (CharacterSet::Include(vec!['b', 'c']), 0, vec![3, 4], false), - (CharacterSet::Include(vec!['h', 'i']), 0, vec![2, 3], false), - ( - CharacterSet::Include(vec!['d', 'e', 'f']), - 0, - vec![2, 4], - false, - ), - ( - CharacterSet::Exclude(vec!['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), - 0, - vec![2, 3, 4], - false, - ), + NfaTransition { + characters: CharacterSet::Include(vec!['a']), + precedence: 0, + states: vec![1, 3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['g']), + precedence: 0, + states: vec![2, 3, 5, 6], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['b', 'c']), + precedence: 0, + states: vec![3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['h', 'i']), + precedence: 0, + states: vec![2, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['d', 'e', 'f']), + precedence: 0, + states: vec![2, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Exclude(vec![ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', + ]), + precedence: 0, + states: vec![2, 3, 4], + is_separator: false, + }, ], ), ]; for row in table.iter() { assert_eq!( - NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))), + NfaCursor::group_transitions(row.0.iter().map(|(c, sep, p, s)| (c, *sep, *p, *s))), row.1 ); } diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 01b925f9..91a0e364 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -372,7 +372,7 @@ impl NfaBuilder { mod tests { use super::*; use crate::grammars::Variable; - use crate::nfa::NfaCursor; + use crate::nfa::{NfaCursor, NfaTransition}; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); @@ -389,14 +389,18 @@ mod tests { result_precedence = precedence; } } - if let Some((_, _, next_states, in_sep)) = cursor - .grouped_successors() + if let Some(NfaTransition { + states, + is_separator, + .. + }) = cursor + .transitions() .into_iter() - .find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence) + .find(|t| t.characters.contains(c) && t.precedence >= result_precedence) { - cursor.reset(next_states); + cursor.reset(states); end_char += 1; - if in_sep { + if is_separator { start_char = end_char; } } else { From 79b9d5ebed3470195e05b50d3f0b42b21cb7c69b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:19:53 -0800 Subject: [PATCH 094/208] Fix minor differences in generated C code --- src/build_tables/build_parse_table.rs | 12 ++++++------ src/render/mod.rs | 20 +++++++++----------- src/rules.rs | 2 +- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 9bccf238..5fc015af 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -531,7 +531,6 @@ impl<'a> ParseTableBuilder<'a> { } fn populate_used_symbols(&mut self) { - self.parse_table.symbols.push(Symbol::end()); let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; @@ -547,16 +546,17 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } for (i, value) in external_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::external(i)); } } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::non_terminal(i)); diff --git a/src/render/mod.rs b/src/render/mod.rs index 0c0e6e59..61c167bb 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -232,15 +232,13 @@ impl Generator { add_line!(self, "static const char *ts_symbol_names[] = {{"); indent!(self); for symbol in self.parse_table.symbols.iter() { - if *symbol != Symbol::end() { - let name = self.sanitize_string( - self.simple_aliases - .get(symbol) - .map(|alias| alias.value.as_str()) - .unwrap_or(self.metadata_for_symbol(*symbol).0), - ); - add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); - } + let name = self.sanitize_string( + self.simple_aliases + .get(symbol) + .map(|alias| alias.value.as_str()) + .unwrap_or(self.metadata_for_symbol(*symbol).0), + ); + add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); } for (alias, symbol) in &self.alias_map { if symbol.is_none() { @@ -864,7 +862,7 @@ impl Generator { fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { match symbol.kind { - SymbolType::End => ("end", VariableType::Auxiliary), + SymbolType::End => ("end", VariableType::Hidden), SymbolType::NonTerminal => { let variable = &self.syntax_grammar.variables[symbol.index]; (&variable.name, variable.kind) @@ -950,7 +948,7 @@ impl Generator { fn add_character(&mut self, c: char) { if c.is_ascii() { match c { - '\0' => add!(self, "'\\0'"), + '\0' => add!(self, "0"), '\'' => add!(self, "'\\''"), '\\' => add!(self, "'\\\\'"), '\t' => add!(self, "'\\t'"), diff --git a/src/rules.rs b/src/rules.rs index bd0340fc..e15070ea 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -3,9 +3,9 @@ use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { External, + End, Terminal, NonTerminal, - End, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] From baf7f3603c5eca1c338be4665d516ff6d189a020 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:30:53 -0800 Subject: [PATCH 095/208] Mark fragile tokens --- src/build_tables/mod.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 78798732..ed47665e 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -43,6 +43,11 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); + mark_fragile_tokens( + &mut parse_table, + lexical_grammar, + &token_conflict_map, + ); if minimize { minimize_parse_table( &mut parse_table, @@ -238,6 +243,34 @@ fn identify_keywords( keywords } +fn mark_fragile_tokens( + parse_table: &mut ParseTable, + lexical_grammar: &LexicalGrammar, + token_conflict_map: &TokenConflictMap, +) { + let n = lexical_grammar.variables.len(); + let mut valid_tokens_mask = Vec::with_capacity(n); + for state in parse_table.states.iter_mut() { + valid_tokens_mask.clear(); + valid_tokens_mask.resize(n, false); + for token in state.terminal_entries.keys() { + if token.is_terminal() { + valid_tokens_mask[token.index] = true; + } + } + for (token, entry) in state.terminal_entries.iter_mut() { + for i in 0..n { + if token_conflict_map.does_overlap(i, token.index) { + if valid_tokens_mask[i] { + entry.reusable = false; + break; + } + } + } + } + } +} + fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { From d0c3e26e8409637f4752a4dafe20297fac4420bc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 11:52:52 -0800 Subject: [PATCH 096/208] Don't let lex state merging be fooled by trivial loops --- src/build_tables/build_lex_table.rs | 21 +++++++++++++-------- src/render/mod.rs | 14 +++++++------- src/tables.rs | 2 +- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index 4212d62b..bcc1bf3d 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -168,7 +168,7 @@ impl<'a> LexTableBuilder<'a> { self.table.states[state_id].advance_actions.push(( CharacterSet::empty().add_char('\0'), AdvanceAction { - state: next_state_id, + state: Some(next_state_id), in_main_token: true, }, )); @@ -189,10 +189,15 @@ impl<'a> LexTableBuilder<'a> { } } let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); + let next_state = if next_state_id == state_id { + None + } else { + Some(next_state_id) + }; self.table.states[state_id].advance_actions.push(( characters, AdvanceAction { - state: next_state_id, + state: next_state, in_main_token: !is_separator, }, )); @@ -231,10 +236,10 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } } for state in table.states.iter_mut() { - for advance_action in state.advance_actions.iter_mut() { - if let Some(new_state_id) = state_replacements.get(&advance_action.1.state) { - advance_action.1.state = *new_state_id; - } + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action + .state + .map(|s| state_replacements.get(&s).cloned().unwrap_or(s)) } } } @@ -259,8 +264,8 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } for state in table.states.iter_mut() { - for advance_action in state.advance_actions.iter_mut() { - advance_action.1.state = final_state_replacements[advance_action.1.state]; + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action.state.map(|s| final_state_replacements[s]); } } diff --git a/src/render/mod.rs b/src/render/mod.rs index 61c167bb..58235fd9 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -342,7 +342,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {}:", i); indent!(self); - self.add_lex_state(state); + self.add_lex_state(i, state); dedent!(self); } @@ -358,7 +358,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_state(&mut self, state: LexState) { + fn add_lex_state(&mut self, index: usize, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -372,14 +372,14 @@ impl Generator { if self.add_character_set_condition(&characters, &ruled_out_characters) { add!(self, ")\n"); indent!(self); - self.add_advance_action(&action); + self.add_advance_action(index, &action); if let CharacterSet::Include(chars) = characters { ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { self.buffer.truncate(previous_length); - self.add_advance_action(&action); + self.add_advance_action(index, &action); } } @@ -491,11 +491,11 @@ impl Generator { }) } - fn add_advance_action(&mut self, action: &AdvanceAction) { + fn add_advance_action(&mut self, index: usize, action: &AdvanceAction) { if action.in_main_token { - add_line!(self, "ADVANCE({});", action.state); + add_line!(self, "ADVANCE({});", action.state.unwrap_or(index)); } else { - add_line!(self, "SKIP({});", action.state); + add_line!(self, "SKIP({});", action.state.unwrap_or(index)); } } diff --git a/src/tables.rs b/src/tables.rs index f400d25c..c8f7e1e4 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -48,7 +48,7 @@ pub(crate) struct ParseTable { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { - pub state: LexStateId, + pub state: Option, pub in_main_token: bool, } From ba96e4961b9710728e6a9ef02be475e2e942d3ca Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 12:42:45 -0800 Subject: [PATCH 097/208] Simplify error handling, finish up LR conflict message generation --- src/build_tables/build_parse_table.rs | 93 +++++++++++++++++++++++++-- src/error.rs | 17 +++-- src/main.rs | 11 +++- src/prepare_grammar/expand_tokens.rs | 9 +-- src/prepare_grammar/extract_tokens.rs | 12 ++-- src/prepare_grammar/intern_symbols.rs | 14 ++-- 6 files changed, 117 insertions(+), 39 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 5fc015af..e642c3cd 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -455,9 +455,9 @@ impl<'a> ParseTableBuilder<'a> { self.symbol_name(&conflicting_lookahead) ) .unwrap(); - write!(&mut msg, "Possible interpretations:\n").unwrap(); + write!(&mut msg, "Possible interpretations:\n\n").unwrap(); for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, "\n {}:", i).unwrap(); + write!(&mut msg, " {}:", i + 1).unwrap(); for preceding_symbol in preceding_symbols .iter() @@ -501,11 +501,89 @@ impl<'a> ParseTableBuilder<'a> { ) .unwrap(); } + + write!(&mut msg, "\n").unwrap(); } - // TODO - generate suggested resolutions + let mut resolution_count = 0; + write!(&mut msg, "\nPossible resolutions:\n\n").unwrap(); + let shift_items = conflicting_items + .iter() + .filter(|i| !i.is_done()) + .cloned() + .collect::>(); + if shift_items.len() > 0 { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in", + resolution_count + ) + .unwrap(); + for (i, item) in shift_items.iter().enumerate() { + if i > 0 { + write!(&mut msg, " and").unwrap(); + } + write!( + &mut msg, + " `{}`", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + write!(&mut msg, " than in the other rules.\n").unwrap(); + } - Err(Error::ConflictError(msg)) + if considered_associativity { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a left or right associativity in ", + resolution_count + ) + .unwrap(); + for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() { + if i > 0 { + write!(&mut msg, " and ").unwrap(); + } + write!( + &mut msg, + "{}", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + } + + for item in &conflicting_items { + if item.is_done() { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in `{}` than in the other rules.\n", + resolution_count, + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + } + + resolution_count += 1; + write!( + &mut msg, + " {}: Add a conflict for these rules: ", + resolution_count + ) + .unwrap(); + for (i, symbol) in actual_conflict.iter().enumerate() { + if i > 0 { + write!(&mut msg, ", ").unwrap(); + } + write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap(); + } + write!(&mut msg, "\n").unwrap(); + + Err(Error(msg)) } fn get_auxiliary_node_info( @@ -517,8 +595,11 @@ impl<'a> ParseTableBuilder<'a> { .entries .keys() .filter_map(|item| { - if item.symbol() == Some(symbol) { - None + let variable_index = item.variable_index as usize; + if item.symbol() == Some(symbol) + && !self.syntax_grammar.variables[variable_index].is_auxiliary() + { + Some(Symbol::non_terminal(variable_index)) } else { None } diff --git a/src/error.rs b/src/error.rs index b03efa93..9a5801f8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,25 +1,24 @@ #[derive(Debug)] -pub enum Error { - GrammarError(String), - SymbolError(String), - RegexError(String), - ConflictError(String), -} +pub struct Error(pub String); pub type Result = std::result::Result; impl Error { pub fn grammar(message: &str) -> Self { - Error::GrammarError(message.to_string()) + Error(format!("Grammar error: {}", message)) } pub fn regex(message: &str) -> Self { - Error::RegexError(message.to_string()) + Error(format!("Regex error: {}", message)) + } + + pub fn undefined_symbol(name: &str) -> Self { + Error(format!("Undefined symbol `{}`", name)) } } impl From for Error { fn from(error: serde_json::Error) -> Self { - Error::GrammarError(error.to_string()) + Error(error.to_string()) } } diff --git a/src/main.rs b/src/main.rs index 10820ed1..c3dbf33d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,7 @@ use clap::{App, Arg, SubCommand}; use std::env; use std::io::Write; use std::path::PathBuf; -use std::process::{Command, Stdio}; +use std::process::{exit, Command, Stdio}; mod build_tables; mod error; @@ -25,7 +25,14 @@ mod render; mod rules; mod tables; -fn main() -> error::Result<()> { +fn main() { + if let Err(e) = run() { + eprintln!("{}", e.0); + exit(1); + } +} + +fn run() -> error::Result<()> { let matches = App::new("tree-sitter") .version("0.1") .author("Max Brunsfeld ") diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 91a0e364..2678df19 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -64,12 +64,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - Error::RegexError(format!("Rule {} {}", variable.name, msg)) - } - _ => e, - })?; + .map_err(|Error(msg)| Error(format!("Rule {} {}", variable.name, msg)))?; if !is_immediate_token { builder.is_sep = true; @@ -97,7 +92,7 @@ impl NfaBuilder { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; + .map_err(|e| Error(e.to_string()))?; self.expand_regex(&ast, next_state_id) } Rule::String(s) => { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 115933ee..5a54d34e 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -89,7 +89,7 @@ pub(super) fn extract_tokens( if let Rule::Symbol(symbol) = rule { let new_symbol = symbol_replacer.replace_symbol(symbol); if new_symbol.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-token symbol '{}' cannot be used as an extra token", &variables[new_symbol.index].name ))); @@ -110,7 +110,7 @@ pub(super) fn extract_tokens( let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); if let Rule::Symbol(symbol) = rule { if symbol.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Rule '{}' cannot be used as both an external token and a non-terminal rule", &variables[symbol.index].name, ))); @@ -130,7 +130,7 @@ pub(super) fn extract_tokens( }) } } else { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-symbol rules cannot be used as external tokens" ))); } @@ -140,7 +140,7 @@ pub(super) fn extract_tokens( if let Some(token) = grammar.word_token { let token = symbol_replacer.replace_symbol(token); if token.is_non_terminal() { - return Err(Error::GrammarError(format!( + return Err(Error(format!( "Non-terminal symbol '{}' cannot be used as the word token", &variables[token.index].name ))); @@ -475,7 +475,7 @@ mod test { grammar.extra_tokens = vec![Rule::non_terminal(1)]; match extract_tokens(grammar) { - Err(Error::GrammarError(s)) => { + Err(Error(s)) => { assert_eq!( s, "Non-token symbol 'rule_1' cannot be used as an extra token" @@ -503,7 +503,7 @@ mod test { grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; match extract_tokens(grammar) { - Err(Error::GrammarError(s)) => { + Err(Error(s)) => { assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); } _ => { diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 5165875c..2e6f5b1c 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -7,7 +7,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let interner = Interner { grammar }; if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { - return Err(Error::GrammarError( + return Err(Error( "Grammar's start rule must be visible".to_string(), )); } @@ -44,7 +44,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result interned_conflict.push( interner .intern_name(&name) - .ok_or_else(|| symbol_error(name))?, + .ok_or_else(|| Error::undefined_symbol(name))?, ); } expected_conflicts.push(interned_conflict); @@ -62,7 +62,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result word_token = Some( interner .intern_name(&name) - .ok_or_else(|| symbol_error(&name))?, + .ok_or_else(|| Error::undefined_symbol(&name))?, ); } @@ -107,7 +107,7 @@ impl<'a> Interner<'a> { if let Some(symbol) = self.intern_name(&name) { Ok(Rule::Symbol(symbol)) } else { - Err(symbol_error(name)) + Err(Error::undefined_symbol(name)) } } @@ -134,10 +134,6 @@ impl<'a> Interner<'a> { } } -fn symbol_error(name: &str) -> Error { - Error::SymbolError(format!("Undefined symbol '{}'", name)) -} - fn variable_type_for_name(name: &str) -> VariableType { if name.starts_with("_") { VariableType::Hidden @@ -223,7 +219,7 @@ mod tests { let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); match result { - Err(Error::SymbolError(message)) => assert_eq!(message, "Undefined symbol 'y'"), + Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"), _ => panic!("Expected an error but got none"), } } From a0e65018ba8282fc8c77734092618e87cfb8cf2d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 13:01:07 -0800 Subject: [PATCH 098/208] Fix computation of MAX_ALIAS_SEQUENCE_LENGTH --- src/build_tables/build_parse_table.rs | 6 +++++- src/render/mod.rs | 10 ++-------- src/tables.rs | 1 + 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index e642c3cd..7fb668dd 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -675,6 +675,9 @@ impl<'a> ParseTableBuilder<'a> { while alias_sequence.last() == Some(&None) { alias_sequence.pop(); } + if item.production.steps.len() > self.parse_table.max_aliased_production_length { + self.parse_table.max_aliased_production_length = item.production.steps.len() + } if let Some(index) = self .parse_table .alias_sequences @@ -721,8 +724,9 @@ pub(crate) fn build_parse_table( parse_state_queue: VecDeque::new(), parse_table: ParseTable { states: Vec::new(), - alias_sequences: Vec::new(), symbols: Vec::new(), + alias_sequences: Vec::new(), + max_aliased_production_length: 0, }, following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], } diff --git a/src/render/mod.rs b/src/render/mod.rs index 58235fd9..8d3ee195 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -191,17 +191,11 @@ impl Generator { "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len() ); - if let Some(max_alias_sequence_length) = self - .parse_table - .alias_sequences - .iter() - .map(|seq| seq.len()) - .max() - { + if self.parse_table.max_aliased_production_length > 0 { add_line!( self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}", - max_alias_sequence_length + self.parse_table.max_aliased_production_length ); } add_line!(self, ""); diff --git a/src/tables.rs b/src/tables.rs index c8f7e1e4..edbbaaab 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -44,6 +44,7 @@ pub(crate) struct ParseTable { pub states: Vec, pub symbols: Vec, pub alias_sequences: Vec>>, + pub max_aliased_production_length: usize, } #[derive(Clone, Debug, PartialEq, Eq)] From 3a727af2645fb41d3f2151d1b1b4893232e49c06 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:26:48 -0800 Subject: [PATCH 099/208] Add flag for logging the item set associated with a certain parse state --- src/build_tables/build_parse_table.rs | 43 ++++++++++++++------------- src/build_tables/item.rs | 18 ++++++++++- src/build_tables/mod.rs | 3 +- src/generate.rs | 16 ++++++---- src/main.rs | 15 +++++++++- 5 files changed, 66 insertions(+), 29 deletions(-) diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 7fb668dd..cda1d7ea 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -39,6 +39,7 @@ struct ParseTableBuilder<'a> { parse_state_queue: VecDeque, parse_table: ParseTable, following_tokens: Vec, + state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { @@ -64,29 +65,26 @@ impl<'a> ParseTableBuilder<'a> { ); while let Some(entry) = self.parse_state_queue.pop_front() { - // info!( - // "state: {}, item set: {}", - // entry.state_id, - // super::item::ParseItemSetDisplay( - // &self.item_sets_by_state_id[entry.state_id], - // self.syntax_grammar, - // self.lexical_grammar, - // ) - // ); - let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - // info!( - // "state: {}, closed item set: {}", - // entry.state_id, - // super::item::ParseItemSetDisplay( - // &item_set, - // self.syntax_grammar, - // self.lexical_grammar, - // ) - // ); + if self.state_ids_to_log.contains(&entry.state_id) { + eprintln!( + "state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}", + entry.state_id, + super::item::ParseItemSetDisplay( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.lexical_grammar, + ), + super::item::ParseItemSetDisplay( + &item_set, + self.syntax_grammar, + self.lexical_grammar, + ) + ); + } self.add_actions( entry.preceding_symbols, @@ -553,6 +551,7 @@ impl<'a> ParseTableBuilder<'a> { ) .unwrap(); } + write!(&mut msg, "\n").unwrap(); } for item in &conflicting_items { @@ -560,7 +559,7 @@ impl<'a> ParseTableBuilder<'a> { resolution_count += 1; write!( &mut msg, - " {}: Specify a higher precedence in `{}` than in the other rules.\n", + " {}: Specify a higher precedence in `{}` than in the other rules.\n", resolution_count, self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) ) @@ -571,7 +570,7 @@ impl<'a> ParseTableBuilder<'a> { resolution_count += 1; write!( &mut msg, - " {}: Add a conflict for these rules: ", + " {}: Add a conflict for these rules: ", resolution_count ) .unwrap(); @@ -714,10 +713,12 @@ pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, + state_ids_to_log: Vec, ) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, + state_ids_to_log, item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index d1d0cbbf..bbd5bbfa 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -45,7 +45,7 @@ pub(crate) struct ParseItemSet<'a> { pub(crate) struct ParseItemDisplay<'a>( pub &'a ParseItem<'a>, pub &'a SyntaxGrammar, - pub &'a LexicalGrammar + pub &'a LexicalGrammar, ); pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); @@ -252,6 +252,13 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { for (i, step) in self.0.production.steps.iter().enumerate() { if i == self.0.step_index as usize { write!(f, " •")?; + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } } write!(f, " ")?; @@ -274,6 +281,15 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { if self.0.is_done() { write!(f, " •")?; + if let Some(step) = self.0.production.steps.last() { + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } + } } Ok(()) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index ed47665e..04b750e3 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -24,9 +24,10 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, minimize: bool, + state_ids_to_log: Vec, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { let (mut parse_table, following_tokens) = - build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); let keywords = identify_keywords( diff --git a/src/generate.rs b/src/generate.rs index d574c165..aa8f3b5b 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -1,18 +1,24 @@ +use crate::build_tables::build_tables; use crate::error::Result; use crate::parse_grammar::parse_grammar; use crate::prepare_grammar::prepare_grammar; -use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result { +pub fn generate_parser_for_grammar( + input: &str, + minimize: bool, + state_ids_to_log: Vec, +) -> Result { let input_grammar = parse_grammar(input)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, &inlines, - minimize + minimize, + state_ids_to_log, )?; let c_code = render_c_code( &input_grammar.name, @@ -22,7 +28,7 @@ pub fn generate_parser_for_grammar(input: &str, minimize: bool) -> Result error::Result<()> { SubCommand::with_name("generate") .about("Generate a parser") .arg(Arg::with_name("log").long("log")) + .arg( + Arg::with_name("state-ids-to-log") + .long("log-state") + .takes_value(true), + ) .arg(Arg::with_name("no-minimize").long("no-minimize")), ) .subcommand( @@ -63,10 +69,17 @@ fn run() -> error::Result<()> { } let minimize = !matches.is_present("no-minimize"); + let state_ids_to_log = matches + .values_of("state-ids-to-log") + .map_or(Vec::new(), |ids| { + ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) + .collect() + }); let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); - let code = generate::generate_parser_for_grammar(&grammar_json, minimize)?; + let code = + generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?; println!("{}", code); } From d8f8bd288eece27626c02407054b454b8102b7f8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:27:15 -0800 Subject: [PATCH 100/208] Fix error in code generation w/ tokens that are internal and external --- src/render/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/render/mod.rs b/src/render/mod.rs index 8d3ee195..36429848 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -561,11 +561,13 @@ impl Generator { ); indent!(self); for i in 0..self.syntax_grammar.external_tokens.len() { + let token = &self.syntax_grammar.external_tokens[i]; + let id_token = token.corresponding_internal_token.unwrap_or(Symbol::external(i)); add_line!( self, "[{}] = {},", - self.external_token_id(&self.syntax_grammar.external_tokens[i]), - self.symbol_ids[&Symbol::external(i)], + self.external_token_id(&token), + self.symbol_ids[&id_token], ); } dedent!(self); From b8dd5d2640f2011d016d0dfd750e804824771c68 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 15:27:35 -0800 Subject: [PATCH 101/208] Fix handling of precedence and associativity with inlining --- src/prepare_grammar/process_inlines.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 9fd2f2c6..557b0fa4 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -90,7 +90,6 @@ impl InlinedProductionMapBuilder { while i < productions_to_add.len() { if let Some(step) = productions_to_add[i].steps.get(step_index) { let symbol = step.symbol.clone(); - if grammar.variables_to_inline.contains(&symbol) { // Remove the production from the vector, replacing it with a placeholder. let production = productions_to_add @@ -116,8 +115,12 @@ impl InlinedProductionMapBuilder { } } if let Some(last_inserted_step) = inserted_steps.last_mut() { - last_inserted_step.precedence = removed_step.precedence; - last_inserted_step.associativity = removed_step.associativity; + if last_inserted_step.precedence == 0 { + last_inserted_step.precedence = removed_step.precedence; + } + if last_inserted_step.associativity == None { + last_inserted_step.associativity = removed_step.associativity; + } } production }), From 5b0e12ea332ebe231ba103b078f832f2ee2148c5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 16:50:52 -0800 Subject: [PATCH 102/208] Move code into cli directory --- Cargo.toml | 31 +++---------------- cli/Cargo.toml | 27 ++++++++++++++++ .../src}/build_tables/build_lex_table.rs | 0 .../src}/build_tables/build_parse_table.rs | 0 .../src}/build_tables/coincident_tokens.rs | 0 {src => cli/src}/build_tables/item.rs | 0 .../src}/build_tables/item_set_builder.rs | 0 .../src}/build_tables/minimize_parse_table.rs | 0 {src => cli/src}/build_tables/mod.rs | 0 .../src}/build_tables/token_conflicts.rs | 0 {src => cli/src}/error.rs | 0 {src => cli/src}/generate.rs | 0 {src => cli/src}/grammars.rs | 0 {src => cli/src}/js/dsl.js | 0 {src => cli/src}/logger.rs | 0 {src => cli/src}/main.rs | 0 {src => cli/src}/nfa.rs | 0 {src => cli/src}/parse_grammar.rs | 0 .../src}/prepare_grammar/expand_repeats.rs | 0 .../src}/prepare_grammar/expand_tokens.rs | 0 .../prepare_grammar/extract_simple_aliases.rs | 0 .../src}/prepare_grammar/extract_tokens.rs | 0 .../src}/prepare_grammar/flatten_grammar.rs | 0 .../src}/prepare_grammar/intern_symbols.rs | 0 {src => cli/src}/prepare_grammar/mod.rs | 0 .../src}/prepare_grammar/process_inlines.rs | 0 {src => cli/src}/render/mod.rs | 0 {src => cli/src}/rules.rs | 0 {src => cli/src}/tables.rs | 0 29 files changed, 32 insertions(+), 26 deletions(-) create mode 100644 cli/Cargo.toml rename {src => cli/src}/build_tables/build_lex_table.rs (100%) rename {src => cli/src}/build_tables/build_parse_table.rs (100%) rename {src => cli/src}/build_tables/coincident_tokens.rs (100%) rename {src => cli/src}/build_tables/item.rs (100%) rename {src => cli/src}/build_tables/item_set_builder.rs (100%) rename {src => cli/src}/build_tables/minimize_parse_table.rs (100%) rename {src => cli/src}/build_tables/mod.rs (100%) rename {src => cli/src}/build_tables/token_conflicts.rs (100%) rename {src => cli/src}/error.rs (100%) rename {src => cli/src}/generate.rs (100%) rename {src => cli/src}/grammars.rs (100%) rename {src => cli/src}/js/dsl.js (100%) rename {src => cli/src}/logger.rs (100%) rename {src => cli/src}/main.rs (100%) rename {src => cli/src}/nfa.rs (100%) rename {src => cli/src}/parse_grammar.rs (100%) rename {src => cli/src}/prepare_grammar/expand_repeats.rs (100%) rename {src => cli/src}/prepare_grammar/expand_tokens.rs (100%) rename {src => cli/src}/prepare_grammar/extract_simple_aliases.rs (100%) rename {src => cli/src}/prepare_grammar/extract_tokens.rs (100%) rename {src => cli/src}/prepare_grammar/flatten_grammar.rs (100%) rename {src => cli/src}/prepare_grammar/intern_symbols.rs (100%) rename {src => cli/src}/prepare_grammar/mod.rs (100%) rename {src => cli/src}/prepare_grammar/process_inlines.rs (100%) rename {src => cli/src}/render/mod.rs (100%) rename {src => cli/src}/rules.rs (100%) rename {src => cli/src}/tables.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 29b10e17..75d3b403 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,27 +1,6 @@ -[package] -name = "rust-tree-sitter-cli" -version = "0.1.0" -authors = ["Max Brunsfeld "] -edition = "2018" +[workspace] -[dependencies] -lazy_static = "1.2.0" -smallbitvec = "2.3.0" -clap = "2.32" -dirs = "1.0.2" -hashbrown = "0.1" -ignore = "0.4.4" -libloading = "0.5" -rusqlite = "0.14.0" -serde = "1.0" -serde_derive = "1.0" -tree-sitter = "0.3.1" -regex-syntax = "0.6.4" - -[dependencies.serde_json] -version = "1.0" -features = ["preserve_order"] - -[dependencies.log] -version = "0.4.6" -features = ["std"] +members = [ + "cli", + "lib", +] diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 00000000..29b10e17 --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "rust-tree-sitter-cli" +version = "0.1.0" +authors = ["Max Brunsfeld "] +edition = "2018" + +[dependencies] +lazy_static = "1.2.0" +smallbitvec = "2.3.0" +clap = "2.32" +dirs = "1.0.2" +hashbrown = "0.1" +ignore = "0.4.4" +libloading = "0.5" +rusqlite = "0.14.0" +serde = "1.0" +serde_derive = "1.0" +tree-sitter = "0.3.1" +regex-syntax = "0.6.4" + +[dependencies.serde_json] +version = "1.0" +features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs similarity index 100% rename from src/build_tables/build_lex_table.rs rename to cli/src/build_tables/build_lex_table.rs diff --git a/src/build_tables/build_parse_table.rs b/cli/src/build_tables/build_parse_table.rs similarity index 100% rename from src/build_tables/build_parse_table.rs rename to cli/src/build_tables/build_parse_table.rs diff --git a/src/build_tables/coincident_tokens.rs b/cli/src/build_tables/coincident_tokens.rs similarity index 100% rename from src/build_tables/coincident_tokens.rs rename to cli/src/build_tables/coincident_tokens.rs diff --git a/src/build_tables/item.rs b/cli/src/build_tables/item.rs similarity index 100% rename from src/build_tables/item.rs rename to cli/src/build_tables/item.rs diff --git a/src/build_tables/item_set_builder.rs b/cli/src/build_tables/item_set_builder.rs similarity index 100% rename from src/build_tables/item_set_builder.rs rename to cli/src/build_tables/item_set_builder.rs diff --git a/src/build_tables/minimize_parse_table.rs b/cli/src/build_tables/minimize_parse_table.rs similarity index 100% rename from src/build_tables/minimize_parse_table.rs rename to cli/src/build_tables/minimize_parse_table.rs diff --git a/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs similarity index 100% rename from src/build_tables/mod.rs rename to cli/src/build_tables/mod.rs diff --git a/src/build_tables/token_conflicts.rs b/cli/src/build_tables/token_conflicts.rs similarity index 100% rename from src/build_tables/token_conflicts.rs rename to cli/src/build_tables/token_conflicts.rs diff --git a/src/error.rs b/cli/src/error.rs similarity index 100% rename from src/error.rs rename to cli/src/error.rs diff --git a/src/generate.rs b/cli/src/generate.rs similarity index 100% rename from src/generate.rs rename to cli/src/generate.rs diff --git a/src/grammars.rs b/cli/src/grammars.rs similarity index 100% rename from src/grammars.rs rename to cli/src/grammars.rs diff --git a/src/js/dsl.js b/cli/src/js/dsl.js similarity index 100% rename from src/js/dsl.js rename to cli/src/js/dsl.js diff --git a/src/logger.rs b/cli/src/logger.rs similarity index 100% rename from src/logger.rs rename to cli/src/logger.rs diff --git a/src/main.rs b/cli/src/main.rs similarity index 100% rename from src/main.rs rename to cli/src/main.rs diff --git a/src/nfa.rs b/cli/src/nfa.rs similarity index 100% rename from src/nfa.rs rename to cli/src/nfa.rs diff --git a/src/parse_grammar.rs b/cli/src/parse_grammar.rs similarity index 100% rename from src/parse_grammar.rs rename to cli/src/parse_grammar.rs diff --git a/src/prepare_grammar/expand_repeats.rs b/cli/src/prepare_grammar/expand_repeats.rs similarity index 100% rename from src/prepare_grammar/expand_repeats.rs rename to cli/src/prepare_grammar/expand_repeats.rs diff --git a/src/prepare_grammar/expand_tokens.rs b/cli/src/prepare_grammar/expand_tokens.rs similarity index 100% rename from src/prepare_grammar/expand_tokens.rs rename to cli/src/prepare_grammar/expand_tokens.rs diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/cli/src/prepare_grammar/extract_simple_aliases.rs similarity index 100% rename from src/prepare_grammar/extract_simple_aliases.rs rename to cli/src/prepare_grammar/extract_simple_aliases.rs diff --git a/src/prepare_grammar/extract_tokens.rs b/cli/src/prepare_grammar/extract_tokens.rs similarity index 100% rename from src/prepare_grammar/extract_tokens.rs rename to cli/src/prepare_grammar/extract_tokens.rs diff --git a/src/prepare_grammar/flatten_grammar.rs b/cli/src/prepare_grammar/flatten_grammar.rs similarity index 100% rename from src/prepare_grammar/flatten_grammar.rs rename to cli/src/prepare_grammar/flatten_grammar.rs diff --git a/src/prepare_grammar/intern_symbols.rs b/cli/src/prepare_grammar/intern_symbols.rs similarity index 100% rename from src/prepare_grammar/intern_symbols.rs rename to cli/src/prepare_grammar/intern_symbols.rs diff --git a/src/prepare_grammar/mod.rs b/cli/src/prepare_grammar/mod.rs similarity index 100% rename from src/prepare_grammar/mod.rs rename to cli/src/prepare_grammar/mod.rs diff --git a/src/prepare_grammar/process_inlines.rs b/cli/src/prepare_grammar/process_inlines.rs similarity index 100% rename from src/prepare_grammar/process_inlines.rs rename to cli/src/prepare_grammar/process_inlines.rs diff --git a/src/render/mod.rs b/cli/src/render/mod.rs similarity index 100% rename from src/render/mod.rs rename to cli/src/render/mod.rs diff --git a/src/rules.rs b/cli/src/rules.rs similarity index 100% rename from src/rules.rs rename to cli/src/rules.rs diff --git a/src/tables.rs b/cli/src/tables.rs similarity index 100% rename from src/tables.rs rename to cli/src/tables.rs From dd416b09552fd5b09313072fb13452dd2a8d8fc0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 4 Jan 2019 17:33:34 -0800 Subject: [PATCH 103/208] Update include paths to not reference 'runtime' directory --- lib/src/array.h | 2 +- lib/src/get_changed_ranges.c | 10 +++++----- lib/src/get_changed_ranges.h | 4 ++-- lib/src/language.c | 6 +++--- lib/src/language.h | 2 +- lib/src/length.h | 2 +- lib/src/lexer.c | 8 ++++---- lib/src/lexer.h | 4 ++-- lib/src/node.c | 6 +++--- lib/src/parser.c | 24 ++++++++++++------------ lib/src/reduce_action.h | 2 +- lib/src/reusable_node.h | 2 +- lib/src/runtime.c | 20 ++++++++++---------- lib/src/stack.c | 12 ++++++------ lib/src/stack.h | 6 +++--- lib/src/subtree.c | 12 ++++++------ lib/src/subtree.h | 6 +++--- lib/src/tree.c | 10 +++++----- lib/src/tree_cursor.c | 8 ++++---- lib/src/tree_cursor.h | 2 +- lib/src/utf16.c | 2 +- script/build-runtime | 14 +++++++------- 22 files changed, 82 insertions(+), 82 deletions(-) diff --git a/lib/src/array.h b/lib/src/array.h index 60cfc800..3f5b6b29 100644 --- a/lib/src/array.h +++ b/lib/src/array.h @@ -10,7 +10,7 @@ extern "C" { #include #include #include -#include "runtime/alloc.h" +#include "./alloc.h" #define Array(T) \ struct { \ diff --git a/lib/src/get_changed_ranges.c b/lib/src/get_changed_ranges.c index 900d36ed..da39dd13 100644 --- a/lib/src/get_changed_ranges.c +++ b/lib/src/get_changed_ranges.c @@ -1,8 +1,8 @@ -#include "runtime/get_changed_ranges.h" -#include "runtime/subtree.h" -#include "runtime/language.h" -#include "runtime/error_costs.h" -#include "runtime/tree_cursor.h" +#include "./get_changed_ranges.h" +#include "./subtree.h" +#include "./language.h" +#include "./error_costs.h" +#include "./tree_cursor.h" #include // #define DEBUG_GET_CHANGED_RANGES diff --git a/lib/src/get_changed_ranges.h b/lib/src/get_changed_ranges.h index e7fcead1..2764b55f 100644 --- a/lib/src/get_changed_ranges.h +++ b/lib/src/get_changed_ranges.h @@ -5,8 +5,8 @@ extern "C" { #endif -#include "runtime/tree_cursor.h" -#include "runtime/subtree.h" +#include "./tree_cursor.h" +#include "./subtree.h" typedef Array(TSRange) TSRangeArray; diff --git a/lib/src/language.c b/lib/src/language.c index 0fb03b6c..9541bba2 100644 --- a/lib/src/language.c +++ b/lib/src/language.c @@ -1,6 +1,6 @@ -#include "runtime/language.h" -#include "runtime/subtree.h" -#include "runtime/error_costs.h" +#include "./language.h" +#include "./subtree.h" +#include "./error_costs.h" #include void ts_language_table_entry(const TSLanguage *self, TSStateId state, diff --git a/lib/src/language.h b/lib/src/language.h index 8386a054..c8e5e8a1 100644 --- a/lib/src/language.h +++ b/lib/src/language.h @@ -5,7 +5,7 @@ extern "C" { #endif -#include "runtime/subtree.h" +#include "./subtree.h" #include "tree_sitter/parser.h" #define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) diff --git a/lib/src/length.h b/lib/src/length.h index 8dd1715e..db325f7a 100644 --- a/lib/src/length.h +++ b/lib/src/length.h @@ -3,7 +3,7 @@ #include #include -#include "runtime/point.h" +#include "./point.h" #include "tree_sitter/runtime.h" typedef struct { diff --git a/lib/src/lexer.c b/lib/src/lexer.c index d2b9ad70..b33da344 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -1,8 +1,8 @@ #include -#include "runtime/lexer.h" -#include "runtime/subtree.h" -#include "runtime/length.h" -#include "runtime/utf16.h" +#include "./lexer.h" +#include "./subtree.h" +#include "./length.h" +#include "./utf16.h" #include "utf8proc.h" #define LOG(...) \ diff --git a/lib/src/lexer.h b/lib/src/lexer.h index 491c2da1..327350f6 100644 --- a/lib/src/lexer.h +++ b/lib/src/lexer.h @@ -5,8 +5,8 @@ extern "C" { #endif -#include "runtime/length.h" -#include "runtime/subtree.h" +#include "./length.h" +#include "./subtree.h" #include "tree_sitter/runtime.h" #include "tree_sitter/parser.h" diff --git a/lib/src/node.c b/lib/src/node.c index c1763261..eb4a3121 100644 --- a/lib/src/node.c +++ b/lib/src/node.c @@ -1,7 +1,7 @@ #include -#include "runtime/subtree.h" -#include "runtime/tree.h" -#include "runtime/language.h" +#include "./subtree.h" +#include "./tree.h" +#include "./language.h" typedef struct { Subtree parent; diff --git a/lib/src/parser.c b/lib/src/parser.c index c7050ce5..ef7f612d 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -3,18 +3,18 @@ #include #include #include "tree_sitter/runtime.h" -#include "runtime/subtree.h" -#include "runtime/lexer.h" -#include "runtime/length.h" -#include "runtime/array.h" -#include "runtime/language.h" -#include "runtime/alloc.h" -#include "runtime/stack.h" -#include "runtime/reusable_node.h" -#include "runtime/reduce_action.h" -#include "runtime/error_costs.h" -#include "runtime/get_changed_ranges.h" -#include "runtime/tree.h" +#include "./subtree.h" +#include "./lexer.h" +#include "./length.h" +#include "./array.h" +#include "./language.h" +#include "./alloc.h" +#include "./stack.h" +#include "./reusable_node.h" +#include "./reduce_action.h" +#include "./error_costs.h" +#include "./get_changed_ranges.h" +#include "./tree.h" #define LOG(...) \ if (self->lexer.logger.log || self->dot_graph_file) { \ diff --git a/lib/src/reduce_action.h b/lib/src/reduce_action.h index 75267c3f..91835c39 100644 --- a/lib/src/reduce_action.h +++ b/lib/src/reduce_action.h @@ -5,7 +5,7 @@ extern "C" { #endif -#include "runtime/array.h" +#include "./array.h" #include "tree_sitter/runtime.h" typedef struct { diff --git a/lib/src/reusable_node.h b/lib/src/reusable_node.h index cb9cea58..ab91cb36 100644 --- a/lib/src/reusable_node.h +++ b/lib/src/reusable_node.h @@ -1,4 +1,4 @@ -#include "runtime/subtree.h" +#include "./subtree.h" typedef struct { Subtree tree; diff --git a/lib/src/runtime.c b/lib/src/runtime.c index 51455a8b..b29f5214 100644 --- a/lib/src/runtime.c +++ b/lib/src/runtime.c @@ -6,14 +6,14 @@ // - include // - externals/utf8proc -#include "runtime/get_changed_ranges.c" -#include "runtime/language.c" -#include "runtime/lexer.c" -#include "runtime/node.c" -#include "runtime/parser.c" -#include "runtime/stack.c" -#include "runtime/subtree.c" -#include "runtime/tree_cursor.c" -#include "runtime/tree.c" -#include "runtime/utf16.c" +#include "./get_changed_ranges.c" +#include "./language.c" +#include "./lexer.c" +#include "./node.c" +#include "./parser.c" +#include "./stack.c" +#include "./subtree.c" +#include "./tree_cursor.c" +#include "./tree.c" +#include "./utf16.c" #include "utf8proc.c" diff --git a/lib/src/stack.c b/lib/src/stack.c index cc434e38..e3a1f22d 100644 --- a/lib/src/stack.c +++ b/lib/src/stack.c @@ -1,9 +1,9 @@ -#include "runtime/alloc.h" -#include "runtime/language.h" -#include "runtime/subtree.h" -#include "runtime/array.h" -#include "runtime/stack.h" -#include "runtime/length.h" +#include "./alloc.h" +#include "./language.h" +#include "./subtree.h" +#include "./array.h" +#include "./stack.h" +#include "./length.h" #include #include diff --git a/lib/src/stack.h b/lib/src/stack.h index 272bb4ee..d476d763 100644 --- a/lib/src/stack.h +++ b/lib/src/stack.h @@ -5,9 +5,9 @@ extern "C" { #endif -#include "runtime/array.h" -#include "runtime/subtree.h" -#include "runtime/error_costs.h" +#include "./array.h" +#include "./subtree.h" +#include "./error_costs.h" #include typedef struct Stack Stack; diff --git a/lib/src/subtree.c b/lib/src/subtree.c index eb7e0530..48c8cff3 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -4,12 +4,12 @@ #include #include #include -#include "runtime/alloc.h" -#include "runtime/atomic.h" -#include "runtime/subtree.h" -#include "runtime/length.h" -#include "runtime/language.h" -#include "runtime/error_costs.h" +#include "./alloc.h" +#include "./atomic.h" +#include "./subtree.h" +#include "./length.h" +#include "./language.h" +#include "./error_costs.h" #include typedef struct { diff --git a/lib/src/subtree.h b/lib/src/subtree.h index de3ddc16..cc5c79aa 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -7,9 +7,9 @@ extern "C" { #include #include -#include "runtime/length.h" -#include "runtime/array.h" -#include "runtime/error_costs.h" +#include "./length.h" +#include "./array.h" +#include "./error_costs.h" #include "tree_sitter/runtime.h" #include "tree_sitter/parser.h" diff --git a/lib/src/tree.c b/lib/src/tree.c index e5122cc1..b729c8c7 100644 --- a/lib/src/tree.c +++ b/lib/src/tree.c @@ -1,9 +1,9 @@ #include "tree_sitter/runtime.h" -#include "runtime/array.h" -#include "runtime/get_changed_ranges.h" -#include "runtime/subtree.h" -#include "runtime/tree_cursor.h" -#include "runtime/tree.h" +#include "./array.h" +#include "./get_changed_ranges.h" +#include "./subtree.h" +#include "./tree_cursor.h" +#include "./tree.h" static const unsigned PARENT_CACHE_CAPACITY = 32; diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 9fce48be..d352c32b 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -1,8 +1,8 @@ #include "tree_sitter/runtime.h" -#include "runtime/alloc.h" -#include "runtime/tree_cursor.h" -#include "runtime/language.h" -#include "runtime/tree.h" +#include "./alloc.h" +#include "./tree_cursor.h" +#include "./language.h" +#include "./tree.h" typedef struct { Subtree parent; diff --git a/lib/src/tree_cursor.h b/lib/src/tree_cursor.h index 84300b21..6e46b7dd 100644 --- a/lib/src/tree_cursor.h +++ b/lib/src/tree_cursor.h @@ -1,7 +1,7 @@ #ifndef RUNTIME_TREE_CURSOR_H_ #define RUNTIME_TREE_CURSOR_H_ -#include "runtime/subtree.h" +#include "./subtree.h" typedef struct { const Subtree *subtree; diff --git a/lib/src/utf16.c b/lib/src/utf16.c index adb82edf..3956c01c 100644 --- a/lib/src/utf16.c +++ b/lib/src/utf16.c @@ -1,4 +1,4 @@ -#include "runtime/utf16.h" +#include "./utf16.h" utf8proc_ssize_t utf16_iterate( const utf8proc_uint8_t *string, diff --git a/script/build-runtime b/script/build-runtime index 9e09b836..7b2e99f2 100755 --- a/script/build-runtime +++ b/script/build-runtime @@ -11,11 +11,11 @@ ${CC} \ -c \ -O3 \ -std=c99 \ - -I src \ - -I include \ - -I externals/utf8proc \ - src/runtime/runtime.c \ - -o runtime.o + -I lib/src \ + -I lib/include \ + -I lib/utf8proc \ + lib/src/runtime.c \ + -o tree-sitter.o -ar rcs libruntime.a runtime.o -rm runtime.o +ar rcs libtree-sitter.a tree-sitter.o +rm tree-sitter.o From 001f8c8f55a2a9a4c14c522ff12fcf27ae04c1e1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 08:39:47 -0800 Subject: [PATCH 104/208] Rename LookaheadSet -> TokenSet Also, replace non-standard `with` method with a `FromIterator` implementation. --- cli/src/build_tables/build_lex_table.rs | 32 ++-- cli/src/build_tables/build_parse_table.rs | 16 +- cli/src/build_tables/item.rs | 39 +++-- cli/src/build_tables/item_set_builder.rs | 34 ++-- cli/src/build_tables/minimize_parse_table.rs | 6 +- cli/src/build_tables/mod.rs | 170 ++++++++++--------- cli/src/build_tables/token_conflicts.rs | 18 +- 7 files changed, 165 insertions(+), 150 deletions(-) diff --git a/cli/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs index bcc1bf3d..9fc8edc6 100644 --- a/cli/src/build_tables/build_lex_table.rs +++ b/cli/src/build_tables/build_lex_table.rs @@ -1,4 +1,4 @@ -use super::item::LookaheadSet; +use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; @@ -11,7 +11,7 @@ pub(crate) fn build_lex_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - keywords: &LookaheadSet, + keywords: &TokenSet, minimize: bool, ) -> (LexTable, LexTable) { let keyword_lex_table; @@ -25,19 +25,23 @@ pub(crate) fn build_lex_table( let mut builder = LexTableBuilder::new(lexical_grammar); for state in parse_table.states.iter_mut() { - let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { - if token.is_terminal() { - if keywords.contains(&token) { - syntax_grammar.word_token - } else { + let tokens = state + .terminal_entries + .keys() + .filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else if token.is_eof() { Some(*token) + } else { + None } - } else if token.is_eof() { - Some(*token) - } else { - None - } - })); + }) + .collect(); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -75,7 +79,7 @@ impl<'a> LexTableBuilder<'a> { } } - fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize { let mut eof_valid = false; let nfa_states = tokens .iter() diff --git a/cli/src/build_tables/build_parse_table.rs b/cli/src/build_tables/build_parse_table.rs index cda1d7ea..27baf146 100644 --- a/cli/src/build_tables/build_parse_table.rs +++ b/cli/src/build_tables/build_parse_table.rs @@ -1,4 +1,4 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item::{ParseItem, ParseItemSet, TokenSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; @@ -38,12 +38,12 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, - following_tokens: Vec, + following_tokens: Vec, state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, Vec)> { + fn build(mut self) -> Result<(ParseTable, Vec)> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -57,7 +57,7 @@ impl<'a> ParseTableBuilder<'a> { ParseItemSet::with( [( ParseItem::start(), - LookaheadSet::with([Symbol::end()].iter().cloned()), + [Symbol::end()].iter().cloned().collect(), )] .iter() .cloned(), @@ -174,7 +174,7 @@ impl<'a> ParseTableBuilder<'a> { .or_insert_with(|| ParseItemSet::default()) .entries .entry(successor) - .or_insert_with(|| LookaheadSet::new()) + .or_insert_with(|| TokenSet::new()) .insert_all(lookaheads); } else { terminal_successors @@ -182,7 +182,7 @@ impl<'a> ParseTableBuilder<'a> { .or_insert_with(|| ParseItemSet::default()) .entries .entry(successor) - .or_insert_with(|| LookaheadSet::new()) + .or_insert_with(|| TokenSet::new()) .insert_all(lookaheads); } } else { @@ -714,7 +714,7 @@ pub(crate) fn build_parse_table( lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, state_ids_to_log: Vec, -) -> Result<(ParseTable, Vec)> { +) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, @@ -729,7 +729,7 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), max_aliased_production_length: 0, }, - following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], + following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()], } .build() } diff --git a/cli/src/build_tables/item.rs b/cli/src/build_tables/item.rs index bbd5bbfa..5d6edc2f 100644 --- a/cli/src/build_tables/item.rs +++ b/cli/src/build_tables/item.rs @@ -6,6 +6,7 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; +use std::iter::FromIterator; use std::u32; lazy_static! { @@ -24,7 +25,7 @@ lazy_static! { } #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub(crate) struct LookaheadSet { +pub(crate) struct TokenSet { terminal_bits: SmallBitVec, external_bits: SmallBitVec, eof: bool, @@ -39,7 +40,7 @@ pub(crate) struct ParseItem<'a> { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet<'a> { - pub entries: BTreeMap, LookaheadSet>, + pub entries: BTreeMap, TokenSet>, } pub(crate) struct ParseItemDisplay<'a>( @@ -48,7 +49,7 @@ pub(crate) struct ParseItemDisplay<'a>( pub &'a LexicalGrammar, ); -pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar); #[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( @@ -57,7 +58,7 @@ pub(crate) struct ParseItemSetDisplay<'a>( pub &'a LexicalGrammar, ); -impl LookaheadSet { +impl TokenSet { pub fn new() -> Self { Self { terminal_bits: SmallBitVec::new(), @@ -92,17 +93,9 @@ impl LookaheadSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } - pub fn with(symbols: impl IntoIterator) -> Self { - let mut result = Self::new(); - for symbol in symbols { - result.insert(symbol); - } - result - } - pub fn contains(&self, symbol: &Symbol) -> bool { match symbol.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), SymbolType::End => self.eof, @@ -111,7 +104,7 @@ impl LookaheadSet { pub fn insert(&mut self, other: Symbol) { let vec = match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), SymbolType::Terminal => &mut self.terminal_bits, SymbolType::External => &mut self.external_bits, SymbolType::End => { @@ -125,7 +118,7 @@ impl LookaheadSet { vec.set(other.index, true); } - pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + pub fn insert_all(&mut self, other: &TokenSet) -> bool { let mut result = false; if other.terminal_bits.len() > self.terminal_bits.len() { self.terminal_bits.resize(other.terminal_bits.len(), false); @@ -153,6 +146,16 @@ impl LookaheadSet { } } +impl FromIterator for TokenSet { + fn from_iter>(iter: T) -> Self { + let mut result = Self::new(); + for symbol in iter { + result.insert(symbol); + } + result + } +} + impl<'a> ParseItem<'a> { pub fn start() -> Self { ParseItem { @@ -204,7 +207,7 @@ impl<'a> ParseItem<'a> { } impl<'a> ParseItemSet<'a> { - pub fn with(elements: impl IntoIterator, LookaheadSet)>) -> Self { + pub fn with(elements: impl IntoIterator, TokenSet)>) -> Self { let mut result = Self::default(); for (item, lookaheads) in elements { result.entries.insert(item, lookaheads); @@ -296,7 +299,7 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } } -impl<'a> fmt::Display for LookaheadSetDisplay<'a> { +impl<'a> fmt::Display for TokenSetDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "[")?; for (i, symbol) in self.0.iter().enumerate() { @@ -328,7 +331,7 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { f, "{}\t{}", ParseItemDisplay(item, self.1, self.2), - LookaheadSetDisplay(lookaheads, self.1, self.2) + TokenSetDisplay(lookaheads, self.1, self.2) )?; } Ok(()) diff --git a/cli/src/build_tables/item_set_builder.rs b/cli/src/build_tables/item_set_builder.rs index 939d700c..fea3b4d1 100644 --- a/cli/src/build_tables/item_set_builder.rs +++ b/cli/src/build_tables/item_set_builder.rs @@ -1,4 +1,4 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet}; +use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use hashbrown::{HashMap, HashSet}; @@ -12,15 +12,15 @@ struct TransitiveClosureAddition<'a> { #[derive(Clone, Debug, PartialEq, Eq)] struct FollowSetInfo { - lookaheads: LookaheadSet, + lookaheads: TokenSet, propagates_lookaheads: bool, } pub(crate) struct ParseItemSetBuilder<'a> { syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - first_sets: HashMap, - last_sets: HashMap, + first_sets: HashMap, + last_sets: HashMap, inlines: &'a InlinedProductionMap, transitive_closure_additions: Vec>>, } @@ -54,7 +54,7 @@ impl<'a> ParseItemSetBuilder<'a> { // terminal itself. for i in 0..lexical_grammar.variables.len() { let symbol = Symbol::terminal(i); - let mut set = LookaheadSet::new(); + let mut set = TokenSet::new(); set.insert(symbol); result.first_sets.insert(symbol, set.clone()); result.last_sets.insert(symbol, set); @@ -62,7 +62,7 @@ impl<'a> ParseItemSetBuilder<'a> { for i in 0..syntax_grammar.external_tokens.len() { let symbol = Symbol::external(i); - let mut set = LookaheadSet::new(); + let mut set = TokenSet::new(); set.insert(symbol); result.first_sets.insert(symbol, set.clone()); result.last_sets.insert(symbol, set); @@ -80,10 +80,7 @@ impl<'a> ParseItemSetBuilder<'a> { for i in 0..syntax_grammar.variables.len() { let symbol = Symbol::non_terminal(i); - let first_set = &mut result - .first_sets - .entry(symbol) - .or_insert(LookaheadSet::new()); + let first_set = &mut result.first_sets.entry(symbol).or_insert(TokenSet::new()); processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push(symbol); @@ -103,10 +100,7 @@ impl<'a> ParseItemSetBuilder<'a> { } // The LAST set is defined in a similar way to the FIRST set. - let last_set = &mut result - .last_sets - .entry(symbol) - .or_insert(LookaheadSet::new()); + let last_set = &mut result.last_sets.entry(symbol).or_insert(TokenSet::new()); processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push(symbol); @@ -148,7 +142,7 @@ impl<'a> ParseItemSetBuilder<'a> { // Again, rather than computing these additions recursively, we use an explicit // stack called `entries_to_process`. for i in 0..syntax_grammar.variables.len() { - let empty_lookaheads = LookaheadSet::new(); + let empty_lookaheads = TokenSet::new(); let mut entries_to_process = vec![(i, &empty_lookaheads, true)]; // First, build up a map whose keys are all of the non-terminals that can @@ -160,7 +154,7 @@ impl<'a> ParseItemSetBuilder<'a> { let existing_info = follow_set_info_by_non_terminal .entry(variable_index) .or_insert_with(|| FollowSetInfo { - lookaheads: LookaheadSet::new(), + lookaheads: TokenSet::new(), propagates_lookaheads: false, }); @@ -269,15 +263,15 @@ impl<'a> ParseItemSetBuilder<'a> { result } - pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + pub fn first_set(&self, symbol: &Symbol) -> &TokenSet { &self.first_sets[symbol] } - pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet { + pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { &self.first_sets[symbol] } - fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { let next_step = item.successor().step(); @@ -294,7 +288,7 @@ impl<'a> ParseItemSetBuilder<'a> { let lookaheads = set .entries .entry(addition.item) - .or_insert_with(|| LookaheadSet::new()); + .or_insert_with(|| TokenSet::new()); lookaheads.insert_all(&addition.info.lookaheads); if addition.info.propagates_lookaheads { lookaheads.insert_all(following_tokens); diff --git a/cli/src/build_tables/minimize_parse_table.rs b/cli/src/build_tables/minimize_parse_table.rs index 573bf974..d83e117f 100644 --- a/cli/src/build_tables/minimize_parse_table.rs +++ b/cli/src/build_tables/minimize_parse_table.rs @@ -1,4 +1,4 @@ -use super::item::LookaheadSet; +use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; @@ -10,7 +10,7 @@ pub(crate) fn minimize_parse_table( syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, - keywords: &LookaheadSet, + keywords: &TokenSet, ) { let mut minimizer = Minimizer { parse_table, @@ -28,7 +28,7 @@ struct Minimizer<'a> { parse_table: &'a mut ParseTable, syntax_grammar: &'a SyntaxGrammar, token_conflict_map: &'a TokenConflictMap<'a>, - keywords: &'a LookaheadSet, + keywords: &'a TokenSet, simple_aliases: &'a AliasMap, } diff --git a/cli/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs index 04b750e3..c632aa7b 100644 --- a/cli/src/build_tables/mod.rs +++ b/cli/src/build_tables/mod.rs @@ -9,7 +9,7 @@ mod token_conflicts; use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; -use self::item::LookaheadSet; +use self::item::TokenSet; use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; @@ -44,11 +44,7 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); - mark_fragile_tokens( - &mut parse_table, - lexical_grammar, - &token_conflict_map, - ); + mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); if minimize { minimize_parse_table( &mut parse_table, @@ -85,22 +81,25 @@ fn populate_error_state( // First identify the *conflict-free tokens*: tokens that do not overlap with // any other token in any way. - let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { - let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { - j != i - && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - && token_conflict_map.does_conflict(i, j) - }); - if conflicts_with_other_tokens { - None - } else { - info!( - "error recovery - token {} has no conflicts", - lexical_grammar.variables[i].name - ); - Some(Symbol::terminal(i)) - } - })); + let conflict_free_tokens: TokenSet = (0..n) + .into_iter() + .filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); + Some(Symbol::terminal(i)) + } + }) + .collect(); let recover_entry = ParseTableEntry { reusable: false, @@ -153,9 +152,9 @@ fn identify_keywords( word_token: Option, token_conflict_map: &TokenConflictMap, coincident_token_index: &CoincidentTokenIndex, -) -> LookaheadSet { +) -> TokenSet { if word_token.is_none() { - return LookaheadSet::new(); + return TokenSet::new(); } let word_token = word_token.unwrap(); @@ -163,8 +162,11 @@ fn identify_keywords( // First find all of the candidate keyword tokens: tokens that start with // letters or underscore and can match the same string as a word token. - let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( - |(i, variable)| { + let keywords: TokenSet = lexical_grammar + .variables + .iter() + .enumerate() + .filter_map(|(i, variable)| { cursor.reset(vec![variable.start_state]); if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) @@ -177,69 +179,75 @@ fn identify_keywords( } else { None } - }, - )); + }) + .collect(); // Exclude keyword candidates that shadow another keyword candidate. - let keywords = LookaheadSet::with(keywords.iter().filter(|token| { - for other_token in keywords.iter() { - if other_token != *token - && token_conflict_map.does_match_same_string(token.index, other_token.index) - { - info!( - "Keywords - exclude {} because it matches the same string as {}", - lexical_grammar.variables[token.index].name, - lexical_grammar.variables[other_token.index].name - ); - return false; + let keywords: TokenSet = keywords + .iter() + .filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + info!( + "Keywords - exclude {} because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } } - } - true - })); + true + }) + .collect(); // Exclude keyword candidates for which substituting the keyword capture // token would introduce new lexical conflicts with other tokens. - let keywords = LookaheadSet::with(keywords.iter().filter(|token| { - for other_index in 0..lexical_grammar.variables.len() { - if keywords.contains(&Symbol::terminal(other_index)) { - continue; + let keywords = keywords + .iter() + .filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + info!( + "Keywords - exclude {} because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } } - // If the word token was already valid in every state containing - // this keyword candidate, then substituting the word token won't - // introduce any new lexical conflicts. - if coincident_token_index - .states_with(*token, Symbol::terminal(other_index)) - .iter() - .all(|state_id| { - parse_table.states[*state_id] - .terminal_entries - .contains_key(&word_token) - }) - { - continue; - } - - if !token_conflict_map.has_same_conflict_status( - token.index, - word_token.index, - other_index, - ) { - info!( - "Keywords - exclude {} because of conflict with {}", - lexical_grammar.variables[token.index].name, - lexical_grammar.variables[other_index].name - ); - return false; - } - } - - info!( - "Keywords - include {}", - lexical_grammar.variables[token.index].name, - ); - true - })); + info!( + "Keywords - include {}", + lexical_grammar.variables[token.index].name, + ); + true + }) + .collect(); keywords } diff --git a/cli/src/build_tables/token_conflicts.rs b/cli/src/build_tables/token_conflicts.rs index cb2b6efe..7bb443a5 100644 --- a/cli/src/build_tables/token_conflicts.rs +++ b/cli/src/build_tables/token_conflicts.rs @@ -1,4 +1,4 @@ -use crate::build_tables::item::LookaheadSet; +use crate::build_tables::item::TokenSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; @@ -22,7 +22,7 @@ pub(crate) struct TokenConflictMap<'a> { } impl<'a> TokenConflictMap<'a> { - pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); @@ -141,7 +141,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, - following_tokens: Vec, + following_tokens: Vec, ) -> Vec { following_tokens .into_iter() @@ -352,9 +352,15 @@ mod tests { let token_map = TokenConflictMap::new( &grammar, vec![ - LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), - LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), - LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + [Symbol::terminal(var("identifier"))] + .iter() + .cloned() + .collect(), + [Symbol::terminal(var("in"))].iter().cloned().collect(), + [Symbol::terminal(var("identifier"))] + .iter() + .cloned() + .collect(), ], ); From 4e29fe69df7b8ee1d1fb032f8662082b0d9da872 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 09:59:04 -0800 Subject: [PATCH 105/208] Reduce lex table size by merging compatible entry point states --- cli/src/build_tables/build_lex_table.rs | 65 +++++++++++++++++++++++-- cli/src/build_tables/item.rs | 17 +++++++ cli/src/build_tables/mod.rs | 2 + 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/cli/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs index 9fc8edc6..0f828f5c 100644 --- a/cli/src/build_tables/build_lex_table.rs +++ b/cli/src/build_tables/build_lex_table.rs @@ -1,9 +1,10 @@ +use super::coincident_tokens::CoincidentTokenIndex; use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use crate::rules::Symbol; -use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; @@ -12,6 +13,8 @@ pub(crate) fn build_lex_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, keywords: &TokenSet, + coincident_token_index: &CoincidentTokenIndex, + token_conflict_map: &TokenConflictMap, minimize: bool, ) -> (LexTable, LexTable) { let keyword_lex_table; @@ -23,8 +26,8 @@ pub(crate) fn build_lex_table( keyword_lex_table = LexTable::default(); } - let mut builder = LexTableBuilder::new(lexical_grammar); - for state in parse_table.states.iter_mut() { + let mut parse_state_ids_by_token_set: Vec<(TokenSet, Vec)> = Vec::new(); + for (i, state) in parse_table.states.iter().enumerate() { let tokens = state .terminal_entries .keys() @@ -42,7 +45,33 @@ pub(crate) fn build_lex_table( } }) .collect(); - state.lex_state_id = builder.add_state_for_tokens(&tokens); + + let mut did_merge = false; + for entry in parse_state_ids_by_token_set.iter_mut() { + if merge_token_set( + &mut entry.0, + &tokens, + lexical_grammar, + token_conflict_map, + coincident_token_index, + ) { + did_merge = true; + entry.1.push(i); + break; + } + } + + if !did_merge { + parse_state_ids_by_token_set.push((tokens, vec![i])); + } + } + + let mut builder = LexTableBuilder::new(lexical_grammar); + for (tokens, parse_state_ids) in parse_state_ids_by_token_set { + let lex_state_id = builder.add_state_for_tokens(&tokens); + for id in parse_state_ids { + parse_table.states[id].lex_state_id = lex_state_id; + } } let mut table = builder.table; @@ -215,6 +244,34 @@ impl<'a> LexTableBuilder<'a> { } } +fn merge_token_set( + tokens: &mut TokenSet, + other: &TokenSet, + lexical_grammar: &LexicalGrammar, + token_conflict_map: &TokenConflictMap, + coincident_token_index: &CoincidentTokenIndex, +) -> bool { + for i in 0..lexical_grammar.variables.len() { + let symbol = Symbol::terminal(i); + let set_without_terminal = match (tokens.contains_terminal(i), other.contains_terminal(i)) { + (true, false) => other, + (false, true) => tokens, + _ => continue, + }; + + for existing_token in set_without_terminal.terminals() { + if token_conflict_map.does_conflict(i, existing_token.index) + || !coincident_token_index.contains(symbol, existing_token) + { + return false; + } + } + } + + tokens.insert_all(other); + true +} + fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { let mut state_replacements = BTreeMap::new(); let mut done = false; diff --git a/cli/src/build_tables/item.rs b/cli/src/build_tables/item.rs index 5d6edc2f..2be331b0 100644 --- a/cli/src/build_tables/item.rs +++ b/cli/src/build_tables/item.rs @@ -93,6 +93,19 @@ impl TokenSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } + pub fn terminals<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + } + pub fn contains(&self, symbol: &Symbol) -> bool { match symbol.kind { SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), @@ -102,6 +115,10 @@ impl TokenSet { } } + pub fn contains_terminal(&self, index: usize) -> bool { + self.terminal_bits.get(index).unwrap_or(false) + } + pub fn insert(&mut self, other: Symbol) { let vec = match other.kind { SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), diff --git a/cli/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs index c632aa7b..1f9acc14 100644 --- a/cli/src/build_tables/mod.rs +++ b/cli/src/build_tables/mod.rs @@ -59,6 +59,8 @@ pub(crate) fn build_tables( syntax_grammar, lexical_grammar, &keywords, + &coincident_token_index, + &token_conflict_map, minimize, ); Ok(( From f059557a9df750340eb87ca087c1df5d3b0fbd11 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 10:23:01 -0800 Subject: [PATCH 106/208] Move parser generation code in to 'generate' module within CLI crate --- cli/src/generate.rs | 34 -------- .../build_tables/build_lex_table.rs | 8 +- .../build_tables/build_parse_table.rs | 6 +- .../build_tables/coincident_tokens.rs | 6 +- cli/src/{ => generate}/build_tables/item.rs | 6 +- .../build_tables/item_set_builder.rs | 4 +- .../build_tables/minimize_parse_table.rs | 6 +- cli/src/{ => generate}/build_tables/mod.rs | 8 +- .../build_tables/token_conflicts.rs | 12 +-- cli/src/{js => generate}/dsl.js | 0 cli/src/{ => generate}/grammar-schema.json | 0 cli/src/{ => generate}/grammars.rs | 4 +- cli/src/generate/mod.rs | 79 +++++++++++++++++++ cli/src/{ => generate}/nfa.rs | 0 cli/src/{ => generate}/parse_grammar.rs | 66 ++++++++++------ .../prepare_grammar/expand_repeats.rs | 4 +- .../prepare_grammar/expand_tokens.rs | 10 +-- .../prepare_grammar/extract_simple_aliases.rs | 8 +- .../prepare_grammar/extract_tokens.rs | 6 +- .../prepare_grammar/flatten_grammar.rs | 8 +- .../prepare_grammar/intern_symbols.rs | 4 +- cli/src/{ => generate}/prepare_grammar/mod.rs | 4 +- .../prepare_grammar/process_inlines.rs | 6 +- cli/src/{render/mod.rs => generate/render.rs} | 8 +- cli/src/{ => generate}/rules.rs | 0 cli/src/{ => generate}/tables.rs | 4 +- cli/src/main.rs | 55 ++----------- script/check-mallocs | 2 +- 28 files changed, 187 insertions(+), 171 deletions(-) delete mode 100644 cli/src/generate.rs rename cli/src/{ => generate}/build_tables/build_lex_table.rs (97%) rename cli/src/{ => generate}/build_tables/build_parse_table.rs (99%) rename cli/src/{ => generate}/build_tables/coincident_tokens.rs (93%) rename cli/src/{ => generate}/build_tables/item.rs (98%) rename cli/src/{ => generate}/build_tables/item_set_builder.rs (99%) rename cli/src/{ => generate}/build_tables/minimize_parse_table.rs (98%) rename cli/src/{ => generate}/build_tables/mod.rs (97%) rename cli/src/{ => generate}/build_tables/token_conflicts.rs (97%) rename cli/src/{js => generate}/dsl.js (100%) rename cli/src/{ => generate}/grammar-schema.json (100%) rename cli/src/{ => generate}/grammars.rs (98%) create mode 100644 cli/src/generate/mod.rs rename cli/src/{ => generate}/nfa.rs (100%) rename cli/src/{ => generate}/parse_grammar.rs (73%) rename cli/src/{ => generate}/prepare_grammar/expand_repeats.rs (98%) rename cli/src/{ => generate}/prepare_grammar/expand_tokens.rs (98%) rename cli/src/{ => generate}/prepare_grammar/extract_simple_aliases.rs (96%) rename cli/src/{ => generate}/prepare_grammar/extract_tokens.rs (98%) rename cli/src/{ => generate}/prepare_grammar/flatten_grammar.rs (97%) rename cli/src/{ => generate}/prepare_grammar/intern_symbols.rs (98%) rename cli/src/{ => generate}/prepare_grammar/mod.rs (95%) rename cli/src/{ => generate}/prepare_grammar/process_inlines.rs (98%) rename cli/src/{render/mod.rs => generate/render.rs} (99%) rename cli/src/{ => generate}/rules.rs (100%) rename cli/src/{ => generate}/tables.rs (97%) diff --git a/cli/src/generate.rs b/cli/src/generate.rs deleted file mode 100644 index aa8f3b5b..00000000 --- a/cli/src/generate.rs +++ /dev/null @@ -1,34 +0,0 @@ -use crate::build_tables::build_tables; -use crate::error::Result; -use crate::parse_grammar::parse_grammar; -use crate::prepare_grammar::prepare_grammar; -use crate::render::render_c_code; - -pub fn generate_parser_for_grammar( - input: &str, - minimize: bool, - state_ids_to_log: Vec, -) -> Result { - let input_grammar = parse_grammar(input)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( - &syntax_grammar, - &lexical_grammar, - &simple_aliases, - &inlines, - minimize, - state_ids_to_log, - )?; - let c_code = render_c_code( - &input_grammar.name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - syntax_grammar, - lexical_grammar, - simple_aliases, - ); - Ok(c_code) -} diff --git a/cli/src/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs similarity index 97% rename from cli/src/build_tables/build_lex_table.rs rename to cli/src/generate/build_tables/build_lex_table.rs index 0f828f5c..200c6959 100644 --- a/cli/src/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -1,10 +1,10 @@ use super::coincident_tokens::CoincidentTokenIndex; use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; -use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; -use crate::rules::Symbol; -use crate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use crate::generate::rules::Symbol; +use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; diff --git a/cli/src/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs similarity index 99% rename from cli/src/build_tables/build_parse_table.rs rename to cli/src/generate/build_tables/build_parse_table.rs index 27baf146..73c9c0e2 100644 --- a/cli/src/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -1,9 +1,9 @@ use super::item::{ParseItem, ParseItemSet, TokenSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::{Alias, Associativity, Symbol, SymbolType}; -use crate::tables::{ +use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::generate::rules::{Alias, Associativity, Symbol, SymbolType}; +use crate::generate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; diff --git a/cli/src/build_tables/coincident_tokens.rs b/cli/src/generate/build_tables/coincident_tokens.rs similarity index 93% rename from cli/src/build_tables/coincident_tokens.rs rename to cli/src/generate/build_tables/coincident_tokens.rs index 62295073..25dbc331 100644 --- a/cli/src/build_tables/coincident_tokens.rs +++ b/cli/src/generate/build_tables/coincident_tokens.rs @@ -1,6 +1,6 @@ -use crate::grammars::LexicalGrammar; -use crate::rules::Symbol; -use crate::tables::{ParseStateId, ParseTable}; +use crate::generate::grammars::LexicalGrammar; +use crate::generate::rules::Symbol; +use crate::generate::tables::{ParseStateId, ParseTable}; use std::fmt; pub(crate) struct CoincidentTokenIndex<'a> { diff --git a/cli/src/build_tables/item.rs b/cli/src/generate/build_tables/item.rs similarity index 98% rename from cli/src/build_tables/item.rs rename to cli/src/generate/build_tables/item.rs index 2be331b0..81c86f4a 100644 --- a/cli/src/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -1,6 +1,6 @@ -use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; -use crate::rules::Associativity; -use crate::rules::{Symbol, SymbolType}; +use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; +use crate::generate::rules::Associativity; +use crate::generate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; use std::cmp::Ordering; use std::collections::BTreeMap; diff --git a/cli/src/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs similarity index 99% rename from cli/src/build_tables/item_set_builder.rs rename to cli/src/generate/build_tables/item_set_builder.rs index fea3b4d1..56d7c7c4 100644 --- a/cli/src/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -1,6 +1,6 @@ use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet}; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::rules::Symbol; +use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::generate::rules::Symbol; use hashbrown::{HashMap, HashSet}; use std::fmt; diff --git a/cli/src/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs similarity index 98% rename from cli/src/build_tables/minimize_parse_table.rs rename to cli/src/generate/build_tables/minimize_parse_table.rs index d83e117f..007c9703 100644 --- a/cli/src/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -1,8 +1,8 @@ use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; -use crate::grammars::{SyntaxGrammar, VariableType}; -use crate::rules::{AliasMap, Symbol}; -use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; +use crate::generate::grammars::{SyntaxGrammar, VariableType}; +use crate::generate::rules::{AliasMap, Symbol}; +use crate::generate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use hashbrown::{HashMap, HashSet}; pub(crate) fn minimize_parse_table( diff --git a/cli/src/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs similarity index 97% rename from cli/src/build_tables/mod.rs rename to cli/src/generate/build_tables/mod.rs index 1f9acc14..7d55d0fa 100644 --- a/cli/src/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -13,10 +13,10 @@ use self::item::TokenSet; use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::nfa::{CharacterSet, NfaCursor}; -use crate::rules::{AliasMap, Symbol}; -use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; +use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::generate::nfa::{CharacterSet, NfaCursor}; +use crate::generate::rules::{AliasMap, Symbol}; +use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, diff --git a/cli/src/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs similarity index 97% rename from cli/src/build_tables/token_conflicts.rs rename to cli/src/generate/build_tables/token_conflicts.rs index 7bb443a5..1a63bfc8 100644 --- a/cli/src/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -1,6 +1,6 @@ -use crate::build_tables::item::TokenSet; -use crate::grammars::LexicalGrammar; -use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use crate::generate::build_tables::item::TokenSet; +use crate::generate::grammars::LexicalGrammar; +use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; use std::cmp::Ordering; use std::fmt; @@ -288,9 +288,9 @@ fn variable_ids_for_states<'a>( #[cfg(test)] mod tests { use super::*; - use crate::grammars::{Variable, VariableType}; - use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; - use crate::rules::{Rule, Symbol}; + use crate::generate::grammars::{Variable, VariableType}; + use crate::generate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; + use crate::generate::rules::{Rule, Symbol}; #[test] fn test_starting_characters() { diff --git a/cli/src/js/dsl.js b/cli/src/generate/dsl.js similarity index 100% rename from cli/src/js/dsl.js rename to cli/src/generate/dsl.js diff --git a/cli/src/grammar-schema.json b/cli/src/generate/grammar-schema.json similarity index 100% rename from cli/src/grammar-schema.json rename to cli/src/generate/grammar-schema.json diff --git a/cli/src/grammars.rs b/cli/src/generate/grammars.rs similarity index 98% rename from cli/src/grammars.rs rename to cli/src/generate/grammars.rs index f82d6b02..3772bfd4 100644 --- a/cli/src/grammars.rs +++ b/cli/src/generate/grammars.rs @@ -1,5 +1,5 @@ -use crate::nfa::Nfa; -use crate::rules::{Alias, Associativity, Rule, Symbol}; +use super::nfa::Nfa; +use super::rules::{Alias, Associativity, Rule, Symbol}; use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs new file mode 100644 index 00000000..7dfe5a4b --- /dev/null +++ b/cli/src/generate/mod.rs @@ -0,0 +1,79 @@ +use self::build_tables::build_tables; +use self::parse_grammar::parse_grammar; +use self::prepare_grammar::prepare_grammar; +use self::render::render_c_code; +use crate::error::Result; +use std::io::Write; +use std::path::PathBuf; +use std::process::{Command, Stdio}; + +mod build_tables; +mod grammars; +mod nfa; +mod parse_grammar; +mod prepare_grammar; +mod render; +mod rules; +mod tables; + +pub fn generate_parser_for_grammar( + grammar_path: &PathBuf, + minimize: bool, + state_ids_to_log: Vec, +) -> Result { + let grammar_json = load_js_grammar_file(grammar_path); + let input_grammar = parse_grammar(&grammar_json)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &inlines, + minimize, + state_ids_to_log, + )?; + let c_code = render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + ); + Ok(c_code) +} + +fn load_js_grammar_file(grammar_path: &PathBuf) -> String { + let mut node_process = Command::new("node") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .expect("Failed to run `node`"); + + let js_prelude = include_str!("./dsl.js"); + let mut node_stdin = node_process + .stdin + .take() + .expect("Failed to open stdin for node"); + write!( + node_stdin, + "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", + js_prelude, + grammar_path.to_str().unwrap() + ) + .expect("Failed to write to node's stdin"); + drop(node_stdin); + let output = node_process + .wait_with_output() + .expect("Failed to read output from node"); + match output.status.code() { + None => panic!("Node process was killed"), + Some(0) => {} + Some(code) => panic!(format!("Node process exited with status {}", code)), + } + + String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") +} diff --git a/cli/src/nfa.rs b/cli/src/generate/nfa.rs similarity index 100% rename from cli/src/nfa.rs rename to cli/src/generate/nfa.rs diff --git a/cli/src/parse_grammar.rs b/cli/src/generate/parse_grammar.rs similarity index 73% rename from cli/src/parse_grammar.rs rename to cli/src/generate/parse_grammar.rs index 6808f402..e77dce9b 100644 --- a/cli/src/parse_grammar.rs +++ b/cli/src/generate/parse_grammar.rs @@ -1,7 +1,7 @@ -use serde_json::{Map, Value}; +use super::grammars::{InputGrammar, Variable, VariableType}; +use super::rules::Rule; use crate::error::Result; -use crate::grammars::{InputGrammar, Variable, VariableType}; -use crate::rules::Rule; +use serde_json::{Map, Value}; #[derive(Deserialize)] #[serde(tag = "type")] @@ -81,20 +81,20 @@ pub(crate) fn parse_grammar(input: &str) -> Result { }) } - let extra_tokens = grammar_json.extras + let extra_tokens = grammar_json + .extras .unwrap_or(Vec::new()) .into_iter() .map(parse_rule) .collect(); - let external_tokens = grammar_json.externals + let external_tokens = grammar_json + .externals .unwrap_or(Vec::new()) .into_iter() .map(parse_rule) .collect(); - let expected_conflicts = grammar_json.conflicts - .unwrap_or(Vec::new()); - let variables_to_inline = grammar_json.inline - .unwrap_or(Vec::new()); + let expected_conflicts = grammar_json.conflicts.unwrap_or(Vec::new()); + let variables_to_inline = grammar_json.inline.unwrap_or(Vec::new()); Ok(InputGrammar { name: grammar_json.name, @@ -109,7 +109,11 @@ pub(crate) fn parse_grammar(input: &str) -> Result { fn parse_rule(json: RuleJSON) -> Rule { match json { - RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named), + RuleJSON::ALIAS { + content, + value, + named, + } => Rule::alias(parse_rule(*content), value, named), RuleJSON::BLANK => Rule::Blank, RuleJSON::STRING { value } => Rule::String(value), RuleJSON::PATTERN { value } => Rule::Pattern(value), @@ -117,11 +121,15 @@ fn parse_rule(json: RuleJSON) -> Rule { RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)), - RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]), + RuleJSON::REPEAT { content } => { + Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]) + } RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), - RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)), + RuleJSON::PREC_DYNAMIC { value, content } => { + Rule::prec_dynamic(value, parse_rule(*content)) + } RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)), } @@ -133,7 +141,8 @@ mod tests { #[test] fn test_parse_grammar() { - let grammar = parse_grammar(r#"{ + let grammar = parse_grammar( + r#"{ "name": "my_lang", "rules": { "file": { @@ -148,20 +157,25 @@ mod tests { "value": "foo" } } - }"#).unwrap(); + }"#, + ) + .unwrap(); assert_eq!(grammar.name, "my_lang"); - assert_eq!(grammar.variables, vec![ - Variable { - name: "file".to_string(), - kind: VariableType::Named, - rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) - }, - Variable { - name: "statement".to_string(), - kind: VariableType::Named, - rule: Rule::String("foo".to_string()) - }, - ]); + assert_eq!( + grammar.variables, + vec![ + Variable { + name: "file".to_string(), + kind: VariableType::Named, + rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) + }, + Variable { + name: "statement".to_string(), + kind: VariableType::Named, + rule: Rule::String("foo".to_string()) + }, + ] + ); } } diff --git a/cli/src/prepare_grammar/expand_repeats.rs b/cli/src/generate/prepare_grammar/expand_repeats.rs similarity index 98% rename from cli/src/prepare_grammar/expand_repeats.rs rename to cli/src/generate/prepare_grammar/expand_repeats.rs index 4589bd11..b290799b 100644 --- a/cli/src/prepare_grammar/expand_repeats.rs +++ b/cli/src/generate/prepare_grammar/expand_repeats.rs @@ -1,6 +1,6 @@ use super::ExtractedSyntaxGrammar; -use crate::grammars::{Variable, VariableType}; -use crate::rules::{Rule, Symbol}; +use crate::generate::grammars::{Variable, VariableType}; +use crate::generate::rules::{Rule, Symbol}; use hashbrown::HashMap; use std::mem; diff --git a/cli/src/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs similarity index 98% rename from cli/src/prepare_grammar/expand_tokens.rs rename to cli/src/generate/prepare_grammar/expand_tokens.rs index 2678df19..d1922dc0 100644 --- a/cli/src/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -1,8 +1,8 @@ use super::ExtractedLexicalGrammar; use crate::error::{Error, Result}; -use crate::grammars::{LexicalGrammar, LexicalVariable}; -use crate::nfa::{CharacterSet, Nfa, NfaState}; -use crate::rules::Rule; +use crate::generate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::generate::nfa::{CharacterSet, Nfa, NfaState}; +use crate::generate::rules::Rule; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; @@ -366,8 +366,8 @@ impl NfaBuilder { #[cfg(test)] mod tests { use super::*; - use crate::grammars::Variable; - use crate::nfa::{NfaCursor, NfaTransition}; + use crate::generate::grammars::Variable; + use crate::generate::nfa::{NfaCursor, NfaTransition}; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); diff --git a/cli/src/prepare_grammar/extract_simple_aliases.rs b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs similarity index 96% rename from cli/src/prepare_grammar/extract_simple_aliases.rs rename to cli/src/generate/prepare_grammar/extract_simple_aliases.rs index aa8b3f77..84c535b9 100644 --- a/cli/src/prepare_grammar/extract_simple_aliases.rs +++ b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs @@ -1,5 +1,5 @@ -use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; #[derive(Clone, Default)] struct SymbolStatus { @@ -83,8 +83,8 @@ pub(super) fn extract_simple_aliases( #[cfg(test)] mod tests { use super::*; - use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; - use crate::nfa::Nfa; + use crate::generate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; + use crate::generate::nfa::Nfa; #[test] fn test_extract_simple_aliases() { diff --git a/cli/src/prepare_grammar/extract_tokens.rs b/cli/src/generate/prepare_grammar/extract_tokens.rs similarity index 98% rename from cli/src/prepare_grammar/extract_tokens.rs rename to cli/src/generate/prepare_grammar/extract_tokens.rs index 5a54d34e..ae07763b 100644 --- a/cli/src/prepare_grammar/extract_tokens.rs +++ b/cli/src/generate/prepare_grammar/extract_tokens.rs @@ -1,7 +1,7 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; -use crate::grammars::{ExternalToken, Variable, VariableType}; -use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; +use crate::generate::grammars::{ExternalToken, Variable, VariableType}; +use crate::generate::rules::{MetadataParams, Rule, Symbol, SymbolType}; use hashbrown::HashMap; use std::mem; @@ -311,7 +311,7 @@ impl SymbolReplacer { #[cfg(test)] mod test { use super::*; - use crate::grammars::VariableType; + use crate::generate::grammars::VariableType; #[test] fn test_extraction() { diff --git a/cli/src/prepare_grammar/flatten_grammar.rs b/cli/src/generate/prepare_grammar/flatten_grammar.rs similarity index 97% rename from cli/src/prepare_grammar/flatten_grammar.rs rename to cli/src/generate/prepare_grammar/flatten_grammar.rs index 3ffef086..9409a010 100644 --- a/cli/src/prepare_grammar/flatten_grammar.rs +++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs @@ -1,7 +1,7 @@ use super::ExtractedSyntaxGrammar; use crate::error::Result; -use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; -use crate::rules::{Alias, Associativity, Rule}; +use crate::generate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; +use crate::generate::rules::{Alias, Associativity, Rule}; struct RuleFlattener { production: Production, @@ -163,8 +163,8 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result Result { let interner = Interner { grammar }; diff --git a/cli/src/prepare_grammar/mod.rs b/cli/src/generate/prepare_grammar/mod.rs similarity index 95% rename from cli/src/prepare_grammar/mod.rs rename to cli/src/generate/prepare_grammar/mod.rs index b0c1d2a3..41f668f4 100644 --- a/cli/src/prepare_grammar/mod.rs +++ b/cli/src/generate/prepare_grammar/mod.rs @@ -14,10 +14,10 @@ use self::flatten_grammar::flatten_grammar; use self::intern_symbols::intern_symbols; use self::process_inlines::process_inlines; use crate::error::Result; -use crate::grammars::{ +use crate::generate::grammars::{ ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable, }; -use crate::rules::{AliasMap, Rule, Symbol}; +use crate::generate::rules::{AliasMap, Rule, Symbol}; pub(crate) struct IntermediateGrammar { variables: Vec, diff --git a/cli/src/prepare_grammar/process_inlines.rs b/cli/src/generate/prepare_grammar/process_inlines.rs similarity index 98% rename from cli/src/prepare_grammar/process_inlines.rs rename to cli/src/generate/prepare_grammar/process_inlines.rs index 557b0fa4..3c0f529a 100644 --- a/cli/src/prepare_grammar/process_inlines.rs +++ b/cli/src/generate/prepare_grammar/process_inlines.rs @@ -1,4 +1,4 @@ -use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; +use crate::generate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -184,8 +184,8 @@ pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { #[cfg(test)] mod tests { use super::*; - use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; - use crate::rules::{Associativity, Symbol}; + use crate::generate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::generate::rules::{Associativity, Symbol}; #[test] fn test_basic_inlining() { diff --git a/cli/src/render/mod.rs b/cli/src/generate/render.rs similarity index 99% rename from cli/src/render/mod.rs rename to cli/src/generate/render.rs index 36429848..5e0d2b67 100644 --- a/cli/src/render/mod.rs +++ b/cli/src/generate/render.rs @@ -1,7 +1,7 @@ -use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::nfa::CharacterSet; -use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; +use super::nfa::CharacterSet; +use super::rules::{Alias, AliasMap, Symbol, SymbolType}; +use super::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use core::ops::Range; use hashbrown::{HashMap, HashSet}; use std::fmt::Write; diff --git a/cli/src/rules.rs b/cli/src/generate/rules.rs similarity index 100% rename from cli/src/rules.rs rename to cli/src/generate/rules.rs diff --git a/cli/src/tables.rs b/cli/src/generate/tables.rs similarity index 97% rename from cli/src/tables.rs rename to cli/src/generate/tables.rs index edbbaaab..6c3da68e 100644 --- a/cli/src/tables.rs +++ b/cli/src/generate/tables.rs @@ -1,5 +1,5 @@ -use crate::nfa::CharacterSet; -use crate::rules::{Alias, Associativity, Symbol}; +use super::nfa::CharacterSet; +use super::rules::{Alias, Associativity, Symbol}; use hashbrown::HashMap; pub(crate) type AliasSequenceId = usize; diff --git a/cli/src/main.rs b/cli/src/main.rs index 11c277c3..fe6ffd8c 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -7,24 +7,14 @@ extern crate serde_derive; extern crate hashbrown; extern crate serde_json; -use clap::{App, Arg, SubCommand}; -use std::env; -use std::io::Write; -use std::path::PathBuf; -use std::process::{exit, Command, Stdio}; -use std::usize; - -mod build_tables; mod error; mod generate; -mod grammars; mod logger; -mod nfa; -mod parse_grammar; -mod prepare_grammar; -mod render; -mod rules; -mod tables; + +use clap::{App, Arg, SubCommand}; +use std::env; +use std::process::exit; +use std::usize; fn main() { if let Err(e) = run() { @@ -77,43 +67,10 @@ fn run() -> error::Result<()> { }); let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); - let grammar_json = load_js_grammar_file(grammar_path); let code = - generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?; + generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?; println!("{}", code); } Ok(()) } - -fn load_js_grammar_file(grammar_path: PathBuf) -> String { - let mut node_process = Command::new("node") - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn() - .expect("Failed to run `node`"); - - let js_prelude = include_str!("./js/dsl.js"); - let mut node_stdin = node_process - .stdin - .take() - .expect("Failed to open stdin for node"); - write!( - node_stdin, - "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", - js_prelude, - grammar_path.to_str().unwrap() - ) - .expect("Failed to write to node's stdin"); - drop(node_stdin); - let output = node_process - .wait_with_output() - .expect("Failed to read output from node"); - match output.status.code() { - None => panic!("Node process was killed"), - Some(0) => {} - Some(code) => panic!(format!("Node process exited with status {}", code)), - } - - String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") -} diff --git a/script/check-mallocs b/script/check-mallocs index 0bd064d0..889861d8 100755 --- a/script/check-mallocs +++ b/script/check-mallocs @@ -1,6 +1,6 @@ #!/usr/bin/env bash -src_dir="src/runtime" +src_dir="lib/src" allocation_functions=( malloc From 8291d294fb0b251addc745c90863e22792f5cc28 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 17:57:27 -0800 Subject: [PATCH 107/208] Add test subcommand Co-Authored-By: Timothy Clem --- Cargo.lock | 2 + cli/Cargo.toml | 2 + cli/src/error.rs | 14 +++ cli/src/loader.rs | 241 ++++++++++++++++++++++++++++++++++++++++++++++ cli/src/main.rs | 34 +++++-- cli/src/test.rs | 212 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 496 insertions(+), 9 deletions(-) create mode 100644 cli/src/loader.rs create mode 100644 cli/src/test.rs diff --git a/Cargo.lock b/Cargo.lock index 758dcad7..7a249312 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -641,6 +641,7 @@ dependencies = [ name = "tree-sitter-cli" version = "0.1.0" dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", @@ -648,6 +649,7 @@ dependencies = [ "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 6a9c253d..200fd2f1 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -9,6 +9,7 @@ name = "tree-sitter" path = "src/main.rs" [dependencies] +ansi_term = "0.11" lazy_static = "1.2.0" smallbitvec = "2.3.0" clap = "2.32" @@ -20,6 +21,7 @@ rusqlite = "0.14.0" serde = "1.0" serde_derive = "1.0" regex-syntax = "0.6.4" +regex = "1" [dependencies.tree-sitter] path = "../lib" diff --git a/cli/src/error.rs b/cli/src/error.rs index 9a5801f8..1b8b1a79 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -1,3 +1,5 @@ +use std::io; + #[derive(Debug)] pub struct Error(pub String); @@ -22,3 +24,15 @@ impl From for Error { Error(error.to_string()) } } + +impl From for Error { + fn from(error: io::Error) -> Self { + Error(error.to_string()) + } +} + +impl From for Error { + fn from(error: String) -> Self { + Error(error) + } +} diff --git a/cli/src/loader.rs b/cli/src/loader.rs new file mode 100644 index 00000000..7dfb233b --- /dev/null +++ b/cli/src/loader.rs @@ -0,0 +1,241 @@ +use libloading::{Library, Symbol}; +use regex::{Regex, RegexBuilder}; +use std::collections::HashMap; +use std::fs; +use std::io; +use std::mem; +use std::path::{Path, PathBuf}; +use std::process::Command; +use tree_sitter::{Language, PropertySheet}; + +const PACKAGE_JSON_PATH: &'static str = "package.json"; +const PARSER_C_PATH: &'static str = "src/parser.c"; +const SCANNER_C_PATH: &'static str = "src/scanner.c"; +const SCANNER_CC_PATH: &'static str = "src/scanner.cc"; + +#[cfg(unix)] +const DYLIB_EXTENSION: &'static str = "so"; + +#[cfg(windows)] +const DYLIB_EXTENSION: &'static str = "dll"; + +struct LanguageRepo { + name: String, + path: PathBuf, + language: Option, + configurations: Vec, +} + +pub struct LanguageConfiguration { + name: String, + content_regex: Option, + first_line_regex: Option, + file_types: Vec, + highlight_property_sheet: Option>, +} + +pub struct Loader { + parser_lib_path: PathBuf, + language_repos: Vec, + language_configuration_indices_by_file_type: HashMap>, +} + +unsafe impl Send for Loader {} +unsafe impl Sync for Loader {} + +impl Loader { + pub fn new(parser_lib_path: PathBuf) -> Self { + Loader { + parser_lib_path, + language_repos: Vec::new(), + language_configuration_indices_by_file_type: HashMap::new(), + } + } + + pub fn find_parsers(&mut self, parser_src_paths: &Vec) -> io::Result<()> { + for parser_container_dir in parser_src_paths.iter() { + for entry in fs::read_dir(parser_container_dir)? { + let entry = entry?; + if let Some(parser_dir_name) = entry.file_name().to_str() { + if parser_dir_name.starts_with("tree-sitter-") { + if self.load_language_configurations( + &parser_container_dir.join(parser_dir_name), + ).is_err() { + eprintln!("Error loading {}", parser_dir_name); + } + } + } + } + } + Ok(()) + } + + pub fn language_configuration_at_path( + &mut self, + path: &Path, + ) -> io::Result> { + let repo_index = self.load_language_configurations(path)?; + self.load_language_from_repo(repo_index, 0) + } + + pub fn language_for_file_name( + &mut self, + path: &Path, + ) -> io::Result> { + let indices = path + .file_name() + .and_then(|n| n.to_str()) + .and_then(|file_name| { + self.language_configuration_indices_by_file_type + .get(file_name) + }) + .or_else(|| { + path.extension() + .and_then(|extension| extension.to_str()) + .and_then(|extension| { + self.language_configuration_indices_by_file_type + .get(extension) + }) + }); + + if let Some(indices) = indices { + // TODO use `content-regex` to pick one + for (repo_index, conf_index) in indices { + return self.load_language_from_repo(*repo_index, *conf_index); + } + } + Ok(None) + } + + fn load_language_from_repo( + &mut self, + repo_index: usize, + conf_index: usize, + ) -> io::Result> { + let repo = &self.language_repos[repo_index]; + let language = if let Some(language) = repo.language { + language + } else { + let language = self.load_language_at_path(&repo.name, &repo.path)?; + self.language_repos[repo_index].language = Some(language); + language + }; + if let Some(configuration) = self.language_repos[repo_index] + .configurations + .get(conf_index) + { + Ok(Some((language, configuration))) + } else { + Ok(None) + } + } + + fn load_language_at_path(&self, name: &str, language_path: &Path) -> io::Result { + let parser_c_path = language_path.join(PARSER_C_PATH); + let mut library_path = self.parser_lib_path.join(name); + library_path.set_extension(DYLIB_EXTENSION); + + if !library_path.exists() || was_modified_more_recently(&parser_c_path, &library_path)? { + let compiler_name = std::env::var("CXX").unwrap_or("c++".to_owned()); + let mut command = Command::new(compiler_name); + command + .arg("-shared") + .arg("-fPIC") + .arg("-I") + .arg(language_path.join("src")) + .arg("-o") + .arg(&library_path) + .arg("-xc") + .arg(parser_c_path); + let scanner_c_path = language_path.join(SCANNER_C_PATH); + let scanner_cc_path = language_path.join(SCANNER_CC_PATH); + if scanner_c_path.exists() { + command.arg("-xc").arg(scanner_c_path); + } else if scanner_cc_path.exists() { + command.arg("-xc++").arg(scanner_cc_path); + } + command.output()?; + } + + let library = Library::new(library_path)?; + let language_fn_name = format!("tree_sitter_{}", name); + let language = unsafe { + let language_fn: Symbol Language> = + library.get(language_fn_name.as_bytes())?; + language_fn() + }; + mem::forget(library); + Ok(language) + } + + fn load_language_configurations<'a>(&'a mut self, parser_path: &Path) -> io::Result { + let name = parser_path + .file_name() + .unwrap() + .to_str() + .unwrap() + .split_at("tree-sitter-".len()) + .1; + + #[derive(Deserialize)] + struct LanguageConfigurationJSON { + name: String, + #[serde(rename = "file-types")] + file_types: Option>, + #[serde(rename = "content-regex")] + content_regex: Option, + #[serde(rename = "first-line-regex")] + first_line_regex: Option, + highlights: Option, + } + + #[derive(Deserialize)] + struct PackageJSON { + #[serde(rename = "tree-sitter")] + tree_sitter: Option>, + } + + let package_json_contents = fs::read_to_string(&parser_path.join(PACKAGE_JSON_PATH))?; + let package_json: PackageJSON = serde_json::from_str(&package_json_contents)?; + let configurations = package_json + .tree_sitter + .map_or(Vec::new(), |configurations| { + configurations + .into_iter() + .map(|conf| LanguageConfiguration { + name: conf.name, + file_types: conf.file_types.unwrap_or(Vec::new()), + content_regex: conf + .content_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + first_line_regex: conf + .first_line_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + highlight_property_sheet: conf.highlights.map(|d| Err(d.into())), + }) + .collect() + }); + + for (i, configuration) in configurations.iter().enumerate() { + for file_type in &configuration.file_types { + self.language_configuration_indices_by_file_type + .entry(file_type.to_string()) + .or_insert(Vec::new()) + .push((self.language_repos.len(), i)); + } + } + + self.language_repos.push(LanguageRepo { + name: name.to_string(), + path: parser_path.to_owned(), + language: None, + configurations, + }); + + Ok(self.language_repos.len() - 1) + } +} + +fn was_modified_more_recently(a: &Path, b: &Path) -> io::Result { + Ok(fs::metadata(a)?.modified()? > fs::metadata(b)?.modified()?) +} diff --git a/cli/src/main.rs b/cli/src/main.rs index fe6ffd8c..87f9e26d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -5,14 +5,20 @@ extern crate log; #[macro_use] extern crate serde_derive; extern crate hashbrown; +extern crate regex; extern crate serde_json; mod error; mod generate; +mod loader; mod logger; +mod parse; +mod test; +use self::loader::Loader; use clap::{App, Arg, SubCommand}; use std::env; +use std::path::Path; use std::process::exit; use std::usize; @@ -44,15 +50,13 @@ fn run() -> error::Result<()> { .about("Parse a file") .arg(Arg::with_name("path").index(1)), ) - .subcommand( - SubCommand::with_name("test") - .about("Run a parser's tests") - .arg(Arg::with_name("path").index(1).required(true)) - .arg(Arg::with_name("line").index(2).required(true)) - .arg(Arg::with_name("column").index(3).required(true)), - ) + .subcommand(SubCommand::with_name("test").about("Run a parser's tests")) .get_matches(); + let home_dir = dirs::home_dir().unwrap(); + let current_dir = env::current_dir().unwrap(); + let mut loader = Loader::new(home_dir.join(".tree-sitter")); + if let Some(matches) = matches.subcommand_matches("generate") { if matches.is_present("log") { logger::init(); @@ -65,11 +69,23 @@ fn run() -> error::Result<()> { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); - let mut grammar_path = env::current_dir().expect("Failed to read CWD"); - grammar_path.push("grammar.js"); + let grammar_path = current_dir.join("grammar.js"); let code = generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?; println!("{}", code); + return Ok(()); + } + + if let Some(_matches) = matches.subcommand_matches("test") { + let corpus_path = current_dir.join("corpus"); + let home_dir = dirs::home_dir().unwrap(); + let mut loader = Loader::new(home_dir.join(".tree-sitter")); + if let Some((language, _)) = loader.language_configuration_at_path(¤t_dir)? { + test::run_tests_at_path(language, &corpus_path)?; + } else { + eprintln!("No language found"); + } + } } Ok(()) diff --git a/cli/src/test.rs b/cli/src/test.rs new file mode 100644 index 00000000..7ef63bb7 --- /dev/null +++ b/cli/src/test.rs @@ -0,0 +1,212 @@ +use super::error::Result; +use ansi_term::Colour; +use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; +use regex::Regex; +use std::char; +use std::fs; +use std::io; +use std::path::Path; +use std::str; +use tree_sitter::{Language, Parser}; + +lazy_static! { + static ref HEADER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^===+\r?\n([^=]*)\r?\n===+\r?\n") + .multi_line(true) + .build() + .unwrap(); + static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n") + .multi_line(true) + .build() + .unwrap(); + static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); +} + +#[derive(Debug, PartialEq, Eq)] +pub enum TestEntry { + Group { + name: String, + children: Vec, + }, + Example { + name: String, + input: Vec, + output: String, + }, +} + +pub fn run_tests_at_path(language: Language, path: &Path) -> Result<()> { + let test_entry = parse_tests(path)?; + let mut parser = Parser::new(); + parser.set_language(language)?; + + let mut failures = Vec::new(); + if let TestEntry::Group { children, .. } = test_entry { + for child in children { + run_tests(&mut parser, child, 0, &mut failures)?; + } + } + + if failures.len() > 0 { + println!(""); + + if failures.len() == 1 { + println!("1 failure:") + } else { + println!("{} failures:", failures.len()) + } + + for (name, actual, expected) in failures { + println!("\n {}:", name); + println!(" Expected: {}", expected); + println!(" Actual: {}", actual); + } + } + + Ok(()) +} + +fn run_tests( + parser: &mut Parser, + test_entry: TestEntry, + mut indent_level: i32, + failures: &mut Vec<(String, String, String)>, +) -> Result<()> { + for _ in 0..indent_level { + print!(" "); + } + match test_entry { + TestEntry::Example { + name, + input, + output, + } => { + let tree = parser + .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) + .unwrap(); + let actual = tree.root_node().to_sexp(); + if actual == output { + println!("✓ {}", Colour::Green.paint(&name)); + } else { + println!("✗ {}", Colour::Red.paint(&name)); + failures.push((name, actual, output)); + } + } + TestEntry::Group { name, children } => { + println!("{}:", name); + indent_level += 1; + for child in children { + run_tests(parser, child, indent_level, failures)?; + } + } + } + Ok(()) +} + +pub fn parse_tests(path: &Path) -> io::Result { + let name = path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string(); + if path.is_dir() { + let mut children = Vec::new(); + for entry in fs::read_dir(path)? { + let entry = entry?; + children.push(parse_tests(&entry.path())?); + } + Ok(TestEntry::Group { name, children }) + } else { + let content = fs::read_to_string(path)?; + Ok(parse_test_content(name, content)) + } +} + +fn parse_test_content(name: String, content: String) -> TestEntry { + let mut children = Vec::new(); + let bytes = content.as_bytes(); + let mut previous_name = String::new(); + let mut previous_header_end = 0; + for header_match in HEADER_REGEX + .find_iter(&bytes) + .map(|m| (m.start(), m.end())) + .chain(Some((bytes.len(), bytes.len()))) + { + let (header_start, header_end) = header_match; + if previous_header_end > 0 { + if let Some(divider_match) = + DIVIDER_REGEX.find(&bytes[previous_header_end..header_start]) + { + let (divider_start, divider_end) = ( + previous_header_end + divider_match.start(), + previous_header_end + divider_match.end(), + ); + if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) { + let input = bytes[previous_header_end..divider_start].to_vec(); + let output = WHITESPACE_REGEX.replace_all(output.trim(), " ").to_string(); + children.push(TestEntry::Example { + name: previous_name, + input, + output, + }); + } + } + } + previous_name = String::from_utf8_lossy(&bytes[header_start..header_end]) + .trim_matches(|c| char::is_whitespace(c) || c == '=') + .to_string(); + previous_header_end = header_end; + } + TestEntry::Group { name, children } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_test_content() { + let entry = parse_test_content( + "the-filename".to_string(), + r#" +=============== +The first test +=============== + +a b c + +--- + +(a + (b c)) + +================ +The second test +================ +d +--- +(d) + "# + .trim() + .to_string(), + ); + + assert_eq!( + entry, + TestEntry::Group { + name: "the-filename".to_string(), + children: vec![ + TestEntry::Example { + name: "The first test".to_string(), + input: "\na b c\n\n".as_bytes().to_vec(), + output: "(a (b c))".to_string(), + }, + TestEntry::Example { + name: "The second test".to_string(), + input: "d\n".as_bytes().to_vec(), + output: "(d)".to_string(), + }, + ] + } + ); + } +} From 20fcffb393fdeac8f09ec42b7d6433dee68b4aaf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 17:57:36 -0800 Subject: [PATCH 108/208] Add parse subcommand Co-Authored-By: Timothy Clem --- cli/src/main.rs | 9 +++++++ cli/src/parse.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 cli/src/parse.rs diff --git a/cli/src/main.rs b/cli/src/main.rs index 87f9e26d..6a0cf9ec 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -86,6 +86,15 @@ fn run() -> error::Result<()> { eprintln!("No language found"); } } + + if let Some(matches) = matches.subcommand_matches("parse") { + loader.find_parsers(&vec![home_dir.join("github")])?; + let source_path = Path::new(matches.value_of("path").unwrap()); + if let Some((language, _)) = loader.language_for_file_name(source_path)? { + parse::parse_file_at_path(language, source_path)?; + } else { + eprintln!("No language found"); + } } Ok(()) diff --git a/cli/src/parse.rs b/cli/src/parse.rs new file mode 100644 index 00000000..26fe9b9a --- /dev/null +++ b/cli/src/parse.rs @@ -0,0 +1,69 @@ +use super::error::Result; +use std::fs; +use std::path::Path; +use tree_sitter::{Language, Parser}; +use std::io::{self, Write}; + +pub fn parse_file_at_path(language: Language, path: &Path) -> Result<()> { + let mut parser = Parser::new(); + parser.set_language(language)?; + let source_code = fs::read_to_string(path)?; + let tree = parser + .parse_str(&source_code, None) + .expect("Incompatible language version"); + + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + let mut cursor = tree.walk(); + let mut needs_newline = false; + let mut indent_level = 0; + let mut did_visit_children = false; + loop { + let node = cursor.node(); + let is_named = node.is_named(); + if did_visit_children { + if is_named { + stdout.write(b")")?; + needs_newline = true; + } + if cursor.goto_next_sibling() { + did_visit_children = false; + } else if cursor.goto_parent() { + did_visit_children = true; + indent_level -= 1; + } else { + break; + } + } else { + if is_named { + if needs_newline { + stdout.write(b"\n")?; + } + for _ in 0..indent_level { + stdout.write(b" ")?; + } + let start = node.start_position(); + let end = node.end_position(); + write!( + &mut stdout, + "({} [{}, {}] - [{}, {}]", + node.kind(), + start.row, + start.column, + end.row, + end.column + )?; + needs_newline = true; + } + if cursor.goto_first_child() { + did_visit_children = false; + indent_level += 1; + } else { + did_visit_children = true; + } + } + } + + println!(""); + Ok(()) +} From 6c4d00aad58c72015941ab86c042b3db1d3f7df9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 22:01:40 -0800 Subject: [PATCH 109/208] Print diffs when tests fail --- Cargo.lock | 7 +++++++ cli/Cargo.toml | 1 + cli/src/main.rs | 8 ++------ cli/src/test.rs | 29 +++++++++++++++++++++++++---- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a249312..db86e43b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -156,6 +156,11 @@ dependencies = [ "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "difference" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "dirs" version = "1.0.4" @@ -643,6 +648,7 @@ version = "0.1.0" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", @@ -764,6 +770,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8" "checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015" "checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816" +"checksum difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" "checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a" "checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7" "checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 200fd2f1..a2f546c4 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -10,6 +10,7 @@ path = "src/main.rs" [dependencies] ansi_term = "0.11" +difference = "2.0" lazy_static = "1.2.0" smallbitvec = "2.3.0" clap = "2.32" diff --git a/cli/src/main.rs b/cli/src/main.rs index 6a0cf9ec..626a729c 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -74,9 +74,7 @@ fn run() -> error::Result<()> { generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?; println!("{}", code); return Ok(()); - } - - if let Some(_matches) = matches.subcommand_matches("test") { + } else if let Some(_matches) = matches.subcommand_matches("test") { let corpus_path = current_dir.join("corpus"); let home_dir = dirs::home_dir().unwrap(); let mut loader = Loader::new(home_dir.join(".tree-sitter")); @@ -85,9 +83,7 @@ fn run() -> error::Result<()> { } else { eprintln!("No language found"); } - } - - if let Some(matches) = matches.subcommand_matches("parse") { + } else if let Some(matches) = matches.subcommand_matches("parse") { loader.find_parsers(&vec![home_dir.join("github")])?; let source_path = Path::new(matches.value_of("path").unwrap()); if let Some((language, _)) = loader.language_for_file_name(source_path)? { diff --git a/cli/src/test.rs b/cli/src/test.rs index 7ef63bb7..a693576d 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -1,5 +1,6 @@ use super::error::Result; use ansi_term::Colour; +use difference::{Changeset, Difference}; use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; use regex::Regex; use std::char; @@ -55,10 +56,30 @@ pub fn run_tests_at_path(language: Language, path: &Path) -> Result<()> { println!("{} failures:", failures.len()) } - for (name, actual, expected) in failures { - println!("\n {}:", name); - println!(" Expected: {}", expected); - println!(" Actual: {}", actual); + println!( + "\n{} / {}", + Colour::Green.paint("expected"), + Colour::Red.paint("actual") + ); + + for (i, (name, actual, expected)) in failures.iter().enumerate() { + println!("\n {}. {}:", i + 1, name); + let changeset = Changeset::new(actual, expected, " "); + print!(" "); + for diff in &changeset.diffs { + match diff { + Difference::Same(part) => { + print!("{}{}", part, changeset.split); + } + Difference::Add(part) => { + print!("{}{}", Colour::Green.paint(part), changeset.split); + } + Difference::Rem(part) => { + print!("{}{}", Colour::Red.paint(part), changeset.split); + } + } + } + println!(""); } } From 98807d205317e0e5ef7512827657086608adcd35 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 8 Jan 2019 21:03:51 -0800 Subject: [PATCH 110/208] Add debug and debug-graph flags to parse and test commands --- cli/src/main.rs | 31 ++++++++++++++----- cli/src/parse.rs | 30 +++++++++++++++++-- cli/src/test.rs | 49 ++++++++++++++++++++++++++----- cli/src/util.rs | 36 +++++++++++++++++++++++ lib/binding/bindings.rs | 31 +++++++++++++------ lib/binding/ffi.rs | 5 ++++ lib/binding/lib.rs | 14 +++++++++ lib/include/tree_sitter/runtime.h | 2 +- lib/src/parser.c | 12 ++++++-- script/bindgen.sh | 4 +-- 10 files changed, 182 insertions(+), 32 deletions(-) create mode 100644 cli/src/util.rs diff --git a/cli/src/main.rs b/cli/src/main.rs index 626a729c..8dbf345a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -14,6 +14,7 @@ mod loader; mod logger; mod parse; mod test; +mod util; use self::loader::Loader; use clap::{App, Arg, SubCommand}; @@ -48,9 +49,22 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("parse") .about("Parse a file") - .arg(Arg::with_name("path").index(1)), + .arg(Arg::with_name("path").index(1).required(true)) + .arg(Arg::with_name("debug").long("debug").short("d")) + .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), + ) + .subcommand( + SubCommand::with_name("test") + .about("Run a parser's tests") + .arg( + Arg::with_name("filter") + .long("filter") + .short("f") + .takes_value(true), + ) + .arg(Arg::with_name("debug").long("debug").short("d")) + .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), ) - .subcommand(SubCommand::with_name("test").about("Run a parser's tests")) .get_matches(); let home_dir = dirs::home_dir().unwrap(); @@ -74,20 +88,23 @@ fn run() -> error::Result<()> { generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?; println!("{}", code); return Ok(()); - } else if let Some(_matches) = matches.subcommand_matches("test") { + } else if let Some(matches) = matches.subcommand_matches("test") { + let debug = matches.is_present("debug"); + let debug_graph = matches.is_present("debug-graph"); + let filter = matches.value_of("filter"); let corpus_path = current_dir.join("corpus"); - let home_dir = dirs::home_dir().unwrap(); - let mut loader = Loader::new(home_dir.join(".tree-sitter")); if let Some((language, _)) = loader.language_configuration_at_path(¤t_dir)? { - test::run_tests_at_path(language, &corpus_path)?; + test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?; } else { eprintln!("No language found"); } } else if let Some(matches) = matches.subcommand_matches("parse") { + let debug = matches.is_present("debug"); + let debug_graph = matches.is_present("debug-graph"); loader.find_parsers(&vec![home_dir.join("github")])?; let source_path = Path::new(matches.value_of("path").unwrap()); if let Some((language, _)) = loader.language_for_file_name(source_path)? { - parse::parse_file_at_path(language, source_path)?; + parse::parse_file_at_path(language, source_path, debug, debug_graph)?; } else { eprintln!("No language found"); } diff --git a/cli/src/parse.rs b/cli/src/parse.rs index 26fe9b9a..fde148b6 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -1,17 +1,41 @@ use super::error::Result; +use super::util; use std::fs; -use std::path::Path; -use tree_sitter::{Language, Parser}; use std::io::{self, Write}; +use std::path::Path; +use tree_sitter::{Language, LogType, Parser}; -pub fn parse_file_at_path(language: Language, path: &Path) -> Result<()> { +pub fn parse_file_at_path( + language: Language, + path: &Path, + debug: bool, + debug_graph: bool, +) -> Result<()> { let mut parser = Parser::new(); parser.set_language(language)?; let source_code = fs::read_to_string(path)?; + + let mut log_session = None; + + if debug_graph { + log_session = Some(util::start_logging_graphs(&mut parser, "log.html")?); + } else if debug { + parser.set_logger(Some(Box::new(|log_type, message| { + if log_type == LogType::Lex { + io::stderr().write(b" ").unwrap(); + } + write!(&mut io::stderr(), "{}\n", message).unwrap(); + }))); + } + let tree = parser .parse_str(&source_code, None) .expect("Incompatible language version"); + if let Some(log_session) = log_session { + util::stop_logging_graphs(&mut parser, log_session)?; + } + let stdout = io::stdout(); let mut stdout = stdout.lock(); let mut cursor = tree.walk(); diff --git a/cli/src/test.rs b/cli/src/test.rs index a693576d..790e9ec7 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -1,14 +1,15 @@ use super::error::Result; +use super::util; use ansi_term::Colour; use difference::{Changeset, Difference}; use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; use regex::Regex; use std::char; use std::fs; -use std::io; +use std::io::{self, Write}; use std::path::Path; use std::str; -use tree_sitter::{Language, Parser}; +use tree_sitter::{Language, LogType, Parser}; lazy_static! { static ref HEADER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^===+\r?\n([^=]*)\r?\n===+\r?\n") @@ -35,15 +36,34 @@ pub enum TestEntry { }, } -pub fn run_tests_at_path(language: Language, path: &Path) -> Result<()> { +pub fn run_tests_at_path( + language: Language, + path: &Path, + debug: bool, + debug_graph: bool, + filter: Option<&str>, +) -> Result<()> { let test_entry = parse_tests(path)?; let mut parser = Parser::new(); parser.set_language(language)?; + let mut log_session = None; + + if debug_graph { + log_session = Some(util::start_logging_graphs(&mut parser, "log.html")?); + } else if debug { + parser.set_logger(Some(Box::new(|log_type, message| { + if log_type == LogType::Lex { + io::stderr().write(b" ").unwrap(); + } + write!(&mut io::stderr(), "{}\n", message).unwrap(); + }))); + } + let mut failures = Vec::new(); if let TestEntry::Group { children, .. } = test_entry { for child in children { - run_tests(&mut parser, child, 0, &mut failures)?; + run_tests(&mut parser, child, filter, 0, &mut failures)?; } } @@ -83,28 +103,38 @@ pub fn run_tests_at_path(language: Language, path: &Path) -> Result<()> { } } + if let Some(log_session) = log_session { + util::stop_logging_graphs(&mut parser, log_session)?; + } + Ok(()) } fn run_tests( parser: &mut Parser, test_entry: TestEntry, + filter: Option<&str>, mut indent_level: i32, failures: &mut Vec<(String, String, String)>, ) -> Result<()> { - for _ in 0..indent_level { - print!(" "); - } match test_entry { TestEntry::Example { name, input, output, } => { + if let Some(filter) = filter { + if !name.contains(filter) { + return Ok(()); + } + } let tree = parser .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) .unwrap(); let actual = tree.root_node().to_sexp(); + for _ in 0..indent_level { + print!(" "); + } if actual == output { println!("✓ {}", Colour::Green.paint(&name)); } else { @@ -113,10 +143,13 @@ fn run_tests( } } TestEntry::Group { name, children } => { + for _ in 0..indent_level { + print!(" "); + } println!("{}:", name); indent_level += 1; for child in children { - run_tests(parser, child, indent_level, failures)?; + run_tests(parser, child, filter, indent_level, failures)?; } } } diff --git a/cli/src/util.rs b/cli/src/util.rs new file mode 100644 index 00000000..d7d8572e --- /dev/null +++ b/cli/src/util.rs @@ -0,0 +1,36 @@ +use std::fs::File; +use std::io::{Result, Write}; +use std::process::{Child, ChildStdin, Command, Stdio}; +use std::str; +use tree_sitter::Parser; + +pub(crate) struct LogSession(Child, ChildStdin); + +pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result { + let mut dot_file = File::create(path)?; + dot_file.write(b"\n\n\n")?; + let mut dot_process = Command::new("dot") + .arg("-Tsvg") + .stdin(Stdio::piped()) + .stdout(dot_file) + .spawn() + .expect("Failed to run Dot"); + let dot_stdin = dot_process + .stdin + .take() + .expect("Failed to open stdin for Dot"); + parser.print_dot_graphs(&dot_stdin); + Ok(LogSession(dot_process, dot_stdin)) +} + +pub(crate) fn stop_logging_graphs(parser: &mut Parser, mut session: LogSession) -> Result<()> { + drop(session.1); + parser.stop_printing_dot_graphs(); + session.0.wait()?; + + if cfg!(target_os = "macos") { + Command::new("open").arg("log.html").output()?; + } + + Ok(()) +} diff --git a/lib/binding/bindings.rs b/lib/binding/bindings.rs index 58d0e510..9d1f3490 100644 --- a/lib/binding/bindings.rs +++ b/lib/binding/bindings.rs @@ -1,5 +1,6 @@ /* automatically generated by rust-bindgen */ +pub type __darwin_size_t = ::std::os::raw::c_ulong; pub type FILE = [u64; 19usize]; pub type TSSymbol = u16; #[repr(C)] @@ -87,9 +88,9 @@ pub struct TSNode { #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSTreeCursor { - pub context: [u32; 2usize], - pub id: *const ::std::os::raw::c_void, pub tree: *const ::std::os::raw::c_void, + pub id: *const ::std::os::raw::c_void, + pub context: [u32; 2usize], } extern "C" { pub fn ts_parser_new() -> *mut TSParser; @@ -110,7 +111,7 @@ extern "C" { pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); } extern "C" { - pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: ::std::os::raw::c_int); } extern "C" { pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); @@ -126,6 +127,15 @@ extern "C" { arg4: u32, ) -> *mut TSTree; } +extern "C" { + pub fn ts_parser_parse_string_encoding( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + arg5: TSInputEncoding, + ) -> *mut TSTree; +} extern "C" { pub fn ts_parser_enabled(arg1: *const TSParser) -> bool; } @@ -271,19 +281,22 @@ extern "C" { pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); } extern "C" { - pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; + pub fn ts_tree_cursor_reset(arg1: *mut TSTreeCursor, arg2: TSNode); } extern "C" { - pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; -} -extern "C" { - pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; } extern "C" { pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; } extern "C" { - pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; } extern "C" { pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; diff --git a/lib/binding/ffi.rs b/lib/binding/ffi.rs index 323609e0..685ed765 100644 --- a/lib/binding/ffi.rs +++ b/lib/binding/ffi.rs @@ -1,4 +1,9 @@ #![allow(dead_code)] #![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] include!("./bindings.rs"); + +extern "C" { + pub(crate) fn dup(fd: std::os::raw::c_int) -> std::os::raw::c_int; +} diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 65a57d16..ae3c979c 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -6,6 +6,9 @@ extern crate regex; extern crate serde; extern crate serde_json; +#[cfg(unix)] +use std::os::unix::io::AsRawFd; + use regex::Regex; use serde::de::DeserializeOwned; use std::collections::HashMap; @@ -185,6 +188,17 @@ impl Parser { unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; } + #[cfg(unix)] + pub fn print_dot_graphs(&mut self, file: & impl AsRawFd) { + let fd = file.as_raw_fd(); + unsafe { ffi::ts_parser_print_dot_graphs(self.0, ffi::dup(fd)) } + } + + #[cfg(unix)] + pub fn stop_printing_dot_graphs(&mut self) { + unsafe { ffi::ts_parser_print_dot_graphs(self.0, -1) } + } + pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { let bytes = input.as_bytes(); self.parse_utf8( diff --git a/lib/include/tree_sitter/runtime.h b/lib/include/tree_sitter/runtime.h index f0442477..ab69a0b5 100644 --- a/lib/include/tree_sitter/runtime.h +++ b/lib/include/tree_sitter/runtime.h @@ -83,7 +83,7 @@ const TSLanguage *ts_parser_language(const TSParser *); bool ts_parser_set_language(TSParser *, const TSLanguage *); TSLogger ts_parser_logger(const TSParser *); void ts_parser_set_logger(TSParser *, TSLogger); -void ts_parser_print_dot_graphs(TSParser *, FILE *); +void ts_parser_print_dot_graphs(TSParser *, int); void ts_parser_halt_on_error(TSParser *, bool); TSTree *ts_parser_parse(TSParser *, const TSTree *, TSInput); TSTree *ts_parser_parse_string(TSParser *, const TSTree *, const char *, uint32_t); diff --git a/lib/src/parser.c b/lib/src/parser.c index ef7f612d..a33dbc6f 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1542,8 +1542,16 @@ void ts_parser_set_logger(TSParser *self, TSLogger logger) { self->lexer.logger = logger; } -void ts_parser_print_dot_graphs(TSParser *self, FILE *file) { - self->dot_graph_file = file; +void ts_parser_print_dot_graphs(TSParser *self, int fd) { + if (self->dot_graph_file) { + fclose(self->dot_graph_file); + } + + if (fd >= 0) { + self->dot_graph_file = fdopen(fd, "a"); + } else { + self->dot_graph_file = NULL; + } } void ts_parser_halt_on_error(TSParser *self, bool should_halt_on_error) { diff --git a/script/bindgen.sh b/script/bindgen.sh index 699f0339..0a536d20 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -1,7 +1,7 @@ #!/bin/bash -output_path=src/bindings.rs -header_path='vendor/tree-sitter/include/tree_sitter/runtime.h' +output_path=lib/binding/bindings.rs +header_path='lib/include/tree_sitter/runtime.h' bindgen \ --no-layout-tests \ From 6972a8e3e8ac442a84b2ad48999d58d9b41d4c4c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 9 Jan 2019 09:58:45 -0800 Subject: [PATCH 111/208] Add logging when deciding not to merge parse states --- cli/src/generate/build_tables/minimize_parse_table.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 007c9703..a5cb5f81 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -1,6 +1,6 @@ use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; -use crate::generate::grammars::{SyntaxGrammar, VariableType}; +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; use crate::generate::rules::{AliasMap, Symbol}; use crate::generate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use hashbrown::{HashMap, HashSet}; @@ -8,6 +8,7 @@ use hashbrown::{HashMap, HashSet}; pub(crate) fn minimize_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, keywords: &TokenSet, @@ -15,6 +16,7 @@ pub(crate) fn minimize_parse_table( let mut minimizer = Minimizer { parse_table, syntax_grammar, + lexical_grammar, token_conflict_map, keywords, simple_aliases, @@ -27,6 +29,7 @@ pub(crate) fn minimize_parse_table( struct Minimizer<'a> { parse_table: &'a mut ParseTable, syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, token_conflict_map: &'a TokenConflictMap<'a>, keywords: &'a TokenSet, simple_aliases: &'a AliasMap, @@ -237,6 +240,11 @@ impl<'a> Minimizer<'a> { .token_conflict_map .does_match_same_string(token.index, existing_token.index) { + info!( + "can't merge parse states because of conflict between {} and {}", + self.lexical_grammar.variables[token.index].name, + self.lexical_grammar.variables[existing_token.index].name + ); return false; } } From 2e8b2ab8fb988790dfa45e6c173b80647786c4fe Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 9 Jan 2019 09:59:46 -0800 Subject: [PATCH 112/208] Give strings more implicit precedence than immediate tokens --- cli/src/generate/build_tables/mod.rs | 5 ++++- cli/src/generate/prepare_grammar/expand_tokens.rs | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 7d55d0fa..b8432fe5 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -43,12 +43,14 @@ pub(crate) fn build_tables( lexical_grammar, &coincident_token_index, &token_conflict_map, + &keywords, ); mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); if minimize { minimize_parse_table( &mut parse_table, syntax_grammar, + lexical_grammar, simple_aliases, &token_conflict_map, &keywords, @@ -77,6 +79,7 @@ fn populate_error_state( lexical_grammar: &LexicalGrammar, coincident_token_index: &CoincidentTokenIndex, token_conflict_map: &TokenConflictMap, + keywords: &TokenSet, ) { let state = &mut parse_table.states[0]; let n = lexical_grammar.variables.len(); @@ -112,7 +115,7 @@ fn populate_error_state( // the *conflict-free tokens* identified above. for i in 0..n { let symbol = Symbol::terminal(i); - if !conflict_free_tokens.contains(&symbol) { + if !conflict_free_tokens.contains(&symbol) && !keywords.contains(&symbol) { if syntax_grammar.word_token != Some(symbol) { if let Some(t) = conflict_free_tokens.iter().find(|t| { !coincident_token_index.contains(symbol, *t) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index d1922dc0..e269df6d 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -16,10 +16,10 @@ struct NfaBuilder { fn get_implicit_precedence(rule: &Rule) -> i32 { match rule { - Rule::String(_) => 1, + Rule::String(_) => 2, Rule::Metadata { rule, params } => { if params.is_main_token { - get_implicit_precedence(rule) + 2 + get_implicit_precedence(rule) + 1 } else { get_implicit_precedence(rule) } From c0fad8b3c4f477be3aa846b8d7570266ea8d17de Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 9 Jan 2019 14:43:49 -0800 Subject: [PATCH 113/208] Write parser.c in generate command --- cli/src/generate/mod.rs | 10 ++++++---- cli/src/main.rs | 6 +----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 7dfe5a4b..366d2495 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -3,6 +3,7 @@ use self::parse_grammar::parse_grammar; use self::prepare_grammar::prepare_grammar; use self::render::render_c_code; use crate::error::Result; +use std::fs; use std::io::Write; use std::path::PathBuf; use std::process::{Command, Stdio}; @@ -17,11 +18,11 @@ mod rules; mod tables; pub fn generate_parser_for_grammar( - grammar_path: &PathBuf, + repo_path: &PathBuf, minimize: bool, state_ids_to_log: Vec, -) -> Result { - let grammar_json = load_js_grammar_file(grammar_path); +) -> Result<()> { + let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); let input_grammar = parse_grammar(&grammar_json)?; let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; @@ -43,7 +44,8 @@ pub fn generate_parser_for_grammar( lexical_grammar, simple_aliases, ); - Ok(c_code) + fs::write(repo_path.join("src").join("parser.c"), c_code)?; + Ok(()) } fn load_js_grammar_file(grammar_path: &PathBuf) -> String { diff --git a/cli/src/main.rs b/cli/src/main.rs index 8dbf345a..604d3068 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -83,11 +83,7 @@ fn run() -> error::Result<()> { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); - let grammar_path = current_dir.join("grammar.js"); - let code = - generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?; - println!("{}", code); - return Ok(()); + generate::generate_parser_for_grammar(¤t_dir, minimize, state_ids_to_log)?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); From 6bd550ca87f6ab4489e8ce1a6b46458ab6afbabe Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 9 Jan 2019 18:09:55 -0800 Subject: [PATCH 114/208] Start work on property sheet compilation --- Cargo.lock | 142 ++++++++++++++ cli/Cargo.toml | 1 + cli/src/error.rs | 6 + cli/src/generate/mod.rs | 51 ++--- cli/src/generate/properties.rs | 327 +++++++++++++++++++++++++++++++++ cli/src/main.rs | 5 +- lib/binding/lib.rs | 48 ++--- 7 files changed, 532 insertions(+), 48 deletions(-) create mode 100644 cli/src/generate/properties.rs diff --git a/Cargo.lock b/Cargo.lock index db86e43b..fa7712ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,11 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "autocfg" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "backtrace" version = "0.3.9" @@ -336,6 +341,36 @@ name = "nodrop" version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "nom" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-integer" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-rational" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-traits" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "owning_ref" version = "0.4.0" @@ -408,6 +443,32 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rand" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_os 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_chacha" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rand_core" version = "0.2.2" @@ -421,6 +482,60 @@ name = "rand_core" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rand_hc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_isaac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_os" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_pcg" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "redox_syscall" version = "0.1.43" @@ -465,6 +580,18 @@ dependencies = [ "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rsass" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "nom 4.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rusqlite" version = "0.14.0" @@ -657,6 +784,7 @@ dependencies = [ "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -756,6 +884,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3f67b0b6a86dae6e67ff4ca2b6201396074996379fba2b92ff649126f37cb392" "checksum arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "f405cc4c21cd8b784f6c8fc2adf9bc00f59558f0049b5ec21517f875963040cc" "checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum autocfg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e5f34df7a019573fb8bdc7e24a2bfebe51a2a1d6bfdbaeccedb3c41fc574727" "checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" @@ -793,6 +922,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" "checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum nom 4.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9c349f68f25f596b9f44cf0e7c69752a5c633b0550c3ff849518bfba0233774a" +"checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" +"checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" +"checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" "checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" "checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" "checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" @@ -801,13 +934,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" "checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" +"checksum rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3906503e80ac6cbcacb2c2973fa8e473f24d7e2747c8c92bb230c2441cad96b5" +"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" "checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372" "checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db" +"checksum rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" +"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" +"checksum rand_os 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f46fbd5550acf75b0c2730f5dd1873751daf9beb8f11b44027778fae50d7feca" +"checksum rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "086bd09a33c7044e56bb44d5bdde5a60e7f119a9e95b0775f545de759a32fe05" +"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" "checksum redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)" = "679da7508e9a6390aeaf7fbd02a800fdc64b73fe2204dd2c8ae66d22d9d5ad5d" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" "checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" "checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7a5dde55023a6c19470f7aeb59f75f897d8b80cbe00d61dfcaf7bbbe3de4c0a6" "checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c" "checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index a2f546c4..2eabd88f 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -23,6 +23,7 @@ serde = "1.0" serde_derive = "1.0" regex-syntax = "0.6.4" regex = "1" +rsass = "0.9" [dependencies.tree-sitter] path = "../lib" diff --git a/cli/src/error.rs b/cli/src/error.rs index 1b8b1a79..4769b481 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -31,6 +31,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: rsass::Error) -> Self { + Error(error.to_string()) + } +} + impl From for Error { fn from(error: String) -> Self { Error(error) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 366d2495..0899d793 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -13,6 +13,7 @@ mod grammars; mod nfa; mod parse_grammar; mod prepare_grammar; +mod properties; mod render; mod rules; mod tables; @@ -21,30 +22,34 @@ pub fn generate_parser_for_grammar( repo_path: &PathBuf, minimize: bool, state_ids_to_log: Vec, + properties_only: bool, ) -> Result<()> { - let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); - let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( - &syntax_grammar, - &lexical_grammar, - &simple_aliases, - &inlines, - minimize, - state_ids_to_log, - )?; - let c_code = render_c_code( - &input_grammar.name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - syntax_grammar, - lexical_grammar, - simple_aliases, - ); - fs::write(repo_path.join("src").join("parser.c"), c_code)?; + if !properties_only { + let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); + let input_grammar = parse_grammar(&grammar_json)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &inlines, + minimize, + state_ids_to_log, + )?; + let c_code = render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + ); + fs::write(repo_path.join("src").join("parser.c"), c_code)?; + } + properties::generate_property_sheets(repo_path)?; Ok(()) } diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs new file mode 100644 index 00000000..c328526f --- /dev/null +++ b/cli/src/generate/properties.rs @@ -0,0 +1,327 @@ +use crate::error::{Error, Result}; +use rsass; +use rsass::sass::Value; +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; +use std::fmt; +use std::fmt::Write; +use std::fs::{self, File}; +use std::hash::{Hash, Hasher}; +use std::path::{Path, PathBuf}; +use tree_sitter::{self, PropertyStateJSON, PropertyTransitionJSON}; + +#[derive(Debug, PartialEq, Eq, Hash, Serialize)] +#[serde(untagged)] +enum PropertyValue { + String(String), + Object(PropertySet), + Array(Vec), +} + +type PropertySet = BTreeMap; +type PropertySheetJSON = tree_sitter::PropertySheetJSON; +type StateId = u32; +type PropertySetId = u32; + +#[derive(Clone, PartialEq, Eq)] +struct SelectorStep { + kind: String, + is_named: bool, + is_immediate: bool, + child_index: Option, + text_pattern: Option, +} + +#[derive(PartialEq, Eq)] +struct Selector(Vec); + +#[derive(Debug, PartialEq, Eq)] +struct Rule { + selectors: Vec, + properties: PropertySet, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct PropertyItem { + rule_id: u32, + selector_id: u32, + step_id: u32, +} + +#[derive(PartialEq, Eq)] +struct PropertyItemSet(BTreeSet); + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +struct SelectorMatch { + specificity: u32, + rule_id: u32, + selector_id: u32, +} + +struct Builder { + rules: Vec, + output: PropertySheetJSON, + ids_by_item_set: HashMap, + ids_by_property_set: HashMap, + item_set_queue: VecDeque<(PropertyItemSet, StateId)>, +} + +impl Builder { + fn new(rules: Vec) -> Self { + Builder { + rules, + output: PropertySheetJSON { + states: Vec::new(), + property_sets: Vec::new(), + }, + ids_by_item_set: HashMap::new(), + ids_by_property_set: HashMap::new(), + item_set_queue: VecDeque::new(), + } + } + + fn build(self) -> PropertySheetJSON { + let mut start_item_set = PropertyItemSet(BTreeSet::new()); + + self.output + } +} + +impl Hash for PropertyItemSet { + fn hash(&self, h: &mut H) { + h.write_usize(self.0.len()); + for entry in &self.0 { + entry.hash(h); + } + } +} + +impl fmt::Debug for SelectorStep { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "(")?; + if self.is_named { + write!(f, "{}", self.kind)?; + } else { + write!(f, "\"{}\"", self.kind)?; + } + if let Some(n) = self.child_index { + write!(f, ":nth-child({})", n)?; + } + if let Some(t) = &self.text_pattern { + write!(f, "[text='{}']", t)?; + } + write!(f, ")")?; + Ok(()) + } +} + +impl fmt::Debug for Selector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "[")?; + for (i, step) in self.0.iter().enumerate() { + if step.is_immediate { + write!(f, " > ")?; + } else if i > 0 { + write!(f, " ")?; + } + write!(f, "{:?}", step)?; + } + write!(f, "]")?; + Ok(()) + } +} + +pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { + let src_dir_path = repo_path.join("src"); + let properties_dir_path = repo_path.join("properties"); + + for entry in fs::read_dir(properties_dir_path)? { + let property_sheet_css_path = entry?.path(); + let rules = parse_property_sheet(&property_sheet_css_path)?; + + for rule in &rules { + eprintln!("rule: {:?}", rule); + } + + let sheet = Builder::new(rules).build(); + let property_sheet_json_path = src_dir_path + .join(property_sheet_css_path.file_name().unwrap()) + .with_extension("json"); + let mut property_sheet_json_file = File::create(property_sheet_json_path)?; + serde_json::to_writer_pretty(&mut property_sheet_json_file, &sheet)?; + } + + Ok(()) +} + +fn parse_property_sheet(path: &Path) -> Result> { + let mut i = 0; + let mut items = rsass::parse_scss_file(path)?; + while i < items.len() { + match &items[i] { + rsass::Item::Import(arg) => { + if let Some(s) = get_sass_string(arg) { + let import_path = resolve_path(path, s)?; + let imported_items = rsass::parse_scss_file(&import_path)?; + items.splice(i..(i + 1), imported_items); + continue; + } else { + return Err(Error("@import arguments must be strings".to_string())); + } + } + rsass::Item::AtRule { name, args, .. } => match name.as_str() { + "schema" => { + if let Some(s) = get_sass_string(args) { + let schema_path = resolve_path(path, s)?; + eprintln!("schema path: {:?}", schema_path); + items.remove(i); + continue; + } else { + return Err(Error("@schema arguments must be strings".to_string())); + } + } + _ => return Err(Error(format!("Unsupported at-rule '{}'", name))), + }, + _ => {} + } + i += 1; + } + + let mut result = Vec::new(); + let selector_prefixes = vec![Vec::new()]; + parse_sass_items(items, &selector_prefixes, &mut result)?; + Ok(result) +} + +fn parse_sass_items( + items: Vec, + selector_prefixes: &Vec>, + result: &mut Vec, +) -> Result<()> { + let mut properties = PropertySet::new(); + for item in items { + match item { + rsass::Item::None | rsass::Item::Comment(_) => {} + rsass::Item::Property(name, value) => { + properties.insert(name.to_string(), parse_sass_value(&value)?); + } + rsass::Item::Rule(selectors, items) => { + let mut full_selectors = Vec::new(); + for prefix in selector_prefixes { + let mut part_string = String::new(); + let mut next_step_is_immediate = false; + for selector in &selectors.s { + let mut prefix = prefix.clone(); + for part in &selector.0 { + part_string.clear(); + write!(&mut part_string, "{}", part).unwrap(); + let part_string = part_string.trim(); + if !part_string.is_empty() { + if part_string == "&" { + continue; + } else if part_string.starts_with("[text=") { + if let Some(last_step) = prefix.last_mut() { + last_step.text_pattern = Some( + part_string[7..(part_string.len() - 2)].to_string(), + ) + } + } else if part_string == ">" { + next_step_is_immediate = true; + } else if part_string.starts_with("[token=") { + prefix.push(SelectorStep { + kind: part_string[8..(part_string.len() - 2)].to_string(), + is_named: false, + child_index: None, + text_pattern: None, + is_immediate: next_step_is_immediate, + }); + next_step_is_immediate = false; + } else { + prefix.push(SelectorStep { + kind: part_string.to_string(), + is_named: true, + child_index: None, + text_pattern: None, + is_immediate: next_step_is_immediate, + }); + next_step_is_immediate = false; + } + } + } + full_selectors.push(prefix); + } + } + parse_sass_items(items, &full_selectors, result)?; + } + _ => return Err(Error(format!("Unsupported syntax type {:?}", item))), + } + } + + if !properties.is_empty() { + result.push(Rule { + selectors: selector_prefixes.iter().cloned().map(Selector).collect(), + properties, + }); + } + + Ok(()) +} + +fn parse_sass_value(value: &Value) -> Result { + match value { + Value::Literal(s) => { + if let Some(s) = s.single_raw() { + Ok(PropertyValue::String(s.to_string())) + } else { + Err(Error("String interpolation is not supported".to_string())) + } + } + Value::Call(name, raw_args) => { + if let Some(name) = name.single_raw() { + let mut args = Vec::new(); + for (_, arg) in raw_args.iter() { + args.push(parse_sass_value(arg)?); + } + let mut result = PropertySet::new(); + result.insert("name".to_string(), PropertyValue::String(name.to_string())); + result.insert("args".to_string(), PropertyValue::Array(args)); + Ok(PropertyValue::Object(result)) + } else { + Err(Error("String interpolation is not supported".to_string())) + } + } + Value::List(elements, ..) => { + let mut result = Vec::new(); + for element in elements { + result.push(parse_sass_value(element)?); + } + Ok(PropertyValue::Array(result)) + } + Value::True => Ok(PropertyValue::String("true".to_string())), + Value::False => Ok(PropertyValue::String("false".to_string())), + _ => Err(Error( + "Property values must be strings or function calls".to_string(), + )), + } +} + +fn get_sass_string(value: &Value) -> Option<&str> { + if let Value::Literal(s) = value { + s.single_raw() + } else { + None + } +} + +fn resolve_path(base: &Path, path: impl AsRef) -> Result { + let mut result = base.to_owned(); + result.pop(); + result.push(path.as_ref()); + if result.exists() { + Ok(result) + } else { + Err(Error(format!( + "Could not resolve import path {:?}", + path.as_ref() + ))) + } +} diff --git a/cli/src/main.rs b/cli/src/main.rs index 604d3068..334f06ef 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -6,6 +6,7 @@ extern crate log; extern crate serde_derive; extern crate hashbrown; extern crate regex; +extern crate rsass; extern crate serde_json; mod error; @@ -39,6 +40,7 @@ fn run() -> error::Result<()> { SubCommand::with_name("generate") .about("Generate a parser") .arg(Arg::with_name("log").long("log")) + .arg(Arg::with_name("properties-only").long("properties")) .arg( Arg::with_name("state-ids-to-log") .long("log-state") @@ -77,13 +79,14 @@ fn run() -> error::Result<()> { } let minimize = !matches.is_present("no-minimize"); + let properties_only = matches.is_present("properties-only"); let state_ids_to_log = matches .values_of("state-ids-to-log") .map_or(Vec::new(), |ids| { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); - generate::generate_parser_for_grammar(¤t_dir, minimize, state_ids_to_log)?; + generate::generate_parser_for_grammar(¤t_dir, minimize, state_ids_to_log, properties_only)?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index ae3c979c..37748447 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -80,6 +80,29 @@ pub struct PropertySheet

> { text_regexes: Vec, } +#[derive(Debug, Deserialize, Serialize)] +pub struct PropertyTransitionJSON { + #[serde(rename = "type")] + pub kind: String, + pub named: bool, + pub index: Option, + pub text: Option, + pub state_id: usize, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct PropertyStateJSON { + pub transitions: Vec, + pub property_set_id: usize, + pub default_next_state_id: usize, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct PropertySheetJSON

{ + pub states: Vec, + pub property_sets: Vec

, +} + #[derive(Clone, Copy)] pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); @@ -189,7 +212,7 @@ impl Parser { } #[cfg(unix)] - pub fn print_dot_graphs(&mut self, file: & impl AsRawFd) { + pub fn print_dot_graphs(&mut self, file: &impl AsRawFd) { let fd = file.as_raw_fd(); unsafe { ffi::ts_parser_print_dot_graphs(self.0, ffi::dup(fd)) } } @@ -754,29 +777,6 @@ impl

PropertySheet

{ where P: DeserializeOwned, { - #[derive(Deserialize, Debug)] - struct PropertyTransitionJSON { - #[serde(rename = "type")] - kind: String, - named: bool, - index: Option, - text: Option, - state_id: usize, - } - - #[derive(Deserialize, Debug)] - struct PropertyStateJSON { - transitions: Vec, - property_set_id: usize, - default_next_state_id: usize, - } - - #[derive(Deserialize, Debug)] - struct PropertySheetJSON

{ - states: Vec, - property_sets: Vec

, - } - let input: PropertySheetJSON

= serde_json::from_str(json).map_err(PropertySheetError::InvalidJSON)?; let mut states = Vec::new(); From 1dfbe495ed55e35e4f94a0eb8e19de65edc2e414 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 13:12:16 -0800 Subject: [PATCH 115/208] Get property sheet compilation working --- cli/src/generate/properties.rs | 540 ++++++++++++++++++++++++++++++--- lib/binding/lib.rs | 9 +- 2 files changed, 504 insertions(+), 45 deletions(-) diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index c328526f..136cd725 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -1,15 +1,17 @@ use crate::error::{Error, Result}; +use hashbrown::hash_map::{Entry, HashMap}; +use hashbrown::HashSet; use rsass; use rsass::sass::Value; -use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; -use std::fmt; -use std::fmt::Write; +use std::cmp::Ordering; +use std::collections::{BTreeMap, VecDeque}; +use std::fmt::{self, Write}; use std::fs::{self, File}; -use std::hash::{Hash, Hasher}; +use std::io::BufWriter; use std::path::{Path, PathBuf}; use tree_sitter::{self, PropertyStateJSON, PropertyTransitionJSON}; -#[derive(Debug, PartialEq, Eq, Hash, Serialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] #[serde(untagged)] enum PropertyValue { String(String), @@ -17,17 +19,17 @@ enum PropertyValue { Array(Vec), } -type PropertySet = BTreeMap; +type PropertySet = std::collections::HashMap; type PropertySheetJSON = tree_sitter::PropertySheetJSON; -type StateId = u32; -type PropertySetId = u32; +type StateId = usize; +type PropertySetId = usize; #[derive(Clone, PartialEq, Eq)] struct SelectorStep { kind: String, is_named: bool, is_immediate: bool, - child_index: Option, + child_index: Option, text_pattern: Option, } @@ -40,29 +42,48 @@ struct Rule { properties: PropertySet, } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct PropertyItem { +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct Item { rule_id: u32, selector_id: u32, step_id: u32, } -#[derive(PartialEq, Eq)] -struct PropertyItemSet(BTreeSet); +#[derive(Clone, PartialEq, Eq, Hash)] +struct ItemSet(Vec); #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] struct SelectorMatch { specificity: u32, rule_id: u32, - selector_id: u32, } struct Builder { rules: Vec, output: PropertySheetJSON, - ids_by_item_set: HashMap, - ids_by_property_set: HashMap, - item_set_queue: VecDeque<(PropertyItemSet, StateId)>, + ids_by_item_set: HashMap, + item_set_queue: VecDeque<(ItemSet, StateId)>, +} + +impl ItemSet { + fn new() -> Self { + ItemSet(Vec::new()) + } + + fn insert(&mut self, item: Item) { + match self.0.binary_search(&item) { + Err(i) => self.0.insert(i, item), + _ => {} + } + } +} + +impl<'a> IntoIterator for &'a ItemSet { + type Item = &'a Item; + type IntoIter = std::slice::Iter<'a, Item>; + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } } impl Builder { @@ -74,25 +95,301 @@ impl Builder { property_sets: Vec::new(), }, ids_by_item_set: HashMap::new(), - ids_by_property_set: HashMap::new(), item_set_queue: VecDeque::new(), } } - fn build(self) -> PropertySheetJSON { - let mut start_item_set = PropertyItemSet(BTreeSet::new()); + fn build(mut self) -> PropertySheetJSON { + let mut start_item_set = ItemSet::new(); + for (i, rule) in self.rules.iter().enumerate() { + for j in 0..rule.selectors.len() { + start_item_set.insert(Item { + rule_id: i as u32, + selector_id: j as u32, + step_id: 0, + }); + } + } + + self.add_state(start_item_set); + while let Some((item_set, state_id)) = self.item_set_queue.pop_front() { + self.populate_state(item_set, state_id); + } + + self.remove_duplicate_states(); + + for (i, state) in self.output.states.iter_mut().enumerate() { + state.id = i; + } self.output } -} -impl Hash for PropertyItemSet { - fn hash(&self, h: &mut H) { - h.write_usize(self.0.len()); - for entry in &self.0 { - entry.hash(h); + fn add_state(&mut self, item_set: ItemSet) -> StateId { + match self.ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.output.states.len(); + self.output.states.push(PropertyStateJSON { + id: 0, + transitions: Vec::new(), + property_set_id: 0, + default_next_state_id: 0, + }); + self.item_set_queue.push_back((v.key().clone(), state_id)); + v.insert(state_id); + state_id + } } } + + fn add_property_set(&mut self, properties: PropertySet) -> PropertySetId { + if let Some(index) = self + .output + .property_sets + .iter() + .position(|i| *i == properties) + { + index + } else { + self.output.property_sets.push(properties); + self.output.property_sets.len() - 1 + } + } + + fn populate_state(&mut self, item_set: ItemSet, state_id: StateId) { + let mut transition_map: HashSet = HashSet::new(); + let mut selector_matches = Vec::new(); + + // First, compute all of the possible state transition predicates for + // this state, and all of the rules that are currently matching. + for item in &item_set { + let rule = &self.rules[item.rule_id as usize]; + let selector = &rule.selectors[item.selector_id as usize]; + let next_step = selector.0.get(item.step_id as usize); + + // If this item has more elements remaining in its selector, then + // add a state transition based on the next step. + if let Some(step) = next_step { + transition_map.insert(PropertyTransitionJSON { + kind: step.kind.clone(), + named: step.is_named, + index: step.child_index, + text: step.text_pattern.clone(), + state_id: 0, + }); + } + // If the item has matched its entire selector, then the item's + // properties are applicable to this state. + else { + selector_matches.push(SelectorMatch { + rule_id: item.rule_id, + specificity: selector_specificity(selector), + }); + } + } + + // For eacy possible state transition, compute the set of items in that transition's + // destination state. + let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = transition_map + .into_iter() + .map(|mut transition| { + let mut next_item_set = ItemSet::new(); + let mut latest_matching_rule_id = 0; + for item in &item_set { + let rule = &self.rules[item.rule_id as usize]; + let selector = &rule.selectors[item.selector_id as usize]; + let next_step = selector.0.get(item.step_id as usize); + + if let Some(step) = next_step { + // If the next step of the item's selector satisfies this transition, + // advance the item to the next part of its selector and add the + // resulting item to this transition's destination state. + if step_matches_transition(step, &transition) { + let next_item = Item { + rule_id: item.rule_id, + selector_id: item.selector_id, + step_id: item.step_id + 1, + }; + + next_item_set.insert(next_item); + + // If the next item is at the end of its selector, record its rule id + // so that the rule id can be used when sorting this state's transitions. + if selector.0.get(item.step_id as usize + 1).is_none() + && item.rule_id > latest_matching_rule_id + { + latest_matching_rule_id = item.rule_id; + } + } + + // If the next step of the item is not an immediate child, then + // include this item in this transition's destination state, because + // the next step of the item might match a descendant node. + if !step.is_immediate { + next_item_set.insert(*item); + } + } + } + + transition.state_id = self.add_state(next_item_set); + (transition, latest_matching_rule_id) + }) + .collect(); + + // Ensure that for a given node type, more specific transitions are tried + // first, and in the event of a tie, transitions corresponding to later rules + // in the cascade are tried first. + transition_list.sort_by(|a, b| { + let result = a.0.kind.cmp(&b.0.kind); + if result != Ordering::Equal { + return result; + } + let result = a.0.named.cmp(&b.0.named); + if result != Ordering::Equal { + return result; + } + let result = transition_specificity(&b.0).cmp(&transition_specificity(&a.0)); + if result != Ordering::Equal { + return result; + } + b.1.cmp(&a.1) + }); + + // Compute the merged properties that apply in the current state. + // Sort the matching property sets by ascending specificity and by + // their order in the sheet. This way, more specific selectors and later + // rules will override less specific selectors and earlier rules. + let mut properties = PropertySet::new(); + selector_matches.sort_unstable_by(|a, b| { + let result = a.specificity.cmp(&b.specificity); + if result != Ordering::Equal { + return result; + } + a.rule_id.cmp(&b.rule_id) + }); + selector_matches.dedup(); + for selector_match in selector_matches { + let rule = &self.rules[selector_match.rule_id as usize]; + for (property, value) in &rule.properties { + properties.insert(property.clone(), value.clone()); + } + } + + // Compute the default successor item set - the item set that + // we should advance to if the next element doesn't match any + // of the next elements in the item set's selectors. + let mut default_next_item_set = ItemSet::new(); + for item in &item_set { + let rule = &self.rules[item.rule_id as usize]; + let selector = &rule.selectors[item.selector_id as usize]; + let next_step = selector.0.get(item.step_id as usize); + if let Some(step) = next_step { + if !step.is_immediate { + default_next_item_set.insert(*item); + } + } + } + + self.output.states[state_id].default_next_state_id = self.add_state(default_next_item_set); + self.output.states[state_id].property_set_id = self.add_property_set(properties); + self.output.states[state_id] + .transitions + .extend(transition_list.into_iter().map(|i| i.0)); + } + + fn remove_duplicate_states(&mut self) { + let mut state_replacements = BTreeMap::new(); + let mut done = false; + while !done { + done = true; + for (i, state_i) in self.output.states.iter().enumerate() { + if state_replacements.contains_key(&i) { + continue; + } + for (j, state_j) in self.output.states.iter().enumerate() { + if j == i { + break; + } + if state_replacements.contains_key(&j) { + continue; + } + if state_i == state_j { + info!("replace state {} with state {}", i, j); + state_replacements.insert(i, j); + done = false; + break; + } + } + } + for state in self.output.states.iter_mut() { + for transition in state.transitions.iter_mut() { + if let Some(replacement) = state_replacements.get(&transition.state_id) { + transition.state_id = *replacement; + } + } + } + } + + let final_state_replacements = (0..self.output.states.len()) + .into_iter() + .map(|state_id| { + let replacement = state_replacements + .get(&state_id) + .cloned() + .unwrap_or(state_id); + let prior_removed = state_replacements + .iter() + .take_while(|i| *i.0 < replacement) + .count(); + replacement - prior_removed + }) + .collect::>(); + + for state in self.output.states.iter_mut() { + for transition in state.transitions.iter_mut() { + transition.state_id = final_state_replacements[transition.state_id]; + } + } + + let mut i = 0; + self.output.states.retain(|_| { + let result = !state_replacements.contains_key(&i); + i += 1; + result + }); + } +} + +fn selector_specificity(selector: &Selector) -> u32 { + let mut result = selector.0.len() as u32; + for step in &selector.0 { + if step.child_index.is_some() { + result += 1; + } + if step.text_pattern.is_some() { + result += 1; + } + } + result +} + +fn transition_specificity(transition: &PropertyTransitionJSON) -> u32 { + let mut result = 0; + if transition.index.is_some() { + result += 1; + } + if transition.text.is_some() { + result += 1; + } + result +} + +fn step_matches_transition(step: &SelectorStep, transition: &PropertyTransitionJSON) -> bool { + step.kind == transition.kind + && step.is_named == transition.named + && (step.child_index == transition.index || step.child_index.is_none()) + && (step.text_pattern == transition.text || step.text_pattern.is_none()) } impl fmt::Debug for SelectorStep { @@ -135,27 +432,28 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { let properties_dir_path = repo_path.join("properties"); for entry in fs::read_dir(properties_dir_path)? { - let property_sheet_css_path = entry?.path(); - let rules = parse_property_sheet(&property_sheet_css_path)?; - - for rule in &rules { - eprintln!("rule: {:?}", rule); - } - - let sheet = Builder::new(rules).build(); + let css_path = entry?.path(); + let css = fs::read_to_string(&css_path)?; + let sheet = generate_property_sheet(&css_path, &css)?; let property_sheet_json_path = src_dir_path - .join(property_sheet_css_path.file_name().unwrap()) + .join(css_path.file_name().unwrap()) .with_extension("json"); - let mut property_sheet_json_file = File::create(property_sheet_json_path)?; - serde_json::to_writer_pretty(&mut property_sheet_json_file, &sheet)?; + let property_sheet_json_file = File::create(property_sheet_json_path)?; + let mut writer = BufWriter::new(property_sheet_json_file); + serde_json::to_writer_pretty(&mut writer, &sheet)?; } Ok(()) } -fn parse_property_sheet(path: &Path) -> Result> { +fn generate_property_sheet(path: impl AsRef, css: &str) -> Result { + let rules = parse_property_sheet(path.as_ref(), &css)?; + Ok(Builder::new(rules).build()) +} + +fn parse_property_sheet(path: &Path, css: &str) -> Result> { let mut i = 0; - let mut items = rsass::parse_scss_file(path)?; + let mut items = rsass::parse_scss_data(css.as_bytes())?; while i < items.len() { match &items[i] { rsass::Item::Import(arg) => { @@ -296,11 +594,14 @@ fn parse_sass_value(value: &Value) -> Result { } Ok(PropertyValue::Array(result)) } + Value::Color(_, Some(name)) => Ok(PropertyValue::String(name.clone())), + Value::Numeric(n, _) => Ok(PropertyValue::String(format!("{}", n))), Value::True => Ok(PropertyValue::String("true".to_string())), Value::False => Ok(PropertyValue::String("false".to_string())), - _ => Err(Error( - "Property values must be strings or function calls".to_string(), - )), + _ => Err(Error(format!( + "Property values must be strings or function calls. Got {:?}", + value + ))), } } @@ -325,3 +626,158 @@ fn resolve_path(base: &Path, path: impl AsRef) -> Result { ))) } } + +#[cfg(test)] +mod tests { + use super::*; + use regex::Regex; + + #[test] + fn test_immediate_child_and_descendant_selectors() { + let sheet = generate_property_sheet( + "foo", + " + f1 { + color: red; + + & > f2 { + color: green; + } + + & f3 { + color: blue; + } + } + + f2 { + color: indigo; + height: 2; + } + + f3 { + color: violet; + height: 3; + } + ", + ) + .unwrap(); + + // f1 single-element selector + assert_eq!( + *query_simple(&sheet, vec!["f1"]), + props(&[("color", "red")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f2", "f1"]), + props(&[("color", "red")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f2", "f3", "f1"]), + props(&[("color", "red")]) + ); + + // f2 single-element selector + assert_eq!( + *query_simple(&sheet, vec!["f2"]), + props(&[("color", "indigo"), ("height", "2")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f2", "f2"]), + props(&[("color", "indigo"), ("height", "2")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f1", "f3", "f2"]), + props(&[("color", "indigo"), ("height", "2")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f1", "f6", "f2"]), + props(&[("color", "indigo"), ("height", "2")]) + ); + + // f3 single-element selector + assert_eq!( + *query_simple(&sheet, vec!["f3"]), + props(&[("color", "violet"), ("height", "3")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f2", "f3"]), + props(&[("color", "violet"), ("height", "3")]) + ); + + // f2 child selector + assert_eq!( + *query_simple(&sheet, vec!["f1", "f2"]), + props(&[("color", "green"), ("height", "2")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f2", "f1", "f2"]), + props(&[("color", "green"), ("height", "2")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f3", "f1", "f2"]), + props(&[("color", "green"), ("height", "2")]) + ); + + // f3 descendant selector + assert_eq!( + *query_simple(&sheet, vec!["f1", "f3"]), + props(&[("color", "blue"), ("height", "3")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f1", "f2", "f3"]), + props(&[("color", "blue"), ("height", "3")]) + ); + assert_eq!( + *query_simple(&sheet, vec!["f1", "f6", "f7", "f8", "f3"]), + props(&[("color", "blue"), ("height", "3")]) + ); + + // no match + assert_eq!( + *query_simple(&sheet, vec!["f1", "f3", "f4"]), + props(&[]) + ); + } + + fn query_simple<'a>( + sheet: &'a PropertySheetJSON, + node_stack: Vec<&'static str>, + ) -> &'a PropertySet { + query( + sheet, + node_stack.into_iter().map(|s| (s, true, 0)).collect(), + "", + ) + } + + fn query<'a>( + sheet: &'a PropertySheetJSON, + node_stack: Vec<(&'static str, bool, usize)>, + leaf_text: &str, + ) -> &'a PropertySet { + let mut state_id = 0; + for (kind, is_named, child_index) in node_stack { + let state = &sheet.states[state_id]; + state_id = state + .transitions + .iter() + .find(|transition| { + transition.kind == kind + && transition.named == is_named + && transition.index.map_or(true, |index| index == child_index) + && (transition + .text + .as_ref() + .map_or(true, |text| Regex::new(text).unwrap().is_match(leaf_text))) + }) + .map_or(state.default_next_state_id, |t| t.state_id); + } + &sheet.property_sets[sheet.states[state_id].property_set_id] + } + + fn props<'a>(s: &'a [(&'a str, &'a str)]) -> PropertySet { + s.into_iter() + .map(|(a, b)| (a.to_string(), PropertyValue::String(b.to_string()))) + .collect() + } +} diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 37748447..88cc24be 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -80,20 +80,23 @@ pub struct PropertySheet

> { text_regexes: Vec, } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, Hash, PartialEq, Eq)] pub struct PropertyTransitionJSON { #[serde(rename = "type")] pub kind: String, pub named: bool, + #[serde(skip_serializing_if = "Option::is_none")] pub index: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, pub state_id: usize, } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct PropertyStateJSON { - pub transitions: Vec, + pub id: usize, pub property_set_id: usize, + pub transitions: Vec, pub default_next_state_id: usize, } From 38417fc8a1e04bae2d2b0c14f3aa2c310bdec264 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 15:17:38 -0800 Subject: [PATCH 116/208] Port over remaining tests about property sheet generation --- cli/src/generate/properties.rs | 239 ++++++++++++++++++++++++++++++--- 1 file changed, 223 insertions(+), 16 deletions(-) diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index 136cd725..9e570a99 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -1,10 +1,9 @@ use crate::error::{Error, Result}; -use hashbrown::hash_map::{Entry, HashMap}; -use hashbrown::HashSet; use rsass; use rsass::sass::Value; use std::cmp::Ordering; -use std::collections::{BTreeMap, VecDeque}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{self, Write}; use std::fs::{self, File}; use std::io::BufWriter; @@ -19,7 +18,7 @@ enum PropertyValue { Array(Vec), } -type PropertySet = std::collections::HashMap; +type PropertySet = HashMap; type PropertySheetJSON = tree_sitter::PropertySheetJSON; type StateId = usize; type PropertySetId = usize; @@ -448,6 +447,9 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { fn generate_property_sheet(path: impl AsRef, css: &str) -> Result { let rules = parse_property_sheet(path.as_ref(), &css)?; + for rule in &rules { + eprintln!("rule {:?}", rule); + } Ok(Builder::new(rules).build()) } @@ -470,7 +472,6 @@ fn parse_property_sheet(path: &Path, css: &str) -> Result> { "schema" => { if let Some(s) = get_sass_string(args) { let schema_path = resolve_path(path, s)?; - eprintln!("schema path: {:?}", schema_path); items.remove(i); continue; } else { @@ -500,7 +501,22 @@ fn parse_sass_items( match item { rsass::Item::None | rsass::Item::Comment(_) => {} rsass::Item::Property(name, value) => { - properties.insert(name.to_string(), parse_sass_value(&value)?); + let value = parse_sass_value(&value)?; + match properties.entry(name.to_string()) { + Entry::Vacant(v) => { + v.insert(value); + } + Entry::Occupied(mut o) => { + let existing_value = o.get_mut(); + if let PropertyValue::Array(items) = existing_value { + items.push(value); + continue; + } else { + let v = existing_value.clone(); + *existing_value = PropertyValue::Array(vec![v, value]); + } + } + } } rsass::Item::Rule(selectors, items) => { let mut full_selectors = Vec::new(); @@ -516,6 +532,15 @@ fn parse_sass_items( if !part_string.is_empty() { if part_string == "&" { continue; + } else if part_string.starts_with(":nth-child(") { + if let Some(last_step) = prefix.last_mut() { + if let Ok(index) = usize::from_str_radix( + &part_string[11..(part_string.len() - 1)], + 10, + ) { + last_step.child_index = Some(index); + } + } } else if part_string.starts_with("[text=") { if let Some(last_step) = prefix.last_mut() { last_step.text_pattern = Some( @@ -613,18 +638,30 @@ fn get_sass_string(value: &Value) -> Option<&str> { } } -fn resolve_path(base: &Path, path: impl AsRef) -> Result { +fn resolve_path(base: &Path, p: &str) -> Result { + let path = Path::new(p); let mut result = base.to_owned(); result.pop(); - result.push(path.as_ref()); - if result.exists() { - Ok(result) + if path.starts_with(".") { + result.push(path); + if result.exists() { + return Ok(result); + } } else { - Err(Error(format!( - "Could not resolve import path {:?}", - path.as_ref() - ))) + loop { + result.push("node_modules"); + result.push(path); + if result.exists() { + return Ok(result); + } + result.pop(); + result.pop(); + if !result.pop() { + break; + } + } } + Err(Error(format!("Could not resolve import path `{}`", p))) } #[cfg(test)] @@ -635,7 +672,7 @@ mod tests { #[test] fn test_immediate_child_and_descendant_selectors() { let sheet = generate_property_sheet( - "foo", + "foo.css", " f1 { color: red; @@ -733,10 +770,164 @@ mod tests { ); // no match + assert_eq!(*query_simple(&sheet, vec!["f1", "f3", "f4"]), props(&[])); + assert_eq!(*query_simple(&sheet, vec!["f1", "f2", "f5"]), props(&[])); + } + + #[test] + fn test_text_attribute() { + let sheet = generate_property_sheet( + "foo.css", + " + f1 { + color: red; + + &[text='^[A-Z]'] { + color: green; + } + + &[text='^[A-Z_]+$'] { + color: blue; + } + } + + f2[text='^[A-Z_]+$'] { + color: purple; + } + ", + ) + .unwrap(); + assert_eq!( - *query_simple(&sheet, vec!["f1", "f3", "f4"]), + *query(&sheet, vec![("f1", true, 0)], "abc"), + props(&[("color", "red")]) + ); + assert_eq!( + *query(&sheet, vec![("f1", true, 0)], "Abc"), + props(&[("color", "green")]) + ); + assert_eq!( + *query(&sheet, vec![("f1", true, 0)], "AB_CD"), + props(&[("color", "blue")]) + ); + assert_eq!(*query(&sheet, vec![("f2", true, 0)], "Abc"), props(&[])); + assert_eq!( + *query(&sheet, vec![("f2", true, 0)], "ABC"), + props(&[("color", "purple")]) + ); + } + + #[test] + fn test_cascade_ordering_as_tie_breaker() { + let sheet = generate_property_sheet( + "foo.css", + " + f1 f2:nth-child(1) { color: red; } + f1:nth-child(1) f2 { color: green; } + f1 f2[text='a'] { color: blue; } + f1 f2[text='b'] { color: violet; } + ", + ) + .unwrap(); + + assert_eq!( + *query(&sheet, vec![("f1", true, 0), ("f2", true, 0)], "x"), props(&[]) ); + assert_eq!( + *query(&sheet, vec![("f1", true, 0), ("f2", true, 1)], "x"), + props(&[("color", "red")]) + ); + assert_eq!( + *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "x"), + props(&[("color", "green")]) + ); + assert_eq!( + *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "a"), + props(&[("color", "blue")]) + ); + assert_eq!( + *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "ab"), + props(&[("color", "violet")]) + ); + } + + #[test] + fn test_css_function_calls() { + let sheet = generate_property_sheet( + "foo.css", + " + a { + b: f(); + c: f(g(h), i, \"j\", 10); + } + ", + ) + .unwrap(); + + let p = query_simple(&sheet, vec!["a"]); + + assert_eq!( + p["b"], + object(&[("name", string("f")), ("args", array(vec![])),]) + ); + + assert_eq!( + p["c"], + object(&[ + ("name", string("f")), + ( + "args", + array(vec![ + object(&[("name", string("g")), ("args", array(vec![string("h"),]))]), + string("i"), + string("j"), + string("10"), + ]) + ), + ]) + ); + } + + #[test] + fn test_array_by_declaring_property_multiple_times() { + let sheet = generate_property_sheet( + "foo.css", + " + a { + b: 'foo'; + b: 'bar'; + b: 'baz'; + c: f(g()); + c: h(); + } + ", + ) + .unwrap(); + + let p = query_simple(&sheet, vec!["a"]); + + assert_eq!( + p["b"], + array(vec![string("foo"), string("bar"), string("baz"),]) + ); + + assert_eq!( + p["c"], + array(vec![ + object(&[ + ("name", string("f")), + ( + "args", + array(vec![object(&[ + ("name", string("g")), + ("args", array(vec![])), + ])]) + ) + ]), + object(&[("name", string("h")), ("args", array(vec![])),]), + ]), + ); } fn query_simple<'a>( @@ -775,6 +966,22 @@ mod tests { &sheet.property_sets[sheet.states[state_id].property_set_id] } + fn array(s: Vec) -> PropertyValue { + PropertyValue::Array(s) + } + + fn object<'a>(s: &'a [(&'a str, PropertyValue)]) -> PropertyValue { + PropertyValue::Object( + s.into_iter() + .map(|(a, b)| (a.to_string(), b.clone())) + .collect(), + ) + } + + fn string(s: &str) -> PropertyValue { + PropertyValue::String(s.to_string()) + } + fn props<'a>(s: &'a [(&'a str, &'a str)]) -> PropertySet { s.into_iter() .map(|(a, b)| (a.to_string(), PropertyValue::String(b.to_string()))) From 0f2347b3189503c768fc070a373d33407f9c4db6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 15:22:39 -0800 Subject: [PATCH 117/208] Just call the C lib 'the library' everywhere, don't call it a 'runtime' --- lib/include/tree_sitter/{runtime.h => api.h} | 6 ++-- lib/include/tree_sitter/compiler.h | 38 -------------------- lib/include/tree_sitter/parser.h | 2 +- lib/src/alloc.h | 6 ++-- lib/src/array.h | 6 ++-- lib/src/atomic.h | 6 ++-- lib/src/error_costs.h | 4 +-- lib/src/get_changed_ranges.h | 6 ++-- lib/src/language.h | 6 ++-- lib/src/length.h | 6 ++-- lib/src/lexer.h | 8 ++--- lib/src/{runtime.c => lib.c} | 0 lib/src/parser.c | 2 +- lib/src/point.h | 6 ++-- lib/src/reduce_action.h | 8 ++--- lib/src/stack.h | 6 ++-- lib/src/subtree.h | 8 ++--- lib/src/tree.c | 2 +- lib/src/tree.h | 6 ++-- lib/src/tree_cursor.c | 2 +- lib/src/tree_cursor.h | 6 ++-- lib/src/utf16.h | 6 ++-- script/bindgen.sh | 2 +- script/{build-runtime => build-lib} | 2 +- test/fuzz/fuzzer.cc | 2 +- 25 files changed, 57 insertions(+), 95 deletions(-) rename lib/include/tree_sitter/{runtime.h => api.h} (98%) delete mode 100644 lib/include/tree_sitter/compiler.h rename lib/src/{runtime.c => lib.c} (100%) rename script/{build-runtime => build-lib} (93%) diff --git a/lib/include/tree_sitter/runtime.h b/lib/include/tree_sitter/api.h similarity index 98% rename from lib/include/tree_sitter/runtime.h rename to lib/include/tree_sitter/api.h index ab69a0b5..16841c8e 100644 --- a/lib/include/tree_sitter/runtime.h +++ b/lib/include/tree_sitter/api.h @@ -1,5 +1,5 @@ -#ifndef TREE_SITTER_RUNTIME_H_ -#define TREE_SITTER_RUNTIME_H_ +#ifndef TREE_SITTER_API_H_ +#define TREE_SITTER_API_H_ #ifdef __cplusplus extern "C" { @@ -153,4 +153,4 @@ uint32_t ts_language_version(const TSLanguage *); } #endif -#endif // TREE_SITTER_RUNTIME_H_ +#endif // TREE_SITTER_API_H_ diff --git a/lib/include/tree_sitter/compiler.h b/lib/include/tree_sitter/compiler.h deleted file mode 100644 index a84d8a75..00000000 --- a/lib/include/tree_sitter/compiler.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef TREE_SITTER_COMPILER_H_ -#define TREE_SITTER_COMPILER_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -typedef enum { - TSCompileErrorTypeNone, - TSCompileErrorTypeInvalidGrammar, - TSCompileErrorTypeInvalidRegex, - TSCompileErrorTypeUndefinedSymbol, - TSCompileErrorTypeInvalidExtraToken, - TSCompileErrorTypeInvalidExternalToken, - TSCompileErrorTypeLexConflict, - TSCompileErrorTypeParseConflict, - TSCompileErrorTypeEpsilonRule, - TSCompileErrorTypeInvalidTokenContents, - TSCompileErrorTypeInvalidRuleName, - TSCompileErrorTypeInvalidWordRule, -} TSCompileErrorType; - -typedef struct { - char *code; - char *error_message; - TSCompileErrorType error_type; -} TSCompileResult; - -TSCompileResult ts_compile_grammar(const char *input, FILE *log_file); -TSCompileResult ts_compile_property_sheet(const char *input, FILE *log_file); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_COMPILER_H_ diff --git a/lib/include/tree_sitter/parser.h b/lib/include/tree_sitter/parser.h index a757eac0..e5037062 100644 --- a/lib/include/tree_sitter/parser.h +++ b/lib/include/tree_sitter/parser.h @@ -13,7 +13,7 @@ extern "C" { #define ts_builtin_sym_end 0 #define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 -#ifndef TREE_SITTER_RUNTIME_H_ +#ifndef TREE_SITTER_API_H_ typedef uint16_t TSSymbol; typedef struct TSLanguage TSLanguage; #endif diff --git a/lib/src/alloc.h b/lib/src/alloc.h index 6f3fa347..8e027a99 100644 --- a/lib/src/alloc.h +++ b/lib/src/alloc.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_ALLOC_H_ -#define RUNTIME_ALLOC_H_ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ #ifdef __cplusplus extern "C" { @@ -82,4 +82,4 @@ static inline void ts_free(void *buffer) { } #endif -#endif // RUNTIME_ALLOC_H_ +#endif // TREE_SITTER_ALLOC_H_ diff --git a/lib/src/array.h b/lib/src/array.h index 3f5b6b29..04565f33 100644 --- a/lib/src/array.h +++ b/lib/src/array.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_ARRAY_H_ -#define RUNTIME_ARRAY_H_ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ #ifdef __cplusplus extern "C" { @@ -132,4 +132,4 @@ static inline void array__splice(VoidArray *self, size_t element_size, } #endif -#endif // RUNTIME_ARRAY_H_ +#endif // TREE_SITTER_ARRAY_H_ diff --git a/lib/src/atomic.h b/lib/src/atomic.h index d1ab1f23..78a4d7d8 100644 --- a/lib/src/atomic.h +++ b/lib/src/atomic.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_ATOMIC_H_ -#define RUNTIME_ATOMIC_H_ +#ifndef TREE_SITTER_ATOMIC_H_ +#define TREE_SITTER_ATOMIC_H_ #include @@ -27,4 +27,4 @@ static inline uint32_t atomic_dec(volatile uint32_t *p) { #endif -#endif // RUNTIME_ATOMIC_H_ +#endif // TREE_SITTER_ATOMIC_H_ diff --git a/lib/src/error_costs.h b/lib/src/error_costs.h index d6420488..32d3666a 100644 --- a/lib/src/error_costs.h +++ b/lib/src/error_costs.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_ERROR_COSTS_H_ -#define RUNTIME_ERROR_COSTS_H_ +#ifndef TREE_SITTER_ERROR_COSTS_H_ +#define TREE_SITTER_ERROR_COSTS_H_ #define ERROR_STATE 0 #define ERROR_COST_PER_RECOVERY 500 diff --git a/lib/src/get_changed_ranges.h b/lib/src/get_changed_ranges.h index 2764b55f..a1f1dbb4 100644 --- a/lib/src/get_changed_ranges.h +++ b/lib/src/get_changed_ranges.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_GET_CHANGED_RANGES_H_ -#define RUNTIME_GET_CHANGED_RANGES_H_ +#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_ +#define TREE_SITTER_GET_CHANGED_RANGES_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ unsigned ts_subtree_get_changed_ranges( } #endif -#endif // RUNTIME_GET_CHANGED_RANGES_H_ +#endif // TREE_SITTER_GET_CHANGED_RANGES_H_ diff --git a/lib/src/language.h b/lib/src/language.h index c8e5e8a1..0a0f108f 100644 --- a/lib/src/language.h +++ b/lib/src/language.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_LANGUAGE_H_ -#define RUNTIME_LANGUAGE_H_ +#ifndef TREE_SITTER_LANGUAGE_H_ +#define TREE_SITTER_LANGUAGE_H_ #ifdef __cplusplus extern "C" { @@ -91,4 +91,4 @@ ts_language_alias_sequence(const TSLanguage *self, unsigned id) { } #endif -#endif // RUNTIME_LANGUAGE_H_ +#endif // TREE_SITTER_LANGUAGE_H_ diff --git a/lib/src/length.h b/lib/src/length.h index db325f7a..ffe0c7f4 100644 --- a/lib/src/length.h +++ b/lib/src/length.h @@ -1,10 +1,10 @@ -#ifndef RUNTIME_LENGTH_H_ -#define RUNTIME_LENGTH_H_ +#ifndef TREE_SITTER_LENGTH_H_ +#define TREE_SITTER_LENGTH_H_ #include #include #include "./point.h" -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" typedef struct { uint32_t bytes; diff --git a/lib/src/lexer.h b/lib/src/lexer.h index 327350f6..f523d88f 100644 --- a/lib/src/lexer.h +++ b/lib/src/lexer.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_LEXER_H_ -#define RUNTIME_LEXER_H_ +#ifndef TREE_SITTER_LEXER_H_ +#define TREE_SITTER_LEXER_H_ #ifdef __cplusplus extern "C" { @@ -7,7 +7,7 @@ extern "C" { #include "./length.h" #include "./subtree.h" -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #include "tree_sitter/parser.h" typedef struct { @@ -45,4 +45,4 @@ TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); } #endif -#endif // RUNTIME_LEXER_H_ +#endif // TREE_SITTER_LEXER_H_ diff --git a/lib/src/runtime.c b/lib/src/lib.c similarity index 100% rename from lib/src/runtime.c rename to lib/src/lib.c diff --git a/lib/src/parser.c b/lib/src/parser.c index a33dbc6f..c2ebfeeb 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -2,7 +2,7 @@ #include #include #include -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #include "./subtree.h" #include "./lexer.h" #include "./length.h" diff --git a/lib/src/point.h b/lib/src/point.h index 0c4941d5..4d0aed18 100644 --- a/lib/src/point.h +++ b/lib/src/point.h @@ -1,7 +1,7 @@ -#ifndef RUNTIME_POINT_H_ -#define RUNTIME_POINT_H_ +#ifndef TREE_SITTER_POINT_H_ +#define TREE_SITTER_POINT_H_ -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) diff --git a/lib/src/reduce_action.h b/lib/src/reduce_action.h index 91835c39..5956fb5d 100644 --- a/lib/src/reduce_action.h +++ b/lib/src/reduce_action.h @@ -1,12 +1,12 @@ -#ifndef RUNTIME_REDUCE_ACTION_H_ -#define RUNTIME_REDUCE_ACTION_H_ +#ifndef TREE_SITTER_REDUCE_ACTION_H_ +#define TREE_SITTER_REDUCE_ACTION_H_ #ifdef __cplusplus extern "C" { #endif #include "./array.h" -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" typedef struct { uint32_t count; @@ -31,4 +31,4 @@ static inline void ts_reduce_action_set_add(ReduceActionSet *self, } #endif -#endif // RUNTIME_REDUCE_ACTION_H_ +#endif // TREE_SITTER_REDUCE_ACTION_H_ diff --git a/lib/src/stack.h b/lib/src/stack.h index d476d763..1ccd98cd 100644 --- a/lib/src/stack.h +++ b/lib/src/stack.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_PARSE_STACK_H_ -#define RUNTIME_PARSE_STACK_H_ +#ifndef TREE_SITTER_PARSE_STACK_H_ +#define TREE_SITTER_PARSE_STACK_H_ #ifdef __cplusplus extern "C" { @@ -132,4 +132,4 @@ void ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *); } #endif -#endif // RUNTIME_PARSE_STACK_H_ +#endif // TREE_SITTER_PARSE_STACK_H_ diff --git a/lib/src/subtree.h b/lib/src/subtree.h index cc5c79aa..039494b5 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_SUBTREE_H_ -#define RUNTIME_SUBTREE_H_ +#ifndef TREE_SITTER_SUBTREE_H_ +#define TREE_SITTER_SUBTREE_H_ #ifdef __cplusplus extern "C" { @@ -10,7 +10,7 @@ extern "C" { #include "./length.h" #include "./array.h" #include "./error_costs.h" -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #include "tree_sitter/parser.h" extern TSStateId TS_TREE_STATE_NONE; @@ -285,4 +285,4 @@ static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) { } #endif -#endif // RUNTIME_SUBTREE_H_ +#endif // TREE_SITTER_SUBTREE_H_ diff --git a/lib/src/tree.c b/lib/src/tree.c index b729c8c7..9f294412 100644 --- a/lib/src/tree.c +++ b/lib/src/tree.c @@ -1,4 +1,4 @@ -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #include "./array.h" #include "./get_changed_ranges.h" #include "./subtree.h" diff --git a/lib/src/tree.h b/lib/src/tree.h index dd4f3184..92a7e641 100644 --- a/lib/src/tree.h +++ b/lib/src/tree.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_TREE_H_ -#define RUNTIME_TREE_H_ +#ifndef TREE_SITTER_TREE_H_ +#define TREE_SITTER_TREE_H_ #ifdef __cplusplus extern "C" { @@ -31,4 +31,4 @@ void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *); } #endif -#endif // RUNTIME_TREE_H_ +#endif // TREE_SITTER_TREE_H_ diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index d352c32b..5ccf4501 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -1,4 +1,4 @@ -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" #include "./alloc.h" #include "./tree_cursor.h" #include "./language.h" diff --git a/lib/src/tree_cursor.h b/lib/src/tree_cursor.h index 6e46b7dd..55bdad86 100644 --- a/lib/src/tree_cursor.h +++ b/lib/src/tree_cursor.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_TREE_CURSOR_H_ -#define RUNTIME_TREE_CURSOR_H_ +#ifndef TREE_SITTER_TREE_CURSOR_H_ +#define TREE_SITTER_TREE_CURSOR_H_ #include "./subtree.h" @@ -17,4 +17,4 @@ typedef struct { void ts_tree_cursor_init(TreeCursor *, TSNode); -#endif // RUNTIME_TREE_CURSOR_H_ +#endif // TREE_SITTER_TREE_CURSOR_H_ diff --git a/lib/src/utf16.h b/lib/src/utf16.h index 0cf69218..32fd05e6 100644 --- a/lib/src/utf16.h +++ b/lib/src/utf16.h @@ -1,5 +1,5 @@ -#ifndef RUNTIME_UTF16_H_ -#define RUNTIME_UTF16_H_ +#ifndef TREE_SITTER_UTF16_H_ +#define TREE_SITTER_UTF16_H_ #ifdef __cplusplus extern "C" { @@ -18,4 +18,4 @@ utf8proc_ssize_t utf16_iterate(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8p } #endif -#endif // RUNTIME_UTF16_H_ +#endif // TREE_SITTER_UTF16_H_ diff --git a/script/bindgen.sh b/script/bindgen.sh index 0a536d20..f9299095 100755 --- a/script/bindgen.sh +++ b/script/bindgen.sh @@ -1,7 +1,7 @@ #!/bin/bash output_path=lib/binding/bindings.rs -header_path='lib/include/tree_sitter/runtime.h' +header_path='lib/include/tree_sitter/api.h' bindgen \ --no-layout-tests \ diff --git a/script/build-runtime b/script/build-lib similarity index 93% rename from script/build-runtime rename to script/build-lib index 7b2e99f2..b81a4b0a 100755 --- a/script/build-runtime +++ b/script/build-lib @@ -14,7 +14,7 @@ ${CC} \ -I lib/src \ -I lib/include \ -I lib/utf8proc \ - lib/src/runtime.c \ + lib/src/lib.c \ -o tree-sitter.o ar rcs libtree-sitter.a tree-sitter.o diff --git a/test/fuzz/fuzzer.cc b/test/fuzz/fuzzer.cc index 2ed7683f..8d6f9cef 100644 --- a/test/fuzz/fuzzer.cc +++ b/test/fuzz/fuzzer.cc @@ -1,5 +1,5 @@ #include -#include "tree_sitter/runtime.h" +#include "tree_sitter/api.h" extern "C" const TSLanguage *TS_LANG(); From fab4673c14ba5fc066bf13e3408802be9d7b5657 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 15:23:31 -0800 Subject: [PATCH 118/208] Get fixture grammars building for library unit tests --- .appveyor.yml | 2 +- .travis.yml | 2 +- lib/build.rs | 63 ++++++++++++++++++++++++++++------ script/fetch-fixtures | 15 ++++---- script/fetch-fixtures.cmd | 15 ++++---- script/fetch-test-fixtures.cmd | 16 --------- script/fetch-test-fixtures.sh | 14 -------- 7 files changed, 70 insertions(+), 57 deletions(-) delete mode 100755 script/fetch-test-fixtures.cmd delete mode 100755 script/fetch-test-fixtures.sh diff --git a/.appveyor.yml b/.appveyor.yml index 7dccb660..934d6f51 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -12,7 +12,7 @@ install: - rustc -vV - cargo -vV - - script\fetch-test-fixtures.cmd + - script\fetch-fixtures.cmd test_script: - cargo build diff --git a/.travis.yml b/.travis.yml index 65c021cf..45bc26e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ env: - TREE_SITTER_TEST=1 before_install: - - ./script/fetch-test-fixtures.sh + - ./script/fetch-fixtures branches: only: diff --git a/lib/build.rs b/lib/build.rs index cee131bd..f8c19f05 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -1,26 +1,67 @@ extern crate cc; use std::env; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::fs; fn main() { let mut config = cc::Build::new(); - let src_path: PathBuf = ["src"].iter().collect(); - config .define("UTF8PROC_STATIC", "") .flag_if_supported("-std=c99") .flag_if_supported("-Wno-unused-parameter") .include("include") .include("utf8proc") - .file(src_path.join("runtime.c")); + .file(Path::new("src").join("lib.c")) + .compile("tree-sitter"); - if env::var("RUST_TREE_SITTER_TEST").is_ok() { - let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); - config - .file(parser_dir.join("parser.c")) - .file(parser_dir.join("scanner.c")); + if env::var("TREE_SITTER_TEST").is_ok() { + let mut parser_config = cc::Build::new(); + parser_config + .opt_level(0) + .flag_if_supported("-Wno-unused-parameter"); + + let mut scanner_c_config = cc::Build::new(); + scanner_c_config + .flag_if_supported("-std=c99") + .flag_if_supported("-Wno-unused-parameter"); + + let mut scanner_cxx_config = cc::Build::new(); + scanner_cxx_config + .cpp(true) + .flag_if_supported("-Wno-unused-parameter"); + + let grammars_dir: PathBuf = ["..", "test", "fixtures", "grammars"].iter().collect(); + for entry in fs::read_dir(&grammars_dir).expect("Failed to list grammar directory") { + let entry = entry.expect("Failed to load grammars directory entry"); + if !entry.path().is_dir() { + continue; + } + let parser_dir_path = entry.path(); + let parser_src_path = parser_dir_path.join("src"); + let parser_c_path = parser_src_path.join("parser.c"); + let scanner_c_path = parser_src_path.join("scanner.c"); + let scanner_cc_path = parser_src_path.join("scanner.cc"); + + println!("cargo:rerun-if-changed={}", parser_c_path.to_str().unwrap()); + parser_config + .include(&parser_src_path) + .file(&parser_c_path); + if scanner_cc_path.exists() { + println!("cargo:rerun-if-changed={}", scanner_cc_path.to_str().unwrap()); + scanner_cxx_config + .include(&parser_src_path) + .file(&scanner_cc_path); + } else if scanner_c_path.exists() { + println!("cargo:rerun-if-changed={}", scanner_c_path.to_str().unwrap()); + scanner_c_config + .include(&parser_src_path) + .file(&scanner_c_path); + } + } + + parser_config.compile("fixture-parsers"); + scanner_c_config.compile("fixture-scanners-c"); + scanner_cxx_config.compile("fixture-scanners-cxx"); } - - config.compile("tree-sitter-runtime"); } diff --git a/script/fetch-fixtures b/script/fetch-fixtures index a298a4d0..94f9eddd 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -21,14 +21,15 @@ fetch_grammar() { ) } -fetch_grammar embedded-template master -fetch_grammar javascript master -fetch_grammar json master +fetch_grammar bash master fetch_grammar c master fetch_grammar cpp master -fetch_grammar python master +fetch_grammar embedded-template master fetch_grammar go master -fetch_grammar ruby master -fetch_grammar typescript master -fetch_grammar bash master fetch_grammar html master +fetch_grammar javascript master +fetch_grammar json master +fetch_grammar python master +fetch_grammar ruby master +fetch_grammar rust master +fetch_grammar typescript master diff --git a/script/fetch-fixtures.cmd b/script/fetch-fixtures.cmd index 0e65b0a0..98d5d578 100644 --- a/script/fetch-fixtures.cmd +++ b/script/fetch-fixtures.cmd @@ -1,16 +1,17 @@ @echo off -call:fetch_grammar embedded-template master -call:fetch_grammar javascript master -call:fetch_grammar json master +call:fetch_grammar bash master call:fetch_grammar c master call:fetch_grammar cpp master -call:fetch_grammar python master +call:fetch_grammar embedded-template master call:fetch_grammar go master -call:fetch_grammar ruby master -call:fetch_grammar typescript master -call:fetch_grammar bash master call:fetch_grammar html master +call:fetch_grammar javascript master +call:fetch_grammar json master +call:fetch_grammar python master +call:fetch_grammar ruby master +call:fetch_grammar rust master +call:fetch_grammar typescript master EXIT /B 0 :fetch_grammar diff --git a/script/fetch-test-fixtures.cmd b/script/fetch-test-fixtures.cmd deleted file mode 100755 index 33543961..00000000 --- a/script/fetch-test-fixtures.cmd +++ /dev/null @@ -1,16 +0,0 @@ -@Echo off -SETLOCAL - -Set grammar_dir=fixtures\tree-sitter-rust -Set grammar_url=https://github.com/tree-sitter/tree-sitter-rust - -@IF NOT EXIST %grammar_dir% ( - git clone %grammar_url% %grammar_dir% --depth=1 -) - -pushd %grammar_dir% -git fetch origin master --depth=1 -git reset --hard origin/master -popd - -ENDLOCAL diff --git a/script/fetch-test-fixtures.sh b/script/fetch-test-fixtures.sh deleted file mode 100755 index 24cc316a..00000000 --- a/script/fetch-test-fixtures.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -grammar_dir='fixtures/tree-sitter-rust' -grammar_url='https://github.com/tree-sitter/tree-sitter-rust' - -if [ ! -d $grammar_dir ]; then - git clone $grammar_url $grammar_dir --depth=1 -fi - -( - cd $grammar_dir; - git fetch origin master --depth=1 - git reset --hard origin/master; -) From 99531d757cc8b5c839a4d1605a438fc334a68bc2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 15:50:34 -0800 Subject: [PATCH 119/208] Configure caching on travis and appveyor --- .appveyor.yml | 3 ++- .travis.yml | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 934d6f51..1d9fb179 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -23,5 +23,6 @@ branches: - master cache: - - test\fixtures + - target + - test\fixtures\grammars - C:\Users\appveyor\.cargo diff --git a/.travis.yml b/.travis.yml index 45bc26e3..47b88e81 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,3 +12,8 @@ before_install: branches: only: - master + +cache: + cargo: true + directories: + - test/fixtures/grammars From ae6dbb945b8f2c2b3fc3159dcd1a453457aa63fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 15:50:54 -0800 Subject: [PATCH 120/208] Avoid using unix-specific methods on windows --- cli/src/util.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cli/src/util.rs b/cli/src/util.rs index d7d8572e..f36cbe79 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -4,8 +4,23 @@ use std::process::{Child, ChildStdin, Command, Stdio}; use std::str; use tree_sitter::Parser; +#[cfg(windows)] +pub(crate) struct LogSession(); + +#[cfg(windows)] +pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result { + Ok(LogSession()) +} + +#[cfg(windows)] +pub(crate) fn stop_logging_graphs(parser: &mut Parser, mut session: LogSession) -> Result<()> { + Ok(()) +} + +#[cfg(unix)] pub(crate) struct LogSession(Child, ChildStdin); +#[cfg(unix)] pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result { let mut dot_file = File::create(path)?; dot_file.write(b"\n\n\n")?; @@ -19,13 +34,20 @@ pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result Result<()> { drop(session.1); - parser.stop_printing_dot_graphs(); + + if cfg!(unix) { + parser.stop_printing_dot_graphs(); + } + session.0.wait()?; if cfg!(target_os = "macos") { From 272046a2506ffba2676a38d5da557f471a44b827 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 10 Jan 2019 17:11:57 -0800 Subject: [PATCH 121/208] Reorganize tests - move them all into the CLI crate --- .../prepare_grammar/intern_symbols.rs | 2 +- .../prepare_grammar/process_inlines.rs | 2 +- cli/src/generate/properties.rs | 4 +- cli/src/main.rs | 3 + cli/src/tests/corpuses.rs | 57 ++ cli/src/tests/languages.rs | 19 + cli/src/tests/mod.rs | 3 + cli/src/tests/parser_api.rs | 507 +++++++++++++++++ lib/binding/lib.rs | 519 +----------------- lib/build.rs | 1 + 10 files changed, 595 insertions(+), 522 deletions(-) create mode 100644 cli/src/tests/corpuses.rs create mode 100644 cli/src/tests/languages.rs create mode 100644 cli/src/tests/mod.rs create mode 100644 cli/src/tests/parser_api.rs diff --git a/cli/src/generate/prepare_grammar/intern_symbols.rs b/cli/src/generate/prepare_grammar/intern_symbols.rs index a466935b..a7248817 100644 --- a/cli/src/generate/prepare_grammar/intern_symbols.rs +++ b/cli/src/generate/prepare_grammar/intern_symbols.rs @@ -219,7 +219,7 @@ mod tests { let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); match result { - Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"), + Err(Error(message)) => assert_eq!(message, "Undefined symbol `y`"), _ => panic!("Expected an error but got none"), } } diff --git a/cli/src/generate/prepare_grammar/process_inlines.rs b/cli/src/generate/prepare_grammar/process_inlines.rs index 3c0f529a..f58de63d 100644 --- a/cli/src/generate/prepare_grammar/process_inlines.rs +++ b/cli/src/generate/prepare_grammar/process_inlines.rs @@ -408,7 +408,7 @@ mod tests { ProductionStep::new(Symbol::terminal(11)) .with_prec(2, None) .with_alias("inner_alias", true), - ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ProductionStep::new(Symbol::terminal(12)), ], }], }, diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index 9e570a99..e3b60185 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -118,7 +118,7 @@ impl Builder { self.remove_duplicate_states(); for (i, state) in self.output.states.iter_mut().enumerate() { - state.id = i; + state.id = Some(i); } self.output @@ -130,7 +130,7 @@ impl Builder { Entry::Vacant(v) => { let state_id = self.output.states.len(); self.output.states.push(PropertyStateJSON { - id: 0, + id: None, transitions: Vec::new(), property_set_id: 0, default_next_state_id: 0, diff --git a/cli/src/main.rs b/cli/src/main.rs index 334f06ef..9f095668 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -17,6 +17,9 @@ mod parse; mod test; mod util; +#[cfg(test)] +mod tests; + use self::loader::Loader; use clap::{App, Arg, SubCommand}; use std::env; diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs new file mode 100644 index 00000000..b70bb371 --- /dev/null +++ b/cli/src/tests/corpuses.rs @@ -0,0 +1,57 @@ +use super::languages; +use crate::test::{parse_tests, TestEntry}; +use std::path::PathBuf; +use tree_sitter::{Language, Parser}; + +lazy_static! { + static ref LANGUAGES: [(&'static str, Language); 6] = [ + ("c", languages::c()), + ("cpp", languages::cpp()), + ("embedded-template", languages::embedded_template()), + ("go", languages::go()), + ("html", languages::html()), + ("javascript", languages::javascript()), + ]; +} + +#[test] +fn test_corpus_files() { + let mut parser = Parser::new(); + let grammars_dir: PathBuf = [ + env!("CARGO_MANIFEST_DIR"), + "..", + "test", + "fixtures", + "grammars", + ] + .iter() + .collect(); + + for (name, language) in LANGUAGES.iter().cloned() { + let corpus_dir = grammars_dir.join(name).join("corpus"); + let test = parse_tests(&corpus_dir).unwrap(); + parser.set_language(language).unwrap(); + run_mutation_tests(&mut parser, test); + } +} + +fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { + match test { + TestEntry::Example { + name, + input, + output, + } => { + let tree = parser + .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) + .unwrap(); + let actual = tree.root_node().to_sexp(); + assert_eq!(actual, output); + } + TestEntry::Group { name, children } => { + for child in children { + run_mutation_tests(parser, child); + } + } + } +} diff --git a/cli/src/tests/languages.rs b/cli/src/tests/languages.rs new file mode 100644 index 00000000..0c483d08 --- /dev/null +++ b/cli/src/tests/languages.rs @@ -0,0 +1,19 @@ +use tree_sitter::Language; + +extern "C" { + fn tree_sitter_c() -> Language; + fn tree_sitter_cpp() -> Language; + fn tree_sitter_embedded_template() -> Language; + fn tree_sitter_go() -> Language; + fn tree_sitter_html() -> Language; + fn tree_sitter_javascript() -> Language; + fn tree_sitter_rust() -> Language; +} + +pub fn c() -> Language { unsafe { tree_sitter_c() } } +pub fn cpp() -> Language { unsafe { tree_sitter_cpp() } } +pub fn embedded_template() -> Language { unsafe { tree_sitter_embedded_template() } } +pub fn go() -> Language { unsafe { tree_sitter_go() } } +pub fn html() -> Language { unsafe { tree_sitter_html() } } +pub fn javascript() -> Language { unsafe { tree_sitter_javascript() } } +pub fn rust() -> Language { unsafe { tree_sitter_rust() } } diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs new file mode 100644 index 00000000..bc199616 --- /dev/null +++ b/cli/src/tests/mod.rs @@ -0,0 +1,3 @@ +mod languages; +mod corpuses; +mod parser_api; diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs new file mode 100644 index 00000000..af5ba71f --- /dev/null +++ b/cli/src/tests/parser_api.rs @@ -0,0 +1,507 @@ +use super::languages::rust; +use std::thread; +use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet, Range}; + +#[test] +fn test_basic_parsing() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let tree = parser + .parse_str( + " + struct Stuff {} + fn main() {} + ", + None, + ) + .unwrap(); + + let root_node = tree.root_node(); + assert_eq!(root_node.kind(), "source_file"); + + assert_eq!( + root_node.to_sexp(), + "(source_file (struct_item (type_identifier) (field_declaration_list)) (function_item (identifier) (parameters) (block)))" + ); + + let struct_node = root_node.child(0).unwrap(); + assert_eq!(struct_node.kind(), "struct_item"); +} + +#[test] +fn test_logging() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut messages = Vec::new(); + parser.set_logger(Some(Box::new(|log_type, message| { + messages.push((log_type, message.to_string())); + }))); + + parser + .parse_str( + " + struct Stuff {} + fn main() {} + ", + None, + ) + .unwrap(); + + assert!(messages.contains(&( + LogType::Parse, + "reduce sym:struct_item, child_count:3".to_string() + ))); + assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); +} + +#[test] +fn test_tree_cursor() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let tree = parser + .parse_str( + " + struct Stuff { + a: A; + b: Option, + } + ", + None, + ) + .unwrap(); + + let mut cursor = tree.walk(); + assert_eq!(cursor.node().kind(), "source_file"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct_item"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct"); + assert_eq!(cursor.node().is_named(), false); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "type_identifier"); + assert_eq!(cursor.node().is_named(), true); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "field_declaration_list"); + assert_eq!(cursor.node().is_named(), true); +} + +#[test] +fn test_tree_property_matching() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { f2(); }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + reference: Option, + define: Option, + } + + let empty_properties = Properties { + reference: None, + define: None, + }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1}, + {"type": "function_item", "named": true, "state_id": 2} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 4} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"reference": "function"}, + {"define": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "function_item"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "fn"); + assert_eq!(*cursor.node_properties(), empty_properties); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties().define, Some("function".to_owned())); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "parameters"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "("); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), ")"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_parent()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!( + cursor.node_properties().reference, + Some("function".to_owned()) + ); +} + +#[test] +fn test_tree_property_matching_with_regexes() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { None(a()) }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + scope: Option, + } + + let empty_properties = Properties { scope: None }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "id": 0, + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "id": 1, + "transitions": [ + {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"scope": "constructor"}, + {"scope": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + cursor.goto_first_child(); + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!( + cursor.node_properties().scope, + Some("constructor".to_owned()) + ); +} + +#[test] +fn test_custom_utf8_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let lines = &["pub fn foo() {", " 1", "}"]; + + let tree = parser + .parse_utf8( + &mut |_, position| { + let row = position.row; + let column = position.column; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); +} + +#[test] +fn test_custom_utf16_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + parser.set_logger(Some(Box::new(|t, message| { + println!("log: {:?} {}", t, message); + }))); + + let lines: Vec> = ["pub fn foo() {", " 1", "}"] + .iter() + .map(|s| s.encode_utf16().collect()) + .collect(); + + let tree = parser + .parse_utf16( + &mut |_, position| { + let row = position.row; + let column = position.column; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); +} + +#[test] +fn test_node_equality() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("struct A {}", None).unwrap(); + let node1 = tree.root_node(); + let node2 = tree.root_node(); + assert_eq!(node1, node2); + assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); + assert_ne!(node1.child(0).unwrap(), node2); +} + +#[test] +fn test_editing() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); + let mut input_bytes_read = Vec::new(); + + let mut tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + input_bytes_read.clear(); + input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + tree.edit(&InputEdit { + start_byte: 14, + old_end_byte: 14, + new_end_byte: 20, + start_position: Point::new(0, 14), + old_end_position: Point::new(0, 14), + new_end_position: Point::new(0, 20), + }); + + let tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + Some(&tree), + ) + .unwrap(); + + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); + assert!(retokenized_content.contains("b: B")); + assert!(!retokenized_content.contains("a: A")); + assert!(!retokenized_content.contains("c: C")); + assert!(!retokenized_content.contains("{}")); +} + +#[test] +fn test_parallel_parsing() { + // Parse this source file so that each thread has a non-trivial amount of + // work to do. + let this_file_source = include_str!("parser_api.rs"); + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str(this_file_source, None).unwrap(); + + let mut parse_threads = Vec::new(); + for thread_id in 1..5 { + let mut tree_clone = tree.clone(); + parse_threads.push(thread::spawn(move || { + // For each thread, prepend a different number of declarations to the + // source code. + let mut prepend_line_count = 0; + let mut prepended_source = String::new(); + for _ in 0..thread_id { + prepend_line_count += 2; + prepended_source += "struct X {}\n\n"; + } + + tree_clone.edit(&InputEdit { + start_byte: 0, + old_end_byte: 0, + new_end_byte: prepended_source.len(), + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(prepend_line_count, 0), + }); + prepended_source += this_file_source; + + // Reparse using the old tree as a starting point. + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + parser + .parse_str(&prepended_source, Some(&tree_clone)) + .unwrap() + })); + } + + // Check that the trees have the expected relationship to one another. + let trees = parse_threads + .into_iter() + .map(|thread| thread.join().unwrap()); + let child_count_differences = trees + .map(|t| t.root_node().child_count() - tree.root_node().child_count()) + .collect::>(); + + assert_eq!(child_count_differences, &[1, 2, 3, 4]); +} diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 88cc24be..08f863f8 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -94,7 +94,7 @@ pub struct PropertyTransitionJSON { #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct PropertyStateJSON { - pub id: usize, + pub id: Option, pub property_set_id: usize, pub transitions: Vec, pub default_next_state_id: usize, @@ -847,520 +847,3 @@ impl

PropertySheet

{ }) } } - -#[cfg(test)] -mod tests { - use super::*; - use std::thread; - - fn rust() -> Language { - unsafe { tree_sitter_rust() } - } - extern "C" { - fn tree_sitter_rust() -> Language; - } - - #[test] - fn test_basic_parsing() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let tree = parser - .parse_str( - " - struct Stuff {} - fn main() {} - ", - None, - ) - .unwrap(); - - let root_node = tree.root_node(); - assert_eq!(root_node.kind(), "source_file"); - - assert_eq!( - root_node.to_sexp(), - "(source_file (struct_item (type_identifier) (field_declaration_list)) (function_item (identifier) (parameters) (block)))" - ); - - let struct_node = root_node.child(0).unwrap(); - assert_eq!(struct_node.kind(), "struct_item"); - } - - #[test] - fn test_logging() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let mut messages = Vec::new(); - parser.set_logger(Some(Box::new(|log_type, message| { - messages.push((log_type, message.to_string())); - }))); - - parser - .parse_str( - " - struct Stuff {} - fn main() {} - ", - None, - ) - .unwrap(); - - assert!(messages.contains(&( - LogType::Parse, - "reduce sym:struct_item, child_count:3".to_string() - ))); - assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); - } - - #[test] - fn test_tree_cursor() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let tree = parser - .parse_str( - " - struct Stuff { - a: A; - b: Option, - } - ", - None, - ) - .unwrap(); - - let mut cursor = tree.walk(); - assert_eq!(cursor.node().kind(), "source_file"); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "struct_item"); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "struct"); - assert_eq!(cursor.node().is_named(), false); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "type_identifier"); - assert_eq!(cursor.node().is_named(), true); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "field_declaration_list"); - assert_eq!(cursor.node().is_named(), true); - } - - #[test] - fn test_tree_property_matching() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let source_code = "fn f1() { f2(); }"; - let tree = parser.parse_str(source_code, None).unwrap(); - - #[derive(Debug, Deserialize, PartialEq, Eq)] - struct Properties { - reference: Option, - define: Option, - } - - let empty_properties = Properties { - reference: None, - define: None, - }; - - let property_sheet = PropertySheet::::new( - rust(), - r##" - { - "states": [ - { - "transitions": [ - {"type": "call_expression", "named": true, "state_id": 1}, - {"type": "function_item", "named": true, "state_id": 2} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [ - {"type": "identifier", "named": true, "state_id": 3} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [ - {"type": "identifier", "named": true, "state_id": 4} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 1 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 2 - } - ], - "property_sets": [ - {}, - {"reference": "function"}, - {"define": "function"} - ] - } - "##, - ) - .unwrap(); - - let mut cursor = tree.walk_with_properties(&property_sheet, source_code); - assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "function_item"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "fn"); - assert_eq!(*cursor.node_properties(), empty_properties); - assert!(!cursor.goto_first_child()); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().define, Some("function".to_owned())); - assert!(!cursor.goto_first_child()); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "parameters"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "("); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), ")"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_parent()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!( - cursor.node_properties().reference, - Some("function".to_owned()) - ); - } - - #[test] - fn test_tree_property_matching_with_regexes() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let source_code = "fn f1() { None(a()) }"; - let tree = parser.parse_str(source_code, None).unwrap(); - - #[derive(Debug, Deserialize, PartialEq, Eq)] - struct Properties { - scope: Option, - } - - let empty_properties = Properties { scope: None }; - - let property_sheet = PropertySheet::::new( - rust(), - r##" - { - "states": [ - { - "id": 0, - "transitions": [ - {"type": "call_expression", "named": true, "state_id": 1} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "id": 1, - "transitions": [ - {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, - {"type": "identifier", "named": true, "state_id": 3} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 1 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 2 - } - ], - "property_sets": [ - {}, - {"scope": "constructor"}, - {"scope": "function"} - ] - } - "##, - ) - .unwrap(); - - let mut cursor = tree.walk_with_properties(&property_sheet, source_code); - assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), empty_properties); - - cursor.goto_first_child(); - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert!(cursor.goto_next_sibling()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!( - cursor.node_properties().scope, - Some("constructor".to_owned()) - ); - } - - #[test] - fn test_custom_utf8_input() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let lines = &["pub fn foo() {", " 1", "}"]; - - let tree = parser - .parse_utf8( - &mut |_, position| { - let row = position.row; - let column = position.column; - if row < lines.len() { - if column < lines[row].as_bytes().len() { - &lines[row].as_bytes()[column..] - } else { - "\n".as_bytes() - } - } else { - &[] - } - }, - None, - ) - .unwrap(); - - let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); - assert_eq!(root.kind(), "source_file"); - assert_eq!(root.has_error(), false); - assert_eq!(root.child(0).unwrap().kind(), "function_item"); - } - - #[test] - fn test_custom_utf16_input() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - parser.set_logger(Some(Box::new(|t, message| { - println!("log: {:?} {}", t, message); - }))); - - let lines: Vec> = ["pub fn foo() {", " 1", "}"] - .iter() - .map(|s| s.encode_utf16().collect()) - .collect(); - - let tree = parser - .parse_utf16( - &mut |_, position| { - let row = position.row; - let column = position.column; - if row < lines.len() { - if column < lines[row].len() { - &lines[row][column..] - } else { - &[10] - } - } else { - &[] - } - }, - None, - ) - .unwrap(); - - let root = tree.root_node(); - assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); - assert_eq!(root.kind(), "source_file"); - assert_eq!(root.has_error(), false); - assert_eq!(root.child(0).unwrap().kind(), "function_item"); - } - - #[test] - fn test_node_equality() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let tree = parser.parse_str("struct A {}", None).unwrap(); - let node1 = tree.root_node(); - let node2 = tree.root_node(); - assert_eq!(node1, node2); - assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); - assert_ne!(node1.child(0).unwrap(), node2); - } - - #[test] - fn test_editing() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); - let mut input_bytes_read = Vec::new(); - - let mut tree = parser - .parse_utf8( - &mut |offset, _| { - let offset = offset; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, - None, - ) - .unwrap(); - - let parameters_sexp = tree - .root_node() - .named_child(0) - .unwrap() - .named_child(1) - .unwrap() - .to_sexp(); - assert_eq!( - parameters_sexp, - "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" - ); - - input_bytes_read.clear(); - input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); - tree.edit(&InputEdit { - start_byte: 14, - old_end_byte: 14, - new_end_byte: 20, - start_position: Point::new(0, 14), - old_end_position: Point::new(0, 14), - new_end_position: Point::new(0, 20), - }); - - let tree = parser - .parse_utf8( - &mut |offset, _| { - let offset = offset; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, - Some(&tree), - ) - .unwrap(); - - let parameters_sexp = tree - .root_node() - .named_child(0) - .unwrap() - .named_child(1) - .unwrap() - .to_sexp(); - assert_eq!( - parameters_sexp, - "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" - ); - - let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); - assert!(retokenized_content.contains("b: B")); - assert!(!retokenized_content.contains("a: A")); - assert!(!retokenized_content.contains("c: C")); - assert!(!retokenized_content.contains("{}")); - } - - #[test] - fn test_parallel_parsing() { - // Parse this source file so that each thread has a non-trivial amount of - // work to do. - let this_file_source = include_str!("lib.rs"); - - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let tree = parser.parse_str(this_file_source, None).unwrap(); - - let mut parse_threads = Vec::new(); - for thread_id in 1..5 { - let mut tree_clone = tree.clone(); - parse_threads.push(thread::spawn(move || { - // For each thread, prepend a different number of declarations to the - // source code. - let mut prepend_line_count = 0; - let mut prepended_source = String::new(); - for _ in 0..thread_id { - prepend_line_count += 2; - prepended_source += "struct X {}\n\n"; - } - - tree_clone.edit(&InputEdit { - start_byte: 0, - old_end_byte: 0, - new_end_byte: prepended_source.len(), - start_position: Point::new(0, 0), - old_end_position: Point::new(0, 0), - new_end_position: Point::new(prepend_line_count, 0), - }); - prepended_source += this_file_source; - - // Reparse using the old tree as a starting point. - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - parser - .parse_str(&prepended_source, Some(&tree_clone)) - .unwrap() - })); - } - - // Check that the trees have the expected relationship to one another. - let trees = parse_threads - .into_iter() - .map(|thread| thread.join().unwrap()); - let child_count_differences = trees - .map(|t| t.root_node().child_count() - tree.root_node().child_count()) - .collect::>(); - - assert_eq!(child_count_differences, &[1, 2, 3, 4]); - } -} diff --git a/lib/build.rs b/lib/build.rs index f8c19f05..7e8714ef 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -46,6 +46,7 @@ fn main() { println!("cargo:rerun-if-changed={}", parser_c_path.to_str().unwrap()); parser_config .include(&parser_src_path) + .opt_level(0) .file(&parser_c_path); if scanner_cc_path.exists() { println!("cargo:rerun-if-changed={}", scanner_cc_path.to_str().unwrap()); From cffe80bfaddde02aa7ac1a6e4e9c698da00f9602 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 09:48:19 -0800 Subject: [PATCH 122/208] Fix tie-breaking via cascade ordering in property sheets --- cli/src/generate/properties.rs | 43 ++++++++++++++-------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index e3b60185..cca7fef8 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -157,7 +157,7 @@ impl Builder { } fn populate_state(&mut self, item_set: ItemSet, state_id: StateId) { - let mut transition_map: HashSet = HashSet::new(); + let mut transition_map: HashSet<(PropertyTransitionJSON, u32)> = HashSet::new(); let mut selector_matches = Vec::new(); // First, compute all of the possible state transition predicates for @@ -170,13 +170,18 @@ impl Builder { // If this item has more elements remaining in its selector, then // add a state transition based on the next step. if let Some(step) = next_step { - transition_map.insert(PropertyTransitionJSON { - kind: step.kind.clone(), - named: step.is_named, - index: step.child_index, - text: step.text_pattern.clone(), - state_id: 0, - }); + transition_map.insert(( + PropertyTransitionJSON { + kind: step.kind.clone(), + named: step.is_named, + index: step.child_index, + text: step.text_pattern.clone(), + state_id: 0, + }, + + // Include the rule id so that it can be used when sorting transitions. + item.rule_id, + )); } // If the item has matched its entire selector, then the item's // properties are applicable to this state. @@ -192,9 +197,8 @@ impl Builder { // destination state. let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = transition_map .into_iter() - .map(|mut transition| { + .map(|(mut transition, rule_id)| { let mut next_item_set = ItemSet::new(); - let mut latest_matching_rule_id = 0; for item in &item_set { let rule = &self.rules[item.rule_id as usize]; let selector = &rule.selectors[item.selector_id as usize]; @@ -205,21 +209,11 @@ impl Builder { // advance the item to the next part of its selector and add the // resulting item to this transition's destination state. if step_matches_transition(step, &transition) { - let next_item = Item { + next_item_set.insert(Item { rule_id: item.rule_id, selector_id: item.selector_id, step_id: item.step_id + 1, - }; - - next_item_set.insert(next_item); - - // If the next item is at the end of its selector, record its rule id - // so that the rule id can be used when sorting this state's transitions. - if selector.0.get(item.step_id as usize + 1).is_none() - && item.rule_id > latest_matching_rule_id - { - latest_matching_rule_id = item.rule_id; - } + }); } // If the next step of the item is not an immediate child, then @@ -232,7 +226,7 @@ impl Builder { } transition.state_id = self.add_state(next_item_set); - (transition, latest_matching_rule_id) + (transition, rule_id) }) .collect(); @@ -447,9 +441,6 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { fn generate_property_sheet(path: impl AsRef, css: &str) -> Result { let rules = parse_property_sheet(path.as_ref(), &css)?; - for rule in &rules { - eprintln!("rule {:?}", rule); - } Ok(Builder::new(rules).build()) } From 0d85a1ef53e98cdfb060e02d029b2637e3b1e79a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 09:48:45 -0800 Subject: [PATCH 123/208] Exclude final newlines from inputs when parsing corpus files --- cli/src/test.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/test.rs b/cli/src/test.rs index 790e9ec7..e064dffd 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -16,7 +16,7 @@ lazy_static! { .multi_line(true) .build() .unwrap(); - static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n") + static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"\r?\n---+\r?\n") .multi_line(true) .build() .unwrap(); @@ -251,12 +251,12 @@ d children: vec![ TestEntry::Example { name: "The first test".to_string(), - input: "\na b c\n\n".as_bytes().to_vec(), + input: "\na b c\n".as_bytes().to_vec(), output: "(a (b c))".to_string(), }, TestEntry::Example { name: "The second test".to_string(), - input: "d\n".as_bytes().to_vec(), + input: "d".as_bytes().to_vec(), output: "(d)".to_string(), }, ] From e64f7a64a11ceeb21e5425133a564e5bcf9022f1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 13:30:45 -0800 Subject: [PATCH 124/208] Start work on running test corpus tests --- Cargo.lock | 1 + cli/Cargo.toml | 1 + cli/build.rs | 6 + .../build_tables/build_parse_table.rs | 51 +--- cli/src/generate/build_tables/mod.rs | 41 +++- cli/src/generate/mod.rs | 67 ++++-- cli/src/loader.rs | 226 +++++++++++------- cli/src/main.rs | 13 +- cli/src/tests/corpuses.rs | 76 +++++- cli/src/tests/parser_api.rs | 2 +- .../test_grammars/aliased_rules/grammar.json | 2 +- .../grammar.json | 2 +- .../external_tokens/grammar.json | 2 +- .../inverted_external_token/grammar.json | 2 +- .../precedence_on_subsequence/grammar.json | 4 +- 15 files changed, 328 insertions(+), 168 deletions(-) create mode 100644 cli/build.rs diff --git a/Cargo.lock b/Cargo.lock index fa7712ba..464cd050 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -774,6 +774,7 @@ name = "tree-sitter-cli" version = "0.1.0" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 2eabd88f..b6226917 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -9,6 +9,7 @@ name = "tree-sitter" path = "src/main.rs" [dependencies] +cc = "1.0" ansi_term = "0.11" difference = "2.0" lazy_static = "1.2.0" diff --git a/cli/build.rs b/cli/build.rs new file mode 100644 index 00000000..e0ebd1c4 --- /dev/null +++ b/cli/build.rs @@ -0,0 +1,6 @@ +fn main() { + println!( + "cargo:rustc-env=BUILD_TARGET={}", + std::env::var("TARGET").unwrap() + ); +} diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 73c9c0e2..6af85b4c 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -1,7 +1,9 @@ use super::item::{ParseItem, ParseItemSet, TokenSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; -use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::generate::grammars::{ + InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType, +}; use crate::generate::rules::{Alias, Associativity, Symbol, SymbolType}; use crate::generate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, @@ -11,6 +13,7 @@ use hashbrown::hash_map::Entry; use hashbrown::{HashMap, HashSet}; use std::collections::hash_map::DefaultHasher; use std::collections::VecDeque; +use std::u32; use std::fmt::Write; use std::hash::Hasher; @@ -94,7 +97,6 @@ impl<'a> ParseTableBuilder<'a> { )?; } - self.populate_used_symbols(); self.remove_precedences(); Ok((self.parse_table, self.following_tokens)) @@ -313,7 +315,10 @@ impl<'a> ParseTableBuilder<'a> { .first_set(&step.symbol) .contains(&conflicting_lookahead) { - conflicting_items.insert(item); + if item.variable_index != u32::MAX { + conflicting_items.insert(item); + } + let precedence = item.precedence(); if let Some(range) = &mut shift_precedence { if precedence < range.start { @@ -327,7 +332,9 @@ impl<'a> ParseTableBuilder<'a> { } } } else if lookaheads.contains(&conflicting_lookahead) { - conflicting_items.insert(item); + if item.variable_index != u32::MAX { + conflicting_items.insert(item); + } } } @@ -610,40 +617,6 @@ impl<'a> ParseTableBuilder<'a> { } } - fn populate_used_symbols(&mut self) { - let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; - let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; - let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; - for state in &self.parse_table.states { - for symbol in state.terminal_entries.keys() { - match symbol.kind { - SymbolType::Terminal => terminal_usages[symbol.index] = true, - SymbolType::External => external_usages[symbol.index] = true, - _ => {} - } - } - for symbol in state.nonterminal_entries.keys() { - non_terminal_usages[symbol.index] = true; - } - } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); - } - } - self.parse_table.symbols.push(Symbol::end()); - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } - for (i, value) in non_terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::non_terminal(i)); - } - } - } - fn remove_precedences(&mut self) { for state in self.parse_table.states.iter_mut() { for (_, entry) in state.terminal_entries.iter_mut() { @@ -702,7 +675,7 @@ impl<'a> ParseTableBuilder<'a> { if variable.kind == VariableType::Named { variable.name.clone() } else { - format!("\"{}\"", &variable.name) + format!("'{}'", &variable.name) } } } diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index b8432fe5..52c6abac 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -15,7 +15,7 @@ use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor}; -use crate::generate::rules::{AliasMap, Symbol}; +use crate::generate::rules::{AliasMap, Symbol, SymbolType}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; pub(crate) fn build_tables( @@ -45,6 +45,7 @@ pub(crate) fn build_tables( &token_conflict_map, &keywords, ); + populate_used_symbols(&mut parse_table, syntax_grammar, lexical_grammar); mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); if minimize { minimize_parse_table( @@ -151,6 +152,44 @@ fn populate_error_state( state.terminal_entries.insert(Symbol::end(), recover_entry); } +fn populate_used_symbols( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, +) { + let mut terminal_usages = vec![false; lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; syntax_grammar.variables.len()]; + let mut external_usages = vec![false; syntax_grammar.external_tokens.len()]; + for state in &parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + parse_table.symbols.push(Symbol::external(i)); + } + } + parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + parse_table.symbols.push(Symbol::non_terminal(i)); + } + } +} + fn identify_keywords( lexical_grammar: &LexicalGrammar, parse_table: &ParseTable, diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 0899d793..5d89bbfe 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -3,6 +3,7 @@ use self::parse_grammar::parse_grammar; use self::prepare_grammar::prepare_grammar; use self::render::render_c_code; use crate::error::Result; +use regex::{Regex, RegexBuilder}; use std::fs; use std::io::Write; use std::path::PathBuf; @@ -18,7 +19,14 @@ mod render; mod rules; mod tables; -pub fn generate_parser_for_grammar( +lazy_static! { + static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*") + .multi_line(true) + .build() + .unwrap(); +} + +pub fn generate_parser_in_directory( repo_path: &PathBuf, minimize: bool, state_ids_to_log: Vec, @@ -26,33 +34,48 @@ pub fn generate_parser_for_grammar( ) -> Result<()> { if !properties_only { let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); - let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( - &syntax_grammar, - &lexical_grammar, - &simple_aliases, - &inlines, - minimize, - state_ids_to_log, - )?; - let c_code = render_c_code( - &input_grammar.name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - syntax_grammar, - lexical_grammar, - simple_aliases, - ); + let c_code = + generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; fs::write(repo_path.join("src").join("parser.c"), c_code)?; } properties::generate_property_sheets(repo_path)?; Ok(()) } +#[cfg(test)] +pub fn generate_parser_for_grammar(grammar_json: &String) -> Result { + let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); + generate_parser_for_grammar_with_opts(&grammar_json, true, Vec::new()) +} + +fn generate_parser_for_grammar_with_opts( + grammar_json: &str, + minimize: bool, + state_ids_to_log: Vec, +) -> Result { + let input_grammar = parse_grammar(grammar_json)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &inlines, + minimize, + state_ids_to_log, + )?; + Ok(render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + )) +} + fn load_js_grammar_file(grammar_path: &PathBuf) -> String { let mut node_process = Command::new("node") .stdin(Stdio::piped()) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 7dfb233b..e056bbaa 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -6,6 +6,7 @@ use std::io; use std::mem; use std::path::{Path, PathBuf}; use std::process::Command; +use std::time::SystemTime; use tree_sitter::{Language, PropertySheet}; const PACKAGE_JSON_PATH: &'static str = "package.json"; @@ -37,7 +38,7 @@ pub struct LanguageConfiguration { pub struct Loader { parser_lib_path: PathBuf, language_repos: Vec, - language_configuration_indices_by_file_type: HashMap>, + language_configuration_ids_by_file_type: HashMap>, } unsafe impl Send for Loader {} @@ -48,19 +49,20 @@ impl Loader { Loader { parser_lib_path, language_repos: Vec::new(), - language_configuration_indices_by_file_type: HashMap::new(), + language_configuration_ids_by_file_type: HashMap::new(), } } - pub fn find_parsers(&mut self, parser_src_paths: &Vec) -> io::Result<()> { + pub fn find_all_languages(&mut self, parser_src_paths: &Vec) -> io::Result<()> { for parser_container_dir in parser_src_paths.iter() { for entry in fs::read_dir(parser_container_dir)? { let entry = entry?; if let Some(parser_dir_name) = entry.file_name().to_str() { if parser_dir_name.starts_with("tree-sitter-") { - if self.load_language_configurations( - &parser_container_dir.join(parser_dir_name), - ).is_err() { + if self + .find_language_at_path(&parser_container_dir.join(parser_dir_name)) + .is_err() + { eprintln!("Error loading {}", parser_dir_name); } } @@ -70,90 +72,126 @@ impl Loader { Ok(()) } - pub fn language_configuration_at_path( - &mut self, - path: &Path, - ) -> io::Result> { - let repo_index = self.load_language_configurations(path)?; - self.load_language_from_repo(repo_index, 0) - } - - pub fn language_for_file_name( - &mut self, - path: &Path, - ) -> io::Result> { - let indices = path - .file_name() - .and_then(|n| n.to_str()) - .and_then(|file_name| { - self.language_configuration_indices_by_file_type - .get(file_name) - }) - .or_else(|| { - path.extension() - .and_then(|extension| extension.to_str()) - .and_then(|extension| { - self.language_configuration_indices_by_file_type - .get(extension) - }) - }); - - if let Some(indices) = indices { - // TODO use `content-regex` to pick one - for (repo_index, conf_index) in indices { - return self.load_language_from_repo(*repo_index, *conf_index); - } - } - Ok(None) - } - - fn load_language_from_repo( - &mut self, - repo_index: usize, - conf_index: usize, - ) -> io::Result> { - let repo = &self.language_repos[repo_index]; - let language = if let Some(language) = repo.language { - language - } else { - let language = self.load_language_at_path(&repo.name, &repo.path)?; - self.language_repos[repo_index].language = Some(language); - language - }; - if let Some(configuration) = self.language_repos[repo_index] - .configurations - .get(conf_index) - { - Ok(Some((language, configuration))) + pub fn language_at_path(&mut self, path: &Path) -> io::Result> { + if let Ok(id) = self.find_language_at_path(path) { + Ok(Some(self.language_configuration_for_id(id)?.0)) } else { Ok(None) } } + pub fn language_configuration_for_file_name( + &mut self, + path: &Path, + ) -> io::Result> { + let ids = path + .file_name() + .and_then(|n| n.to_str()) + .and_then(|file_name| self.language_configuration_ids_by_file_type.get(file_name)) + .or_else(|| { + path.extension() + .and_then(|extension| extension.to_str()) + .and_then(|extension| { + self.language_configuration_ids_by_file_type.get(extension) + }) + }); + if let Some(ids) = ids { + // TODO use `content-regex` to pick one + for (repo_id, configuration_id) in ids.iter().cloned() { + let (language, configurations) = self.language_configuration_for_id(repo_id)?; + return Ok(Some((language, &configurations[configuration_id]))); + } + } + Ok(None) + } + + fn language_configuration_for_id( + &mut self, + id: usize, + ) -> io::Result<(Language, &Vec)> { + let repo = &self.language_repos[id]; + let language = if let Some(language) = repo.language { + language + } else { + let language = self.load_language_at_path(&repo.name, &repo.path)?; + self.language_repos[id].language = Some(language); + language + }; + Ok((language, &self.language_repos[id].configurations)) + } + fn load_language_at_path(&self, name: &str, language_path: &Path) -> io::Result { - let parser_c_path = language_path.join(PARSER_C_PATH); + let src_path = language_path.join("src"); + let parser_c_path = src_path.join("parser.c"); + + let scanner_path; + let scanner_c_path = src_path.join("scanner.c"); + if scanner_c_path.exists() { + scanner_path = Some(scanner_c_path); + } else { + let scanner_cc_path = src_path.join("scanner.cc"); + if scanner_cc_path.exists() { + scanner_path = Some(scanner_cc_path); + } else { + scanner_path = None; + } + } + + self.load_language_from_sources(name, &src_path, &parser_c_path, &scanner_path) + } + + pub fn load_language_from_sources( + &self, + name: &str, + header_path: &Path, + parser_path: &Path, + scanner_path: &Option, + ) -> io::Result { let mut library_path = self.parser_lib_path.join(name); library_path.set_extension(DYLIB_EXTENSION); - if !library_path.exists() || was_modified_more_recently(&parser_c_path, &library_path)? { - let compiler_name = std::env::var("CXX").unwrap_or("c++".to_owned()); - let mut command = Command::new(compiler_name); - command - .arg("-shared") - .arg("-fPIC") - .arg("-I") - .arg(language_path.join("src")) - .arg("-o") - .arg(&library_path) - .arg("-xc") - .arg(parser_c_path); - let scanner_c_path = language_path.join(SCANNER_C_PATH); - let scanner_cc_path = language_path.join(SCANNER_CC_PATH); - if scanner_c_path.exists() { - command.arg("-xc").arg(scanner_c_path); - } else if scanner_cc_path.exists() { - command.arg("-xc++").arg(scanner_cc_path); + if needs_recompile(&library_path, &parser_path, &scanner_path)? { + let mut config = cc::Build::new(); + config + .opt_level(2) + .cargo_metadata(false) + .target(env!("BUILD_TARGET")) + .host(env!("BUILD_TARGET")); + let compiler = config.get_compiler(); + let compiler_path = compiler.path(); + let mut command = Command::new(compiler_path); + + if cfg!(windows) { + command + .args(&["/nologo", "/LD", "/I"]) + .arg(header_path) + .arg("/Od") + .arg(parser_path); + if let Some(scanner_path) = scanner_path.as_ref() { + command.arg(scanner_path); + } + command + .arg("/link") + .arg(format!("/out:{}", library_path.to_str().unwrap())); + } else { + command + .arg("-shared") + .arg("-fPIC") + .arg("-I") + .arg(header_path) + .arg("-o") + .arg(&library_path) + .arg("-xc") + .arg(parser_path); + if let Some(scanner_path) = scanner_path.as_ref() { + if scanner_path.extension() == Some("c".as_ref()) { + command.arg(scanner_path); + } else { + command.arg("-xc++").arg(scanner_path); + } + } } + command.output()?; } @@ -168,7 +206,7 @@ impl Loader { Ok(language) } - fn load_language_configurations<'a>(&'a mut self, parser_path: &Path) -> io::Result { + fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> io::Result { let name = parser_path .file_name() .unwrap() @@ -218,7 +256,7 @@ impl Loader { for (i, configuration) in configurations.iter().enumerate() { for file_type in &configuration.file_types { - self.language_configuration_indices_by_file_type + self.language_configuration_ids_by_file_type .entry(file_type.to_string()) .or_insert(Vec::new()) .push((self.language_repos.len(), i)); @@ -236,6 +274,26 @@ impl Loader { } } -fn was_modified_more_recently(a: &Path, b: &Path) -> io::Result { - Ok(fs::metadata(a)?.modified()? > fs::metadata(b)?.modified()?) +fn needs_recompile( + lib_path: &Path, + parser_c_path: &Path, + scanner_path: &Option, +) -> io::Result { + if !lib_path.exists() { + return Ok(true); + } + let lib_mtime = mtime(lib_path)?; + if mtime(parser_c_path)? > lib_mtime { + return Ok(true); + } + if let Some(scanner_path) = scanner_path { + if mtime(scanner_path)? > lib_mtime { + return Ok(true); + } + } + Ok(false) +} + +fn mtime(path: &Path) -> io::Result { + Ok(fs::metadata(path)?.modified()?) } diff --git a/cli/src/main.rs b/cli/src/main.rs index 9f095668..dda4bdca 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -89,13 +89,18 @@ fn run() -> error::Result<()> { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); - generate::generate_parser_for_grammar(¤t_dir, minimize, state_ids_to_log, properties_only)?; + generate::generate_parser_in_directory( + ¤t_dir, + minimize, + state_ids_to_log, + properties_only, + )?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); let filter = matches.value_of("filter"); let corpus_path = current_dir.join("corpus"); - if let Some((language, _)) = loader.language_configuration_at_path(¤t_dir)? { + if let Some(language) = loader.language_at_path(¤t_dir)? { test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?; } else { eprintln!("No language found"); @@ -103,9 +108,9 @@ fn run() -> error::Result<()> { } else if let Some(matches) = matches.subcommand_matches("parse") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); - loader.find_parsers(&vec![home_dir.join("github")])?; + loader.find_all_languages(&vec![home_dir.join("github")])?; let source_path = Path::new(matches.value_of("path").unwrap()); - if let Some((language, _)) = loader.language_for_file_name(source_path)? { + if let Some((language, _)) = loader.language_configuration_for_file_name(source_path)? { parse::parse_file_at_path(language, source_path, debug, debug_graph)?; } else { eprintln!("No language found"); diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index b70bb371..624786fc 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,6 +1,9 @@ use super::languages; +use crate::generate; +use crate::loader::Loader; use crate::test::{parse_tests, TestEntry}; -use std::path::PathBuf; +use std::fs; +use std::path::{Path, PathBuf}; use tree_sitter::{Language, Parser}; lazy_static! { @@ -12,20 +15,16 @@ lazy_static! { ("html", languages::html()), ("javascript", languages::javascript()), ]; + static ref ROOT_DIR: PathBuf = [env!("CARGO_MANIFEST_DIR"), ".."].iter().collect(); + static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); + static ref SCRATCH_DIR: PathBuf = ROOT_DIR.join("target").join("scratch"); + static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); } #[test] -fn test_corpus_files() { +fn test_real_language_corpus_files() { let mut parser = Parser::new(); - let grammars_dir: PathBuf = [ - env!("CARGO_MANIFEST_DIR"), - "..", - "test", - "fixtures", - "grammars", - ] - .iter() - .collect(); + let grammars_dir = FIXTURES_DIR.join("grammars"); for (name, language) in LANGUAGES.iter().cloned() { let corpus_dir = grammars_dir.join(name).join("corpus"); @@ -35,6 +34,61 @@ fn test_corpus_files() { } } +#[test] +fn test_feature_corpus_files() { + fs::create_dir_all(SCRATCH_DIR.as_path()).unwrap(); + + let mut loader = Loader::new(SCRATCH_DIR.clone()); + let mut parser = Parser::new(); + let test_grammars_dir = FIXTURES_DIR.join("test_grammars"); + + for entry in fs::read_dir(&test_grammars_dir).unwrap() { + let entry = entry.unwrap(); + let test_name = entry.file_name(); + let test_name = test_name.to_str().unwrap(); + + eprintln!("test name: {}", test_name); + let test_path = entry.path(); + let grammar_path = test_path.join("grammar.json"); + let corpus_path = test_path.join("corpus.txt"); + let error_message_path = test_path.join("expected_error.txt"); + + let grammar_json = fs::read_to_string(grammar_path).unwrap(); + let generate_result = generate::generate_parser_for_grammar(&grammar_json); + if error_message_path.exists() { + continue; + if let Err(e) = generate_result { + assert_eq!(e.0, fs::read_to_string(&error_message_path).unwrap()); + } else { + panic!( + "Expected error message but got none for test grammar '{}'", + test_name + ); + } + } else { + let c_code = generate_result.unwrap(); + let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", test_name)); + fs::write(&parser_c_path, c_code).unwrap(); + let scanner_path = test_path.join("scanner.c"); + let scanner_path = if scanner_path.exists() { + Some(scanner_path) + } else { + None + }; + let language = loader + .load_language_from_sources(test_name, &HEADER_DIR, &parser_c_path, &scanner_path) + .unwrap(); + } + } + + // for (name, language) in LANGUAGES.iter().cloned() { + // let corpus_dir = grammars_dir.join(name).join("corpus"); + // let test = parse_tests(&corpus_dir).unwrap(); + // parser.set_language(language).unwrap(); + // run_mutation_tests(&mut parser, test); + // } +} + fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { match test { TestEntry::Example { diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs index af5ba71f..e32c292b 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api.rs @@ -1,6 +1,6 @@ use super::languages::rust; use std::thread; -use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet, Range}; +use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet}; #[test] fn test_basic_parsing() { diff --git a/test/fixtures/test_grammars/aliased_rules/grammar.json b/test/fixtures/test_grammars/aliased_rules/grammar.json index 391f780f..a66bfb78 100644 --- a/test/fixtures/test_grammars/aliased_rules/grammar.json +++ b/test/fixtures/test_grammars/aliased_rules/grammar.json @@ -40,7 +40,7 @@ {"type": "SYMBOL", "name": "_expression"}, {"type": "STRING", "value": "("}, {"type": "SYMBOL", "name": "_expression"}, - {"type": "STRING", "value": ")"}, + {"type": "STRING", "value": ")"} ] } }, diff --git a/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/grammar.json b/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/grammar.json index 0be2008c..d97d9c9d 100644 --- a/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/grammar.json +++ b/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/grammar.json @@ -2,7 +2,7 @@ "name": "conflict_in_repeat_rule_after_external_token", "externals": [ - {"type": "SYMBOL", "name": "_program_start"}, + {"type": "SYMBOL", "name": "_program_start"} ], "rules": { diff --git a/test/fixtures/test_grammars/external_tokens/grammar.json b/test/fixtures/test_grammars/external_tokens/grammar.json index d61a978f..e5ca6bc9 100644 --- a/test/fixtures/test_grammars/external_tokens/grammar.json +++ b/test/fixtures/test_grammars/external_tokens/grammar.json @@ -45,7 +45,7 @@ {"type": "SYMBOL", "name": "expression"}, {"type": "SYMBOL", "name": "_percent_string_end"} ] - }, + } ] }, diff --git a/test/fixtures/test_grammars/inverted_external_token/grammar.json b/test/fixtures/test_grammars/inverted_external_token/grammar.json index a43cedcc..6dee3d03 100644 --- a/test/fixtures/test_grammars/inverted_external_token/grammar.json +++ b/test/fixtures/test_grammars/inverted_external_token/grammar.json @@ -15,7 +15,7 @@ "content": { "type": "SYMBOL", "name": "statement" - }, + } }, "statement": { diff --git a/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json b/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json index d05db765..d992793c 100644 --- a/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json +++ b/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json @@ -110,7 +110,7 @@ "type": "SEQ", "members": [ {"type": "STRING", "value": "::"}, - {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "expression"} ] } ] @@ -132,4 +132,4 @@ "value": "[a-zA-Z]+" } } -} \ No newline at end of file +} From 45c8cf47ea3f6a53ca3fe31283b399062697e0f7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 13:49:49 -0800 Subject: [PATCH 125/208] Enable backtraces on CI --- .appveyor.yml | 1 + .travis.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 1d9fb179..f1acceda 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,5 @@ environment: + RUST_BACKTRACE: full TREE_SITTER_TEST: true build: false diff --git a/.travis.yml b/.travis.yml index 47b88e81..46bb9a95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ rust: env: - TREE_SITTER_TEST=1 + - RUST_BACKTRACE=full before_install: - ./script/fetch-fixtures From fa283dcf27b897891e3203a527a6263dde198553 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 14:44:32 -0800 Subject: [PATCH 126/208] Use the compiler environment vars computed by the cc config --- cli/src/loader.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index e056bbaa..af1ab7be 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -9,11 +9,6 @@ use std::process::Command; use std::time::SystemTime; use tree_sitter::{Language, PropertySheet}; -const PACKAGE_JSON_PATH: &'static str = "package.json"; -const PARSER_C_PATH: &'static str = "src/parser.c"; -const SCANNER_C_PATH: &'static str = "src/scanner.c"; -const SCANNER_CC_PATH: &'static str = "src/scanner.cc"; - #[cfg(unix)] const DYLIB_EXTENSION: &'static str = "so"; @@ -158,8 +153,10 @@ impl Loader { .target(env!("BUILD_TARGET")) .host(env!("BUILD_TARGET")); let compiler = config.get_compiler(); - let compiler_path = compiler.path(); - let mut command = Command::new(compiler_path); + let mut command = Command::new(compiler.path()); + for (key, value) in compiler.env() { + command.env(key, value); + } if cfg!(windows) { command @@ -192,7 +189,17 @@ impl Loader { } } - command.output()?; + let output = command.output()?; + if !output.status.success() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Parser compilation failed.\nStdout: {}\nStderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ).as_str(), + )); + } } let library = Library::new(library_path)?; @@ -233,7 +240,7 @@ impl Loader { tree_sitter: Option>, } - let package_json_contents = fs::read_to_string(&parser_path.join(PACKAGE_JSON_PATH))?; + let package_json_contents = fs::read_to_string(&parser_path.join("package.json"))?; let package_json: PackageJSON = serde_json::from_str(&package_json_contents)?; let configurations = package_json .tree_sitter From 88f1c4af8edb82d1893a9bc7ab478dc05cc4ec7b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 14:48:29 -0800 Subject: [PATCH 127/208] Ensure the .tree-sitter directory exists --- cli/src/main.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index dda4bdca..5a830458 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -23,6 +23,7 @@ mod tests; use self::loader::Loader; use clap::{App, Arg, SubCommand}; use std::env; +use std::fs; use std::path::Path; use std::process::exit; use std::usize; @@ -74,7 +75,9 @@ fn run() -> error::Result<()> { let home_dir = dirs::home_dir().unwrap(); let current_dir = env::current_dir().unwrap(); - let mut loader = Loader::new(home_dir.join(".tree-sitter")); + let config_dir = home_dir.join(".tree-sitter"); + fs::create_dir_all(&config_dir).unwrap(); + let mut loader = Loader::new(config_dir); if let Some(matches) = matches.subcommand_matches("generate") { if matches.is_present("log") { From acfa0c524a7eb7ca417eb75375fbd7653d6d2c03 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 14:48:36 -0800 Subject: [PATCH 128/208] Fix env var setup on travis --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 46bb9a95..0d4f8cd5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,7 @@ rust: - stable env: - - TREE_SITTER_TEST=1 - - RUST_BACKTRACE=full + - TREE_SITTER_TEST=1 RUST_BACKTRACE=full before_install: - ./script/fetch-fixtures From 0236de79635740534cb5f01238a23525965a01d2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 14:54:30 -0800 Subject: [PATCH 129/208] Tweak caching setup on appveyor --- .appveyor.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index f1acceda..147827e9 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -5,14 +5,15 @@ environment: build: false install: - - git submodule update --init --recursive - + # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - rustup-init -yv --default-toolchain stable - - set PATH=%PATH%;%USERPROFILE%\.cargo\bin + - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin - rustc -vV - cargo -vV + # Install dependencies + - git submodule update --init - script\fetch-fixtures.cmd test_script: @@ -27,3 +28,5 @@ cache: - target - test\fixtures\grammars - C:\Users\appveyor\.cargo + - C:\cargo\registry + - C:\cargo\git From 6592fdd24cb63fd1967fcf5f51fc8b947b96284c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 17:26:45 -0800 Subject: [PATCH 130/208] Fix parser generation error messages --- .../build_tables/build_parse_table.rs | 113 +++++++----- .../build_tables/coincident_tokens.rs | 12 +- cli/src/generate/build_tables/item.rs | 4 +- cli/src/generate/build_tables/mod.rs | 12 +- .../prepare_grammar/flatten_grammar.rs | 170 +++++++++++------- .../prepare_grammar/intern_symbols.rs | 2 +- cli/src/generate/render.rs | 12 +- cli/src/tests/corpuses.rs | 37 +++- .../expected_error.txt | 12 +- .../expected_error.txt | 12 +- .../conflicting_precedence/expected_error.txt | 8 +- .../dynamic_precedence/grammar.json | 2 +- .../inlined_aliased_rules/grammar.json | 2 +- .../expected_error.txt | 10 +- .../expected_error.txt | 6 +- .../precedence_on_token/grammar.json | 2 +- 16 files changed, 252 insertions(+), 164 deletions(-) diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 6af85b4c..b87cc3d0 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -461,18 +461,20 @@ impl<'a> ParseTableBuilder<'a> { ) .unwrap(); write!(&mut msg, "Possible interpretations:\n\n").unwrap(); - for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, " {}:", i + 1).unwrap(); + + let interpretions = conflicting_items.iter().enumerate().map(|(i, item)| { + let mut line = String::new(); + write!(&mut line, " {}:", i + 1).unwrap(); for preceding_symbol in preceding_symbols .iter() .take(preceding_symbols.len() - item.step_index as usize) { - write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap(); } write!( - &mut msg, + &mut line, " ({}", &self.syntax_grammar.variables[item.variable_index as usize].name ) @@ -480,17 +482,17 @@ impl<'a> ParseTableBuilder<'a> { for (j, step) in item.production.steps.iter().enumerate() { if j as u32 == item.step_index { - write!(&mut msg, " •").unwrap(); + write!(&mut line, " •").unwrap(); } - write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap(); } - write!(&mut msg, ")").unwrap(); + write!(&mut line, ")").unwrap(); if item.is_done() { write!( - &mut msg, - " • {}", + &mut line, + " • {} …", self.symbol_name(&conflicting_lookahead) ) .unwrap(); @@ -498,16 +500,33 @@ impl<'a> ParseTableBuilder<'a> { let precedence = item.precedence(); let associativity = item.associativity(); - if precedence != 0 || associativity.is_some() { - write!( - &mut msg, + + let prec_line = if let Some(associativity) = associativity { + Some(format!( "(precedence: {}, associativity: {:?})", precedence, associativity - ) - .unwrap(); - } + )) + } else if precedence > 0 { + Some(format!("(precedence: {})", precedence)) + } else { + None + }; - write!(&mut msg, "\n").unwrap(); + (line, prec_line) + }).collect::>(); + + let max_interpretation_length = interpretions.iter().map(|i| i.0.chars().count()).max().unwrap(); + + for (line, prec_suffix) in interpretions { + msg += &line; + if let Some(prec_suffix) = prec_suffix { + for _ in line.chars().count()..max_interpretation_length { + msg.push(' '); + } + msg += " "; + msg += &prec_suffix; + } + msg.push('\n'); } let mut resolution_count = 0; @@ -517,26 +536,41 @@ impl<'a> ParseTableBuilder<'a> { .filter(|i| !i.is_done()) .cloned() .collect::>(); - if shift_items.len() > 0 { - resolution_count += 1; - write!( - &mut msg, - " {}: Specify a higher precedence in", - resolution_count - ) - .unwrap(); - for (i, item) in shift_items.iter().enumerate() { - if i > 0 { - write!(&mut msg, " and").unwrap(); - } + if actual_conflict.len() > 1 { + if shift_items.len() > 0 { + resolution_count += 1; write!( &mut msg, - " `{}`", - self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + " {}: Specify a higher precedence in", + resolution_count ) .unwrap(); + for (i, item) in shift_items.iter().enumerate() { + if i > 0 { + write!(&mut msg, " and").unwrap(); + } + write!( + &mut msg, + " `{}`", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + write!(&mut msg, " than in the other rules.\n").unwrap(); + } + + for item in &conflicting_items { + if item.is_done() { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in `{}` than in the other rules.\n", + resolution_count, + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } } - write!(&mut msg, " than in the other rules.\n").unwrap(); } if considered_associativity { @@ -553,7 +587,7 @@ impl<'a> ParseTableBuilder<'a> { } write!( &mut msg, - "{}", + "`{}`", self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) ) .unwrap(); @@ -561,19 +595,6 @@ impl<'a> ParseTableBuilder<'a> { write!(&mut msg, "\n").unwrap(); } - for item in &conflicting_items { - if item.is_done() { - resolution_count += 1; - write!( - &mut msg, - " {}: Specify a higher precedence in `{}` than in the other rules.\n", - resolution_count, - self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) - ) - .unwrap(); - } - } - resolution_count += 1; write!( &mut msg, @@ -585,7 +606,7 @@ impl<'a> ParseTableBuilder<'a> { if i > 0 { write!(&mut msg, ", ").unwrap(); } - write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap(); + write!(&mut msg, "`{}`", self.symbol_name(symbol)).unwrap(); } write!(&mut msg, "\n").unwrap(); diff --git a/cli/src/generate/build_tables/coincident_tokens.rs b/cli/src/generate/build_tables/coincident_tokens.rs index 25dbc331..bb234c4a 100644 --- a/cli/src/generate/build_tables/coincident_tokens.rs +++ b/cli/src/generate/build_tables/coincident_tokens.rs @@ -19,10 +19,14 @@ impl<'a> CoincidentTokenIndex<'a> { }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { - for other_symbol in state.terminal_entries.keys() { - let index = result.index(symbol.index, other_symbol.index); - if result.entries[index].last().cloned() != Some(i) { - result.entries[index].push(i); + if symbol.is_terminal() { + for other_symbol in state.terminal_entries.keys() { + if other_symbol.is_terminal() { + let index = result.index(symbol.index, other_symbol.index); + if result.entries[index].last().cloned() != Some(i) { + result.entries[index].push(i); + } + } } } } diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 81c86f4a..279c5df6 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -402,11 +402,11 @@ impl<'a> PartialEq for ParseItem<'a> { impl<'a> Ord for ParseItem<'a> { fn cmp(&self, other: &Self) -> Ordering { - let o = self.variable_index.cmp(&other.variable_index); + let o = self.step_index.cmp(&other.step_index); if o != Ordering::Equal { return o; } - let o = self.step_index.cmp(&other.step_index); + let o = self.variable_index.cmp(&other.variable_index); if o != Ordering::Equal { return o; } diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 52c6abac..28b18109 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -312,11 +312,13 @@ fn mark_fragile_tokens( } } for (token, entry) in state.terminal_entries.iter_mut() { - for i in 0..n { - if token_conflict_map.does_overlap(i, token.index) { - if valid_tokens_mask[i] { - entry.reusable = false; - break; + if token.is_terminal() { + for i in 0..n { + if token_conflict_map.does_overlap(i, token.index) { + if valid_tokens_mask[i] { + entry.reusable = false; + break; + } } } } diff --git a/cli/src/generate/prepare_grammar/flatten_grammar.rs b/cli/src/generate/prepare_grammar/flatten_grammar.rs index 9409a010..abd06769 100644 --- a/cli/src/generate/prepare_grammar/flatten_grammar.rs +++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs @@ -1,6 +1,9 @@ use super::ExtractedSyntaxGrammar; -use crate::error::Result; -use crate::generate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; +use crate::error::{Error, Result}; +use crate::generate::rules::Symbol; +use crate::generate::grammars::{ + Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable, +}; use crate::generate::rules::{Alias, Associativity, Rule}; struct RuleFlattener { @@ -145,11 +148,38 @@ fn flatten_variable(variable: Variable) -> Result { }) } +fn symbol_is_used(variables: &Vec, symbol: Symbol) -> bool { + for variable in variables { + for production in &variable.productions { + for step in &production.steps { + if step.symbol == symbol { + return true; + } + } + } + } + false +} + pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { let mut variables = Vec::new(); for variable in grammar.variables { variables.push(flatten_variable(variable)?); } + for (i, variable) in variables.iter().enumerate() { + for production in &variable.productions { + if production.steps.is_empty() && symbol_is_used(&variables, Symbol::non_terminal(i)) { + return Err(Error(format!( + "The rule `{}` matches the empty string. + +Tree-sitter does not support syntactic rules that match the empty string +unless they are used only as the grammar's start rule. +", + variable.name + ))); + } + } + } Ok(SyntaxGrammar { extra_tokens: grammar.extra_tokens, expected_conflicts: grammar.expected_conflicts, @@ -228,48 +258,55 @@ mod tests { #[test] fn test_flatten_grammar_with_maximum_dynamic_precedence() { let result = flatten_variable(Variable { - name: "test".to_string(), - kind: VariableType::Named, - rule: Rule::seq(vec![ - Rule::non_terminal(1), - Rule::prec_dynamic(101, Rule::seq(vec![ - Rule::non_terminal(2), - Rule::choice(vec![ - Rule::prec_dynamic(102, Rule::seq(vec![ - Rule::non_terminal(3), - Rule::non_terminal(4) - ])), - Rule::non_terminal(5), - ]), - Rule::non_terminal(6), - ])), - Rule::non_terminal(7), - ]) - }).unwrap(); + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_dynamic( + 101, + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_dynamic( + 102, + Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), + ), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ]), + ), + Rule::non_terminal(7), + ]), + }) + .unwrap(); - assert_eq!(result.productions, vec![ - Production { - dynamic_precedence: 102, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)), - ProductionStep::new(Symbol::non_terminal(2)), - ProductionStep::new(Symbol::non_terminal(3)), - ProductionStep::new(Symbol::non_terminal(4)), - ProductionStep::new(Symbol::non_terminal(6)), - ProductionStep::new(Symbol::non_terminal(7)), - ], - }, - Production { - dynamic_precedence: 101, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)), - ProductionStep::new(Symbol::non_terminal(2)), - ProductionStep::new(Symbol::non_terminal(5)), - ProductionStep::new(Symbol::non_terminal(6)), - ProductionStep::new(Symbol::non_terminal(7)), - ], - }, - ]); + assert_eq!( + result.productions, + vec![ + Production { + dynamic_precedence: 102, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(3)), + ProductionStep::new(Symbol::non_terminal(4)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + Production { + dynamic_precedence: 101, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(5)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + ] + ); } #[test] @@ -277,37 +314,40 @@ mod tests { let result = flatten_variable(Variable { name: "test".to_string(), kind: VariableType::Named, - rule: Rule::prec_left(101, Rule::seq(vec![ - Rule::non_terminal(1), - Rule::non_terminal(2), - ])), - }).unwrap(); + rule: Rule::prec_left( + 101, + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), + ), + }) + .unwrap(); - assert_eq!(result.productions, vec![ - Production { + assert_eq!( + result.productions, + vec![Production { dynamic_precedence: 0, steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), - ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(1)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), ] - } - ]); + }] + ); let result = flatten_variable(Variable { name: "test".to_string(), kind: VariableType::Named, - rule: Rule::prec_left(101, Rule::seq(vec![ - Rule::non_terminal(1), - ])), - }).unwrap(); + rule: Rule::prec_left(101, Rule::seq(vec![Rule::non_terminal(1)])), + }) + .unwrap(); - assert_eq!(result.productions, vec![ - Production { + assert_eq!( + result.productions, + vec![Production { dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), - ] - } - ]); + steps: vec![ProductionStep::new(Symbol::non_terminal(1)) + .with_prec(101, Some(Associativity::Left)),] + }] + ); } } diff --git a/cli/src/generate/prepare_grammar/intern_symbols.rs b/cli/src/generate/prepare_grammar/intern_symbols.rs index a7248817..8b07309b 100644 --- a/cli/src/generate/prepare_grammar/intern_symbols.rs +++ b/cli/src/generate/prepare_grammar/intern_symbols.rs @@ -8,7 +8,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { return Err(Error( - "Grammar's start rule must be visible".to_string(), + "A grammar's start rule must be visible.".to_string(), )); } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 5e0d2b67..55bfbfa2 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -191,13 +191,11 @@ impl Generator { "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len() ); - if self.parse_table.max_aliased_production_length > 0 { - add_line!( - self, - "#define MAX_ALIAS_SEQUENCE_LENGTH {}", - self.parse_table.max_aliased_production_length - ); - } + add_line!( + self, + "#define MAX_ALIAS_SEQUENCE_LENGTH {}", + self.parse_table.max_aliased_production_length + ); add_line!(self, ""); } diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index 624786fc..eeea113c 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -3,7 +3,7 @@ use crate::generate; use crate::loader::Loader; use crate::test::{parse_tests, TestEntry}; use std::fs; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use tree_sitter::{Language, Parser}; lazy_static! { @@ -19,6 +19,7 @@ lazy_static! { static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); static ref SCRATCH_DIR: PathBuf = ROOT_DIR.join("target").join("scratch"); static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); + static ref EXEC_PATH: PathBuf = std::env::current_exe().unwrap(); } #[test] @@ -38,27 +39,42 @@ fn test_real_language_corpus_files() { fn test_feature_corpus_files() { fs::create_dir_all(SCRATCH_DIR.as_path()).unwrap(); + let filter = std::env::var("TREE_SITTER_TEST_FILTER").ok(); let mut loader = Loader::new(SCRATCH_DIR.clone()); let mut parser = Parser::new(); let test_grammars_dir = FIXTURES_DIR.join("test_grammars"); for entry in fs::read_dir(&test_grammars_dir).unwrap() { let entry = entry.unwrap(); + if !entry.metadata().unwrap().is_dir() { + continue; + } let test_name = entry.file_name(); let test_name = test_name.to_str().unwrap(); - eprintln!("test name: {}", test_name); + if let Some(filter) = filter.as_ref() { + if !test_name.contains(filter.as_str()) { + continue; + } + } + + eprintln!("test: {:?}", test_name); + let test_path = entry.path(); let grammar_path = test_path.join("grammar.json"); - let corpus_path = test_path.join("corpus.txt"); let error_message_path = test_path.join("expected_error.txt"); - let grammar_json = fs::read_to_string(grammar_path).unwrap(); let generate_result = generate::generate_parser_for_grammar(&grammar_json); + if error_message_path.exists() { - continue; + let expected_message = fs::read_to_string(&error_message_path).unwrap(); if let Err(e) = generate_result { - assert_eq!(e.0, fs::read_to_string(&error_message_path).unwrap()); + if e.0 != expected_message { + panic!( + "Unexpected error message.\n\nExpected:\n\n{}\nActual:\n\n{}\n", + expected_message, e.0 + ); + } } else { panic!( "Expected error message but got none for test grammar '{}'", @@ -66,9 +82,15 @@ fn test_feature_corpus_files() { ); } } else { + let corpus_path = test_path.join("corpus.txt"); let c_code = generate_result.unwrap(); let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", test_name)); - fs::write(&parser_c_path, c_code).unwrap(); + if !fs::read_to_string(&parser_c_path) + .map(|content| content == c_code) + .unwrap_or(false) + { + fs::write(&parser_c_path, c_code).unwrap(); + } let scanner_path = test_path.join("scanner.c"); let scanner_path = if scanner_path.exists() { Some(scanner_path) @@ -78,6 +100,7 @@ fn test_feature_corpus_files() { let language = loader .load_language_from_sources(test_name, &HEADER_DIR, &parser_c_path, &scanner_path) .unwrap(); + let test = parse_tests(&corpus_path).unwrap(); } } diff --git a/test/fixtures/test_grammars/conflict_in_repeat_rule/expected_error.txt b/test/fixtures/test_grammars/conflict_in_repeat_rule/expected_error.txt index 2c710346..94d1caa4 100644 --- a/test/fixtures/test_grammars/conflict_in_repeat_rule/expected_error.txt +++ b/test/fixtures/test_grammars/conflict_in_repeat_rule/expected_error.txt @@ -1,14 +1,14 @@ Unresolved conflict for symbol sequence: - '[' identifier • ']' … + '[' identifier • identifier … Possible interpretations: - 1: '[' (array_repeat1 identifier) • ']' … - 2: '[' (array_type_repeat1 identifier) • ']' … + 1: '[' (array_type_repeat1 identifier) • identifier … + 2: '[' (array_repeat1 identifier) • identifier … Possible resolutions: - 1: Specify a higher precedence in `array_repeat1` than in the other rules. - 2: Specify a higher precedence in `array_type_repeat1` than in the other rules. - 3: Add a conflict for these rules: `array` `array_type` + 1: Specify a higher precedence in `array_type_repeat1` than in the other rules. + 2: Specify a higher precedence in `array_repeat1` than in the other rules. + 3: Add a conflict for these rules: `array`, `array_type` diff --git a/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/expected_error.txt b/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/expected_error.txt index cbb3e02c..4a81f0ef 100644 --- a/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/expected_error.txt +++ b/test/fixtures/test_grammars/conflict_in_repeat_rule_after_external_token/expected_error.txt @@ -1,14 +1,14 @@ Unresolved conflict for symbol sequence: - _program_start '[' identifier • ']' … + _program_start '[' identifier • identifier … Possible interpretations: - 1: _program_start '[' (array_repeat1 identifier) • ']' … - 2: _program_start '[' (array_type_repeat1 identifier) • ']' … + 1: _program_start '[' (array_type_repeat1 identifier) • identifier … + 2: _program_start '[' (array_repeat1 identifier) • identifier … Possible resolutions: - 1: Specify a higher precedence in `array_repeat1` than in the other rules. - 2: Specify a higher precedence in `array_type_repeat1` than in the other rules. - 3: Add a conflict for these rules: `array` `array_type` + 1: Specify a higher precedence in `array_type_repeat1` than in the other rules. + 2: Specify a higher precedence in `array_repeat1` than in the other rules. + 3: Add a conflict for these rules: `array`, `array_type` diff --git a/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt b/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt index ce7090a3..ea23b072 100644 --- a/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt +++ b/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt @@ -4,12 +4,12 @@ Unresolved conflict for symbol sequence: Possible interpretations: - 1: expression '+' (product expression • '*' expression) - 2: expression '+' (other_thing expression • '*' '*') - 3: (sum expression '+' expression) • '*' … + 1: expression '+' (product expression • '*' expression) (precedence: 1, associativity: Left) + 2: expression '+' (other_thing expression • '*' '*') (precedence: -1, associativity: Left) + 3: (sum expression '+' expression) • '*' … (precedence: 0, associativity: Left) Possible resolutions: 1: Specify a higher precedence in `product` and `other_thing` than in the other rules. 2: Specify a higher precedence in `sum` than in the other rules. - 3: Add a conflict for these rules: `sum` `product` `other_thing` + 3: Add a conflict for these rules: `sum`, `product`, `other_thing` diff --git a/test/fixtures/test_grammars/dynamic_precedence/grammar.json b/test/fixtures/test_grammars/dynamic_precedence/grammar.json index 381ed4c2..1a7e04ab 100644 --- a/test/fixtures/test_grammars/dynamic_precedence/grammar.json +++ b/test/fixtures/test_grammars/dynamic_precedence/grammar.json @@ -14,7 +14,7 @@ "type": "CHOICE", "members": [ {"type": "SYMBOL", "name": "declaration"}, - {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "expression"} ] }, diff --git a/test/fixtures/test_grammars/inlined_aliased_rules/grammar.json b/test/fixtures/test_grammars/inlined_aliased_rules/grammar.json index bdf01789..d98f6e6c 100644 --- a/test/fixtures/test_grammars/inlined_aliased_rules/grammar.json +++ b/test/fixtures/test_grammars/inlined_aliased_rules/grammar.json @@ -44,7 +44,7 @@ {"type": "SYMBOL", "name": "expression"}, {"type": "STRING", "value": "("}, {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": ")"}, + {"type": "STRING", "value": ")"} ] } }, diff --git a/test/fixtures/test_grammars/partially_resolved_conflict/expected_error.txt b/test/fixtures/test_grammars/partially_resolved_conflict/expected_error.txt index 201bdf98..a8699897 100644 --- a/test/fixtures/test_grammars/partially_resolved_conflict/expected_error.txt +++ b/test/fixtures/test_grammars/partially_resolved_conflict/expected_error.txt @@ -4,11 +4,11 @@ Unresolved conflict for symbol sequence: Possible interpretations: - 1: (unary_a '!' expression) • '<' … - 2: (unary_b '!' expression) • '<' … + 1: (unary_b '!' expression) • '<' … (precedence: 2) + 2: (unary_a '!' expression) • '<' … (precedence: 2) Possible resolutions: - 1: Specify a higher precedence in `unary_a` than in the other rules. - 2: Specify a higher precedence in `unary_b` than in the other rules. - 3: Add a conflict for these rules: `unary_a` `unary_b` + 1: Specify a higher precedence in `unary_b` than in the other rules. + 2: Specify a higher precedence in `unary_a` than in the other rules. + 3: Add a conflict for these rules: `unary_a`, `unary_b` diff --git a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt index 6ee80f23..557f1837 100644 --- a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt +++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt @@ -4,12 +4,12 @@ Unresolved conflict for symbol sequence: Possible interpretations: - 1: identifier (expression identifier) • '{' … - 2: identifier (function_call identifier • block) + 1: identifier (function_call identifier • block) (precedence: 0, associativity: Right) + 2: identifier (expression identifier) • '{' … Possible resolutions: 1: Specify a higher precedence in `function_call` than in the other rules. 2: Specify a higher precedence in `expression` than in the other rules. 3: Specify a left or right associativity in `expression` - 4: Add a conflict for these rules: `expression` `function_call` + 4: Add a conflict for these rules: `expression`, `function_call` diff --git a/test/fixtures/test_grammars/precedence_on_token/grammar.json b/test/fixtures/test_grammars/precedence_on_token/grammar.json index 1b1ef7ea..8ba7e69a 100644 --- a/test/fixtures/test_grammars/precedence_on_token/grammar.json +++ b/test/fixtures/test_grammars/precedence_on_token/grammar.json @@ -3,7 +3,7 @@ "extras": [ {"type": "SYMBOL", "name": "comment"}, - {"type": "PATTERN", "value": "\\s"}, + {"type": "PATTERN", "value": "\\s"} ], "rules": { From 1468b349b59667171f0776c1bdf2689e0ffc1a68 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 17:39:16 -0800 Subject: [PATCH 131/208] Ensure 'src' directory exists before writing src/parser.c --- cli/src/generate/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 5d89bbfe..283ab0b2 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -36,6 +36,7 @@ pub fn generate_parser_in_directory( let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); let c_code = generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; + fs::create_dir_all("src")?; fs::write(repo_path.join("src").join("parser.c"), c_code)?; } properties::generate_property_sheets(repo_path)?; From c76a155174c841a9f85b5a73682c5b090af739d1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 17:43:27 -0800 Subject: [PATCH 132/208] Fix escaping of characters in C strings --- cli/src/generate/render.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 55bfbfa2..a3e20536 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -931,10 +931,14 @@ impl Generator { fn sanitize_string(&self, name: &str) -> String { let mut result = String::with_capacity(name.len()); for c in name.chars() { - if ['\\', '\n', '\r', '\"'].contains(&c) { - result.push('\\'); + match c { + '\"' => result += "\\\"", + '\\' => result += "\\\\", + '\t' => result += "'\\t'", + '\n' => result += "'\\n'", + '\r' => result += "'\\r'", + _ => result.push(c), } - result.push(c); } result } From 6f242fda0c25cc9271478b13440ae39e89d928ca Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 11 Jan 2019 17:43:42 -0800 Subject: [PATCH 133/208] Fix edge case in flatten rule --- .../generate/prepare_grammar/flatten_grammar.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cli/src/generate/prepare_grammar/flatten_grammar.rs b/cli/src/generate/prepare_grammar/flatten_grammar.rs index abd06769..204ceb07 100644 --- a/cli/src/generate/prepare_grammar/flatten_grammar.rs +++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs @@ -31,13 +31,15 @@ impl RuleFlattener { self.production } - fn apply(&mut self, rule: Rule, at_end: bool) { + fn apply(&mut self, rule: Rule, at_end: bool) -> bool { match rule { Rule::Seq(members) => { + let mut result = false; let last_index = members.len() - 1; for (i, member) in members.into_iter().enumerate() { - self.apply(member, i == last_index && at_end); + result |= self.apply(member, i == last_index && at_end); } + result } Rule::Metadata { rule, params } => { let mut has_precedence = false; @@ -62,11 +64,11 @@ impl RuleFlattener { self.production.dynamic_precedence = params.dynamic_precedence; } - self.apply(*rule, at_end); + let did_push = self.apply(*rule, at_end); if has_precedence { self.precedence_stack.pop(); - if !at_end { + if did_push && !at_end { self.production.steps.last_mut().unwrap().precedence = self.precedence_stack.last().cloned().unwrap_or(0); } @@ -74,7 +76,7 @@ impl RuleFlattener { if has_associativity { self.associativity_stack.pop(); - if !at_end { + if did_push && !at_end { self.production.steps.last_mut().unwrap().associativity = self.associativity_stack.last().cloned(); } @@ -83,6 +85,8 @@ impl RuleFlattener { if has_alias { self.alias_stack.pop(); } + + did_push } Rule::Symbol(symbol) => { self.production.steps.push(ProductionStep { @@ -91,8 +95,9 @@ impl RuleFlattener { associativity: self.associativity_stack.last().cloned(), alias: self.alias_stack.last().cloned(), }); + true } - _ => (), + _ => false, } } } From 545e840a082b8873ed2a85f5bb98196a06e419de Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 12 Jan 2019 21:42:31 -0800 Subject: [PATCH 134/208] Remove stray single quotes in symbol name strings --- cli/src/generate/render.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index a3e20536..c164ee1b 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -934,9 +934,9 @@ impl Generator { match c { '\"' => result += "\\\"", '\\' => result += "\\\\", - '\t' => result += "'\\t'", - '\n' => result += "'\\n'", - '\r' => result += "'\\r'", + '\t' => result += "\\t", + '\n' => result += "\\n", + '\r' => result += "\\r", _ => result.push(c), } } From 2e009f7177b5b409399c5de80ed330717a0b5522 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 12 Jan 2019 21:57:34 -0800 Subject: [PATCH 135/208] Avoid writing empty initializer list for alias sequences --- cli/src/generate/render.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index c164ee1b..9b09c6ad 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -66,7 +66,10 @@ impl Generator { self.add_symbol_enum(); self.add_symbol_names_list(); self.add_symbol_metadata_list(); - self.add_alias_sequences(); + + if self.parse_table.alias_sequences.len() > 1 { + self.add_alias_sequences(); + } let mut main_lex_table = LexTable::default(); swap(&mut main_lex_table, &mut self.main_lex_table); @@ -750,10 +753,13 @@ impl Generator { add_line!(self, ".parse_actions = ts_parse_actions,"); add_line!(self, ".lex_modes = ts_lex_modes,"); add_line!(self, ".symbol_names = ts_symbol_names,"); - add_line!( - self, - ".alias_sequences = (const TSSymbol *)ts_alias_sequences," - ); + + if self.parse_table.alias_sequences.len() > 1 { + add_line!( + self, + ".alias_sequences = (const TSSymbol *)ts_alias_sequences," + ); + } add_line!( self, From e2717a6ad14c6d1db056b55e89526b70eeb48a83 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 14:05:19 -0800 Subject: [PATCH 136/208] Preprocess regexes to allow non-standard escape sequences Also allow unescaped curly braces to match literal curly braces when they don't form a valid repetition operator. --- cli/src/generate/dsl.js | 5 -- .../generate/prepare_grammar/expand_tokens.rs | 65 +++++++++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/cli/src/generate/dsl.js b/cli/src/generate/dsl.js index ba3962cd..fa60dfa7 100644 --- a/cli/src/generate/dsl.js +++ b/cli/src/generate/dsl.js @@ -1,5 +1,4 @@ const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; -const DELIMITER_ESCAPE_PATTERN = /\\\//g; function alias(rule, value) { const result = { @@ -150,10 +149,6 @@ function normalize(value) { return { type: 'PATTERN', value: value.source - .replace( - DELIMITER_ESCAPE_PATTERN, - '/' - ) .replace( UNICODE_ESCAPE_PATTERN, (match, group) => String.fromCharCode(parseInt(group, 16)) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index e269df6d..1e2ef2e5 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -6,8 +6,15 @@ use crate::generate::rules::Rule; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; +use regex::Regex; use std::i32; +lazy_static! { + static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9}][^}]*)\}"#).unwrap(); +} + +const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; + struct NfaBuilder { nfa: Nfa, is_sep: bool, @@ -35,6 +42,31 @@ fn get_completion_precedence(rule: &Rule) -> i32 { } } +fn preprocess_regex(content: &str) -> String { + let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}"); + let mut result = String::with_capacity(content.len()); + let mut is_escaped = false; + for c in content.chars() { + if is_escaped { + if ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) { + result.push(c); + } else { + result.push('\\'); + result.push(c); + } + is_escaped = false; + } else if c == '\\' { + is_escaped = true; + } else { + result.push(c); + } + } + if is_escaped { + result.push('\\'); + } + result +} + pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { let mut builder = NfaBuilder { nfa: Nfa::new(), @@ -90,6 +122,7 @@ impl NfaBuilder { fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result { match rule { Rule::Pattern(s) => { + let s = preprocess_regex(s); let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error(e.to_string()))?; @@ -586,6 +619,38 @@ mod tests { ("12e34", Some((0, "12e34"))), ], }, + // Allowing unrecognized escape sequences + Row { + rules: vec![ + // Escaped forward slash (used in JS because '/' is the regex delimiter) + Rule::pattern(r#"\/"#), + // Escaped quotes + Rule::pattern(r#"\"\'"#), + // Quote preceded by a literal backslash + Rule::pattern(r#"[\\']+"#), + ], + separators: vec![], + examples: vec![ + ("/", Some((0, "/"))), + ("\"\'", Some((1, "\"\'"))), + (r#"'\'a"#, Some((2, r#"'\'"#))), + ], + }, + // Allowing un-escaped curly braces + Row { + rules: vec![ + // Un-escaped curly braces + Rule::pattern(r#"u{[0-9a-fA-F]+}"#), + // Already-escaped curly braces + Rule::pattern(r#"\{[ab]{3}\}"#), + ], + separators: vec![], + examples: vec![ + ("u{1234} ok", Some((0, "u{1234}"))), + ("{aba}}", Some((1, "{aba}"))), + ], + + } ]; for Row { From 8f48240bf1e7d654e127ed1147df59be93c07db0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 14:06:22 -0800 Subject: [PATCH 137/208] Allow building the C code with static analysis --- lib/build.rs | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/lib/build.rs b/lib/build.rs index 7e8714ef..e4d1f91a 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -5,16 +5,7 @@ use std::path::{Path, PathBuf}; use std::fs; fn main() { - let mut config = cc::Build::new(); - config - .define("UTF8PROC_STATIC", "") - .flag_if_supported("-std=c99") - .flag_if_supported("-Wno-unused-parameter") - .include("include") - .include("utf8proc") - .file(Path::new("src").join("lib.c")) - .compile("tree-sitter"); - + println!("cargo:rerun-if-env-changed=TREE_SITTER_TEST"); if env::var("TREE_SITTER_TEST").is_ok() { let mut parser_config = cc::Build::new(); parser_config @@ -65,4 +56,34 @@ fn main() { scanner_c_config.compile("fixture-scanners-c"); scanner_cxx_config.compile("fixture-scanners-cxx"); } + + println!("cargo:rerun-if-env-changed=TREE_SITTER_STATIC_ANALYSIS"); + if env::var("TREE_SITTER_STATIC_ANALYSIS").is_ok() { + let clang_path = which("clang").unwrap(); + let clang_path = clang_path.to_str().unwrap(); + env::set_var("CC", &format!("scan-build -analyze-headers --use-analyzer={} cc", clang_path)); + } + + let mut config = cc::Build::new(); + config + .define("UTF8PROC_STATIC", "") + .flag_if_supported("-std=c99") + .flag_if_supported("-Wno-unused-parameter") + .include("include") + .include("utf8proc") + .file(Path::new("src").join("lib.c")) + .compile("tree-sitter"); +} + +fn which(exe_name: impl AsRef) -> Option { + env::var_os("PATH").and_then(|paths| { + env::split_paths(&paths).find_map(|dir| { + let full_path = dir.join(&exe_name); + if full_path.is_file() { + Some(full_path) + } else { + None + } + }) + }) } From def5884b59495fbe3ff199f199eee58731f5398e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 14:07:42 -0800 Subject: [PATCH 138/208] Allow passing grammar JS or JSON path to `generate` command --- cli/src/generate/mod.rs | 12 +++++++++++- cli/src/generate/properties.rs | 22 ++++++++++++---------- cli/src/main.rs | 3 +++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 283ab0b2..1593c0da 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -28,12 +28,14 @@ lazy_static! { pub fn generate_parser_in_directory( repo_path: &PathBuf, + grammar_path: Option<&str>, minimize: bool, state_ids_to_log: Vec, properties_only: bool, ) -> Result<()> { if !properties_only { - let grammar_json = load_js_grammar_file(&repo_path.join("grammar.js")); + let grammar_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into()); + let grammar_json = load_grammar_file(&grammar_path); let c_code = generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; fs::create_dir_all("src")?; @@ -77,6 +79,14 @@ fn generate_parser_for_grammar_with_opts( )) } +fn load_grammar_file(grammar_path: &PathBuf) -> String { + match grammar_path.extension().and_then(|e| e.to_str()) { + Some("js") => load_js_grammar_file(grammar_path), + Some("json") => fs::read_to_string(grammar_path).expect("Failed to read grammar file"), + _ => panic!("Unknown grammar file extension"), + } +} + fn load_js_grammar_file(grammar_path: &PathBuf) -> String { let mut node_process = Command::new("node") .stdin(Stdio::piped()) diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index cca7fef8..e1492d6f 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -424,16 +424,18 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { let src_dir_path = repo_path.join("src"); let properties_dir_path = repo_path.join("properties"); - for entry in fs::read_dir(properties_dir_path)? { - let css_path = entry?.path(); - let css = fs::read_to_string(&css_path)?; - let sheet = generate_property_sheet(&css_path, &css)?; - let property_sheet_json_path = src_dir_path - .join(css_path.file_name().unwrap()) - .with_extension("json"); - let property_sheet_json_file = File::create(property_sheet_json_path)?; - let mut writer = BufWriter::new(property_sheet_json_file); - serde_json::to_writer_pretty(&mut writer, &sheet)?; + if let Ok(entries) = fs::read_dir(properties_dir_path) { + for entry in entries { + let css_path = entry?.path(); + let css = fs::read_to_string(&css_path)?; + let sheet = generate_property_sheet(&css_path, &css)?; + let property_sheet_json_path = src_dir_path + .join(css_path.file_name().unwrap()) + .with_extension("json"); + let property_sheet_json_file = File::create(property_sheet_json_path)?; + let mut writer = BufWriter::new(property_sheet_json_file); + serde_json::to_writer_pretty(&mut writer, &sheet)?; + } } Ok(()) diff --git a/cli/src/main.rs b/cli/src/main.rs index 5a830458..80a40758 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -43,6 +43,7 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("generate") .about("Generate a parser") + .arg(Arg::with_name("grammar-path").index(1)) .arg(Arg::with_name("log").long("log")) .arg(Arg::with_name("properties-only").long("properties")) .arg( @@ -84,6 +85,7 @@ fn run() -> error::Result<()> { logger::init(); } + let grammar_path = matches.value_of("grammar-path"); let minimize = !matches.is_present("no-minimize"); let properties_only = matches.is_present("properties-only"); let state_ids_to_log = matches @@ -94,6 +96,7 @@ fn run() -> error::Result<()> { }); generate::generate_parser_in_directory( ¤t_dir, + grammar_path, minimize, state_ids_to_log, properties_only, From 19b2addcc42f89e17cb34ecef906c29203dacb9e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 14:08:07 -0800 Subject: [PATCH 139/208] Fix bug in symbol enum code generation --- cli/src/generate/render.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 9b09c6ad..1da7f99d 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -215,8 +215,8 @@ impl Generator { for (alias, symbol) in &self.alias_map { if symbol.is_none() { add_line!(self, "{} = {},", self.alias_ids[&alias], i); + i += 1; } - i += 1; } dedent!(self); add_line!(self, "}};"); From b1fa49448d3c18e2d44b5e35e59795e0e7aa9078 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 14:39:01 -0800 Subject: [PATCH 140/208] Regenerate parsers on CI --- .appveyor.yml | 17 ++++++++++------- .travis.yml | 15 +++++++++++---- script/format | 7 ------- script/lint | 14 -------------- script/regenerate-fixtures | 27 +++++++++++++++++++++++++++ script/regenerate-fixtures.cmd | 22 ++++++++++++++++++++++ 6 files changed, 70 insertions(+), 32 deletions(-) delete mode 100755 script/format delete mode 100755 script/lint create mode 100755 script/regenerate-fixtures create mode 100644 script/regenerate-fixtures.cmd diff --git a/.appveyor.yml b/.appveyor.yml index 147827e9..f46e34e6 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,9 +1,4 @@ -environment: - RUST_BACKTRACE: full - TREE_SITTER_TEST: true - build: false - install: # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe @@ -14,10 +9,18 @@ install: # Install dependencies - git submodule update --init - - script\fetch-fixtures.cmd + +environment: + RUST_BACKTRACE: full test_script: - - cargo build + # Fetch and regenerate the fixture parsers + - script\fetch-fixtures.cmd + - cargo build --release + - script\regenerate-fixtures.cmd + + # Run tests + - set TREE_SITTER_TEST=1 - cargo test branches: diff --git a/.travis.yml b/.travis.yml index 0d4f8cd5..c281ef95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,20 @@ language: rust - rust: - stable env: - - TREE_SITTER_TEST=1 RUST_BACKTRACE=full + - RUST_BACKTRACE=full -before_install: - - ./script/fetch-fixtures +script: + # Fetch and regenerate the fixture parsers + - script/fetch-fixtures + - cargo build --release + - script/regenerate-fixtures + + # Run tests + - export TREE_SITTER_TEST=1 + - export TREE_SITTER_STATIC_ANALYSIS=1 + - cargo test branches: only: diff --git a/script/format b/script/format deleted file mode 100755 index 1aa8fbee..00000000 --- a/script/format +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -DIRS="${*:-src include}" - -find $DIRS \ - -name '*.c' -or -name '*.cc' -or -name '*.h' | \ - xargs clang-format -i -style=file diff --git a/script/lint b/script/lint deleted file mode 100755 index 3d6a03dc..00000000 --- a/script/lint +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -CPPLINT=externals/cpplint.py -CPPLINT_URL=http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py - -if [[ ! -f $CPPLINT ]]; then - curl $CPPLINT_URL > $CPPLINT - chmod +x $CPPLINT -fi - -FILTERS='--filter=-legal/copyright,-readability/todo,-build/c++11' - -$CPPLINT --linelength=90 --root=include $FILTERS include/tree_sitter/compiler.h 2>&1 -$CPPLINT --linelength=90 --root=src $FILTERS $(find src/compiler -type f) 2>&1 diff --git a/script/regenerate-fixtures b/script/regenerate-fixtures new file mode 100755 index 00000000..15e3c09d --- /dev/null +++ b/script/regenerate-fixtures @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +root_dir=$PWD +tree_sitter=${root_dir}/target/release/tree-sitter +grammars_dir=${root_dir}/test/fixtures/grammars + +grammar_names=( + bash + c + cpp + embedded-template + go + html + javascript + json + python + rust +) + +for grammar_name in "${grammar_names[@]}"; do + echo "Regenerating ${grammar_name} parser" + cd ${grammars_dir}/${grammar_name} + $tree_sitter generate src/grammar.json + cd $PWD +done diff --git a/script/regenerate-fixtures.cmd b/script/regenerate-fixtures.cmd new file mode 100644 index 00000000..739bdba1 --- /dev/null +++ b/script/regenerate-fixtures.cmd @@ -0,0 +1,22 @@ +@echo off + +call:regenerate bash +call:regenerate c +call:regenerate cpp +call:regenerate embedded-template +call:regenerate go +call:regenerate html +call:regenerate javascript +call:regenerate json +call:regenerate python +call:regenerate rust +EXIT /B 0 + +:regenerate +SETLOCAL +SET tree_sitter=%cd%\target\release\tree-sitter +SET grammar_dir=test\fixtures\grammars\%~1 +pushd %grammar_dir% +%tree_sitter% generate src\grammar.json +popd +EXIT /B 0 From 5c3c1dd0bd28fec0ab20d77c288f1b66b9b90a0f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Jan 2019 17:19:46 -0800 Subject: [PATCH 141/208] Get logging flags working properly with test script --- .appveyor.yml | 5 +- .travis.yml | 6 +- cli/src/parse.rs | 9 +- cli/src/test.rs | 10 +- cli/src/tests/corpuses.rs | 95 +++++++++--- cli/src/tests/languages.rs | 2 + cli/src/tests/parser_api.rs | 4 - cli/src/util.rs | 55 ++++--- lib/binding/lib.rs | 2 +- script/clean | 7 - script/configure | 7 - script/configure.cmd | 3 - script/{bindgen.sh => generate-bindings} | 0 script/test | 139 +++--------------- script/test.cmd | 10 +- script/test.sh | 3 - script/trim-whitespace | 3 - .../corpus.txt | 1 + .../grammar.json | 4 +- 19 files changed, 140 insertions(+), 225 deletions(-) delete mode 100755 script/clean delete mode 100755 script/configure delete mode 100644 script/configure.cmd rename script/{bindgen.sh => generate-bindings} (100%) delete mode 100755 script/test.sh delete mode 100755 script/trim-whitespace diff --git a/.appveyor.yml b/.appveyor.yml index f46e34e6..29193a53 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -10,9 +10,6 @@ install: # Install dependencies - git submodule update --init -environment: - RUST_BACKTRACE: full - test_script: # Fetch and regenerate the fixture parsers - script\fetch-fixtures.cmd @@ -21,7 +18,7 @@ test_script: # Run tests - set TREE_SITTER_TEST=1 - - cargo test + - script\test.cmd branches: only: diff --git a/.travis.yml b/.travis.yml index c281ef95..5f981ce9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,6 @@ language: rust rust: - stable -env: - - RUST_BACKTRACE=full - script: # Fetch and regenerate the fixture parsers - script/fetch-fixtures @@ -12,9 +9,8 @@ script: - script/regenerate-fixtures # Run tests - - export TREE_SITTER_TEST=1 - export TREE_SITTER_STATIC_ANALYSIS=1 - - cargo test + - script/test branches: only: diff --git a/cli/src/parse.rs b/cli/src/parse.rs index fde148b6..38b6a61c 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -11,14 +11,13 @@ pub fn parse_file_at_path( debug: bool, debug_graph: bool, ) -> Result<()> { + let mut log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; let source_code = fs::read_to_string(path)?; - let mut log_session = None; - if debug_graph { - log_session = Some(util::start_logging_graphs(&mut parser, "log.html")?); + log_session = Some(util::log_graphs(&mut parser, "log.html")?); } else if debug { parser.set_logger(Some(Box::new(|log_type, message| { if log_type == LogType::Lex { @@ -32,9 +31,7 @@ pub fn parse_file_at_path( .parse_str(&source_code, None) .expect("Incompatible language version"); - if let Some(log_session) = log_session { - util::stop_logging_graphs(&mut parser, log_session)?; - } + drop(log_session); let stdout = io::stdout(); let mut stdout = stdout.lock(); diff --git a/cli/src/test.rs b/cli/src/test.rs index e064dffd..bcea3dcc 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -44,13 +44,12 @@ pub fn run_tests_at_path( filter: Option<&str>, ) -> Result<()> { let test_entry = parse_tests(path)?; + let mut log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; - let mut log_session = None; - if debug_graph { - log_session = Some(util::start_logging_graphs(&mut parser, "log.html")?); + log_session = Some(util::log_graphs(&mut parser, "log.html")?); } else if debug { parser.set_logger(Some(Box::new(|log_type, message| { if log_type == LogType::Lex { @@ -103,10 +102,7 @@ pub fn run_tests_at_path( } } - if let Some(log_session) = log_session { - util::stop_logging_graphs(&mut parser, log_session)?; - } - + drop(log_session); Ok(()) } diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index eeea113c..e1fe9189 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -2,12 +2,14 @@ use super::languages; use crate::generate; use crate::loader::Loader; use crate::test::{parse_tests, TestEntry}; +use crate::util; use std::fs; use std::path::PathBuf; -use tree_sitter::{Language, Parser}; +use tree_sitter::{Language, Parser, LogType}; lazy_static! { - static ref LANGUAGES: [(&'static str, Language); 6] = [ + static ref LANGUAGES: [(&'static str, Language); 7] = [ + ("bash", languages::bash()), ("c", languages::c()), ("cpp", languages::cpp()), ("embedded-template", languages::embedded_template()), @@ -20,45 +22,87 @@ lazy_static! { static ref SCRATCH_DIR: PathBuf = ROOT_DIR.join("target").join("scratch"); static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); static ref EXEC_PATH: PathBuf = std::env::current_exe().unwrap(); + static ref LANGUAGE_FILTER: Option = + std::env::var("TREE_SITTER_TEST_LANGUAGE_FILTER").ok(); + static ref EXAMPLE_FILTER: Option = + std::env::var("TREE_SITTER_TEST_EXAMPLE_FILTER").ok(); + static ref LOG_ENABLED: bool = std::env::var("TREE_SITTER_ENABLE_LOG").is_ok(); + static ref LOG_GRAPH_ENABLED: bool = std::env::var("TREE_SITTER_ENABLE_LOG_GRAPHS").is_ok(); } #[test] fn test_real_language_corpus_files() { + let mut log_session = None; let mut parser = Parser::new(); let grammars_dir = FIXTURES_DIR.join("grammars"); - for (name, language) in LANGUAGES.iter().cloned() { - let corpus_dir = grammars_dir.join(name).join("corpus"); + if *LOG_ENABLED { + parser.set_logger(Some(Box::new(|log_type, msg| { + if log_type == LogType::Lex { + eprintln!(" {}", msg); + } else { + eprintln!("{}", msg); + } + }))); + } else if *LOG_GRAPH_ENABLED { + log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); + } + + for (language_name, language) in LANGUAGES.iter().cloned() { + if let Some(filter) = LANGUAGE_FILTER.as_ref() { + if !language_name.contains(filter.as_str()) { + continue; + } + } + + eprintln!("language: {:?}", language_name); + + let corpus_dir = grammars_dir.join(language_name).join("corpus"); let test = parse_tests(&corpus_dir).unwrap(); parser.set_language(language).unwrap(); run_mutation_tests(&mut parser, test); } + + drop(parser); + drop(log_session); } #[test] fn test_feature_corpus_files() { fs::create_dir_all(SCRATCH_DIR.as_path()).unwrap(); - let filter = std::env::var("TREE_SITTER_TEST_FILTER").ok(); - let mut loader = Loader::new(SCRATCH_DIR.clone()); + let loader = Loader::new(SCRATCH_DIR.clone()); + let mut log_session = None; let mut parser = Parser::new(); let test_grammars_dir = FIXTURES_DIR.join("test_grammars"); + if *LOG_ENABLED { + parser.set_logger(Some(Box::new(|log_type, msg| { + if log_type == LogType::Lex { + eprintln!(" {}", msg); + } else { + eprintln!("{}", msg); + } + }))); + } else if *LOG_GRAPH_ENABLED { + log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); + } + for entry in fs::read_dir(&test_grammars_dir).unwrap() { let entry = entry.unwrap(); if !entry.metadata().unwrap().is_dir() { continue; } - let test_name = entry.file_name(); - let test_name = test_name.to_str().unwrap(); + let language_name = entry.file_name(); + let language_name = language_name.to_str().unwrap(); - if let Some(filter) = filter.as_ref() { - if !test_name.contains(filter.as_str()) { + if let Some(filter) = LANGUAGE_FILTER.as_ref() { + if !language_name.contains(filter.as_str()) { continue; } } - eprintln!("test: {:?}", test_name); + eprintln!("test language: {:?}", language_name); let test_path = entry.path(); let grammar_path = test_path.join("grammar.json"); @@ -78,13 +122,13 @@ fn test_feature_corpus_files() { } else { panic!( "Expected error message but got none for test grammar '{}'", - test_name + language_name ); } } else { let corpus_path = test_path.join("corpus.txt"); let c_code = generate_result.unwrap(); - let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", test_name)); + let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", language_name)); if !fs::read_to_string(&parser_c_path) .map(|content| content == c_code) .unwrap_or(false) @@ -98,18 +142,21 @@ fn test_feature_corpus_files() { None }; let language = loader - .load_language_from_sources(test_name, &HEADER_DIR, &parser_c_path, &scanner_path) + .load_language_from_sources( + language_name, + &HEADER_DIR, + &parser_c_path, + &scanner_path, + ) .unwrap(); let test = parse_tests(&corpus_path).unwrap(); + parser.set_language(language).unwrap(); + run_mutation_tests(&mut parser, test); } } - // for (name, language) in LANGUAGES.iter().cloned() { - // let corpus_dir = grammars_dir.join(name).join("corpus"); - // let test = parse_tests(&corpus_dir).unwrap(); - // parser.set_language(language).unwrap(); - // run_mutation_tests(&mut parser, test); - // } + drop(parser); + drop(log_session); } fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { @@ -119,6 +166,14 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { input, output, } => { + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !name.contains(filter.as_str()) { + return; + } + } + + eprintln!(" example: {:?}", name); + let tree = parser .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) .unwrap(); diff --git a/cli/src/tests/languages.rs b/cli/src/tests/languages.rs index 0c483d08..e093d218 100644 --- a/cli/src/tests/languages.rs +++ b/cli/src/tests/languages.rs @@ -1,6 +1,7 @@ use tree_sitter::Language; extern "C" { + fn tree_sitter_bash() -> Language; fn tree_sitter_c() -> Language; fn tree_sitter_cpp() -> Language; fn tree_sitter_embedded_template() -> Language; @@ -10,6 +11,7 @@ extern "C" { fn tree_sitter_rust() -> Language; } +pub fn bash() -> Language { unsafe { tree_sitter_bash() } } pub fn c() -> Language { unsafe { tree_sitter_c() } } pub fn cpp() -> Language { unsafe { tree_sitter_cpp() } } pub fn embedded_template() -> Language { unsafe { tree_sitter_embedded_template() } } diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs index e32c292b..d717bfab 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api.rs @@ -324,10 +324,6 @@ fn test_custom_utf16_input() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); - parser.set_logger(Some(Box::new(|t, message| { - println!("log: {:?} {}", t, message); - }))); - let lines: Vec> = ["pub fn foo() {", " 1", "}"] .iter() .map(|s| s.encode_utf16().collect()) diff --git a/cli/src/util.rs b/cli/src/util.rs index f36cbe79..5c1bc39c 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -1,29 +1,28 @@ -use std::fs::File; -use std::io::{Result, Write}; +#[cfg(unix)] +use std::path::PathBuf; +#[cfg(unix)] use std::process::{Child, ChildStdin, Command, Stdio}; -use std::str; use tree_sitter::Parser; +const HTML_HEADER: &[u8] = b"\n\n\n"; + #[cfg(windows)] pub(crate) struct LogSession(); +#[cfg(unix)] +pub(crate) struct LogSession(PathBuf, Option, Option); + #[cfg(windows)] -pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result { +pub(crate) fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { Ok(LogSession()) } -#[cfg(windows)] -pub(crate) fn stop_logging_graphs(parser: &mut Parser, mut session: LogSession) -> Result<()> { - Ok(()) -} - #[cfg(unix)] -pub(crate) struct LogSession(Child, ChildStdin); +pub(crate) fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { + use std::io::Write; -#[cfg(unix)] -pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result { - let mut dot_file = File::create(path)?; - dot_file.write(b"\n\n\n")?; + let mut dot_file = std::fs::File::create(path)?; + dot_file.write(HTML_HEADER)?; let mut dot_process = Command::new("dot") .arg("-Tsvg") .stdin(Stdio::piped()) @@ -34,25 +33,23 @@ pub(crate) fn start_logging_graphs(parser: &mut Parser, path: &str) -> Result Result<()> { - drop(session.1); +impl Drop for LogSession { + fn drop(&mut self) { + use std::fs; - if cfg!(unix) { - parser.stop_printing_dot_graphs(); + drop(self.2.take().unwrap()); + let output = self.1.take().unwrap().wait_with_output().unwrap(); + if output.status.success() { + if cfg!(target_os = "macos") && fs::metadata(&self.0).unwrap().len() > HTML_HEADER.len() as u64 { + Command::new("open").arg("log.html").output().unwrap(); + } + } else { + eprintln!("Dot failed: {} {}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr)); + } } - - session.0.wait()?; - - if cfg!(target_os = "macos") { - Command::new("open").arg("log.html").output()?; - } - - Ok(()) } diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 08f863f8..fdb243ec 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -220,7 +220,6 @@ impl Parser { unsafe { ffi::ts_parser_print_dot_graphs(self.0, ffi::dup(fd)) } } - #[cfg(unix)] pub fn stop_printing_dot_graphs(&mut self) { unsafe { ffi::ts_parser_print_dot_graphs(self.0, -1) } } @@ -391,6 +390,7 @@ impl Parser { impl Drop for Parser { fn drop(&mut self) { + self.stop_printing_dot_graphs(); self.set_logger(None); unsafe { ffi::ts_parser_delete(self.0) } } diff --git a/script/clean b/script/clean deleted file mode 100755 index dfa8ff78..00000000 --- a/script/clean +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -rm -rf \ - build out \ - gyp-mac-tool \ - Makefile *.Makefile *.target.mk \ - *.xcodeproj diff --git a/script/configure b/script/configure deleted file mode 100755 index f2e511a1..00000000 --- a/script/configure +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -set -e - -git submodule update --init --recursive -externals/gyp/gyp project.gyp --depth . --format=make $@ -externals/gyp/gyp tests.gyp --depth . --format=make $@ diff --git a/script/configure.cmd b/script/configure.cmd deleted file mode 100644 index dc73e8de..00000000 --- a/script/configure.cmd +++ /dev/null @@ -1,3 +0,0 @@ -git submodule update --init --recursive -call .\externals\gyp\gyp.bat project.gyp --depth . -call .\externals\gyp\gyp.bat tests.gyp --depth . diff --git a/script/bindgen.sh b/script/generate-bindings similarity index 100% rename from script/bindgen.sh rename to script/generate-bindings diff --git a/script/test b/script/test index eb394962..43c274f7 100755 --- a/script/test +++ b/script/test @@ -12,150 +12,53 @@ OPTIONS -h print this message - -b run make under scan-build static analyzer + -a Compile C code with the Clang static analyzer - -d run tests in a debugger (either lldb or gdb) + -l run only the corpus tests for the given language - -g run tests with valgrind's memcheck tool - - -G run tests with valgrind's memcheck tool, including a full leak check - - -v run tests with verbose output - - -f run only tests whose description contain the given string + -e run only the corpus tests whose name contain the given string -s set the seed used to control random behavior + -d print parsing log to stderr + -D pipe tests' stderr to \`dot(1)\` to render an SVG log EOF } -profile= -leak_check=no -mode=normal -verbose= -args=() -target=tests -export BUILDTYPE=Test -cmd="out/${BUILDTYPE}/${target}" -run_scan_build= +export TREE_SITTER_TEST=1 +export RUST_TEST_THREADS=1 +export RUST_BACKTRACE=full -if [ "$(uname -s)" == "Darwin" ]; then - export LINK="clang++ -fsanitize=address" -fi - -while getopts "bdf:s:gGhpvD" option; do +while getopts "bdl:e:s:gGhpvD" option; do case ${option} in h) usage exit ;; - d) - mode=debug + l) + export TREE_SITTER_TEST_LANGUAGE_FILTER=${OPTARG} ;; - g) - mode=valgrind - ;; - G) - mode=valgrind - leak_check=full - ;; - p) - profile=true - ;; - f) - args+=("--only=${OPTARG}") - ;; - v) - verbose=true + e) + export TREE_SITTER_TEST_EXAMPLE_FILTER=${OPTARG} ;; s) export TREE_SITTER_SEED=${OPTARG} ;; - D) - export TREE_SITTER_ENABLE_DEBUG_GRAPHS=1 - mode=SVG + d) + export TREE_SITTER_ENABLE_LOG=1 ;; - b) - run_scan_build=true + D) + export TREE_SITTER_ENABLE_LOG_GRAPHS=1 ;; esac done -if [[ -n $verbose ]]; then - args+=("--reporter=spec") +if [[ -n $TREE_SITTER_TEST_LANGUAGE_FILTER || -n $TREE_SITTER_TEST_EXAMPLE_FILTER ]]; then + top_level_filter=corpus else - args+=("--reporter=singleline") + top_level_filter=$1 fi -if [[ -n "$run_scan_build" ]]; then - . script/util/scan-build.sh - scan_build make -j2 $target -else - make -j2 $target -fi -args=${args:-""} - -if [[ -n $profile ]]; then - export CPUPROFILE=/tmp/${target}-$(date '+%s').prof -fi - -case ${mode} in - valgrind) - valgrind \ - --suppressions=./script/util/valgrind.supp \ - --dsymutil=yes \ - --leak-check=${leak_check} \ - $cmd "${args[@]}" 2>&1 | \ - grep --color -E '\w+_tests?.cc:\d+|$' - ;; - - debug) - if hash lldb &> /dev/null; then - lldb $cmd -- "${args[@]}" - elif hash gdb &> /dev/null; then - gdb $cmd -- "${args[@]}" - else - echo "No debugger found" - exit 1 - fi - ;; - - SVG) - html_file=log.html - dot_file=$html_file.dot - - function write_log_file { - echo "" > $html_file - line_count=$(grep -n '^$' $dot_file | tail -1 | cut -f1 -d:) - if [[ -n $line_count ]]; then - head -n $line_count $dot_file | dot -Tsvg >> $html_file - else - cat $dot_file | grep -v 'Assertion' | dot -Tsvg >> $html_file - fi - rm $dot_file - echo "Wrote $html_file - $line_count" - } - - function handle_sigint { - trap '' SIGINT - echo - write_log_file - exit 0 - } - trap handle_sigint SIGINT - - $cmd "${args[@]}" 2> $dot_file || export status=$? - write_log_file - exit $status - ;; - - normal) - time $cmd "${args[@]}" - ;; -esac - -if [[ -n $profile ]]; then - pprof $cmd $CPUPROFILE -fi +cargo test --jobs 1 $top_level_filter -- --nocapture diff --git a/script/test.cmd b/script/test.cmd index f2d97303..e62eed0e 100644 --- a/script/test.cmd +++ b/script/test.cmd @@ -1,9 +1,7 @@ @echo off -msbuild /p:Configuration=Test tests.vcxproj -set only_arg= -IF not "%~1"=="" ( - set only_arg=--only=%1 -) +set TREE_SITTER_TEST=1 +set RUST_TEST_THREADS=1 +set RUST_BACKTRACE=full -.\test\tests.exe --reporter=singleline --no-color %only_arg% +cargo test "%~1" diff --git a/script/test.sh b/script/test.sh deleted file mode 100755 index eb6183c0..00000000 --- a/script/test.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -RUST_TREE_SITTER_TEST=1 cargo test $@ diff --git a/script/trim-whitespace b/script/trim-whitespace deleted file mode 100755 index b67791f5..00000000 --- a/script/trim-whitespace +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -find src test include -type f | xargs perl -pi -e 's/ +$//' diff --git a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt index 06a7bf0b..749264c6 100644 --- a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt +++ b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt @@ -19,6 +19,7 @@ anonymous tokens defined with LF escape sequence anonymous tokens defined with CR escape sequence ================================================= + --- (first_rule) diff --git a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json index d2613776..38ada64c 100644 --- a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json +++ b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json @@ -5,10 +5,10 @@ "type": "CHOICE", "members": [ {"type": "STRING", "value": "\n"}, - {"type": "STRING", "value": "\r"}, + {"type": "STRING", "value": "\r\n"}, {"type": "STRING", "value": "'hello'"}, {"type": "PATTERN", "value": "\\d+"} ] } } -} \ No newline at end of file +} From a8292f4fe99d87dfee886e146307da0a8beb2a9c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 10:27:39 -0800 Subject: [PATCH 142/208] Load all fixture grammars dynamically This way the build doesn't take forever any time a single grammar has been regenerated. --- cli/src/generate/nfa.rs | 5 +-- cli/src/generate/properties.rs | 3 +- cli/src/loader.rs | 49 ++++++++++++++++++-------- cli/src/tests/corpuses.rs | 64 ++++++++++------------------------ cli/src/tests/fixtures.rs | 51 +++++++++++++++++++++++++++ cli/src/tests/languages.rs | 21 ----------- cli/src/tests/mod.rs | 2 +- cli/src/tests/parser_api.rs | 8 +++-- lib/build.rs | 53 ---------------------------- script/regenerate-fixtures | 6 ++++ 10 files changed, 119 insertions(+), 143 deletions(-) create mode 100644 cli/src/tests/fixtures.rs delete mode 100644 cli/src/tests/languages.rs diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 54e34814..674391ff 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -55,10 +55,6 @@ impl CharacterSet { CharacterSet::Include(Vec::new()) } - pub fn all() -> Self { - CharacterSet::Exclude(Vec::new()) - } - pub fn negate(self) -> CharacterSet { match self { CharacterSet::Include(chars) => CharacterSet::Exclude(chars), @@ -182,6 +178,7 @@ impl CharacterSet { } } + #[cfg(test)] pub fn contains(&self, c: char) -> bool { match self { CharacterSet::Include(chars) => chars.contains(&c), diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index e1492d6f..b16e698a 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -464,7 +464,8 @@ fn parse_property_sheet(path: &Path, css: &str) -> Result> { rsass::Item::AtRule { name, args, .. } => match name.as_str() { "schema" => { if let Some(s) = get_sass_string(args) { - let schema_path = resolve_path(path, s)?; + // TODO - use schema + let _schema_path = resolve_path(path, s)?; items.remove(i); continue; } else { diff --git a/cli/src/loader.rs b/cli/src/loader.rs index af1ab7be..83b878b9 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -23,11 +23,11 @@ struct LanguageRepo { } pub struct LanguageConfiguration { - name: String, - content_regex: Option, - first_line_regex: Option, + _name: String, + _content_regex: Option, + _first_line_regex: Option, file_types: Vec, - highlight_property_sheet: Option>, + _highlight_property_sheet: Option>, } pub struct Loader { @@ -108,16 +108,21 @@ impl Loader { let language = if let Some(language) = repo.language { language } else { - let language = self.load_language_at_path(&repo.name, &repo.path)?; + let src_path = repo.path.join("src"); + let language = self.load_language_at_path(&repo.name, &src_path, &src_path)?; self.language_repos[id].language = Some(language); language }; Ok((language, &self.language_repos[id].configurations)) } - fn load_language_at_path(&self, name: &str, language_path: &Path) -> io::Result { - let src_path = language_path.join("src"); - let parser_c_path = src_path.join("parser.c"); + pub fn load_language_at_path( + &self, + name: &str, + src_path: &Path, + header_path: &Path, + ) -> io::Result { + let parser_path = src_path.join("parser.c"); let scanner_path; let scanner_c_path = src_path.join("scanner.c"); @@ -132,7 +137,7 @@ impl Loader { } } - self.load_language_from_sources(name, &src_path, &parser_c_path, &scanner_path) + self.load_language_from_sources(name, &header_path, &parser_path, &scanner_path) } pub fn load_language_from_sources( @@ -148,6 +153,7 @@ impl Loader { if needs_recompile(&library_path, &parser_path, &scanner_path)? { let mut config = cc::Build::new(); config + .cpp(true) .opt_level(2) .cargo_metadata(false) .target(env!("BUILD_TARGET")) @@ -197,13 +203,14 @@ impl Loader { "Parser compilation failed.\nStdout: {}\nStderr: {}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr) - ).as_str(), + ) + .as_str(), )); } } let library = Library::new(library_path)?; - let language_fn_name = format!("tree_sitter_{}", name); + let language_fn_name = format!("tree_sitter_{}", replace_dashes_with_underscores(name)); let language = unsafe { let language_fn: Symbol Language> = library.get(language_fn_name.as_bytes())?; @@ -248,15 +255,15 @@ impl Loader { configurations .into_iter() .map(|conf| LanguageConfiguration { - name: conf.name, + _name: conf.name, file_types: conf.file_types.unwrap_or(Vec::new()), - content_regex: conf + _content_regex: conf .content_regex .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), - first_line_regex: conf + _first_line_regex: conf .first_line_regex .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), - highlight_property_sheet: conf.highlights.map(|d| Err(d.into())), + _highlight_property_sheet: conf.highlights.map(|d| Err(d.into())), }) .collect() }); @@ -304,3 +311,15 @@ fn needs_recompile( fn mtime(path: &Path) -> io::Result { Ok(fs::metadata(path)?.modified()?) } + +fn replace_dashes_with_underscores(name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if c == '-' { + result.push('_'); + } else { + result.push(c); + } + } + result +} diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index e1fe9189..e55c8f57 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,27 +1,21 @@ -use super::languages; +use super::fixtures::{get_language, get_test_language, fixtures_dir}; use crate::generate; -use crate::loader::Loader; use crate::test::{parse_tests, TestEntry}; use crate::util; use std::fs; -use std::path::PathBuf; -use tree_sitter::{Language, Parser, LogType}; +use tree_sitter::{LogType, Parser}; + +const LANGUAGES: &'static [&'static str] = &[ + "bash", + "c", + "cpp", + "embedded-template", + "go", + "html", + "javascript", +]; lazy_static! { - static ref LANGUAGES: [(&'static str, Language); 7] = [ - ("bash", languages::bash()), - ("c", languages::c()), - ("cpp", languages::cpp()), - ("embedded-template", languages::embedded_template()), - ("go", languages::go()), - ("html", languages::html()), - ("javascript", languages::javascript()), - ]; - static ref ROOT_DIR: PathBuf = [env!("CARGO_MANIFEST_DIR"), ".."].iter().collect(); - static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); - static ref SCRATCH_DIR: PathBuf = ROOT_DIR.join("target").join("scratch"); - static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); - static ref EXEC_PATH: PathBuf = std::env::current_exe().unwrap(); static ref LANGUAGE_FILTER: Option = std::env::var("TREE_SITTER_TEST_LANGUAGE_FILTER").ok(); static ref EXAMPLE_FILTER: Option = @@ -34,7 +28,7 @@ lazy_static! { fn test_real_language_corpus_files() { let mut log_session = None; let mut parser = Parser::new(); - let grammars_dir = FIXTURES_DIR.join("grammars"); + let grammars_dir = fixtures_dir().join("grammars"); if *LOG_ENABLED { parser.set_logger(Some(Box::new(|log_type, msg| { @@ -48,7 +42,7 @@ fn test_real_language_corpus_files() { log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); } - for (language_name, language) in LANGUAGES.iter().cloned() { + for language_name in LANGUAGES.iter().cloned() { if let Some(filter) = LANGUAGE_FILTER.as_ref() { if !language_name.contains(filter.as_str()) { continue; @@ -57,6 +51,7 @@ fn test_real_language_corpus_files() { eprintln!("language: {:?}", language_name); + let language = get_language(language_name); let corpus_dir = grammars_dir.join(language_name).join("corpus"); let test = parse_tests(&corpus_dir).unwrap(); parser.set_language(language).unwrap(); @@ -69,12 +64,9 @@ fn test_real_language_corpus_files() { #[test] fn test_feature_corpus_files() { - fs::create_dir_all(SCRATCH_DIR.as_path()).unwrap(); - - let loader = Loader::new(SCRATCH_DIR.clone()); let mut log_session = None; let mut parser = Parser::new(); - let test_grammars_dir = FIXTURES_DIR.join("test_grammars"); + let test_grammars_dir = fixtures_dir().join("test_grammars"); if *LOG_ENABLED { parser.set_logger(Some(Box::new(|log_type, msg| { @@ -128,27 +120,7 @@ fn test_feature_corpus_files() { } else { let corpus_path = test_path.join("corpus.txt"); let c_code = generate_result.unwrap(); - let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", language_name)); - if !fs::read_to_string(&parser_c_path) - .map(|content| content == c_code) - .unwrap_or(false) - { - fs::write(&parser_c_path, c_code).unwrap(); - } - let scanner_path = test_path.join("scanner.c"); - let scanner_path = if scanner_path.exists() { - Some(scanner_path) - } else { - None - }; - let language = loader - .load_language_from_sources( - language_name, - &HEADER_DIR, - &parser_c_path, - &scanner_path, - ) - .unwrap(); + let language = get_test_language(language_name, c_code, &test_path); let test = parse_tests(&corpus_path).unwrap(); parser.set_language(language).unwrap(); run_mutation_tests(&mut parser, test); @@ -180,7 +152,7 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { let actual = tree.root_node().to_sexp(); assert_eq!(actual, output); } - TestEntry::Group { name, children } => { + TestEntry::Group { children, .. } => { for child in children { run_mutation_tests(parser, child); } diff --git a/cli/src/tests/fixtures.rs b/cli/src/tests/fixtures.rs new file mode 100644 index 00000000..978a1212 --- /dev/null +++ b/cli/src/tests/fixtures.rs @@ -0,0 +1,51 @@ +use crate::loader::Loader; +use std::path::{Path, PathBuf}; +use tree_sitter::Language; +use std::fs; + +lazy_static! { + static ref ROOT_DIR: PathBuf = [env!("CARGO_MANIFEST_DIR"), ".."].iter().collect(); + static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); + static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); + static ref GRAMMARS_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures").join("grammars"); + static ref SCRATCH_DIR: PathBuf = { + let result = ROOT_DIR.join("target").join("scratch"); + fs::create_dir_all(&result).unwrap(); + result + }; + static ref TEST_LOADER: Loader = Loader::new(SCRATCH_DIR.clone()); +} + +pub fn fixtures_dir<'a>() -> &'static Path { + &FIXTURES_DIR +} + +pub fn get_language(name: &str) -> Language { + TEST_LOADER + .load_language_at_path(name, &GRAMMARS_DIR.join(name).join("src"), &HEADER_DIR) + .unwrap() +} + +pub fn get_test_language(name: &str, parser_code: String, path: &Path) -> Language { + let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", name)); + if !fs::read_to_string(&parser_c_path) + .map(|content| content == parser_code) + .unwrap_or(false) + { + fs::write(&parser_c_path, parser_code).unwrap(); + } + let scanner_path = path.join("scanner.c"); + let scanner_path = if scanner_path.exists() { + Some(scanner_path) + } else { + None + }; + TEST_LOADER + .load_language_from_sources( + name, + &HEADER_DIR, + &parser_c_path, + &scanner_path, + ) + .unwrap() +} diff --git a/cli/src/tests/languages.rs b/cli/src/tests/languages.rs deleted file mode 100644 index e093d218..00000000 --- a/cli/src/tests/languages.rs +++ /dev/null @@ -1,21 +0,0 @@ -use tree_sitter::Language; - -extern "C" { - fn tree_sitter_bash() -> Language; - fn tree_sitter_c() -> Language; - fn tree_sitter_cpp() -> Language; - fn tree_sitter_embedded_template() -> Language; - fn tree_sitter_go() -> Language; - fn tree_sitter_html() -> Language; - fn tree_sitter_javascript() -> Language; - fn tree_sitter_rust() -> Language; -} - -pub fn bash() -> Language { unsafe { tree_sitter_bash() } } -pub fn c() -> Language { unsafe { tree_sitter_c() } } -pub fn cpp() -> Language { unsafe { tree_sitter_cpp() } } -pub fn embedded_template() -> Language { unsafe { tree_sitter_embedded_template() } } -pub fn go() -> Language { unsafe { tree_sitter_go() } } -pub fn html() -> Language { unsafe { tree_sitter_html() } } -pub fn javascript() -> Language { unsafe { tree_sitter_javascript() } } -pub fn rust() -> Language { unsafe { tree_sitter_rust() } } diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index bc199616..c9f1dda4 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,3 +1,3 @@ -mod languages; +mod fixtures; mod corpuses; mod parser_api; diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs index d717bfab..a399bf38 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api.rs @@ -1,6 +1,10 @@ -use super::languages::rust; +use super::fixtures::get_language; use std::thread; -use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet}; +use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet, Language}; + +fn rust() -> Language { + get_language("rust") +} #[test] fn test_basic_parsing() { diff --git a/lib/build.rs b/lib/build.rs index e4d1f91a..eb6fea8b 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -2,61 +2,8 @@ extern crate cc; use std::env; use std::path::{Path, PathBuf}; -use std::fs; fn main() { - println!("cargo:rerun-if-env-changed=TREE_SITTER_TEST"); - if env::var("TREE_SITTER_TEST").is_ok() { - let mut parser_config = cc::Build::new(); - parser_config - .opt_level(0) - .flag_if_supported("-Wno-unused-parameter"); - - let mut scanner_c_config = cc::Build::new(); - scanner_c_config - .flag_if_supported("-std=c99") - .flag_if_supported("-Wno-unused-parameter"); - - let mut scanner_cxx_config = cc::Build::new(); - scanner_cxx_config - .cpp(true) - .flag_if_supported("-Wno-unused-parameter"); - - let grammars_dir: PathBuf = ["..", "test", "fixtures", "grammars"].iter().collect(); - for entry in fs::read_dir(&grammars_dir).expect("Failed to list grammar directory") { - let entry = entry.expect("Failed to load grammars directory entry"); - if !entry.path().is_dir() { - continue; - } - let parser_dir_path = entry.path(); - let parser_src_path = parser_dir_path.join("src"); - let parser_c_path = parser_src_path.join("parser.c"); - let scanner_c_path = parser_src_path.join("scanner.c"); - let scanner_cc_path = parser_src_path.join("scanner.cc"); - - println!("cargo:rerun-if-changed={}", parser_c_path.to_str().unwrap()); - parser_config - .include(&parser_src_path) - .opt_level(0) - .file(&parser_c_path); - if scanner_cc_path.exists() { - println!("cargo:rerun-if-changed={}", scanner_cc_path.to_str().unwrap()); - scanner_cxx_config - .include(&parser_src_path) - .file(&scanner_cc_path); - } else if scanner_c_path.exists() { - println!("cargo:rerun-if-changed={}", scanner_c_path.to_str().unwrap()); - scanner_c_config - .include(&parser_src_path) - .file(&scanner_c_path); - } - } - - parser_config.compile("fixture-parsers"); - scanner_c_config.compile("fixture-scanners-c"); - scanner_cxx_config.compile("fixture-scanners-cxx"); - } - println!("cargo:rerun-if-env-changed=TREE_SITTER_STATIC_ANALYSIS"); if env::var("TREE_SITTER_STATIC_ANALYSIS").is_ok() { let clang_path = which("clang").unwrap(); diff --git a/script/regenerate-fixtures b/script/regenerate-fixtures index 15e3c09d..c47c53f9 100755 --- a/script/regenerate-fixtures +++ b/script/regenerate-fixtures @@ -2,6 +2,8 @@ set -e +cargo build --release + root_dir=$PWD tree_sitter=${root_dir}/target/release/tree-sitter grammars_dir=${root_dir}/test/fixtures/grammars @@ -19,6 +21,10 @@ grammar_names=( rust ) +if [[ "$#" > 0 ]]; then + grammar_names=($1) +fi + for grammar_name in "${grammar_names[@]}"; do echo "Regenerating ${grammar_name} parser" cd ${grammars_dir}/${grammar_name} From 0a2d72d956b6db5eecd29cbcf8f1c2293b71fbe3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 12:12:12 -0800 Subject: [PATCH 143/208] Determine language name from package.json, not directory --- cli/src/loader.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 83b878b9..26064f04 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -221,14 +221,6 @@ impl Loader { } fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> io::Result { - let name = parser_path - .file_name() - .unwrap() - .to_str() - .unwrap() - .split_at("tree-sitter-".len()) - .1; - #[derive(Deserialize)] struct LanguageConfigurationJSON { name: String, @@ -243,6 +235,7 @@ impl Loader { #[derive(Deserialize)] struct PackageJSON { + name: String, #[serde(rename = "tree-sitter")] tree_sitter: Option>, } @@ -278,7 +271,7 @@ impl Loader { } self.language_repos.push(LanguageRepo { - name: name.to_string(), + name: package_json.name.split_at("tree-sitter-".len()).1.to_string(), path: parser_path.to_owned(), language: None, configurations, From b799b46f790ccfee6c3e77f98b9129c0d13a021e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 12:13:14 -0800 Subject: [PATCH 144/208] Handle repetition range operators with commas in regexes --- cli/src/generate/prepare_grammar/expand_tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 1e2ef2e5..9cc527bd 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -10,7 +10,7 @@ use regex::Regex; use std::i32; lazy_static! { - static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9}][^}]*)\}"#).unwrap(); + static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9,}][^}]*)\}"#).unwrap(); } const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; From d8ab36b2a598c0a75dfc8756e7223e6fd60375be Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 12:13:42 -0800 Subject: [PATCH 145/208] Fix bugs in handling tokens that overlap with separators --- .../generate/build_tables/build_lex_table.rs | 20 ++++++-- .../generate/build_tables/token_conflicts.rs | 22 ++------ cli/src/generate/grammars.rs | 21 +++++++- cli/src/generate/nfa.rs | 2 +- cli/src/test.rs | 51 +++++++++++-------- cli/src/tests/corpuses.rs | 38 ++++++++++---- 6 files changed, 98 insertions(+), 56 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 200c6959..15f09f6b 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -191,6 +191,7 @@ impl<'a> LexTableBuilder<'a> { ); let transitions = self.cursor.transitions(); + let has_sep = self.cursor.transition_chars().any(|(_, sep)| sep); info!("lex state: {}, transitions: {:?}", state_id, transitions); // If EOF is a valid lookahead token, add a transition predicated on the null @@ -214,12 +215,23 @@ impl<'a> LexTableBuilder<'a> { is_separator, } in transitions { - if let Some((_, completed_precedence)) = completion { - if precedence < completed_precedence - || (precedence == completed_precedence && is_separator) - { + if let Some((completed_id, completed_precedence)) = completion { + if precedence < completed_precedence { continue; } + + if precedence == completed_precedence { + if is_separator { + continue; + } + if has_sep && self.lexical_grammar + .variable_indices_for_nfa_states(&states) + .position(|i| i == completed_id) + .is_none() + { + continue; + } + } } let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); let next_state = if next_state_id == state_id { diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 1a63bfc8..df3d4250 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -58,7 +58,7 @@ impl<'a> TokenConflictMap<'a> { pub fn does_conflict(&self, i: usize, j: usize) -> bool { let entry = &self.status_matrix[matrix_index(self.n, i, j)]; - entry.does_match_valid_continuation || entry.does_match_separators + entry.does_match_valid_continuation || entry.does_match_separators || entry.matches_same_string } pub fn does_overlap(&self, i: usize, j: usize) -> bool { @@ -176,7 +176,7 @@ fn compute_conflict_status( while let Some(state_set) = state_set_queue.pop() { // Don't pursue states where there's no potential for conflict. - if variable_ids_for_states(&state_set, grammar).count() > 1 { + if grammar.variable_indices_for_nfa_states(&state_set).count() > 1 { cursor.reset(state_set); } else { continue; @@ -226,7 +226,7 @@ fn compute_conflict_status( if let Some((completed_id, completed_precedence)) = completion { let mut other_id = None; let mut successor_contains_completed_id = false; - for variable_id in variable_ids_for_states(&states, grammar) { + for variable_id in grammar.variable_indices_for_nfa_states(&states) { if variable_id == completed_id { successor_contains_completed_id = true; break; @@ -269,22 +269,6 @@ fn compute_conflict_status( result } -fn variable_ids_for_states<'a>( - state_ids: &'a Vec, - grammar: &'a LexicalGrammar, -) -> impl Iterator + 'a { - let mut prev = None; - state_ids.iter().filter_map(move |state_id| { - let variable_id = grammar.variable_index_for_nfa_state(*state_id); - if prev != Some(variable_id) { - prev = Some(variable_id); - prev - } else { - None - } - }) -} - #[cfg(test)] mod tests { use super::*; diff --git a/cli/src/generate/grammars.rs b/cli/src/generate/grammars.rs index 3772bfd4..3cedcd42 100644 --- a/cli/src/generate/grammars.rs +++ b/cli/src/generate/grammars.rs @@ -175,8 +175,27 @@ impl Variable { } impl LexicalGrammar { + pub fn variable_indices_for_nfa_states<'a>( + &'a self, + state_ids: &'a Vec, + ) -> impl Iterator + 'a { + let mut prev = None; + state_ids.iter().filter_map(move |state_id| { + let variable_id = self.variable_index_for_nfa_state(*state_id); + if prev != Some(variable_id) { + prev = Some(variable_id); + prev + } else { + None + } + }) + } + pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize { - self.variables.iter().position(|v| v.start_state >= state_id).unwrap() + self.variables + .iter() + .position(|v| v.start_state >= state_id) + .unwrap() } } diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 674391ff..ca2e5405 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -374,7 +374,7 @@ impl<'a> NfaCursor<'a> { } let intersection_transition = NfaTransition { characters: intersection, - is_separator: result[i].is_separator || is_sep, + is_separator: result[i].is_separator && is_sep, precedence: max(result[i].precedence, prec), states: intersection_states, }; diff --git a/cli/src/test.rs b/cli/src/test.rs index bcea3dcc..4d6034e5 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -75,30 +75,10 @@ pub fn run_tests_at_path( println!("{} failures:", failures.len()) } - println!( - "\n{} / {}", - Colour::Green.paint("expected"), - Colour::Red.paint("actual") - ); - + print_diff_key(); for (i, (name, actual, expected)) in failures.iter().enumerate() { println!("\n {}. {}:", i + 1, name); - let changeset = Changeset::new(actual, expected, " "); - print!(" "); - for diff in &changeset.diffs { - match diff { - Difference::Same(part) => { - print!("{}{}", part, changeset.split); - } - Difference::Add(part) => { - print!("{}{}", Colour::Green.paint(part), changeset.split); - } - Difference::Rem(part) => { - print!("{}{}", Colour::Red.paint(part), changeset.split); - } - } - } - println!(""); + print_diff(actual, expected); } } @@ -106,6 +86,33 @@ pub fn run_tests_at_path( Ok(()) } +pub fn print_diff_key() { + println!( + "\n{} / {}", + Colour::Green.paint("expected"), + Colour::Red.paint("actual") + ); +} + +pub fn print_diff(actual: &String, expected: &String) { + let changeset = Changeset::new(actual, expected, " "); + print!(" "); + for diff in &changeset.diffs { + match diff { + Difference::Same(part) => { + print!("{}{}", part, changeset.split); + } + Difference::Add(part) => { + print!("{}{}", Colour::Green.paint(part), changeset.split); + } + Difference::Rem(part) => { + print!("{}{}", Colour::Red.paint(part), changeset.split); + } + } + } + println!(""); +} + fn run_tests( parser: &mut Parser, test_entry: TestEntry, diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index e55c8f57..707158cf 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,6 +1,6 @@ use super::fixtures::{get_language, get_test_language, fixtures_dir}; use crate::generate; -use crate::test::{parse_tests, TestEntry}; +use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; use std::fs; use tree_sitter::{LogType, Parser}; @@ -13,6 +13,7 @@ const LANGUAGES: &'static [&'static str] = &[ "go", "html", "javascript", + "python", ]; lazy_static! { @@ -42,9 +43,10 @@ fn test_real_language_corpus_files() { log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); } + let mut did_fail = false; for language_name in LANGUAGES.iter().cloned() { if let Some(filter) = LANGUAGE_FILTER.as_ref() { - if !language_name.contains(filter.as_str()) { + if language_name != filter.as_str() { continue; } } @@ -55,11 +57,15 @@ fn test_real_language_corpus_files() { let corpus_dir = grammars_dir.join(language_name).join("corpus"); let test = parse_tests(&corpus_dir).unwrap(); parser.set_language(language).unwrap(); - run_mutation_tests(&mut parser, test); + did_fail |= run_mutation_tests(&mut parser, test); } drop(parser); drop(log_session); + + if did_fail { + panic!("Corpus tests failed"); + } } #[test] @@ -80,6 +86,7 @@ fn test_feature_corpus_files() { log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); } + let mut did_fail = false; for entry in fs::read_dir(&test_grammars_dir).unwrap() { let entry = entry.unwrap(); if !entry.metadata().unwrap().is_dir() { @@ -89,7 +96,7 @@ fn test_feature_corpus_files() { let language_name = language_name.to_str().unwrap(); if let Some(filter) = LANGUAGE_FILTER.as_ref() { - if !language_name.contains(filter.as_str()) { + if language_name != filter.as_str() { continue; } } @@ -123,15 +130,19 @@ fn test_feature_corpus_files() { let language = get_test_language(language_name, c_code, &test_path); let test = parse_tests(&corpus_path).unwrap(); parser.set_language(language).unwrap(); - run_mutation_tests(&mut parser, test); + did_fail |= run_mutation_tests(&mut parser, test); } } drop(parser); drop(log_session); + + if did_fail { + panic!("Corpus tests failed"); + } } -fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { +fn run_mutation_tests(parser: &mut Parser, test: TestEntry) -> bool { match test { TestEntry::Example { name, @@ -140,7 +151,7 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { } => { if let Some(filter) = EXAMPLE_FILTER.as_ref() { if !name.contains(filter.as_str()) { - return; + return false; } } @@ -150,12 +161,21 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) { .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) .unwrap(); let actual = tree.root_node().to_sexp(); - assert_eq!(actual, output); + if actual != output { + print_diff_key(); + print_diff(&actual, &output); + println!(""); + true + } else { + false + } } TestEntry::Group { children, .. } => { + let mut result = false; for child in children { - run_mutation_tests(parser, child); + result |= run_mutation_tests(parser, child); } + result } } } From 522021b107c00bbd146dbc0f813d16e3bce8e550 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 15:57:29 -0800 Subject: [PATCH 146/208] Fix NFA generation w/ nested groups --- .../generate/prepare_grammar/expand_tokens.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 9cc527bd..6b92713e 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -263,7 +263,7 @@ impl NfaBuilder { Ok(result) } }, - Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()), + Ast::Group(group) => self.expand_regex(&group.ast, next_state_id), Ast::Alternation(alternation) => { let mut alternative_state_ids = Vec::new(); for ast in alternation.asts.iter() { @@ -619,7 +619,18 @@ mod tests { ("12e34", Some((0, "12e34"))), ], }, - // Allowing unrecognized escape sequences + // nested groups + Row { + rules: vec![Rule::seq(vec![ + Rule::pattern(r#"([^x\\]|\\(.|\n))+"#), + ])], + separators: vec![], + examples: vec![ + ("abcx", Some((0, "abc"))), + ("abc\\0x", Some((0, "abc\\0"))), + ], + }, + // allowing unrecognized escape sequences Row { rules: vec![ // Escaped forward slash (used in JS because '/' is the regex delimiter) @@ -636,7 +647,7 @@ mod tests { (r#"'\'a"#, Some((2, r#"'\'"#))), ], }, - // Allowing un-escaped curly braces + // allowing un-escaped curly braces Row { rules: vec![ // Un-escaped curly braces @@ -649,7 +660,6 @@ mod tests { ("u{1234} ok", Some((0, "u{1234}"))), ("{aba}}", Some((1, "{aba}"))), ], - } ]; From ceff3936ef6e9231e2ea78e1edaaac8370f542f0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 16:10:52 -0800 Subject: [PATCH 147/208] Unify logic for handling tokens that match separators into one place --- .../generate/build_tables/build_lex_table.rs | 39 +++------ .../generate/build_tables/token_conflicts.rs | 85 +++++++++++++------ 2 files changed, 71 insertions(+), 53 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 15f09f6b..03ec0c7b 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -2,7 +2,7 @@ use super::coincident_tokens::CoincidentTokenIndex; use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use crate::generate::nfa::{CharacterSet, NfaCursor}; use crate::generate::rules::Symbol; use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use std::collections::hash_map::Entry; @@ -208,42 +208,31 @@ impl<'a> LexTableBuilder<'a> { )); } - for NfaTransition { - characters, - precedence, - states, - is_separator, - } in transitions - { + for transition in transitions { if let Some((completed_id, completed_precedence)) = completion { - if precedence < completed_precedence { + if !TokenConflictMap::prefer_transition( + &self.lexical_grammar, + &transition, + completed_id, + completed_precedence, + has_sep, + ) { continue; } - - if precedence == completed_precedence { - if is_separator { - continue; - } - if has_sep && self.lexical_grammar - .variable_indices_for_nfa_states(&states) - .position(|i| i == completed_id) - .is_none() - { - continue; - } - } } - let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); + + let (next_state_id, _) = + self.add_state(transition.states, eof_valid && transition.is_separator); let next_state = if next_state_id == state_id { None } else { Some(next_state_id) }; self.table.states[state_id].advance_actions.push(( - characters, + transition.characters, AdvanceAction { state: next_state, - in_main_token: !is_separator, + in_main_token: !transition.is_separator, }, )); } diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index df3d4250..13c69c19 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -58,7 +58,9 @@ impl<'a> TokenConflictMap<'a> { pub fn does_conflict(&self, i: usize, j: usize) -> bool { let entry = &self.status_matrix[matrix_index(self.n, i, j)]; - entry.does_match_valid_continuation || entry.does_match_separators || entry.matches_same_string + entry.does_match_valid_continuation + || entry.does_match_separators + || entry.matches_same_string } pub fn does_overlap(&self, i: usize, j: usize) -> bool { @@ -81,6 +83,32 @@ impl<'a> TokenConflictMap<'a> { Ordering::Equal => left.1 < right.1, } } + + pub fn prefer_transition( + grammar: &LexicalGrammar, + t: &NfaTransition, + completed_id: usize, + completed_precedence: i32, + has_separator_transitions: bool, + ) -> bool { + if t.precedence < completed_precedence { + return false; + } + if t.precedence == completed_precedence { + if t.is_separator { + return false; + } + if has_separator_transitions + && grammar + .variable_indices_for_nfa_states(&t.states) + .position(|i| i == completed_id) + .is_none() + { + return false; + } + } + true + } } impl<'a> fmt::Debug for TokenConflictMap<'a> { @@ -97,7 +125,7 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> { for i in 0..self.n { write!( f, - " {}: {:?},\n", + " {:?}: {:?},\n", self.grammar.variables[i].name, self.following_chars_by_index[i] )?; } @@ -105,11 +133,11 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> { write!(f, " status_matrix: {{\n")?; for i in 0..self.n { - write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + write!(f, " {:?}: {{\n", self.grammar.variables[i].name)?; for j in 0..self.n { write!( f, - " {}: {:?},\n", + " {:?}: {:?},\n", self.grammar.variables[j].name, self.status_matrix[matrix_index(self.n, i, j)] )?; @@ -191,19 +219,19 @@ fn compute_conflict_status( // Prefer tokens with higher precedence. For tokens with equal precedence, // prefer those listed earlier in the grammar. - let winning_id; + let preferred_id; if TokenConflictMap::prefer_token( grammar, (prev_precedence, prev_id), (precedence, id), ) { - winning_id = prev_id; + preferred_id = prev_id; } else { - winning_id = id; + preferred_id = id; completion = Some((id, precedence)); } - if winning_id == i { + if preferred_id == i { result.0.matches_same_string = true; result.0.does_overlap = true; } else { @@ -215,18 +243,14 @@ fn compute_conflict_status( } } - for NfaTransition { - characters, - precedence, - states, - is_separator, - } in cursor.transitions() - { + let has_sep = cursor.transition_chars().any(|(_, sep)| sep); + + for transition in cursor.transitions() { let mut can_advance = true; if let Some((completed_id, completed_precedence)) = completion { let mut other_id = None; let mut successor_contains_completed_id = false; - for variable_id in grammar.variable_indices_for_nfa_states(&states) { + for variable_id in grammar.variable_indices_for_nfa_states(&transition.states) { if variable_id == completed_id { successor_contains_completed_id = true; break; @@ -236,33 +260,38 @@ fn compute_conflict_status( } if let (Some(other_id), false) = (other_id, successor_contains_completed_id) { - let winning_id; - if precedence < completed_precedence { - winning_id = completed_id; - can_advance = false; + let preferred_id = if TokenConflictMap::prefer_transition( + grammar, + &transition, + completed_id, + completed_precedence, + has_sep, + ) { + can_advance = true; + other_id } else { - winning_id = other_id; - } + completed_id + }; - if winning_id == i { + if preferred_id == i { result.0.does_overlap = true; - if characters.does_intersect(&following_chars[j]) { + if transition.characters.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } - if is_separator { + if transition.is_separator || has_sep { result.0.does_match_separators = true; } } else { result.1.does_overlap = true; - if characters.does_intersect(&following_chars[i]) { + if transition.characters.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; } } } } - if can_advance && visited_state_sets.insert(states.clone()) { - state_set_queue.push(states); + if can_advance && visited_state_sets.insert(transition.states.clone()) { + state_set_queue.push(transition.states); } } } From 0ee11584a7ea4bb39ed0066899ad1ceb5eb36cb8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 16:12:30 -0800 Subject: [PATCH 148/208] Add -xc compiler flag for pure-C external scanners --- cli/src/loader.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 26064f04..afb18f9e 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -188,7 +188,7 @@ impl Loader { .arg(parser_path); if let Some(scanner_path) = scanner_path.as_ref() { if scanner_path.extension() == Some("c".as_ref()) { - command.arg(scanner_path); + command.arg("-xc").arg("-std=c99").arg(scanner_path); } else { command.arg("-xc++").arg(scanner_path); } @@ -271,7 +271,11 @@ impl Loader { } self.language_repos.push(LanguageRepo { - name: package_json.name.split_at("tree-sitter-".len()).1.to_string(), + name: package_json + .name + .split_at("tree-sitter-".len()) + .1 + .to_string(), path: parser_path.to_owned(), language: None, configurations, From d23a03bdf118737c80fc19027000ee16df48bbb6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 16:37:54 -0800 Subject: [PATCH 149/208] Represent ParseItemSet as a sorted Vec, not a BTreeMap --- .../build_tables/build_parse_table.rs | 14 ++++-------- cli/src/generate/build_tables/item.rs | 22 ++++++++++++++----- .../generate/build_tables/item_set_builder.rs | 8 ++----- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index b87cc3d0..792a8759 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -174,18 +174,12 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_successors .entry(next_symbol) .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| TokenSet::new()) - .insert_all(lookaheads); + .insert(successor, lookaheads); } else { terminal_successors .entry(next_symbol) .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| TokenSet::new()) - .insert_all(lookaheads); + .insert(successor, lookaheads); } } else { let action = if item.is_augmented() { @@ -620,8 +614,8 @@ impl<'a> ParseTableBuilder<'a> { ) -> AuxiliarySymbolInfo { let parent_symbols = item_set .entries - .keys() - .filter_map(|item| { + .iter() + .filter_map(|(item, _)| { let variable_index = item.variable_index as usize; if item.symbol() == Some(symbol) && !self.syntax_grammar.variables[variable_index].is_auxiliary() diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 279c5df6..0222ac21 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -3,7 +3,6 @@ use crate::generate::rules::Associativity; use crate::generate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; use std::cmp::Ordering; -use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::iter::FromIterator; @@ -40,7 +39,7 @@ pub(crate) struct ParseItem<'a> { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet<'a> { - pub entries: BTreeMap, TokenSet>, + pub entries: Vec<(ParseItem<'a>, TokenSet)>, } pub(crate) struct ParseItemDisplay<'a>( @@ -227,15 +226,28 @@ impl<'a> ParseItemSet<'a> { pub fn with(elements: impl IntoIterator, TokenSet)>) -> Self { let mut result = Self::default(); for (item, lookaheads) in elements { - result.entries.insert(item, lookaheads); + result.insert(item, &lookaheads); } result } + pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet { + match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) { + Err(i) => { + self.entries.insert(i, (item, lookaheads.clone())); + &mut self.entries[i].1 + }, + Ok(i) => { + self.entries[i].1.insert_all(lookaheads); + &mut self.entries[i].1 + } + } + } + pub fn hash_unfinished_items(&self, h: &mut impl Hasher) { let mut previous_variable_index = u32::MAX; let mut previous_step_index = u32::MAX; - for item in self.entries.keys() { + for (item, _) in self.entries.iter() { if item.step().is_none() && item.variable_index != previous_variable_index || item.step_index != previous_step_index { @@ -251,7 +263,7 @@ impl<'a> ParseItemSet<'a> { impl<'a> Default for ParseItemSet<'a> { fn default() -> Self { Self { - entries: BTreeMap::new(), + entries: Vec::new(), } } } diff --git a/cli/src/generate/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs index 56d7c7c4..b941b179 100644 --- a/cli/src/generate/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -285,18 +285,14 @@ impl<'a> ParseItemSetBuilder<'a> { // Use the pre-computed *additions* to expand the non-terminal. for addition in &self.transitive_closure_additions[step.symbol.index] { - let lookaheads = set - .entries - .entry(addition.item) - .or_insert_with(|| TokenSet::new()); - lookaheads.insert_all(&addition.info.lookaheads); + let lookaheads = set.insert(addition.item, &addition.info.lookaheads); if addition.info.propagates_lookaheads { lookaheads.insert_all(following_tokens); } } } } - set.entries.insert(item, lookaheads.clone()); + set.insert(item, lookaheads); } } From ef735eb94228423e367f36900225c57c6eb8d9b9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 19:18:33 -0800 Subject: [PATCH 150/208] Upload binary artifacts from CI builds --- .appveyor.yml | 21 +++++++++++++++++++++ .travis.yml | 21 +++++++++++++++++++++ cli/src/util.rs | 3 ++- lib/build.rs | 14 +++++++++++--- 4 files changed, 55 insertions(+), 4 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 29193a53..0c9de3ac 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -10,6 +10,10 @@ install: # Install dependencies - git submodule update --init +platform: + - x64 + - x86 + test_script: # Fetch and regenerate the fixture parsers - script\fetch-fixtures.cmd @@ -23,6 +27,23 @@ test_script: branches: only: - master + - /\d+\.\d+\.\d+.*/ + +before_deploy: + - move target\release\tree-sitter.exe tree-sitter.exe + - 7z a tree-sitter-windows-%PLATFORM%.zip tree-sitter.exe + - appveyor PushArtifact tree-sitter-windows-%PLATFORM%.zip + +deploy: + description: '' + provider: GitHub + auth_token: + secure: VC9ntV5+inKoNteZyLQksKzWMKXF46P+Jx3JHKVSfF+o1rWtZn2iIHAVsQv5LaUi + artifact: /tree-sitter-windows-.*.zip/ + draft: true + force_update: true + on: + APPVEYOR_REPO_TAG: true cache: - target diff --git a/.travis.yml b/.travis.yml index 5f981ce9..55fc9276 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,10 @@ language: rust rust: - stable +os: + - linux + - osx + script: # Fetch and regenerate the fixture parsers - script/fetch-fixtures @@ -15,6 +19,23 @@ script: branches: only: - master + - /\d+\.\d+\.\d+/ + +before_deploy: + - mv target/release/tree-sitter . + - tar czf tree-sitter-${TRAVIS_OS_NAME}-x64.tar.gz tree-sitter + +deploy: + provider: releases + api_key: + secure: "cAd2mQP+Q55v3zedo5ZyOVc3hq3XKMW93lp5LuXV6CYKYbIhkyfym4qfs+C9GJQiIP27cnePYM7B3+OMIFwSPIgXHWWSsuloMtDgYSc/PAwb2dZnJqAyog3BohW/QiGTSnvbVlxPF6P9RMQU6+JP0HJzEJy6QBTa4Und/j0jm24=" + file_glob: true + file: "tree-sitter-*.tar.gz" + draft: true + overwrite: true + skip_cleanup: true + on: + tags: true cache: cargo: true diff --git a/cli/src/util.rs b/cli/src/util.rs index 5c1bc39c..166e54d0 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use std::process::{Child, ChildStdin, Command, Stdio}; use tree_sitter::Parser; +#[cfg(unix)] const HTML_HEADER: &[u8] = b"\n\n\n"; #[cfg(windows)] @@ -13,7 +14,7 @@ pub(crate) struct LogSession(); pub(crate) struct LogSession(PathBuf, Option, Option); #[cfg(windows)] -pub(crate) fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { +pub(crate) fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result { Ok(LogSession()) } diff --git a/lib/build.rs b/lib/build.rs index eb6fea8b..2a121001 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -6,9 +6,17 @@ use std::path::{Path, PathBuf}; fn main() { println!("cargo:rerun-if-env-changed=TREE_SITTER_STATIC_ANALYSIS"); if env::var("TREE_SITTER_STATIC_ANALYSIS").is_ok() { - let clang_path = which("clang").unwrap(); - let clang_path = clang_path.to_str().unwrap(); - env::set_var("CC", &format!("scan-build -analyze-headers --use-analyzer={} cc", clang_path)); + if let (Some(clang_path), Some(scan_build_path)) = (which("clang"), which("scan-build")) { + let clang_path = clang_path.to_str().unwrap(); + let scan_build_path = scan_build_path.to_str().unwrap(); + env::set_var( + "CC", + &format!( + "{} -analyze-headers --use-analyzer={} cc", + scan_build_path, clang_path + ), + ); + } } let mut config = cc::Build::new(); From b0fe8164414b900b9fdea72824071c62894857f0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 11:42:50 -0800 Subject: [PATCH 151/208] Add npm module --- cli/npm/.gitignore | 4 +++ cli/npm/install.js | 71 +++++++++++++++++++++++++++++++++++++++ cli/npm/package-lock.json | 5 +++ cli/npm/package.json | 22 ++++++++++++ 4 files changed, 102 insertions(+) create mode 100644 cli/npm/.gitignore create mode 100755 cli/npm/install.js create mode 100644 cli/npm/package-lock.json create mode 100644 cli/npm/package.json diff --git a/cli/npm/.gitignore b/cli/npm/.gitignore new file mode 100644 index 00000000..306613e7 --- /dev/null +++ b/cli/npm/.gitignore @@ -0,0 +1,4 @@ +tree-sitter +tree-sitter.exe +*.tar.gz +*.zip diff --git a/cli/npm/install.js b/cli/npm/install.js new file mode 100755 index 00000000..5564ce98 --- /dev/null +++ b/cli/npm/install.js @@ -0,0 +1,71 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const https = require('https'); +const execFileSync = require('child_process').execFileSync; +const packageJSON = require('./package.json'); + +// Determine the URL of the file. +const isWindows = process.platform === 'win32'; +const platformName = { + 'darwin': 'osx', + 'linux': 'linux', + 'win32': 'windows' +}[process.platform]; +if (!platformName) { + throw new Error(`Cannot install tree-sitter-cli for platform ${process.platform}`); +} +const releaseURL = `https://github.com/tree-sitter/tree-sitter/releases/download/${packageJSON.version}`; +const assetExtension = isWindows ? 'zip' : 'tar.gz'; +const assetName = `tree-sitter-${platformName}-${process.arch}.${assetExtension}`; +const assetURL = `${releaseURL}/${assetName}`; + +// Remove previously-downloaded files. +const executableName = isWindows ? 'tree-sitter.exe' : 'tree-sitter'; +if (fs.existsSync(executableName)) { + fs.unlinkSync(executableName); +} +if (fs.existsSync(assetName)) { + fs.unlinkSync(assetName); +} + +// Download the compressed file. +console.log(`Downloading ${assetURL}`); +const file = fs.createWriteStream(assetName); +get(assetURL, response => { + if (response.statusCode > 299) { + throw new Error([ + 'Download failed', + '', + `url: ${url}`, + `status: ${response.statusCode}`, + `headers: ${JSON.stringify(response.headers, null, 2)}`, + '', + ].join('\n')); + } + + response.pipe(file); +}); + +// Extract the file. +file.on('finish', () => { + console.log(`Extracting ${assetName}`); + if (isWindows) { + execFileSync('7z', ['e', assetName]); + } else { + execFileSync('tar', ['xzf', assetName]); + } + fs.unlinkSync(assetName); + console.log(`Done`); +}); + +// Follow redirects. +function get(url, callback) { + https.get(url, response => { + if (response.statusCode === 301 || response.statusCode === 302) { + get(response.headers.location, callback); + } else { + callback(response); + } + }); +} diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json new file mode 100644 index 00000000..b78f1d4e --- /dev/null +++ b/cli/npm/package-lock.json @@ -0,0 +1,5 @@ +{ + "name": "tree-sitter-cli", + "version": "0.14.0-beta0", + "lockfileVersion": 1 +} diff --git a/cli/npm/package.json b/cli/npm/package.json new file mode 100644 index 00000000..01a50491 --- /dev/null +++ b/cli/npm/package.json @@ -0,0 +1,22 @@ +{ + "name": "tree-sitter-cli", + "version": "0.14.0-beta0", + "author": "Max Brunsfeld", + "license": "MIT", + "repository": { + "type": "git", + "url": "http://github.com/tree-sitter/tree-sitter.git" + }, + "description": "CLI for generating fast incremental parsers", + "keywords": [ + "parser", + "lexer" + ], + "main": "lib/api/index.js", + "scripts": { + "install": "./install.js" + }, + "bin": { + "tree-sitter": "tree-sitter" + } +} From e7bb57550badeab50f3a44a607dab80f7a91069c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 15 Jan 2019 19:18:33 -0800 Subject: [PATCH 152/208] Use gzip for release assets so they can easily be extracted from node --- .appveyor.yml | 6 +++--- .gitignore | 12 +++++++++--- .travis.yml | 6 +++--- cli/npm/.gitignore | 3 +-- cli/npm/install.js | 40 ++++++++++++++++------------------------ cli/npm/package.json | 2 +- 6 files changed, 33 insertions(+), 36 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 0c9de3ac..07a72738 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -31,15 +31,15 @@ branches: before_deploy: - move target\release\tree-sitter.exe tree-sitter.exe - - 7z a tree-sitter-windows-%PLATFORM%.zip tree-sitter.exe - - appveyor PushArtifact tree-sitter-windows-%PLATFORM%.zip + - 7z a -tgzip tree-sitter-windows-%PLATFORM%.gz tree-sitter.exe + - appveyor PushArtifact tree-sitter-windows-%PLATFORM%.gz deploy: description: '' provider: GitHub auth_token: secure: VC9ntV5+inKoNteZyLQksKzWMKXF46P+Jx3JHKVSfF+o1rWtZn2iIHAVsQv5LaUi - artifact: /tree-sitter-windows-.*.zip/ + artifact: /tree-sitter-windows-.*/ draft: true force_update: true on: diff --git a/.gitignore b/.gitignore index 23c82fe6..bcb55844 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,17 @@ log.html + .idea *.xcodeproj -*.a -*.o + fuzz-results + test/fixtures/grammars/* !test/fixtures/grammars/.gitkeep /target -**/*.rs.bk +*.rs.bk +*.a +*.o +*.obj +*.exp +*.lib diff --git a/.travis.yml b/.travis.yml index 55fc9276..722a4dc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,15 +22,15 @@ branches: - /\d+\.\d+\.\d+/ before_deploy: - - mv target/release/tree-sitter . - - tar czf tree-sitter-${TRAVIS_OS_NAME}-x64.tar.gz tree-sitter + - cp target/release/tree-sitter . + - gzip --suffix "-${TRAVIS_OS_NAME}-x64.gz" tree-sitter deploy: provider: releases api_key: secure: "cAd2mQP+Q55v3zedo5ZyOVc3hq3XKMW93lp5LuXV6CYKYbIhkyfym4qfs+C9GJQiIP27cnePYM7B3+OMIFwSPIgXHWWSsuloMtDgYSc/PAwb2dZnJqAyog3BohW/QiGTSnvbVlxPF6P9RMQU6+JP0HJzEJy6QBTa4Und/j0jm24=" file_glob: true - file: "tree-sitter-*.tar.gz" + file: "tree-sitter-*.gz" draft: true overwrite: true skip_cleanup: true diff --git a/cli/npm/.gitignore b/cli/npm/.gitignore index 306613e7..f0475945 100644 --- a/cli/npm/.gitignore +++ b/cli/npm/.gitignore @@ -1,4 +1,3 @@ tree-sitter tree-sitter.exe -*.tar.gz -*.zip +*.gz diff --git a/cli/npm/install.js b/cli/npm/install.js index 5564ce98..d73c51cb 100755 --- a/cli/npm/install.js +++ b/cli/npm/install.js @@ -1,12 +1,11 @@ #!/usr/bin/env node const fs = require('fs'); +const zlib = require('zlib'); const https = require('https'); -const execFileSync = require('child_process').execFileSync; const packageJSON = require('./package.json'); // Determine the URL of the file. -const isWindows = process.platform === 'win32'; const platformName = { 'darwin': 'osx', 'linux': 'linux', @@ -15,48 +14,41 @@ const platformName = { if (!platformName) { throw new Error(`Cannot install tree-sitter-cli for platform ${process.platform}`); } + +const archName = { + 'x64': 'x64', + 'x86': 'x86', + 'ia32': 'x86' +}[process.arch]; +if (!archName) { + throw new Error(`Cannot install tree-sitter-cli for architecture ${process.arch}`); +} + const releaseURL = `https://github.com/tree-sitter/tree-sitter/releases/download/${packageJSON.version}`; -const assetExtension = isWindows ? 'zip' : 'tar.gz'; -const assetName = `tree-sitter-${platformName}-${process.arch}.${assetExtension}`; +const assetName = `tree-sitter-${platformName}-${archName}.gz`; const assetURL = `${releaseURL}/${assetName}`; // Remove previously-downloaded files. -const executableName = isWindows ? 'tree-sitter.exe' : 'tree-sitter'; +const executableName = process.platform === 'win32' ? 'tree-sitter.exe' : 'tree-sitter'; if (fs.existsSync(executableName)) { fs.unlinkSync(executableName); } -if (fs.existsSync(assetName)) { - fs.unlinkSync(assetName); -} // Download the compressed file. console.log(`Downloading ${assetURL}`); -const file = fs.createWriteStream(assetName); +const file = fs.createWriteStream(executableName); get(assetURL, response => { if (response.statusCode > 299) { throw new Error([ 'Download failed', '', - `url: ${url}`, + `url: ${assetURL}`, `status: ${response.statusCode}`, `headers: ${JSON.stringify(response.headers, null, 2)}`, '', ].join('\n')); } - - response.pipe(file); -}); - -// Extract the file. -file.on('finish', () => { - console.log(`Extracting ${assetName}`); - if (isWindows) { - execFileSync('7z', ['e', assetName]); - } else { - execFileSync('tar', ['xzf', assetName]); - } - fs.unlinkSync(assetName); - console.log(`Done`); + response.pipe(zlib.createGunzip()).pipe(file); }); // Follow redirects. diff --git a/cli/npm/package.json b/cli/npm/package.json index 01a50491..e459b551 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -14,7 +14,7 @@ ], "main": "lib/api/index.js", "scripts": { - "install": "./install.js" + "install": "install.js" }, "bin": { "tree-sitter": "tree-sitter" From 564c5e39b66a6d9406852e823262872c5ace6cec Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 12:42:02 -0800 Subject: [PATCH 153/208] 0.14.0-beta1 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 464cd050..09bc9ea0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -771,7 +771,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.1.0" +version = "0.14.0-beta1" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index b6226917..e1c83583 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tree-sitter-cli" -version = "0.1.0" +version = "0.14.0-beta1" authors = ["Max Brunsfeld "] edition = "2018" diff --git a/cli/npm/package.json b/cli/npm/package.json index e459b551..de64c70f 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta0", + "version": "0.14.0-beta1", "author": "Max Brunsfeld", "license": "MIT", "repository": { From a0a3903f767b9d421aa50bfde65eca13d40924b9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 13:53:01 -0800 Subject: [PATCH 154/208] Generate binding.gyp, binding.cc, and index.js --- cli/src/generate/mod.rs | 29 +++++++++++++++++++------- cli/src/generate/npm_files.rs | 18 ++++++++++++++++ cli/src/generate/templates/binding.cc | 28 +++++++++++++++++++++++++ cli/src/generate/templates/binding.gyp | 18 ++++++++++++++++ cli/src/generate/templates/index.js | 9 ++++++++ cli/src/tests/corpuses.rs | 2 +- 6 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 cli/src/generate/npm_files.rs create mode 100644 cli/src/generate/templates/binding.cc create mode 100644 cli/src/generate/templates/binding.gyp create mode 100644 cli/src/generate/templates/index.js diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 1593c0da..f42dff96 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -12,6 +12,7 @@ use std::process::{Command, Stdio}; mod build_tables; mod grammars; mod nfa; +mod npm_files; mod parse_grammar; mod prepare_grammar; mod properties; @@ -36,17 +37,30 @@ pub fn generate_parser_in_directory( if !properties_only { let grammar_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into()); let grammar_json = load_grammar_file(&grammar_path); - let c_code = + let (language_name, c_code) = generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; - fs::create_dir_all("src")?; - fs::write(repo_path.join("src").join("parser.c"), c_code)?; + let repo_src_path = repo_path.join("src"); + fs::create_dir_all(&repo_src_path)?; + fs::write(&repo_src_path.join("parser.c"), c_code)?; + let binding_cc_path = repo_src_path.join("binding.cc"); + if !binding_cc_path.exists() { + fs::write(&binding_cc_path, npm_files::binding_cc(&language_name))?; + } + let binding_gyp_path = repo_path.join("binding.gyp"); + if !binding_gyp_path.exists() { + fs::write(&binding_gyp_path, npm_files::binding_gyp(&language_name))?; + } + let index_js_path = repo_path.join("index.js"); + if !index_js_path.exists() { + fs::write(&index_js_path, npm_files::index_js(&language_name))?; + } } properties::generate_property_sheets(repo_path)?; Ok(()) } #[cfg(test)] -pub fn generate_parser_for_grammar(grammar_json: &String) -> Result { +pub fn generate_parser_for_grammar(grammar_json: &String) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); generate_parser_for_grammar_with_opts(&grammar_json, true, Vec::new()) } @@ -55,7 +69,7 @@ fn generate_parser_for_grammar_with_opts( grammar_json: &str, minimize: bool, state_ids_to_log: Vec, -) -> Result { +) -> Result<(String, String)> { let input_grammar = parse_grammar(grammar_json)?; let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = prepare_grammar(&input_grammar)?; @@ -67,7 +81,7 @@ fn generate_parser_for_grammar_with_opts( minimize, state_ids_to_log, )?; - Ok(render_c_code( + let c_code = render_c_code( &input_grammar.name, parse_table, main_lex_table, @@ -76,7 +90,8 @@ fn generate_parser_for_grammar_with_opts( syntax_grammar, lexical_grammar, simple_aliases, - )) + ); + Ok((input_grammar.name, c_code)) } fn load_grammar_file(grammar_path: &PathBuf) -> String { diff --git a/cli/src/generate/npm_files.rs b/cli/src/generate/npm_files.rs new file mode 100644 index 00000000..5f813c88 --- /dev/null +++ b/cli/src/generate/npm_files.rs @@ -0,0 +1,18 @@ +use std::str; + +const BINDING_CC_TEMPLATE: &'static str = include_str!("./templates/binding.cc"); +const BINDING_GYP_TEMPLATE: &'static str = include_str!("./templates/binding.gyp"); +const INDEX_JS_TEMPLATE: &'static str = include_str!("./templates/index.js"); +const PARSER_NAME_PLACEHOLDER: &'static str = "PARSER_NAME"; + +pub fn binding_cc(parser_name: &str) -> String { + BINDING_CC_TEMPLATE.replace(PARSER_NAME_PLACEHOLDER, parser_name) +} + +pub fn binding_gyp(parser_name: &str) -> String { + BINDING_GYP_TEMPLATE.replace(PARSER_NAME_PLACEHOLDER, parser_name) +} + +pub fn index_js(parser_name: &str) -> String { + INDEX_JS_TEMPLATE.replace(PARSER_NAME_PLACEHOLDER, parser_name) +} diff --git a/cli/src/generate/templates/binding.cc b/cli/src/generate/templates/binding.cc new file mode 100644 index 00000000..18853f55 --- /dev/null +++ b/cli/src/generate/templates/binding.cc @@ -0,0 +1,28 @@ +#include "tree_sitter/parser.h" +#include +#include "nan.h" + +using namespace v8; + +extern "C" TSLanguage * tree_sitter_PARSER_NAME(); + +namespace { + +NAN_METHOD(New) {} + +void Init(Handle exports, Handle module) { + Local tpl = Nan::New(New); + tpl->SetClassName(Nan::New("Language").ToLocalChecked()); + tpl->InstanceTemplate()->SetInternalFieldCount(1); + + Local constructor = tpl->GetFunction(); + Local instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked(); + Nan::SetInternalFieldPointer(instance, 0, tree_sitter_PARSER_NAME()); + + instance->Set(Nan::New("name").ToLocalChecked(), Nan::New("PARSER_NAME").ToLocalChecked()); + module->Set(Nan::New("exports").ToLocalChecked(), instance); +} + +NODE_MODULE(tree_sitter_PARSER_NAME_binding, Init) + +} // namespace diff --git a/cli/src/generate/templates/binding.gyp b/cli/src/generate/templates/binding.gyp new file mode 100644 index 00000000..f273a007 --- /dev/null +++ b/cli/src/generate/templates/binding.gyp @@ -0,0 +1,18 @@ +{ + "targets": [ + { + "target_name": "tree_sitter_PARSER_NAME_binding", + "include_dirs": [ + " Date: Wed, 16 Jan 2019 13:53:20 -0800 Subject: [PATCH 155/208] Remove unused dependencies --- .appveyor.yml | 2 - Cargo.lock | 270 ------------------------------------------------- cli/Cargo.toml | 2 - 3 files changed, 274 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 07a72738..26ae0691 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -49,5 +49,3 @@ cache: - target - test\fixtures\grammars - C:\Users\appveyor\.cargo - - C:\cargo\registry - - C:\cargo\git diff --git a/Cargo.lock b/Cargo.lock index 09bc9ea0..2edecfc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -123,44 +123,6 @@ name = "constant_time_eq" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "crossbeam-channel" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", - "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", - "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crossbeam-utils" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "crossbeam-utils" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "difference" version = "2.0.0" @@ -196,11 +158,6 @@ dependencies = [ "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "fnv" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "fuchsia-zircon" version = "0.3.3" @@ -215,18 +172,6 @@ name = "fuchsia-zircon-sys" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "globset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", - "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "hashbrown" version = "0.1.7" @@ -236,23 +181,6 @@ dependencies = [ "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "ignore" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", - "globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "indexmap" version = "1.0.2" @@ -282,29 +210,6 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "libsqlite3-sys" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", - "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "linked-hash-map" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "lock_api" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "log" version = "0.4.6" @@ -313,14 +218,6 @@ dependencies = [ "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "lru-cache" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "memchr" version = "2.1.1" @@ -331,11 +228,6 @@ dependencies = [ "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "memoffset" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "nodrop" version = "0.1.13" @@ -371,40 +263,6 @@ name = "num-traits" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "owning_ref" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "parking_lot" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "parking_lot_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", - "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "pkg-config" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "proc-macro2" version = "0.4.24" @@ -431,18 +289,6 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rand" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "rand" version = "0.6.4" @@ -469,14 +315,6 @@ dependencies = [ "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rand_core" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "rand_core" version = "0.3.0" @@ -592,17 +430,6 @@ dependencies = [ "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rusqlite" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", - "lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "rustc-demangle" version = "0.1.9" @@ -621,14 +448,6 @@ name = "ryu" version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "same-file" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "scoped_threadpool" version = "0.1.9" @@ -683,19 +502,6 @@ name = "smallbitvec" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "smallvec" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "stable_deref_trait" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "strsim" version = "0.7.0" @@ -748,16 +554,6 @@ dependencies = [ "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "time" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "tree-sitter" version = "0.3.5" @@ -779,14 +575,12 @@ dependencies = [ "difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", - "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)", - "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", @@ -809,24 +603,11 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "unreachable" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "utf8-ranges" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "vcpkg" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "vec_map" version = "0.8.1" @@ -837,21 +618,6 @@ name = "version_check" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "walkdir" -version = "2.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "winapi" version = "0.3.6" @@ -866,14 +632,6 @@ name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "winapi-util" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -896,48 +654,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e" -"checksum crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b85741761b7f160bc5e7e0c14986ef685b7f8bf9b7ad081c60c604bb4649827" -"checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8" -"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015" -"checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816" "checksum difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" "checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a" "checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7" "checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596" -"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" -"checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" "checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e" -"checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" "checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" "checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" "checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" "checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" -"checksum libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d3711dfd91a1081d2458ad2d06ea30a8755256e74038be2ad927d94e1c955ca8" -"checksum linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7860ec297f7008ff7a1e3382d7f7e1dcd69efc94751a2284bafc3d013c2aa939" -"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" -"checksum lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d06ff7ff06f729ce5f4e227876cb88d10bc59cd4ae1e09fbb2bde15c850dc21" "checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" -"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" "checksum nom 4.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9c349f68f25f596b9f44cf0e7c69752a5c633b0550c3ff849518bfba0233774a" "checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" "checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" -"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" -"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" -"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" -"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" "checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" "checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" -"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" "checksum rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3906503e80ac6cbcacb2c2973fa8e473f24d7e2747c8c92bb230c2441cad96b5" "checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -"checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372" "checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db" "checksum rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" "checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" @@ -951,11 +691,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" "checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" "checksum rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7a5dde55023a6c19470f7aeb59f75f897d8b80cbe00d61dfcaf7bbbe3de4c0a6" -"checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c" "checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" "checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" -"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" "checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" @@ -964,26 +702,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" "checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" -"checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" -"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" "checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" "checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" -"checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" -"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" -"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" -"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" -"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e1c83583..edd14616 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -17,9 +17,7 @@ smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" hashbrown = "0.1" -ignore = "0.4.4" libloading = "0.5" -rusqlite = "0.14.0" serde = "1.0" serde_derive = "1.0" regex-syntax = "0.6.4" From 4689cadf9d12f7f2b62bbb393b53ed1df10fc17a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 13:59:37 -0800 Subject: [PATCH 156/208] Make downloaded binary executable in intsall script --- cli/npm/install.js | 4 ++++ cli/npm/package-lock.json | 2 +- cli/npm/package.json | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cli/npm/install.js b/cli/npm/install.js index d73c51cb..9350f682 100755 --- a/cli/npm/install.js +++ b/cli/npm/install.js @@ -51,6 +51,10 @@ get(assetURL, response => { response.pipe(zlib.createGunzip()).pipe(file); }); +file.on('finish', () => { + fs.chmodSync(executableName, '755'); +}); + // Follow redirects. function get(url, callback) { https.get(url, response => { diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index b78f1d4e..685806c4 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta0", + "version": "0.14.0-beta1", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index de64c70f..0155c8da 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -14,7 +14,7 @@ ], "main": "lib/api/index.js", "scripts": { - "install": "install.js" + "install": "node install.js" }, "bin": { "tree-sitter": "tree-sitter" From ae07d2d6e4d136e5d7269a4b6c1886e76b520327 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 14:09:19 -0800 Subject: [PATCH 157/208] Build 32-bit executables on 32-bit appveyor builds --- .appveyor.yml | 3 ++- cli/src/loader.rs | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 26ae0691..3de89da7 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -2,7 +2,8 @@ build: false install: # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - - rustup-init -yv --default-toolchain stable + - IF "%PLATFORM%" == "x86" rustup-init -y --default-toolchain stable --default-host i686-pc-windows-msvc + - IF "%PLATFORM%" == "x64" rustup-init -y --default-toolchain stable --default-host x86_64-pc-windows-msvc - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin - rustc -vV - cargo -vV diff --git a/cli/src/loader.rs b/cli/src/loader.rs index afb18f9e..7aa0ca50 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -15,6 +15,8 @@ const DYLIB_EXTENSION: &'static str = "so"; #[cfg(windows)] const DYLIB_EXTENSION: &'static str = "dll"; +const BUILD_TARGET: &'static str = env!("BUILD_TARGET"); + struct LanguageRepo { name: String, path: PathBuf, @@ -156,8 +158,8 @@ impl Loader { .cpp(true) .opt_level(2) .cargo_metadata(false) - .target(env!("BUILD_TARGET")) - .host(env!("BUILD_TARGET")); + .target(BUILD_TARGET) + .host(BUILD_TARGET); let compiler = config.get_compiler(); let mut command = Command::new(compiler.path()); for (key, value) in compiler.env() { @@ -165,6 +167,9 @@ impl Loader { } if cfg!(windows) { + if !BUILD_TARGET.contains("64") { + command.env("Platform", "x86"); + } command .args(&["/nologo", "/LD", "/I"]) .arg(header_path) From e4b9d9dfa9cd2873df2ea70a059554a44d3d8aa5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 16 Jan 2019 20:56:36 -0800 Subject: [PATCH 158/208] Fix token conflict detection bugs --- cli/src/generate/build_tables/item.rs | 7 ++++--- cli/src/generate/build_tables/minimize_parse_table.rs | 5 +++-- cli/src/generate/build_tables/mod.rs | 2 +- cli/src/loader.rs | 3 --- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 0222ac21..6c74d465 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -236,7 +236,7 @@ impl<'a> ParseItemSet<'a> { Err(i) => { self.entries.insert(i, (item, lookaheads.clone())); &mut self.entries[i].1 - }, + } Ok(i) => { self.entries[i].1.insert_all(lookaheads); &mut self.entries[i].1 @@ -248,8 +248,9 @@ impl<'a> ParseItemSet<'a> { let mut previous_variable_index = u32::MAX; let mut previous_step_index = u32::MAX; for (item, _) in self.entries.iter() { - if item.step().is_none() && item.variable_index != previous_variable_index - || item.step_index != previous_step_index + if item.step().is_some() + && (item.variable_index != previous_variable_index + || item.step_index != previous_step_index) { h.write_u32(item.variable_index); h.write_u32(item.step_index); diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index a5cb5f81..bb9b26eb 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -228,8 +228,9 @@ impl<'a> Minimizer<'a> { // Do not add a token if it conflicts with an existing token. if token.is_terminal() { for existing_token in state.terminal_entries.keys() { - if (is_word_token && self.keywords.contains(existing_token)) - || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token) + if (is_word_token || is_keyword) + && (self.keywords.contains(existing_token) + || self.syntax_grammar.word_token.as_ref() == Some(existing_token)) { continue; } diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 28b18109..7811176b 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -232,7 +232,7 @@ fn identify_keywords( .filter(|token| { for other_token in keywords.iter() { if other_token != *token - && token_conflict_map.does_match_same_string(token.index, other_token.index) + && token_conflict_map.does_match_same_string(other_token.index, token.index) { info!( "Keywords - exclude {} because it matches the same string as {}", diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 7aa0ca50..70056404 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -167,9 +167,6 @@ impl Loader { } if cfg!(windows) { - if !BUILD_TARGET.contains("64") { - command.env("Platform", "x86"); - } command .args(&["/nologo", "/LD", "/I"]) .arg(header_path) From d903371709c8ae442c7ec162e9ea227ce59a2863 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 10:07:58 -0800 Subject: [PATCH 159/208] Remove noisy logging --- cli/src/generate/build_tables/build_lex_table.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 03ec0c7b..38f56cc3 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -184,21 +184,13 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - info!( - "lex state: {}, completion: {:?}", - state_id, - completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) - ); - let transitions = self.cursor.transitions(); let has_sep = self.cursor.transition_chars().any(|(_, sep)| sep); - info!("lex state: {}, transitions: {:?}", state_id, transitions); // If EOF is a valid lookahead token, add a transition predicated on the null // character that leads to the empty set of NFA states. if eof_valid { let (next_state_id, _) = self.add_state(Vec::new(), false); - info!("lex state: {}, successor: EOF", state_id); self.table.states[state_id].advance_actions.push(( CharacterSet::empty().add_char('\0'), AdvanceAction { From d52a11fd03a7d348275422990fb4ab8fc23ca2fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 10:09:03 -0800 Subject: [PATCH 160/208] Avoid using a string literal to pass grammar path to JS Backslashes in windows path were getting interpeted as escape characters. --- cli/src/generate/dsl.js | 3 +++ cli/src/generate/mod.rs | 11 +++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cli/src/generate/dsl.js b/cli/src/generate/dsl.js index fa60dfa7..950b2d3b 100644 --- a/cli/src/generate/dsl.js +++ b/cli/src/generate/dsl.js @@ -327,3 +327,6 @@ global.seq = seq; global.sym = sym; global.token = token; global.grammar = grammar; + +const result = require(process.env.TREE_SITTER_GRAMMAR_PATH); +console.log(JSON.stringify(result, null, 2)); diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index f42dff96..baaeb182 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -104,23 +104,18 @@ fn load_grammar_file(grammar_path: &PathBuf) -> String { fn load_js_grammar_file(grammar_path: &PathBuf) -> String { let mut node_process = Command::new("node") + .env("TREE_SITTER_GRAMMAR_PATH", grammar_path) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() .expect("Failed to run `node`"); - let js_prelude = include_str!("./dsl.js"); let mut node_stdin = node_process .stdin .take() .expect("Failed to open stdin for node"); - write!( - node_stdin, - "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", - js_prelude, - grammar_path.to_str().unwrap() - ) - .expect("Failed to write to node's stdin"); + let javascript_code = include_bytes!("./dsl.js"); + node_stdin.write(javascript_code).expect("Failed to write to node's stdin"); drop(node_stdin); let output = node_process .wait_with_output() From 3d11388cd10d2e69191a6e255e47afce16eeab69 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 12:40:21 -0800 Subject: [PATCH 161/208] Fix test subcommand bugs * Log session was dropped before the parser * Whitespace between close parens was not stripped --- cli/src/test.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/test.rs b/cli/src/test.rs index 4d6034e5..3a40eb83 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -44,12 +44,12 @@ pub fn run_tests_at_path( filter: Option<&str>, ) -> Result<()> { let test_entry = parse_tests(path)?; - let mut log_session = None; + let mut _log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; if debug_graph { - log_session = Some(util::log_graphs(&mut parser, "log.html")?); + _log_session = Some(util::log_graphs(&mut parser, "log.html")?); } else if debug { parser.set_logger(Some(Box::new(|log_type, message| { if log_type == LogType::Lex { @@ -82,7 +82,6 @@ pub fn run_tests_at_path( } } - drop(log_session); Ok(()) } @@ -200,6 +199,7 @@ fn parse_test_content(name: String, content: String) -> TestEntry { if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) { let input = bytes[previous_header_end..divider_start].to_vec(); let output = WHITESPACE_REGEX.replace_all(output.trim(), " ").to_string(); + let output = output.replace(" )", ")"); children.push(TestEntry::Example { name: previous_name, input, From 9f7079c9c50abd43cceda31e22b6871ac4db6847 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 12:44:14 -0800 Subject: [PATCH 162/208] Ensure that the word token has a low numerical index Fixes https://github.com/tree-sitter/tree-sitter/issues/258 --- cli/src/generate/build_tables/mod.rs | 10 +++++----- cli/src/generate/prepare_grammar/extract_tokens.rs | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 7811176b..3d7b6fd0 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -172,17 +172,17 @@ fn populate_used_symbols( non_terminal_usages[symbol.index] = true; } } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - parse_table.symbols.push(Symbol::external(i)); - } - } parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { parse_table.symbols.push(Symbol::terminal(i)); } } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + parse_table.symbols.push(Symbol::external(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { parse_table.symbols.push(Symbol::non_terminal(i)); diff --git a/cli/src/generate/prepare_grammar/extract_tokens.rs b/cli/src/generate/prepare_grammar/extract_tokens.rs index ae07763b..72df21b2 100644 --- a/cli/src/generate/prepare_grammar/extract_tokens.rs +++ b/cli/src/generate/prepare_grammar/extract_tokens.rs @@ -15,6 +15,16 @@ pub(super) fn extract_tokens( extracted_usage_counts: Vec::new(), }; + // Extract the word token first to give it a low numerical index. This ensure that + // it can be stored in a subtree with no heap allocations, even for grammars with + // very large numbers of tokens. This is an optimization, but also important to + // ensure that a subtree's symbol can be successfully reassigned to the word token + // without having to move the subtree to the heap. + // See https://github.com/tree-sitter/tree-sitter/issues/258 + if let Some(token) = grammar.word_token { + extractor.extract_tokens_in_variable(&mut grammar.variables[token.index]); + } + for mut variable in grammar.variables.iter_mut() { extractor.extract_tokens_in_variable(&mut variable); } From bb5dedfb1e43440267cab845b94e78ba5fedbaa3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 12:44:35 -0800 Subject: [PATCH 163/208] Fix another token conflict detection bug --- cli/src/generate/build_tables/token_conflicts.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 13c69c19..1f89022a 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -286,6 +286,9 @@ fn compute_conflict_status( if transition.characters.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; } + if transition.is_separator || has_sep { + result.1.does_match_separators = true; + } } } } From 8f4096e5cb20c508ceae368bcdbe69b72244281f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 12:50:30 -0800 Subject: [PATCH 164/208] Give more informative error messages when failing to write files --- cli/src/generate/mod.rs | 38 ++++++++++++++++++++-------------- cli/src/generate/properties.rs | 4 +++- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index baaeb182..062a9e6b 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -2,7 +2,7 @@ use self::build_tables::build_tables; use self::parse_grammar::parse_grammar; use self::prepare_grammar::prepare_grammar; use self::render::render_c_code; -use crate::error::Result; +use crate::error::{Error, Result}; use regex::{Regex, RegexBuilder}; use std::fs; use std::io::Write; @@ -41,19 +41,17 @@ pub fn generate_parser_in_directory( generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; let repo_src_path = repo_path.join("src"); fs::create_dir_all(&repo_src_path)?; - fs::write(&repo_src_path.join("parser.c"), c_code)?; - let binding_cc_path = repo_src_path.join("binding.cc"); - if !binding_cc_path.exists() { - fs::write(&binding_cc_path, npm_files::binding_cc(&language_name))?; - } - let binding_gyp_path = repo_path.join("binding.gyp"); - if !binding_gyp_path.exists() { - fs::write(&binding_gyp_path, npm_files::binding_gyp(&language_name))?; - } - let index_js_path = repo_path.join("index.js"); - if !index_js_path.exists() { - fs::write(&index_js_path, npm_files::index_js(&language_name))?; - } + fs::write(&repo_src_path.join("parser.c"), c_code) + .map_err(|e| format!("Failed to write parser.c: {}", e))?; + ensure_file(&repo_src_path.join("binding.cc"), || { + npm_files::binding_cc(&language_name) + })?; + ensure_file(&repo_path.join("binding.gyp"), || { + npm_files::binding_gyp(&language_name) + })?; + ensure_file(&repo_path.join("index.js"), || { + npm_files::index_js(&language_name) + })?; } properties::generate_property_sheets(repo_path)?; Ok(()) @@ -115,7 +113,9 @@ fn load_js_grammar_file(grammar_path: &PathBuf) -> String { .take() .expect("Failed to open stdin for node"); let javascript_code = include_bytes!("./dsl.js"); - node_stdin.write(javascript_code).expect("Failed to write to node's stdin"); + node_stdin + .write(javascript_code) + .expect("Failed to write to node's stdin"); drop(node_stdin); let output = node_process .wait_with_output() @@ -128,3 +128,11 @@ fn load_js_grammar_file(grammar_path: &PathBuf) -> String { String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") } + +fn ensure_file(path: &PathBuf, f: impl Fn() -> String) -> Result<()> { + if path.exists() { + Ok(()) + } else { + fs::write(path, f()).map_err(|e| Error(format!("Failed to write file {:?}: {}", path, e))) + } +} diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index b16e698a..bf299af4 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -432,7 +432,9 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { let property_sheet_json_path = src_dir_path .join(css_path.file_name().unwrap()) .with_extension("json"); - let property_sheet_json_file = File::create(property_sheet_json_path)?; + let property_sheet_json_file = File::create(&property_sheet_json_path).map_err(|e| + format!("Failed to create {:?}: {}", property_sheet_json_path, e) + )?; let mut writer = BufWriter::new(property_sheet_json_file); serde_json::to_writer_pretty(&mut writer, &sheet)?; } From c27f776d418c3413b907435d0e8fe5a86f99f7db Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 12:50:30 -0800 Subject: [PATCH 165/208] Fix word token index issue in a different way Refs https://github.com/tree-sitter/tree-sitter/issues/258 --- cli/src/generate/build_tables/mod.rs | 12 +++++++++++- cli/src/generate/prepare_grammar/extract_tokens.rs | 10 ---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 3d7b6fd0..92fddefe 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -175,7 +175,17 @@ fn populate_used_symbols( parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { - parse_table.symbols.push(Symbol::terminal(i)); + // Assign the grammar's word token a low numerical index. This ensures that + // it can be stored in a subtree with no heap allocations, even for grammars with + // very large numbers of tokens. This is an optimization, but it's also important to + // ensure that a subtree's symbol can be successfully reassigned to the word token + // without having to move the subtree to the heap. + // See https://github.com/tree-sitter/tree-sitter/issues/258 + if syntax_grammar.word_token.map_or(false, |t| t.index == i) { + parse_table.symbols.insert(1, Symbol::terminal(i)); + } else { + parse_table.symbols.push(Symbol::terminal(i)); + } } } for (i, value) in external_usages.into_iter().enumerate() { diff --git a/cli/src/generate/prepare_grammar/extract_tokens.rs b/cli/src/generate/prepare_grammar/extract_tokens.rs index 72df21b2..ae07763b 100644 --- a/cli/src/generate/prepare_grammar/extract_tokens.rs +++ b/cli/src/generate/prepare_grammar/extract_tokens.rs @@ -15,16 +15,6 @@ pub(super) fn extract_tokens( extracted_usage_counts: Vec::new(), }; - // Extract the word token first to give it a low numerical index. This ensure that - // it can be stored in a subtree with no heap allocations, even for grammars with - // very large numbers of tokens. This is an optimization, but also important to - // ensure that a subtree's symbol can be successfully reassigned to the word token - // without having to move the subtree to the heap. - // See https://github.com/tree-sitter/tree-sitter/issues/258 - if let Some(token) = grammar.word_token { - extractor.extract_tokens_in_variable(&mut grammar.variables[token.index]); - } - for mut variable in grammar.variables.iter_mut() { extractor.extract_tokens_in_variable(&mut variable); } From 64fa721779e6b69c6d019db68505add1e9cc34a7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 13:36:35 -0800 Subject: [PATCH 166/208] Don't skip branch builds on appveyor --- .appveyor.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 3de89da7..4a6721ad 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -25,11 +25,6 @@ test_script: - set TREE_SITTER_TEST=1 - script\test.cmd -branches: - only: - - master - - /\d+\.\d+\.\d+.*/ - before_deploy: - move target\release\tree-sitter.exe tree-sitter.exe - 7z a -tgzip tree-sitter-windows-%PLATFORM%.gz tree-sitter.exe From 06cb829d37ebd5b975483f67d39dd62236908102 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 13:44:22 -0800 Subject: [PATCH 167/208] Try another way of building only tags and PRs on appveyor --- .appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 4a6721ad..3d6b7bd7 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,5 +1,8 @@ build: false install: + # Terminate early unless building either a tag or a PR. + - if not defined APPVEYOR_REPO_TAG if not defined APPVEYOR_PULL_REQUEST_NUMBER appveyor exit + # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - IF "%PLATFORM%" == "x86" rustup-init -y --default-toolchain stable --default-host i686-pc-windows-msvc From 14ecec1d4f7b5f99c18308fc394adcad8018c95a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 13:55:18 -0800 Subject: [PATCH 168/208] Fix early termination on appveyor --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 3d6b7bd7..7d4acdc7 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,7 +1,7 @@ build: false install: # Terminate early unless building either a tag or a PR. - - if not defined APPVEYOR_REPO_TAG if not defined APPVEYOR_PULL_REQUEST_NUMBER appveyor exit + - if not defined APPVEYOR_REPO_TAG if not "%APPVEYOR_REPO_BRANCH%" == "master" appveyor exit # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe From 53c8eaa4c2ad55daef39d877b4fbeb8daa42b162 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 15:15:34 -0800 Subject: [PATCH 169/208] Create a wrapper script for npm package to fix npm install issues --- cli/npm/.gitignore | 1 + cli/npm/cli.js | 12 ++++++++++++ cli/npm/package.json | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100755 cli/npm/cli.js diff --git a/cli/npm/.gitignore b/cli/npm/.gitignore index f0475945..2d3aa23a 100644 --- a/cli/npm/.gitignore +++ b/cli/npm/.gitignore @@ -1,3 +1,4 @@ tree-sitter tree-sitter.exe *.gz +*.tgz diff --git a/cli/npm/cli.js b/cli/npm/cli.js new file mode 100755 index 00000000..404739fa --- /dev/null +++ b/cli/npm/cli.js @@ -0,0 +1,12 @@ +#!/usr/bin/env node + +const path = require('path'); +const spawn = require("child_process").spawn; +const executable = process.platform === 'win32' + ? 'tree-sitter.exe' + : 'tree-sitter'; +spawn( + path.join(__dirname, executable), + process.argv.slice(2), + {stdio: 'inherit'} +).on('close', process.exit) diff --git a/cli/npm/package.json b/cli/npm/package.json index 0155c8da..230676f1 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -17,6 +17,6 @@ "install": "node install.js" }, "bin": { - "tree-sitter": "tree-sitter" + "tree-sitter": "cli.js" } } From c204b5e72837fd5a0fb7aeb09e3c1af5080a4604 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 15:15:40 -0800 Subject: [PATCH 170/208] Print help/version info when run w/ no subcommand --- cli/build.rs | 26 ++++++++++++++++++++++++++ cli/src/main.rs | 6 ++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/cli/build.rs b/cli/build.rs index e0ebd1c4..f8e62274 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -1,6 +1,32 @@ +use std::{io, env, fs}; + fn main() { + let git_sha = read_git_sha().unwrap(); + println!("cargo:rustc-env={}={}", "BUILD_SHA", git_sha); + println!( "cargo:rustc-env=BUILD_TARGET={}", std::env::var("TARGET").unwrap() ); } + +fn read_git_sha() -> io::Result { + let git_path = env::current_dir().unwrap().parent().unwrap().join(".git"); + let git_head_path = git_path.join("HEAD"); + println!("cargo:rerun-if-changed={}", git_head_path.to_str().unwrap()); + let mut head_content = fs::read_to_string(&git_head_path)?; + assert!(head_content.ends_with("\n")); + head_content.pop(); + + if head_content.starts_with("ref: ") { + // We're on a branch. Read the SHA from the ref file. + head_content.replace_range(0.."ref: ".len(), ""); + let ref_filename = git_path.join(&head_content); + println!("cargo:rerun-if-changed={}", ref_filename.to_str().unwrap()); + fs::read_to_string(&ref_filename) + } else { + // We're not on a branch. The `HEAD` file itself contains the sha. + assert_eq!(head_content.len(), 40); + Ok(head_content) + } +} diff --git a/cli/src/main.rs b/cli/src/main.rs index 80a40758..1860ecc2 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -21,7 +21,7 @@ mod util; mod tests; use self::loader::Loader; -use clap::{App, Arg, SubCommand}; +use clap::{App, AppSettings, Arg, SubCommand}; use std::env; use std::fs; use std::path::Path; @@ -37,7 +37,8 @@ fn main() { fn run() -> error::Result<()> { let matches = App::new("tree-sitter") - .version("0.1") + .version(concat!(env!("CARGO_PKG_VERSION"), " (", env!("BUILD_SHA"), ")")) + .setting(AppSettings::SubcommandRequiredElseHelp) .author("Max Brunsfeld ") .about("Generates and tests parsers") .subcommand( @@ -77,6 +78,7 @@ fn run() -> error::Result<()> { let home_dir = dirs::home_dir().unwrap(); let current_dir = env::current_dir().unwrap(); let config_dir = home_dir.join(".tree-sitter"); + fs::create_dir_all(&config_dir).unwrap(); let mut loader = Loader::new(config_dir); From 652eb3bbb62cf883a05c23a12e50df6caf92a45b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 15:17:55 -0800 Subject: [PATCH 171/208] 0.14.0-beta2 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package-lock.json | 2 +- cli/npm/package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2edecfc7..f27e897e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -567,7 +567,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.14.0-beta1" +version = "0.14.0-beta2" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index edd14616..b7cd21d2 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tree-sitter-cli" -version = "0.14.0-beta1" +version = "0.14.0-beta2" authors = ["Max Brunsfeld "] edition = "2018" diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index 685806c4..ff76b456 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta1", + "version": "0.14.0-beta2", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index 230676f1..9dfd5e7e 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta1", + "version": "0.14.0-beta2", "author": "Max Brunsfeld", "license": "MIT", "repository": { From 71357afb2fa55a7b35402bdbca9cc1777d2559e4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 15:59:12 -0800 Subject: [PATCH 172/208] Add version script --- script/version | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 script/version diff --git a/script/version b/script/version new file mode 100755 index 00000000..4373dcdb --- /dev/null +++ b/script/version @@ -0,0 +1,49 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const path = require('path'); +const {execFileSync} = require('child_process'); + +const cliPath = path.join(__dirname, '..', 'cli'); +const npmPath = path.join(cliPath, 'npm'); +const cargoTomlPath = path.join(cliPath, 'Cargo.toml'); + +const npmMetadata = require(path.join(npmPath, 'package.json')); +const npmVersion = npmMetadata.version; + +const cargoMetadata = fs.readFileSync(cargoTomlPath, 'utf8') +const cargoVersionMatch = cargoMetadata.match(/version = "([^"\n]+)"/); +const cargoVersion = cargoVersionMatch[1]; + +if (npmVersion !== cargoVersion) { + console.error(`NPM version ${npmVersion} does not match Cargo version ${cargoVersion}`); + process.exit(1); +} + +if (process.argv[2]) { + // Check that working directory is clean + const diff = execFileSync( + 'git', + ['diff', '--stat'], + {encoding: 'utf8'} + ); + if (diff.length !== 0) { + console.error('There are uncommited changes.'); + process.exit(1); + } + + const newVersion = execFileSync( + 'npm', + ['version', process.argv[2], '--git-tag-version=false'], + {cwd: npmPath, encoding: 'utf8'} + ).trim().replace(/^v/, ''); + const newCargoVersionLine = cargoVersionMatch[0].replace(cargoVersion, newVersion); + const newCargoMetadata = cargoMetadata.replace(cargoVersionMatch[0], newCargoVersionLine); + fs.writeFileSync(cargoTomlPath, newCargoMetadata, 'utf8'); + execFileSync('cargo', ['build'], {cwd: cliPath}); + execFileSync('git', ['commit', '-a', '-m', newVersion]); + execFileSync('git', ['tag', newVersion]); + console.log(newVersion) +} else { + console.log(npmVersion); +} From cbcc61a8cf3f59b1d6fadc106472b7f1cbf378a8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 17:15:10 -0800 Subject: [PATCH 173/208] Get parse command handling multiple files, add --time, --quiet flags --- cli/src/loader.rs | 8 +-- cli/src/main.rs | 51 +++++++++++++--- cli/src/parse.rs | 147 ++++++++++++++++++++++++++++++--------------- lib/binding/lib.rs | 13 ++++ 4 files changed, 157 insertions(+), 62 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 70056404..6dd4e4db 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -56,12 +56,8 @@ impl Loader { let entry = entry?; if let Some(parser_dir_name) = entry.file_name().to_str() { if parser_dir_name.starts_with("tree-sitter-") { - if self - .find_language_at_path(&parser_container_dir.join(parser_dir_name)) - .is_err() - { - eprintln!("Error loading {}", parser_dir_name); - } + self.find_language_at_path(&parser_container_dir.join(parser_dir_name)) + .ok(); } } } diff --git a/cli/src/main.rs b/cli/src/main.rs index 1860ecc2..aaf45cb1 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -37,7 +37,12 @@ fn main() { fn run() -> error::Result<()> { let matches = App::new("tree-sitter") - .version(concat!(env!("CARGO_PKG_VERSION"), " (", env!("BUILD_SHA"), ")")) + .version(concat!( + env!("CARGO_PKG_VERSION"), + " (", + env!("BUILD_SHA"), + ")" + )) .setting(AppSettings::SubcommandRequiredElseHelp) .author("Max Brunsfeld ") .about("Generates and tests parsers") @@ -57,9 +62,16 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("parse") .about("Parse a file") - .arg(Arg::with_name("path").index(1).required(true)) + .arg( + Arg::with_name("path") + .index(1) + .multiple(true) + .required(true), + ) .arg(Arg::with_name("debug").long("debug").short("d")) - .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), + .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")) + .arg(Arg::with_name("quiet").long("quiet").short("q")) + .arg(Arg::with_name("time").long("time").short("t")), ) .subcommand( SubCommand::with_name("test") @@ -116,12 +128,35 @@ fn run() -> error::Result<()> { } else if let Some(matches) = matches.subcommand_matches("parse") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); + let quiet = matches.is_present("quiet"); + let time = matches.is_present("time"); loader.find_all_languages(&vec![home_dir.join("github")])?; - let source_path = Path::new(matches.value_of("path").unwrap()); - if let Some((language, _)) = loader.language_configuration_for_file_name(source_path)? { - parse::parse_file_at_path(language, source_path, debug, debug_graph)?; - } else { - eprintln!("No language found"); + let paths = matches + .values_of("path") + .unwrap() + .into_iter() + .collect::>(); + let max_path_length = paths.iter().map(|p| p.chars().count()).max().unwrap(); + for path in paths { + let path = Path::new(path); + let language = + if let Some((l, _)) = loader.language_configuration_for_file_name(path)? { + l + } else if let Some(l) = loader.language_at_path(¤t_dir)? { + l + } else { + eprintln!("No language found"); + return Ok(()); + }; + parse::parse_file_at_path( + language, + path, + max_path_length, + quiet, + time, + debug, + debug_graph, + )?; } } diff --git a/cli/src/parse.rs b/cli/src/parse.rs index 38b6a61c..54c02ad2 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -3,21 +3,25 @@ use super::util; use std::fs; use std::io::{self, Write}; use std::path::Path; +use std::time::Instant; use tree_sitter::{Language, LogType, Parser}; pub fn parse_file_at_path( language: Language, path: &Path, + max_path_length: usize, + quiet: bool, + print_time: bool, debug: bool, debug_graph: bool, ) -> Result<()> { - let mut log_session = None; + let mut _log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; - let source_code = fs::read_to_string(path)?; + let source_code = fs::read(path)?; if debug_graph { - log_session = Some(util::log_graphs(&mut parser, "log.html")?); + _log_session = Some(util::log_graphs(&mut parser, "log.html")?); } else if debug { parser.set_logger(Some(Box::new(|log_type, message| { if log_type == LogType::Lex { @@ -27,64 +31,111 @@ pub fn parse_file_at_path( }))); } + let time = Instant::now(); let tree = parser - .parse_str(&source_code, None) + .parse_utf8(&mut |byte, _| &source_code[byte..], None) .expect("Incompatible language version"); + let duration = time.elapsed(); + let duration_ms = duration.as_secs() * 1000 + duration.subsec_nanos() as u64 / 1000000; - drop(log_session); + let mut cursor = tree.walk(); let stdout = io::stdout(); let mut stdout = stdout.lock(); - let mut cursor = tree.walk(); - let mut needs_newline = false; - let mut indent_level = 0; - let mut did_visit_children = false; + + if !quiet { + let mut needs_newline = false; + let mut indent_level = 0; + let mut did_visit_children = false; + loop { + let node = cursor.node(); + let is_named = node.is_named(); + if did_visit_children { + if is_named { + stdout.write(b")")?; + needs_newline = true; + } + if cursor.goto_next_sibling() { + did_visit_children = false; + } else if cursor.goto_parent() { + did_visit_children = true; + indent_level -= 1; + } else { + break; + } + } else { + if is_named { + if needs_newline { + stdout.write(b"\n")?; + } + for _ in 0..indent_level { + stdout.write(b" ")?; + } + let start = node.start_position(); + let end = node.end_position(); + write!( + &mut stdout, + "({} [{}, {}] - [{}, {}]", + node.kind(), + start.row, + start.column, + end.row, + end.column + )?; + needs_newline = true; + } + if cursor.goto_first_child() { + did_visit_children = false; + indent_level += 1; + } else { + did_visit_children = true; + } + } + } + cursor.reset(tree.root_node()); + println!(""); + } + + let mut first_error = None; loop { let node = cursor.node(); - let is_named = node.is_named(); - if did_visit_children { - if is_named { - stdout.write(b")")?; - needs_newline = true; - } - if cursor.goto_next_sibling() { - did_visit_children = false; - } else if cursor.goto_parent() { - did_visit_children = true; - indent_level -= 1; - } else { + if node.has_error() { + if node.is_error() || node.is_missing() { + first_error = Some(node); break; - } - } else { - if is_named { - if needs_newline { - stdout.write(b"\n")?; - } - for _ in 0..indent_level { - stdout.write(b" ")?; - } - let start = node.start_position(); - let end = node.end_position(); - write!( - &mut stdout, - "({} [{}, {}] - [{}, {}]", - node.kind(), - start.row, - start.column, - end.row, - end.column - )?; - needs_newline = true; - } - if cursor.goto_first_child() { - did_visit_children = false; - indent_level += 1; } else { - did_visit_children = true; + cursor.goto_first_child(); + } + } else if !cursor.goto_next_sibling() { + if !cursor.goto_parent() { + break; } } } - println!(""); + if first_error.is_some() || print_time { + write!( + &mut stdout, + "{:width$}\t{} ms", + path.to_str().unwrap(), + duration_ms, + width = max_path_length + )?; + if let Some(node) = first_error { + let start = node.start_position(); + let end = node.end_position(); + write!( + &mut stdout, + "\t({} [{}, {}] - [{}, {}]", + node.kind(), + start.row, + start.column, + end.row, + end.column + )?; + } + write!(&mut stdout, "\n")?; + } + Ok(()) } diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index fdb243ec..8143fd6b 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -19,6 +19,7 @@ use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; use std::str; +use std::u16; #[derive(Clone, Copy)] #[repr(transparent)] @@ -479,6 +480,14 @@ impl<'tree> Node<'tree> { unsafe { ffi::ts_node_has_error(self.0) } } + pub fn is_error(&self) -> bool { + self.kind_id() == u16::MAX + } + + pub fn is_missing(&self) -> bool { + unsafe { ffi::ts_node_is_missing(self.0) } + } + pub fn start_byte(&self) -> usize { unsafe { ffi::ts_node_start_byte(self.0) as usize } } @@ -622,6 +631,10 @@ impl<'a> TreeCursor<'a> { Some(result as usize) } } + + pub fn reset(&mut self, node: Node<'a>) { + unsafe { ffi::ts_tree_cursor_reset(&mut self.0, node.0) }; + } } impl<'a> Drop for TreeCursor<'a> { From ed195de8b68476572c72aa60d6da83d83e2dfd33 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 17:16:04 -0800 Subject: [PATCH 174/208] rustfmt --- cli/build.rs | 2 +- .../build_tables/build_parse_table.rs | 94 ++++++----- .../generate/prepare_grammar/expand_tokens.rs | 16 +- .../prepare_grammar/extract_simple_aliases.rs | 147 ++++++++++-------- .../prepare_grammar/extract_tokens.rs | 2 +- .../prepare_grammar/flatten_grammar.rs | 2 +- .../prepare_grammar/intern_symbols.rs | 4 +- cli/src/generate/properties.rs | 6 +- cli/src/generate/render.rs | 4 +- cli/src/generate/rules.rs | 5 +- cli/src/tests/corpuses.rs | 2 +- cli/src/tests/fixtures.rs | 9 +- cli/src/tests/mod.rs | 2 +- cli/src/tests/parser_api.rs | 2 +- cli/src/util.rs | 16 +- 15 files changed, 170 insertions(+), 143 deletions(-) diff --git a/cli/build.rs b/cli/build.rs index f8e62274..b24eef82 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -1,4 +1,4 @@ -use std::{io, env, fs}; +use std::{env, fs, io}; fn main() { let git_sha = read_git_sha().unwrap(); diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 792a8759..bd790b29 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -456,60 +456,68 @@ impl<'a> ParseTableBuilder<'a> { .unwrap(); write!(&mut msg, "Possible interpretations:\n\n").unwrap(); - let interpretions = conflicting_items.iter().enumerate().map(|(i, item)| { - let mut line = String::new(); - write!(&mut line, " {}:", i + 1).unwrap(); + let interpretions = conflicting_items + .iter() + .enumerate() + .map(|(i, item)| { + let mut line = String::new(); + write!(&mut line, " {}:", i + 1).unwrap(); - for preceding_symbol in preceding_symbols - .iter() - .take(preceding_symbols.len() - item.step_index as usize) - { - write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap(); - } - - write!( - &mut line, - " ({}", - &self.syntax_grammar.variables[item.variable_index as usize].name - ) - .unwrap(); - - for (j, step) in item.production.steps.iter().enumerate() { - if j as u32 == item.step_index { - write!(&mut line, " •").unwrap(); + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index as usize) + { + write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap(); } - write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap(); - } - write!(&mut line, ")").unwrap(); - - if item.is_done() { write!( &mut line, - " • {} …", - self.symbol_name(&conflicting_lookahead) + " ({}", + &self.syntax_grammar.variables[item.variable_index as usize].name ) .unwrap(); - } - let precedence = item.precedence(); - let associativity = item.associativity(); + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { + write!(&mut line, " •").unwrap(); + } + write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap(); + } - let prec_line = if let Some(associativity) = associativity { - Some(format!( - "(precedence: {}, associativity: {:?})", - precedence, associativity - )) - } else if precedence > 0 { - Some(format!("(precedence: {})", precedence)) - } else { - None - }; + write!(&mut line, ")").unwrap(); - (line, prec_line) - }).collect::>(); + if item.is_done() { + write!( + &mut line, + " • {} …", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } - let max_interpretation_length = interpretions.iter().map(|i| i.0.chars().count()).max().unwrap(); + let precedence = item.precedence(); + let associativity = item.associativity(); + + let prec_line = if let Some(associativity) = associativity { + Some(format!( + "(precedence: {}, associativity: {:?})", + precedence, associativity + )) + } else if precedence > 0 { + Some(format!("(precedence: {})", precedence)) + } else { + None + }; + + (line, prec_line) + }) + .collect::>(); + + let max_interpretation_length = interpretions + .iter() + .map(|i| i.0.chars().count()) + .max() + .unwrap(); for (line, prec_suffix) in interpretions { msg += &line; diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 6b92713e..8e0f12fe 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -3,14 +3,15 @@ use crate::error::{Error, Result}; use crate::generate::grammars::{LexicalGrammar, LexicalVariable}; use crate::generate::nfa::{CharacterSet, Nfa, NfaState}; use crate::generate::rules::Rule; +use regex::Regex; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; -use regex::Regex; use std::i32; lazy_static! { - static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9,}][^}]*)\}"#).unwrap(); + static ref CURLY_BRACE_REGEX: Regex = + Regex::new(r#"(^|[^\\])\{([^}]*[^0-9,}][^}]*)\}"#).unwrap(); } const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; @@ -621,14 +622,9 @@ mod tests { }, // nested groups Row { - rules: vec![Rule::seq(vec![ - Rule::pattern(r#"([^x\\]|\\(.|\n))+"#), - ])], + rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#)])], separators: vec![], - examples: vec![ - ("abcx", Some((0, "abc"))), - ("abc\\0x", Some((0, "abc\\0"))), - ], + examples: vec![("abcx", Some((0, "abc"))), ("abc\\0x", Some((0, "abc\\0")))], }, // allowing unrecognized escape sequences Row { @@ -660,7 +656,7 @@ mod tests { ("u{1234} ok", Some((0, "u{1234}"))), ("{aba}}", Some((1, "{aba}"))), ], - } + }, ]; for Row { diff --git a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs index 84c535b9..79ea5e67 100644 --- a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs +++ b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs @@ -1,5 +1,5 @@ -use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; #[derive(Clone, Default)] struct SymbolStatus { @@ -9,20 +9,34 @@ struct SymbolStatus { pub(super) fn extract_simple_aliases( syntax_grammar: &mut SyntaxGrammar, - lexical_grammar: &LexicalGrammar + lexical_grammar: &LexicalGrammar, ) -> AliasMap { // Determine which symbols in the grammars are *always* aliased to a single name. let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; - let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()]; - let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + let mut non_terminal_status_list = + vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = + vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; for variable in syntax_grammar.variables.iter() { for production in variable.productions.iter() { for step in production.steps.iter() { let mut status = match step.symbol { - Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], - Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], - Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], - Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), + Symbol { + kind: SymbolType::External, + index, + } => &mut external_status_list[index], + Symbol { + kind: SymbolType::NonTerminal, + index, + } => &mut non_terminal_status_list[index], + Symbol { + kind: SymbolType::Terminal, + index, + } => &mut terminal_status_list[index], + Symbol { + kind: SymbolType::End, + .. + } => panic!("Unexpected end token"), }; if step.alias.is_none() { @@ -47,10 +61,22 @@ pub(super) fn extract_simple_aliases( for production in variable.productions.iter_mut() { for step in production.steps.iter_mut() { let status = match step.symbol { - Symbol { kind: SymbolType::External, index} => &external_status_list[index], - Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], - Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], - Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), + Symbol { + kind: SymbolType::External, + index, + } => &external_status_list[index], + Symbol { + kind: SymbolType::NonTerminal, + index, + } => &non_terminal_status_list[index], + Symbol { + kind: SymbolType::Terminal, + index, + } => &terminal_status_list[index], + Symbol { + kind: SymbolType::End, + .. + } => panic!("Unexpected end token"), }; if status.alias.is_some() { @@ -83,7 +109,9 @@ pub(super) fn extract_simple_aliases( #[cfg(test)] mod tests { use super::*; - use crate::generate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; + use crate::generate::grammars::{ + LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType, + }; use crate::generate::nfa::Nfa; #[test] @@ -93,35 +121,29 @@ mod tests { SyntaxVariable { name: "v1".to_owned(), kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), - ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), - ], - }, - ], + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }], }, SyntaxVariable { name: "v2".to_owned(), kind: VariableType::Named, - productions: vec![ - Production { - dynamic_precedence: 0, - steps: vec![ - // Token 0 is always aliased as "a1". - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - - // Token 1 is aliased above, but not here. - ProductionStep::new(Symbol::terminal(1)), - - // Token 2 is aliased differently than above. - ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), - ], - }, - ], + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + // Token 1 is aliased above, but not here. + ProductionStep::new(Symbol::terminal(1)), + // Token 2 is aliased differently than above. + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }], }, ], extra_tokens: Vec::new(), @@ -151,49 +173,50 @@ mod tests { kind: VariableType::Anonymous, implicit_precedence: 0, start_state: 0, - } + }, ], }; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); assert_eq!(simple_aliases.len(), 1); - assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias { - value: "a1".to_string(), - is_named: true, - }); + assert_eq!( + simple_aliases[&Symbol::terminal(0)], + Alias { + value: "a1".to_string(), + is_named: true, + } + ); - assert_eq!(syntax_grammar.variables, vec![ - SyntaxVariable { - name: "v1".to_owned(), - kind: VariableType::Named, - productions: vec![ - Production { + assert_eq!( + syntax_grammar.variables, + vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![Production { dynamic_precedence: 0, steps: vec![ // 'Simple' alias removed ProductionStep::new(Symbol::terminal(0)), - // Other aliases unchanged ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), ], - }, - ], - }, - SyntaxVariable { - name: "v2".to_owned(), - kind: VariableType::Named, - productions: vec![ - Production { + },], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![Production { dynamic_precedence: 0, steps: vec![ ProductionStep::new(Symbol::terminal(0)), ProductionStep::new(Symbol::terminal(1)), ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), ], - }, - ], - }, - ]); + },], + }, + ] + ); } } diff --git a/cli/src/generate/prepare_grammar/extract_tokens.rs b/cli/src/generate/prepare_grammar/extract_tokens.rs index ae07763b..88afb50f 100644 --- a/cli/src/generate/prepare_grammar/extract_tokens.rs +++ b/cli/src/generate/prepare_grammar/extract_tokens.rs @@ -243,7 +243,7 @@ impl TokenExtractor { Variable { name: string_value.clone(), kind: VariableType::Anonymous, - rule: rule.clone() + rule: rule.clone(), } } else { self.current_variable_token_count += 1; diff --git a/cli/src/generate/prepare_grammar/flatten_grammar.rs b/cli/src/generate/prepare_grammar/flatten_grammar.rs index 204ceb07..98276b7e 100644 --- a/cli/src/generate/prepare_grammar/flatten_grammar.rs +++ b/cli/src/generate/prepare_grammar/flatten_grammar.rs @@ -1,9 +1,9 @@ use super::ExtractedSyntaxGrammar; use crate::error::{Error, Result}; -use crate::generate::rules::Symbol; use crate::generate::grammars::{ Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable, }; +use crate::generate::rules::Symbol; use crate::generate::rules::{Alias, Associativity, Rule}; struct RuleFlattener { diff --git a/cli/src/generate/prepare_grammar/intern_symbols.rs b/cli/src/generate/prepare_grammar/intern_symbols.rs index 8b07309b..d742864c 100644 --- a/cli/src/generate/prepare_grammar/intern_symbols.rs +++ b/cli/src/generate/prepare_grammar/intern_symbols.rs @@ -7,9 +7,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let interner = Interner { grammar }; if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { - return Err(Error( - "A grammar's start rule must be visible.".to_string(), - )); + return Err(Error("A grammar's start rule must be visible.".to_string())); } let mut variables = Vec::with_capacity(grammar.variables.len()); diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index bf299af4..4df4d67d 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -178,7 +178,6 @@ impl Builder { text: step.text_pattern.clone(), state_id: 0, }, - // Include the rule id so that it can be used when sorting transitions. item.rule_id, )); @@ -432,9 +431,8 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { let property_sheet_json_path = src_dir_path .join(css_path.file_name().unwrap()) .with_extension("json"); - let property_sheet_json_file = File::create(&property_sheet_json_path).map_err(|e| - format!("Failed to create {:?}: {}", property_sheet_json_path, e) - )?; + let property_sheet_json_file = File::create(&property_sheet_json_path) + .map_err(|e| format!("Failed to create {:?}: {}", property_sheet_json_path, e))?; let mut writer = BufWriter::new(property_sheet_json_file); serde_json::to_writer_pretty(&mut writer, &sheet)?; } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 1da7f99d..5e87189c 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -563,7 +563,9 @@ impl Generator { indent!(self); for i in 0..self.syntax_grammar.external_tokens.len() { let token = &self.syntax_grammar.external_tokens[i]; - let id_token = token.corresponding_internal_token.unwrap_or(Symbol::external(i)); + let id_token = token + .corresponding_internal_token + .unwrap_or(Symbol::external(i)); add_line!( self, "[{}] = {},", diff --git a/cli/src/generate/rules.rs b/cli/src/generate/rules.rs index e15070ea..09a20294 100644 --- a/cli/src/generate/rules.rs +++ b/cli/src/generate/rules.rs @@ -59,10 +59,7 @@ pub(crate) enum Rule { impl Rule { pub fn alias(content: Rule, value: String, is_named: bool) -> Self { add_metadata(content, move |params| { - params.alias = Some(Alias { - is_named, - value - }); + params.alias = Some(Alias { is_named, value }); }) } diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index 173426d6..b28b2510 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,4 +1,4 @@ -use super::fixtures::{get_language, get_test_language, fixtures_dir}; +use super::fixtures::{fixtures_dir, get_language, get_test_language}; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; diff --git a/cli/src/tests/fixtures.rs b/cli/src/tests/fixtures.rs index 978a1212..639b1004 100644 --- a/cli/src/tests/fixtures.rs +++ b/cli/src/tests/fixtures.rs @@ -1,7 +1,7 @@ use crate::loader::Loader; +use std::fs; use std::path::{Path, PathBuf}; use tree_sitter::Language; -use std::fs; lazy_static! { static ref ROOT_DIR: PathBuf = [env!("CARGO_MANIFEST_DIR"), ".."].iter().collect(); @@ -41,11 +41,6 @@ pub fn get_test_language(name: &str, parser_code: String, path: &Path) -> Langua None }; TEST_LOADER - .load_language_from_sources( - name, - &HEADER_DIR, - &parser_c_path, - &scanner_path, - ) + .load_language_from_sources(name, &HEADER_DIR, &parser_c_path, &scanner_path) .unwrap() } diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index c9f1dda4..a874358a 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,3 +1,3 @@ -mod fixtures; mod corpuses; +mod fixtures; mod parser_api; diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs index a399bf38..9a4ce9f1 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api.rs @@ -1,6 +1,6 @@ use super::fixtures::get_language; use std::thread; -use tree_sitter::{InputEdit, LogType, Parser, Point, PropertySheet, Language}; +use tree_sitter::{InputEdit, Language, LogType, Parser, Point, PropertySheet}; fn rust() -> Language { get_language("rust") diff --git a/cli/src/util.rs b/cli/src/util.rs index 166e54d0..b1073624 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -35,7 +35,11 @@ pub(crate) fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result HTML_HEADER.len() as u64 { + if cfg!(target_os = "macos") + && fs::metadata(&self.0).unwrap().len() > HTML_HEADER.len() as u64 + { Command::new("open").arg("log.html").output().unwrap(); } } else { - eprintln!("Dot failed: {} {}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr)); + eprintln!( + "Dot failed: {} {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); } } } From 1d463522977f1e8f0590d626e174c5b0a6dc5f2b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 17 Jan 2019 17:26:48 -0800 Subject: [PATCH 175/208] Fix check for APPVEYOR_REPO_TAG --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7d4acdc7..de82a7d5 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,7 +1,7 @@ build: false install: # Terminate early unless building either a tag or a PR. - - if not defined APPVEYOR_REPO_TAG if not "%APPVEYOR_REPO_BRANCH%" == "master" appveyor exit + - if "%APPVEYOR_REPO_TAG%" == "false" if not "%APPVEYOR_REPO_BRANCH%" == "master" appveyor exit # Install rust - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe From 31bdf5eb97faabcf61e0f4b911b34ebaa4c319f2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 Jan 2019 09:40:09 -0800 Subject: [PATCH 176/208] Fix handling of JavaScript errors Refs #258 --- cli/src/generate/dsl.js | 3 +-- cli/src/generate/mod.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cli/src/generate/dsl.js b/cli/src/generate/dsl.js index 950b2d3b..1a9bed20 100644 --- a/cli/src/generate/dsl.js +++ b/cli/src/generate/dsl.js @@ -135,7 +135,6 @@ token.immediate = function(value) { } function normalize(value) { - if (typeof value == "undefined") throw new Error("Undefined symbol"); @@ -289,7 +288,7 @@ function grammar(baseGrammar, options) { throw new Error("Grammar's conflicts must be an array of arrays of rules."); } - return conflictSet.map(symbol => symbol.name); + return conflictSet.map(symbol => normalize(symbol).name); }); } diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 062a9e6b..535f9d19 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -36,7 +36,7 @@ pub fn generate_parser_in_directory( ) -> Result<()> { if !properties_only { let grammar_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into()); - let grammar_json = load_grammar_file(&grammar_path); + let grammar_json = load_grammar_file(&grammar_path)?; let (language_name, c_code) = generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; let repo_src_path = repo_path.join("src"); @@ -92,15 +92,15 @@ fn generate_parser_for_grammar_with_opts( Ok((input_grammar.name, c_code)) } -fn load_grammar_file(grammar_path: &PathBuf) -> String { +fn load_grammar_file(grammar_path: &PathBuf) -> Result { match grammar_path.extension().and_then(|e| e.to_str()) { - Some("js") => load_js_grammar_file(grammar_path), - Some("json") => fs::read_to_string(grammar_path).expect("Failed to read grammar file"), - _ => panic!("Unknown grammar file extension"), + Some("js") => Ok(load_js_grammar_file(grammar_path)?), + Some("json") => Ok(fs::read_to_string(grammar_path)?), + _ => Err(Error(format!("Unknown grammar file extension: {:?}", grammar_path))), } } -fn load_js_grammar_file(grammar_path: &PathBuf) -> String { +fn load_js_grammar_file(grammar_path: &PathBuf) -> Result { let mut node_process = Command::new("node") .env("TREE_SITTER_GRAMMAR_PATH", grammar_path) .stdin(Stdio::piped()) @@ -123,10 +123,10 @@ fn load_js_grammar_file(grammar_path: &PathBuf) -> String { match output.status.code() { None => panic!("Node process was killed"), Some(0) => {} - Some(code) => panic!(format!("Node process exited with status {}", code)), + Some(code) => return Err(Error(format!("Node process exited with status {}", code))), } - String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") + Ok(String::from_utf8(output.stdout).expect("Got invalid UTF8 from node")) } fn ensure_file(path: &PathBuf, f: impl Fn() -> String) -> Result<()> { From ff41f05a204e6bf8679e4e490c99dd671bb79ba5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 Jan 2019 15:13:13 -0800 Subject: [PATCH 177/208] Fix computation of following tokens --- .../build_tables/build_parse_table.rs | 61 ++++++++++++------- cli/src/generate/build_tables/item.rs | 26 ++++++-- .../generate/build_tables/item_set_builder.rs | 40 +++++++++++- .../generate/build_tables/token_conflicts.rs | 32 ++++++++-- cli/src/generate/grammars.rs | 2 +- 5 files changed, 125 insertions(+), 36 deletions(-) diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index bd790b29..5351f72e 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -41,12 +41,11 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, - following_tokens: Vec, state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, Vec)> { + fn build(mut self) -> Result { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -99,7 +98,7 @@ impl<'a> ParseTableBuilder<'a> { self.remove_precedences(); - Ok((self.parse_table, self.following_tokens)) + Ok(self.parse_table) } fn add_parse_state( @@ -108,20 +107,6 @@ impl<'a> ParseTableBuilder<'a> { preceding_auxiliary_symbols: &AuxiliarySymbolSequence, item_set: ParseItemSet<'a>, ) -> ParseStateId { - if preceding_symbols.len() > 1 { - let left_tokens = self - .item_set_builder - .last_set(&preceding_symbols[preceding_symbols.len() - 2]); - let right_tokens = self - .item_set_builder - .first_set(&preceding_symbols[preceding_symbols.len() - 1]); - for left_token in left_tokens.iter() { - if left_token.is_terminal() { - self.following_tokens[left_token.index].insert_all(right_tokens); - } - } - } - let mut hasher = DefaultHasher::new(); item_set.hash_unfinished_items(&mut hasher); let unfinished_item_signature = hasher.finish(); @@ -705,17 +690,50 @@ impl<'a> ParseTableBuilder<'a> { } } +fn populate_following_tokens( + result: &mut Vec, + grammar: &SyntaxGrammar, + inlines: &InlinedProductionMap, + builder: &ParseItemSetBuilder, +) { + let productions = grammar + .variables + .iter() + .flat_map(|v| &v.productions) + .chain(&inlines.productions); + for production in productions { + for i in 1..production.steps.len() { + let left_tokens = builder.last_set(&production.steps[i - 1].symbol); + let right_tokens = builder.first_set(&production.steps[i].symbol); + for left_token in left_tokens.iter() { + if left_token.is_terminal() { + result[left_token.index].insert_all_terminals(right_tokens); + } + } + } + } +} + pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, state_ids_to_log: Vec, ) -> Result<(ParseTable, Vec)> { - ParseTableBuilder { + let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines); + let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()]; + populate_following_tokens( + &mut following_tokens, + syntax_grammar, + inlines, + &item_set_builder, + ); + + let table = ParseTableBuilder { syntax_grammar, lexical_grammar, state_ids_to_log, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + item_set_builder, state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), parse_state_queue: VecDeque::new(), @@ -725,7 +743,8 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), max_aliased_production_length: 0, }, - following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()], } - .build() + .build()?; + + Ok((table, following_tokens)) } diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 6c74d465..9f3307dd 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -48,7 +48,11 @@ pub(crate) struct ParseItemDisplay<'a>( pub &'a LexicalGrammar, ); -pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct TokenSetDisplay<'a>( + pub &'a TokenSet, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, +); #[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( @@ -134,30 +138,42 @@ impl TokenSet { vec.set(other.index, true); } - pub fn insert_all(&mut self, other: &TokenSet) -> bool { + pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool { let mut result = false; if other.terminal_bits.len() > self.terminal_bits.len() { self.terminal_bits.resize(other.terminal_bits.len(), false); } - if other.external_bits.len() > self.external_bits.len() { - self.external_bits.resize(other.external_bits.len(), false); - } for (i, element) in other.terminal_bits.iter().enumerate() { if element { result |= !self.terminal_bits[i]; self.terminal_bits.set(i, element); } } + result + } + + fn insert_all_externals(&mut self, other: &TokenSet) -> bool { + let mut result = false; + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } for (i, element) in other.external_bits.iter().enumerate() { if element { result |= !self.external_bits[i]; self.external_bits.set(i, element); } } + result + } + + pub fn insert_all(&mut self, other: &TokenSet) -> bool { + let mut result = false; if other.eof { result |= !self.eof; self.eof = true; } + result |= self.insert_all_terminals(other); + result |= self.insert_all_externals(other); result } } diff --git a/cli/src/generate/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs index b941b179..9a929f05 100644 --- a/cli/src/generate/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -1,6 +1,6 @@ -use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet}; +use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet, TokenSetDisplay}; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::rules::Symbol; +use crate::generate::rules::{Symbol, SymbolType}; use hashbrown::{HashMap, HashSet}; use std::fmt; @@ -268,7 +268,7 @@ impl<'a> ParseItemSetBuilder<'a> { } pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { - &self.first_sets[symbol] + &self.last_sets[symbol] } fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) { @@ -300,6 +300,40 @@ impl<'a> fmt::Debug for ParseItemSetBuilder<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "ParseItemSetBuilder {{\n")?; + write!(f, " first_sets: {{\n")?; + for (symbol, first_set) in &self.first_sets { + let name = match symbol.kind { + SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name, + SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name, + SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name, + SymbolType::End => "END", + }; + write!( + f, + " first({:?}): {}\n", + name, + TokenSetDisplay(first_set, &self.syntax_grammar, &self.lexical_grammar) + )?; + } + write!(f, " }}\n")?; + + write!(f, " last_sets: {{\n")?; + for (symbol, last_set) in &self.last_sets { + let name = match symbol.kind { + SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name, + SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name, + SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name, + SymbolType::End => "END", + }; + write!( + f, + " last({:?}): {}\n", + name, + TokenSetDisplay(last_set, &self.syntax_grammar, &self.lexical_grammar) + )?; + } + write!(f, " }}\n")?; + write!(f, " additions: {{\n")?; for (i, variable) in self.syntax_grammar.variables.iter().enumerate() { write!(f, " {}: {{\n", variable.name)?; diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 1f89022a..1c4fc753 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -1,5 +1,5 @@ -use crate::generate::build_tables::item::TokenSet; -use crate::generate::grammars::LexicalGrammar; +use crate::generate::build_tables::item::{TokenSet, TokenSetDisplay}; +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; use std::cmp::Ordering; @@ -16,6 +16,7 @@ struct TokenConflictStatus { pub(crate) struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, + following_tokens: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, grammar: &'a LexicalGrammar, @@ -25,7 +26,7 @@ impl<'a> TokenConflictMap<'a> { pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); - let following_chars = get_following_chars(&starting_chars, following_tokens); + let following_chars = get_following_chars(&starting_chars, &following_tokens); let n = grammar.variables.len(); let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; @@ -40,6 +41,7 @@ impl<'a> TokenConflictMap<'a> { TokenConflictMap { n, status_matrix, + following_tokens, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, grammar, @@ -115,9 +117,27 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; + let syntax_grammar = SyntaxGrammar::default(); + + write!(f, " following_tokens: {{\n")?; + for (i, following_tokens) in self.following_tokens.iter().enumerate() { + write!( + f, + " follow({:?}): {},\n", + self.grammar.variables[i].name, + TokenSetDisplay(following_tokens, &syntax_grammar, &self.grammar) + )?; + } + write!(f, " }},\n")?; + write!(f, " starting_characters: {{\n")?; for i in 0..self.n { - write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; + write!( + f, + " {:?}: {:?},\n", + self.grammar.variables[i].name, + self.starting_chars_by_index[i] + )?; } write!(f, " }},\n")?; @@ -169,10 +189,10 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, - following_tokens: Vec, + following_tokens: &Vec, ) -> Vec { following_tokens - .into_iter() + .iter() .map(|following_tokens| { let mut chars = CharacterSet::empty(); for token in following_tokens.iter() { diff --git a/cli/src/generate/grammars.rs b/cli/src/generate/grammars.rs index 3cedcd42..c9282da3 100644 --- a/cli/src/generate/grammars.rs +++ b/cli/src/generate/grammars.rs @@ -81,7 +81,7 @@ pub(crate) struct ExternalToken { pub corresponding_internal_token: Option, } -#[derive(Debug)] +#[derive(Debug, Default)] pub(crate) struct SyntaxGrammar { pub variables: Vec, pub extra_tokens: Vec, From 9e610bf88e937daf6a3b3768ec1224b516dd3bd9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 18 Jan 2019 15:16:51 -0800 Subject: [PATCH 178/208] 0.14.0-beta3 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package-lock.json | 2 +- cli/npm/package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f27e897e..003978c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -567,7 +567,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.14.0-beta2" +version = "0.14.0-beta3" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index b7cd21d2..d8e50bbf 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tree-sitter-cli" -version = "0.14.0-beta2" +version = "0.14.0-beta3" authors = ["Max Brunsfeld "] edition = "2018" diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index ff76b456..4590ac72 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta2", + "version": "0.14.0-beta3", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index 9dfd5e7e..276ea9d8 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta2", + "version": "0.14.0-beta3", "author": "Max Brunsfeld", "license": "MIT", "repository": { From f6cdd5e3d4817c6dc0fffaeb0c50574171d80309 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 20 Jan 2019 16:58:31 -0800 Subject: [PATCH 179/208] Loosen criteria for identifying conflict-free tokens for error recovery --- cli/src/generate/build_tables/mod.rs | 4 +- .../generate/build_tables/token_conflicts.rs | 82 ++++++++++++++++--- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 92fddefe..ca54f274 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -86,14 +86,14 @@ fn populate_error_state( let n = lexical_grammar.variables.len(); // First identify the *conflict-free tokens*: tokens that do not overlap with - // any other token in any way. + // any other token in any way, besides matching exactly the same string. let conflict_free_tokens: TokenSet = (0..n) .into_iter() .filter_map(|i| { let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { j != i && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - && token_conflict_map.does_conflict(i, j) + && token_conflict_map.does_match_shorter_or_longer(i, j) }); if conflicts_with_other_tokens { None diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 1c4fc753..5c8b3ff5 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -65,6 +65,13 @@ impl<'a> TokenConflictMap<'a> { || entry.matches_same_string } + pub fn does_match_shorter_or_longer(&self, i: usize, j: usize) -> bool { + let entry = &self.status_matrix[matrix_index(self.n, i, j)]; + let reverse_entry = &self.status_matrix[matrix_index(self.n, j, i)]; + (entry.does_match_valid_continuation || entry.does_match_separators) + && !reverse_entry.does_match_separators + } + pub fn does_overlap(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].does_overlap } @@ -135,8 +142,7 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> { write!( f, " {:?}: {:?},\n", - self.grammar.variables[i].name, - self.starting_chars_by_index[i] + self.grammar.variables[i].name, self.starting_chars_by_index[i] )?; } write!(f, " }},\n")?; @@ -230,8 +236,18 @@ fn compute_conflict_status( continue; } + let has_sep = cursor.transition_chars().any(|(_, sep)| sep); + let mut completion = None; for (id, precedence) in cursor.completions() { + if has_sep { + if id == i { + result.0.does_match_separators = true; + } else { + result.1.does_match_separators = true; + } + } + if let Some((prev_id, prev_precedence)) = completion { if id == prev_id { continue; @@ -263,8 +279,6 @@ fn compute_conflict_status( } } - let has_sep = cursor.transition_chars().any(|(_, sep)| sep); - for transition in cursor.transitions() { let mut can_advance = true; if let Some((completed_id, completed_precedence)) = completion { @@ -298,17 +312,11 @@ fn compute_conflict_status( if transition.characters.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } - if transition.is_separator || has_sep { - result.0.does_match_separators = true; - } } else { result.1.does_overlap = true; if transition.characters.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; } - if transition.is_separator || has_sep { - result.1.does_match_separators = true; - } } } } @@ -414,6 +422,60 @@ mod tests { assert!(token_map.does_conflict(var("instanceof"), var("in"))); } + #[test] + fn test_token_conflicts_with_separators() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: vec![Rule::pattern("\\s")], + variables: vec![ + Variable { + name: "x".to_string(), + kind: VariableType::Named, + rule: Rule::string("x"), + }, + Variable { + name: "newline".to_string(), + kind: VariableType::Named, + rule: Rule::string("\n"), + }, + ], + }) + .unwrap(); + + let var = |name| index_of_var(&grammar, name); + + let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]); + + assert!(token_map.does_conflict(var("newline"), var("x"))); + assert!(!token_map.does_conflict(var("x"), var("newline"))); + } + + #[test] + fn test_token_conflicts_with_open_ended_tokens() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: vec![Rule::pattern("\\s")], + variables: vec![ + Variable { + name: "x".to_string(), + kind: VariableType::Named, + rule: Rule::string("x"), + }, + Variable { + name: "anything".to_string(), + kind: VariableType::Named, + rule: Rule::prec(-1, Rule::pattern(".*")), + }, + ], + }) + .unwrap(); + + let var = |name| index_of_var(&grammar, name); + + let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]); + + assert!(token_map.does_match_shorter_or_longer(var("anything"), var("x"))); + assert!(!token_map.does_match_shorter_or_longer(var("x"), var("anything"))); + } + fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { grammar .variables From 6105bf990937c282898cba2e03c69728f924f4c8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 20 Jan 2019 16:58:49 -0800 Subject: [PATCH 180/208] Include error recovery examples in test suite --- .gitignore | 2 +- cli/src/tests/corpuses.rs | 80 ++++++++++++------- cli/src/util.rs | 2 +- .../error_corpus/javascript_errors.txt | 2 +- 4 files changed, 52 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index bcb55844..360390b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -log.html +log*.html .idea *.xcodeproj diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index b28b2510..6d46aacb 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -28,21 +28,9 @@ lazy_static! { #[test] fn test_real_language_corpus_files() { let mut log_session = None; - let mut parser = Parser::new(); + let mut parser = get_parser(&mut log_session, "log1.html"); let grammars_dir = fixtures_dir().join("grammars"); - if *LOG_ENABLED { - parser.set_logger(Some(Box::new(|log_type, msg| { - if log_type == LogType::Lex { - eprintln!(" {}", msg); - } else { - eprintln!("{}", msg); - } - }))); - } else if *LOG_GRAPH_ENABLED { - log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); - } - let mut did_fail = false; for language_name in LANGUAGES.iter().cloned() { if let Some(filter) = LANGUAGE_FILTER.as_ref() { @@ -60,8 +48,35 @@ fn test_real_language_corpus_files() { did_fail |= run_mutation_tests(&mut parser, test); } - drop(parser); - drop(log_session); + if did_fail { + panic!("Corpus tests failed"); + } +} + +#[test] +fn test_error_corpus_files() { + let mut log_session = None; + let mut parser = get_parser(&mut log_session, "log2.html"); + let corpus_dir = fixtures_dir().join("error_corpus"); + + let mut did_fail = false; + for entry in fs::read_dir(&corpus_dir).unwrap() { + let entry = entry.unwrap(); + let language_name = entry.file_name(); + let language_name = language_name.to_str().unwrap().replace("_errors.txt", ""); + if let Some(filter) = LANGUAGE_FILTER.as_ref() { + if language_name != filter.as_str() { + continue; + } + } + + eprintln!("language: {:?}", language_name); + + let test = parse_tests(&entry.path()).unwrap(); + let language = get_language(&language_name); + parser.set_language(language).unwrap(); + did_fail |= run_mutation_tests(&mut parser, test); + } if did_fail { panic!("Corpus tests failed"); @@ -71,21 +86,9 @@ fn test_real_language_corpus_files() { #[test] fn test_feature_corpus_files() { let mut log_session = None; - let mut parser = Parser::new(); + let mut parser = get_parser(&mut log_session, "log3.html"); let test_grammars_dir = fixtures_dir().join("test_grammars"); - if *LOG_ENABLED { - parser.set_logger(Some(Box::new(|log_type, msg| { - if log_type == LogType::Lex { - eprintln!(" {}", msg); - } else { - eprintln!("{}", msg); - } - }))); - } else if *LOG_GRAPH_ENABLED { - log_session = Some(util::log_graphs(&mut parser, "log.html").unwrap()); - } - let mut did_fail = false; for entry in fs::read_dir(&test_grammars_dir).unwrap() { let entry = entry.unwrap(); @@ -134,9 +137,6 @@ fn test_feature_corpus_files() { } } - drop(parser); - drop(log_session); - if did_fail { panic!("Corpus tests failed"); } @@ -179,3 +179,21 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) -> bool { } } } + +fn get_parser(session: &mut Option, log_filename: &str) -> Parser { + let mut parser = Parser::new(); + + if *LOG_ENABLED { + parser.set_logger(Some(Box::new(|log_type, msg| { + if log_type == LogType::Lex { + eprintln!(" {}", msg); + } else { + eprintln!("{}", msg); + } + }))); + } else if *LOG_GRAPH_ENABLED { + *session = Some(util::log_graphs(&mut parser, log_filename).unwrap()); + } + + parser +} diff --git a/cli/src/util.rs b/cli/src/util.rs index b1073624..004d3b06 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -53,7 +53,7 @@ impl Drop for LogSession { if cfg!(target_os = "macos") && fs::metadata(&self.0).unwrap().len() > HTML_HEADER.len() as u64 { - Command::new("open").arg("log.html").output().unwrap(); + Command::new("open").arg(&self.0).output().unwrap(); } } else { eprintln!( diff --git a/test/fixtures/error_corpus/javascript_errors.txt b/test/fixtures/error_corpus/javascript_errors.txt index 1717b85d..ffa9d547 100644 --- a/test/fixtures/error_corpus/javascript_errors.txt +++ b/test/fixtures/error_corpus/javascript_errors.txt @@ -36,7 +36,7 @@ Missing object-literal values (program (expression_statement (object (pair (property_identifier) (identifier)) - (pair (property_identifier) (yield_expression (MISSING)))))) + (pair (property_identifier) (MISSING))))) =================================================== Extra identifiers in expressions From 196339aaa9aad0cf9bdc4ef381f008c7c1651c54 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 21 Jan 2019 14:22:35 -0800 Subject: [PATCH 181/208] Assert no memory leaks by stubbing malloc/free in the test suite --- Cargo.lock | 7 +++ cli/Cargo.toml | 3 + cli/src/tests/allocations.rs | 104 +++++++++++++++++++++++++++++++++++ cli/src/tests/corpuses.rs | 29 +++++----- cli/src/tests/mod.rs | 1 + lib/build.rs | 18 +++++- lib/src/subtree.c | 2 +- 7 files changed, 146 insertions(+), 18 deletions(-) create mode 100644 cli/src/tests/allocations.rs diff --git a/Cargo.lock b/Cargo.lock index 003978c1..936c60ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -502,6 +502,11 @@ name = "smallbitvec" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "spin" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "strsim" version = "0.7.0" @@ -585,6 +590,7 @@ dependencies = [ "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.5", ] @@ -702,6 +708,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" "checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" +"checksum spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44363f6f51401c34e7be73db0db371c04705d35efbe9f7d6082e03a921a32c55" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" "checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index d8e50bbf..5eb92079 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -34,3 +34,6 @@ features = ["preserve_order"] [dependencies.log] version = "0.4.6" features = ["std"] + +[dev-dependencies] +spin = "0.5" diff --git a/cli/src/tests/allocations.rs b/cli/src/tests/allocations.rs new file mode 100644 index 00000000..c4a3dbac --- /dev/null +++ b/cli/src/tests/allocations.rs @@ -0,0 +1,104 @@ +#![cfg(test)] +#![allow(dead_code)] + +use spin::Mutex; +use std::collections::HashMap; +use std::os::raw::{c_ulong, c_void}; + +#[derive(Debug, PartialEq, Eq, Hash)] +struct Allocation(*const c_void); +unsafe impl Send for Allocation {} +unsafe impl Sync for Allocation {} + +#[derive(Default)] +struct AllocationRecorder { + enabled: bool, + allocation_count: u64, + outstanding_allocations: HashMap, +} + +lazy_static! { + static ref RECORDER: Mutex = Mutex::new(AllocationRecorder::default()); +} + +extern "C" { + fn malloc(size: c_ulong) -> *mut c_void; + fn calloc(count: c_ulong, size: c_ulong) -> *mut c_void; + fn realloc(ptr: *mut c_void, size: c_ulong) -> *mut c_void; + fn free(ptr: *mut c_void); +} + +pub fn start_recording() { + let mut recorder = RECORDER.lock(); + recorder.enabled = true; + recorder.allocation_count = 0; + recorder.outstanding_allocations.clear(); +} + +pub fn stop_recording() { + let mut recorder = RECORDER.lock(); + recorder.enabled = false; + + if !recorder.outstanding_allocations.is_empty() { + panic!( + "Leaked allocation indices: {:?}", + recorder + .outstanding_allocations + .iter() + .map(|e| e.1) + .collect::>() + ); + } +} + +fn record_alloc(ptr: *mut c_void) { + let mut recorder = RECORDER.lock(); + if recorder.enabled { + let count = recorder.allocation_count; + recorder.allocation_count += 1; + recorder + .outstanding_allocations + .insert(Allocation(ptr), count); + } +} + +fn record_dealloc(ptr: *mut c_void) { + let mut recorder = RECORDER.lock(); + if recorder.enabled { + recorder.outstanding_allocations.remove(&Allocation(ptr)); + } +} + +#[no_mangle] +extern "C" fn ts_record_malloc(size: c_ulong) -> *const c_void { + let result = unsafe { malloc(size) }; + record_alloc(result); + result +} + +#[no_mangle] +extern "C" fn ts_record_calloc(count: c_ulong, size: c_ulong) -> *const c_void { + let result = unsafe { calloc(count, size) }; + record_alloc(result); + result +} + +#[no_mangle] +extern "C" fn ts_record_realloc(ptr: *mut c_void, size: c_ulong) -> *const c_void { + record_dealloc(ptr); + let result = unsafe { realloc(ptr, size) }; + record_alloc(result); + result +} + +#[no_mangle] +extern "C" fn ts_record_free(ptr: *mut c_void) { + record_dealloc(ptr); + unsafe { free(ptr) }; +} + +#[no_mangle] +extern "C" fn ts_record_allocations_toggle() { + let mut recorder = RECORDER.lock(); + recorder.enabled = !recorder.enabled; +} diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index 6d46aacb..2c205d40 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,9 +1,10 @@ +use super::allocations; use super::fixtures::{fixtures_dir, get_language, get_test_language}; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; use std::fs; -use tree_sitter::{LogType, Parser}; +use tree_sitter::{Language, LogType, Parser}; const LANGUAGES: &'static [&'static str] = &[ "bash", @@ -27,8 +28,6 @@ lazy_static! { #[test] fn test_real_language_corpus_files() { - let mut log_session = None; - let mut parser = get_parser(&mut log_session, "log1.html"); let grammars_dir = fixtures_dir().join("grammars"); let mut did_fail = false; @@ -44,8 +43,7 @@ fn test_real_language_corpus_files() { let language = get_language(language_name); let corpus_dir = grammars_dir.join(language_name).join("corpus"); let test = parse_tests(&corpus_dir).unwrap(); - parser.set_language(language).unwrap(); - did_fail |= run_mutation_tests(&mut parser, test); + did_fail |= run_mutation_tests(language, test); } if did_fail { @@ -55,8 +53,6 @@ fn test_real_language_corpus_files() { #[test] fn test_error_corpus_files() { - let mut log_session = None; - let mut parser = get_parser(&mut log_session, "log2.html"); let corpus_dir = fixtures_dir().join("error_corpus"); let mut did_fail = false; @@ -74,8 +70,7 @@ fn test_error_corpus_files() { let test = parse_tests(&entry.path()).unwrap(); let language = get_language(&language_name); - parser.set_language(language).unwrap(); - did_fail |= run_mutation_tests(&mut parser, test); + did_fail |= run_mutation_tests(language, test); } if did_fail { @@ -85,8 +80,6 @@ fn test_error_corpus_files() { #[test] fn test_feature_corpus_files() { - let mut log_session = None; - let mut parser = get_parser(&mut log_session, "log3.html"); let test_grammars_dir = fixtures_dir().join("test_grammars"); let mut did_fail = false; @@ -132,8 +125,7 @@ fn test_feature_corpus_files() { let c_code = generate_result.unwrap().1; let language = get_test_language(language_name, c_code, &test_path); let test = parse_tests(&corpus_path).unwrap(); - parser.set_language(language).unwrap(); - did_fail |= run_mutation_tests(&mut parser, test); + did_fail |= run_mutation_tests(language, test); } } @@ -142,7 +134,7 @@ fn test_feature_corpus_files() { } } -fn run_mutation_tests(parser: &mut Parser, test: TestEntry) -> bool { +fn run_mutation_tests(language: Language, test: TestEntry) -> bool { match test { TestEntry::Example { name, @@ -157,23 +149,30 @@ fn run_mutation_tests(parser: &mut Parser, test: TestEntry) -> bool { eprintln!(" example: {:?}", name); + allocations::start_recording(); + let mut log_session = None; + let mut parser = get_parser(&mut log_session, "log.html"); + parser.set_language(language).unwrap(); let tree = parser .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) .unwrap(); let actual = tree.root_node().to_sexp(); + drop(tree); + drop(parser); if actual != output { print_diff_key(); print_diff(&actual, &output); println!(""); true } else { + allocations::stop_recording(); false } } TestEntry::Group { children, .. } => { let mut result = false; for child in children { - result |= run_mutation_tests(parser, child); + result |= run_mutation_tests(language, child); } result } diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index a874358a..174be67b 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,3 +1,4 @@ +mod allocations; mod corpuses; mod fixtures; mod parser_api; diff --git a/lib/build.rs b/lib/build.rs index 2a121001..df66ee7c 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -1,6 +1,6 @@ extern crate cc; -use std::env; +use std::{env, fs}; use std::path::{Path, PathBuf}; fn main() { @@ -20,13 +20,27 @@ fn main() { } let mut config = cc::Build::new(); + + println!("cargo:rerun-if-env-changed=TREE_SITTER_TEST"); + if env::var("TREE_SITTER_TEST").is_ok() { + config.define("TREE_SITTER_TEST", ""); + } + + let src_path = Path::new("src"); + + for entry in fs::read_dir(&src_path).unwrap() { + let entry = entry.unwrap(); + let path = src_path.join(entry.file_name()); + println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); + } + config .define("UTF8PROC_STATIC", "") .flag_if_supported("-std=c99") .flag_if_supported("-Wno-unused-parameter") .include("include") .include("utf8proc") - .file(Path::new("src").join("lib.c")) + .file(src_path.join("lib.c")) .compile("tree-sitter"); } diff --git a/lib/src/subtree.c b/lib/src/subtree.c index 48c8cff3..3e353f99 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -855,7 +855,7 @@ char *ts_subtree_string(Subtree self, const TSLanguage *language, bool include_a language, true, include_all, 0, false ) + 1; - char *result = ts_malloc(size * sizeof(char)); + char *result = malloc(size * sizeof(char)); ts_subtree__write_to_string(self, result, size, language, true, include_all, 0, false); return result; } From e305012b3107ba48fa9d75be4a8fa40c3adf5458 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 21 Jan 2019 15:33:43 -0800 Subject: [PATCH 182/208] Loosen keyword identification criteria slightly --- cli/src/generate/build_tables/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index ca54f274..36f6770b 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -216,7 +216,7 @@ fn identify_keywords( // First find all of the candidate keyword tokens: tokens that start with // letters or underscore and can match the same string as a word token. - let keywords: TokenSet = lexical_grammar + let keyword_candidates: TokenSet = lexical_grammar .variables .iter() .enumerate() @@ -237,10 +237,10 @@ fn identify_keywords( .collect(); // Exclude keyword candidates that shadow another keyword candidate. - let keywords: TokenSet = keywords + let keywords: TokenSet = keyword_candidates .iter() .filter(|token| { - for other_token in keywords.iter() { + for other_token in keyword_candidates.iter() { if other_token != *token && token_conflict_map.does_match_same_string(other_token.index, token.index) { @@ -262,7 +262,7 @@ fn identify_keywords( .iter() .filter(|token| { for other_index in 0..lexical_grammar.variables.len() { - if keywords.contains(&Symbol::terminal(other_index)) { + if keyword_candidates.contains(&Symbol::terminal(other_index)) { continue; } From 233d616ebfaca1dd354accdb95df32444a4eeef0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 Jan 2019 12:05:21 -0800 Subject: [PATCH 183/208] Add random mutation tests --- Cargo.lock | 1 + cli/Cargo.toml | 1 + cli/src/test.rs | 8 +- cli/src/tests/allocations.rs | 12 +- cli/src/tests/corpuses.rs | 386 +++++++++++++++++++++++++++-------- cli/src/tests/mod.rs | 1 + cli/src/tests/parser_api.rs | 8 +- cli/src/tests/random.rs | 41 ++++ lib/binding/lib.rs | 32 ++- lib/src/parser.c | 29 ++- lib/src/subtree.c | 8 +- script/test | 43 ++-- 12 files changed, 443 insertions(+), 127 deletions(-) create mode 100644 cli/src/tests/random.rs diff --git a/Cargo.lock b/Cargo.lock index 936c60ee..5c2dcd62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -583,6 +583,7 @@ dependencies = [ "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 5eb92079..75efdb18 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -36,4 +36,5 @@ version = "0.4.6" features = ["std"] [dev-dependencies] +rand = "0.6.4" spin = "0.5" diff --git a/cli/src/test.rs b/cli/src/test.rs index 3a40eb83..b8b78b8f 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -36,6 +36,12 @@ pub enum TestEntry { }, } +impl Default for TestEntry { + fn default() -> Self { + TestEntry::Group { name: String::new(), children: Vec::new() } + } +} + pub fn run_tests_at_path( language: Language, path: &Path, @@ -160,7 +166,7 @@ fn run_tests( pub fn parse_tests(path: &Path) -> io::Result { let name = path - .file_name() + .file_stem() .and_then(|s| s.to_str()) .unwrap_or("") .to_string(); diff --git a/cli/src/tests/allocations.rs b/cli/src/tests/allocations.rs index c4a3dbac..e3cdae27 100644 --- a/cli/src/tests/allocations.rs +++ b/cli/src/tests/allocations.rs @@ -40,13 +40,15 @@ pub fn stop_recording() { recorder.enabled = false; if !recorder.outstanding_allocations.is_empty() { + let mut allocation_indices = recorder + .outstanding_allocations + .iter() + .map(|e| e.1) + .collect::>(); + allocation_indices.sort_unstable(); panic!( "Leaked allocation indices: {:?}", - recorder - .outstanding_allocations - .iter() - .map(|e| e.1) - .collect::>() + allocation_indices ); } } diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index 2c205d40..76ed02d0 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,11 +1,14 @@ use super::allocations; use super::fixtures::{fixtures_dir, get_language, get_test_language}; +use super::random::Rand; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; -use std::fs; -use tree_sitter::{Language, LogType, Parser}; +use std::{env, fs, time, usize}; +use tree_sitter::{InputEdit, LogType, Parser, Point, Tree}; +const EDIT_COUNT: usize = 3; +const TRIAL_COUNT: usize = 10; const LANGUAGES: &'static [&'static str] = &[ "bash", "c", @@ -18,19 +21,30 @@ const LANGUAGES: &'static [&'static str] = &[ ]; lazy_static! { - static ref LANGUAGE_FILTER: Option = - std::env::var("TREE_SITTER_TEST_LANGUAGE_FILTER").ok(); - static ref EXAMPLE_FILTER: Option = - std::env::var("TREE_SITTER_TEST_EXAMPLE_FILTER").ok(); - static ref LOG_ENABLED: bool = std::env::var("TREE_SITTER_ENABLE_LOG").is_ok(); - static ref LOG_GRAPH_ENABLED: bool = std::env::var("TREE_SITTER_ENABLE_LOG_GRAPHS").is_ok(); + static ref LOG_ENABLED: bool = env::var("TREE_SITTER_TEST_ENABLE_LOG").is_ok(); + static ref LOG_GRAPH_ENABLED: bool = env::var("TREE_SITTER_TEST_ENABLE_LOG_GRAPHS").is_ok(); + static ref LANGUAGE_FILTER: Option = env::var("TREE_SITTER_TEST_LANGUAGE_FILTER").ok(); + static ref EXAMPLE_FILTER: Option = env::var("TREE_SITTER_TEST_EXAMPLE_FILTER").ok(); + static ref TRIAL_FILTER: Option = env::var("TREE_SITTER_TEST_TRIAL_FILTER") + .map(|s| usize::from_str_radix(&s, 10).unwrap()) + .ok(); + pub static ref SEED: usize = env::var("TREE_SITTER_TEST_SEED") + .map(|s| usize::from_str_radix(&s, 10).unwrap()) + .unwrap_or( + time::SystemTime::now() + .duration_since(time::UNIX_EPOCH) + .unwrap() + .as_secs() as usize, + ); } #[test] fn test_real_language_corpus_files() { + eprintln!("\n\nRandom seed: {}\n", *SEED); let grammars_dir = fixtures_dir().join("grammars"); + let error_corpus_dir = fixtures_dir().join("error_corpus"); - let mut did_fail = false; + let mut failure_count = 0; for language_name in LANGUAGES.iter().cloned() { if let Some(filter) = LANGUAGE_FILTER.as_ref() { if language_name != filter.as_str() { @@ -38,51 +52,133 @@ fn test_real_language_corpus_files() { } } - eprintln!("language: {:?}", language_name); - let language = get_language(language_name); let corpus_dir = grammars_dir.join(language_name).join("corpus"); - let test = parse_tests(&corpus_dir).unwrap(); - did_fail |= run_mutation_tests(language, test); - } + let error_corpus_file = error_corpus_dir.join(&format!("{}_errors.txt", language_name)); + let main_tests = parse_tests(&corpus_dir).unwrap(); + let error_tests = parse_tests(&error_corpus_file).unwrap_or(TestEntry::default()); + let mut tests = flatten_tests(main_tests); + tests.extend(flatten_tests(error_tests)); - if did_fail { - panic!("Corpus tests failed"); + if !tests.is_empty() { + eprintln!("language: {:?}", language_name); + } + + for (example_name, input, expected_output) in tests { + eprintln!(" example: {:?}", example_name); + + if TRIAL_FILTER.map_or(true, |t| t == 0) { + allocations::start_recording(); + let mut log_session = None; + let mut parser = get_parser(&mut log_session, "log.html"); + parser.set_language(language).unwrap(); + let tree = parser.parse_utf8(&mut |i, _| &input[i..], None).unwrap(); + let actual_output = tree.root_node().to_sexp(); + drop(tree); + drop(parser); + if actual_output != expected_output { + print_diff_key(); + print_diff(&actual_output, &expected_output); + println!(""); + failure_count += 1; + continue; + } + allocations::stop_recording(); + } + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser + .parse_utf8(&mut |i, _| input.get(i..).unwrap_or(&[]), None) + .unwrap(); + drop(parser); + + for trial in 1..=TRIAL_COUNT { + if TRIAL_FILTER.map_or(true, |filter| filter == trial) { + let mut rand = Rand::new(*SEED + trial); + + allocations::start_recording(); + let mut log_session = None; + let mut parser = get_parser(&mut log_session, "log.html"); + parser.set_language(language).unwrap(); + let mut tree = tree.clone(); + let mut input = input.clone(); + + if *LOG_GRAPH_ENABLED { + eprintln!("{}\n", String::from_utf8_lossy(&input)); + } + + // Perform a random series of edits and reparse. + let mut undo_stack = Vec::new(); + for _ in 0..EDIT_COUNT { + let edit = get_random_edit(&mut rand, &input); + undo_stack.push(invert_edit(&input, &edit)); + perform_edit(&mut tree, &mut input, &edit); + } + if *LOG_GRAPH_ENABLED { + eprintln!("{}\n", String::from_utf8_lossy(&input)); + } + + let mut tree2 = parser + .parse_utf8(&mut |i, _| input.get(i..).unwrap_or(&[]), Some(&tree)) + .unwrap(); + + // Check that the new tree is consistent. + check_consistent_sizes(&tree2, &input); + check_changed_ranges(&tree, &tree2, &input); + + // Undo all of the edits and re-parse again. + while let Some(edit) = undo_stack.pop() { + perform_edit(&mut tree2, &mut input, &edit); + } + if *LOG_GRAPH_ENABLED { + eprintln!("{}\n", String::from_utf8_lossy(&input)); + } + + let tree3 = parser + .parse_utf8(&mut |i, _| input.get(i..).unwrap_or(&[]), Some(&tree2)) + .unwrap(); + + // Check that the edited tree is consistent. + check_consistent_sizes(&tree3, &input); + check_changed_ranges(&tree2, &tree3, &input); + + // Verify that the final tree matches the expectation from the corpus. + let actual_output = tree3.root_node().to_sexp(); + if actual_output != expected_output { + println!("Incorrect parse for {} - {} - trial {}", language_name, example_name, trial); + print_diff_key(); + print_diff(&actual_output, &expected_output); + println!(""); + failure_count += 1; + // break; + } + + drop(tree); + drop(tree2); + drop(tree3); + drop(parser); + allocations::stop_recording(); + } + } + } + } + if failure_count > 0 { + panic!("{} corpus tests failed", failure_count); } } -#[test] -fn test_error_corpus_files() { - let corpus_dir = fixtures_dir().join("error_corpus"); - - let mut did_fail = false; - for entry in fs::read_dir(&corpus_dir).unwrap() { - let entry = entry.unwrap(); - let language_name = entry.file_name(); - let language_name = language_name.to_str().unwrap().replace("_errors.txt", ""); - if let Some(filter) = LANGUAGE_FILTER.as_ref() { - if language_name != filter.as_str() { - continue; - } - } - - eprintln!("language: {:?}", language_name); - - let test = parse_tests(&entry.path()).unwrap(); - let language = get_language(&language_name); - did_fail |= run_mutation_tests(language, test); - } - - if did_fail { - panic!("Corpus tests failed"); - } +struct Edit { + position: usize, + deleted_length: usize, + inserted_text: Vec, } #[test] fn test_feature_corpus_files() { let test_grammars_dir = fixtures_dir().join("test_grammars"); - let mut did_fail = false; + let mut failure_count = 0; for entry in fs::read_dir(&test_grammars_dir).unwrap() { let entry = entry.unwrap(); if !entry.metadata().unwrap().is_dir() { @@ -97,8 +193,6 @@ fn test_feature_corpus_files() { } } - eprintln!("test language: {:?}", language_name); - let test_path = entry.path(); let grammar_path = test_path.join("grammar.json"); let error_message_path = test_path.join("expected_error.txt"); @@ -106,79 +200,156 @@ fn test_feature_corpus_files() { let generate_result = generate::generate_parser_for_grammar(&grammar_json); if error_message_path.exists() { + if EXAMPLE_FILTER.is_some() { + continue; + } + + eprintln!("test language: {:?}", language_name); + let expected_message = fs::read_to_string(&error_message_path).unwrap(); if let Err(e) = generate_result { if e.0 != expected_message { - panic!( + eprintln!( "Unexpected error message.\n\nExpected:\n\n{}\nActual:\n\n{}\n", expected_message, e.0 ); + failure_count += 1; } } else { - panic!( + eprintln!( "Expected error message but got none for test grammar '{}'", language_name ); + failure_count += 1; } } else { let corpus_path = test_path.join("corpus.txt"); let c_code = generate_result.unwrap().1; let language = get_test_language(language_name, c_code, &test_path); let test = parse_tests(&corpus_path).unwrap(); - did_fail |= run_mutation_tests(language, test); - } - } + let tests = flatten_tests(test); - if did_fail { - panic!("Corpus tests failed"); - } -} + if !tests.is_empty() { + eprintln!("test language: {:?}", language_name); + } -fn run_mutation_tests(language: Language, test: TestEntry) -> bool { - match test { - TestEntry::Example { - name, - input, - output, - } => { - if let Some(filter) = EXAMPLE_FILTER.as_ref() { - if !name.contains(filter.as_str()) { - return false; + for (name, input, expected_output) in tests { + eprintln!(" example: {:?}", name); + + allocations::start_recording(); + let mut log_session = None; + let mut parser = get_parser(&mut log_session, "log.html"); + parser.set_language(language).unwrap(); + let tree = parser.parse_utf8(&mut |i, _| &input[i..], None).unwrap(); + let actual_output = tree.root_node().to_sexp(); + drop(tree); + drop(parser); + if actual_output != expected_output { + print_diff_key(); + print_diff(&actual_output, &expected_output); + println!(""); + failure_count += 1; + continue; } - } - - eprintln!(" example: {:?}", name); - - allocations::start_recording(); - let mut log_session = None; - let mut parser = get_parser(&mut log_session, "log.html"); - parser.set_language(language).unwrap(); - let tree = parser - .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None) - .unwrap(); - let actual = tree.root_node().to_sexp(); - drop(tree); - drop(parser); - if actual != output { - print_diff_key(); - print_diff(&actual, &output); - println!(""); - true - } else { allocations::stop_recording(); - false } } - TestEntry::Group { children, .. } => { - let mut result = false; - for child in children { - result |= run_mutation_tests(language, child); - } - result + } + if failure_count > 0 { + panic!("{} corpus tests failed", failure_count); + } +} + +fn get_random_edit(rand: &mut Rand, input: &Vec) -> Edit { + let choice = rand.unsigned(10); + if choice < 2 { + // Insert text at end + let inserted_text = rand.words(3); + Edit { + position: input.len(), + deleted_length: 0, + inserted_text, + } + } else if choice < 5 { + // Delete text from the end + let mut deleted_length = rand.unsigned(10); + if deleted_length > input.len() { + deleted_length = input.len(); + } + Edit { + position: input.len() - deleted_length, + deleted_length, + inserted_text: vec![], + } + } else if choice < 8 { + // Insert at a random position + let position = rand.unsigned(input.len()); + let word_count = 1 + rand.unsigned(3); + let inserted_text = rand.words(word_count); + Edit { + position, + deleted_length: 0, + inserted_text, + } + } else { + // Replace at random position + let position = rand.unsigned(input.len()); + let deleted_length = rand.unsigned(input.len() - position); + let word_count = 1 + rand.unsigned(3); + let inserted_text = rand.words(word_count); + Edit { + position, + deleted_length, + inserted_text, } } } +fn invert_edit(input: &Vec, edit: &Edit) -> Edit { + let position = edit.position; + let removed_content = &input[position..(position + edit.deleted_length)]; + Edit { + position, + deleted_length: edit.inserted_text.len(), + inserted_text: removed_content.to_vec(), + } +} + +fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { + let start_byte = edit.position; + let old_end_byte = edit.position + edit.deleted_length; + let new_end_byte = edit.position + edit.inserted_text.len(); + let start_position = position_for_offset(input, start_byte); + let old_end_position = position_for_offset(input, old_end_byte); + input.splice(start_byte..old_end_byte, edit.inserted_text.iter().cloned()); + let new_end_position = position_for_offset(input, new_end_byte); + tree.edit(&InputEdit { + start_byte, + old_end_byte, + new_end_byte, + start_position, + old_end_position, + new_end_position, + }); +} + +fn position_for_offset(input: &Vec, offset: usize) -> Point { + let mut result = Point { row: 0, column: 0 }; + for c in &input[0..offset] { + if *c as char == '\n' { + result.row += 1; + result.column = 0; + } else { + result.column += 1; + } + } + result +} + +fn check_consistent_sizes(tree: &Tree, input: &Vec) {} + +fn check_changed_ranges(old_tree: &Tree, new_tree: &Tree, input: &Vec) {} + fn get_parser(session: &mut Option, log_filename: &str) -> Parser { let mut parser = Parser::new(); @@ -196,3 +367,38 @@ fn get_parser(session: &mut Option, log_filename: &str) -> Par parser } + +fn flatten_tests(test: TestEntry) -> Vec<(String, Vec, String)> { + fn helper(test: TestEntry, prefix: &str, result: &mut Vec<(String, Vec, String)>) { + match test { + TestEntry::Example { + mut name, + input, + output, + } => { + if !prefix.is_empty() { + name.insert_str(0, " - "); + name.insert_str(0, prefix); + } + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !name.contains(filter.as_str()) { + return; + } + } + result.push((name, input, output)); + } + TestEntry::Group { mut name, children } => { + if !prefix.is_empty() { + name.insert_str(0, " - "); + name.insert_str(0, prefix); + } + for child in children { + helper(child, &name, result); + } + } + } + } + let mut result = Vec::new(); + helper(test, "", &mut result); + result +} diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index 174be67b..3a0c607a 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,4 +1,5 @@ mod allocations; mod corpuses; mod fixtures; +mod random; mod parser_api; diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api.rs index 9a4ce9f1..38bc0b69 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api.rs @@ -2,10 +2,6 @@ use super::fixtures::get_language; use std::thread; use tree_sitter::{InputEdit, Language, LogType, Parser, Point, PropertySheet}; -fn rust() -> Language { - get_language("rust") -} - #[test] fn test_basic_parsing() { let mut parser = Parser::new(); @@ -505,3 +501,7 @@ fn test_parallel_parsing() { assert_eq!(child_count_differences, &[1, 2, 3, 4]); } + +fn rust() -> Language { + get_language("rust") +} diff --git a/cli/src/tests/random.rs b/cli/src/tests/random.rs new file mode 100644 index 00000000..3c8394e7 --- /dev/null +++ b/cli/src/tests/random.rs @@ -0,0 +1,41 @@ +use rand::distributions::Alphanumeric; +use rand::prelude::{Rng, SeedableRng, SmallRng}; + +const OPERATORS: &[char] = &[ + '+', '-', '<', '>', '(', ')', '*', '/', '&', '|', '!', ',', '.', +]; + +pub struct Rand(SmallRng); + +impl Rand { + pub fn new(seed: usize) -> Self { + Rand(SmallRng::seed_from_u64(seed as u64)) + } + + pub fn unsigned(&mut self, max: usize) -> usize { + self.0.gen_range(0, max + 1) + } + + pub fn words(&mut self, max_count: usize) -> Vec { + let mut result = Vec::new(); + let word_count = self.unsigned(max_count); + for i in 0..word_count { + if i > 0 { + if self.unsigned(5) == 0 { + result.push('\n' as u8); + } else { + result.push(' ' as u8); + } + } + if self.unsigned(3) == 0 { + let index = self.unsigned(OPERATORS.len() - 1); + result.push(OPERATORS[index] as u8); + } else { + for _ in 0..self.unsigned(8) { + result.push(self.0.sample(Alphanumeric) as u8); + } + } + } + result + } +} diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 8143fd6b..1f29e28a 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -18,6 +18,7 @@ use std::io::{self, Read, Seek}; use std::marker::PhantomData; use std::os::raw::{c_char, c_void}; use std::ptr; +use std::slice; use std::str; use std::u16; @@ -427,6 +428,18 @@ impl Tree { ) -> TreePropertyCursor<'a, P> { TreePropertyCursor::new(self, property_sheet, source) } + + pub fn changed_ranges(&self, other: &Tree) -> Vec { + unsafe { + let mut count = 0; + let ptr = + ffi::ts_tree_get_changed_ranges(self.0, other.0, &mut count as *mut _ as *mut u32); + let ranges = slice::from_raw_parts(ptr, count); + let result = ranges.into_iter().map(|r| r.clone().into()).collect(); + free(ptr as *mut c_void); + result + } + } } unsafe impl Send for Tree {} @@ -558,10 +571,6 @@ impl<'tree> Node<'tree> { } pub fn to_sexp(&self) -> String { - extern "C" { - fn free(pointer: *mut c_void); - } - let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) } .to_str() @@ -788,6 +797,17 @@ impl Into for Range { } } +impl From for Range { + fn from(range: ffi::TSRange) -> Self { + Self { + start_byte: range.start_byte as usize, + end_byte: range.end_byte as usize, + start_point: range.start_point.into(), + end_point: range.end_point.into(), + } + } +} + impl

PropertySheet

{ pub fn new(language: Language, json: &str) -> Result where @@ -860,3 +880,7 @@ impl

PropertySheet

{ }) } } + +extern "C" { + fn free(pointer: *mut c_void); +} diff --git a/lib/src/parser.c b/lib/src/parser.c index c2ebfeeb..56326feb 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -28,10 +28,10 @@ fputs("\n\n", self->dot_graph_file); \ } -#define LOG_TREE() \ - if (self->dot_graph_file) { \ - ts_subtree_print_dot_graph(self->finished_tree, self->language, self->dot_graph_file); \ - fputs("\n", self->dot_graph_file); \ +#define LOG_TREE(tree) \ + if (self->dot_graph_file) { \ + ts_subtree_print_dot_graph(tree, self->language, self->dot_graph_file); \ + fputs("\n", self->dot_graph_file); \ } #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) @@ -417,6 +417,13 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa parse_state, self->language ); + + LOG( + "lexed_lookahead sym:%s, size:%u, character:'%c'", + SYM_NAME(ts_subtree_symbol(result)), + ts_subtree_total_size(result).bytes, + first_error_character + ); } else { if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { self->lexer.token_start_position = self->lexer.token_end_position; @@ -467,13 +474,14 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa length ); } + + LOG( + "lexed_lookahead sym:%s, size:%u", + SYM_NAME(ts_subtree_symbol(result)), + ts_subtree_total_size(result).bytes + ); } - LOG( - "lexed_lookahead sym:%s, size:%u", - SYM_NAME(ts_subtree_symbol(result)), - ts_subtree_total_size(result).bytes - ); return result; } @@ -1623,6 +1631,7 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) { ); reusable_node_reset(&self->reusable_node, old_tree->root); LOG("parse_after_edit"); + LOG_TREE(self->old_tree); for (unsigned i = 0; i < self->included_range_differences.size; i++) { TSRange *range = &self->included_range_differences.contents[i]; LOG("different_included_range %u - %u", range->start_byte, range->end_byte); @@ -1681,7 +1690,7 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) { ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); LOG("done"); - LOG_TREE(); + LOG_TREE(self->finished_tree); TSTree *result = ts_tree_new( self->finished_tree, diff --git a/lib/src/subtree.c b/lib/src/subtree.c index 3e353f99..00af7507 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -879,7 +879,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, "error-cost: %u\n" "has-changes: %u\n" "repeat-depth: %u\n" - "lookahead-bytes: %u\"]\n", + "lookahead-bytes: %u", start_offset, end_offset, ts_subtree_parse_state(*self), ts_subtree_error_cost(*self), @@ -888,6 +888,12 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, ts_subtree_lookahead_bytes(*self) ); + if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0) { + fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char); + } + + fprintf(f, "\"]\n"); + uint32_t child_start_offset = start_offset; uint32_t structural_child_index = 0; const TSSymbol *alias_sequence = ts_language_alias_sequence( diff --git a/script/test b/script/test index 43c274f7..ba95a754 100755 --- a/script/test +++ b/script/test @@ -10,19 +10,23 @@ USAGE OPTIONS - -h print this message + -h Print this message -a Compile C code with the Clang static analyzer - -l run only the corpus tests for the given language + -l Run only the corpus tests for the given language - -e run only the corpus tests whose name contain the given string + -e Run only the corpus tests whose name contain the given string - -s set the seed used to control random behavior + -t Run only the given trial number of randomized test - -d print parsing log to stderr + -s Set the seed used to control random behavior - -D pipe tests' stderr to \`dot(1)\` to render an SVG log + -d Print parsing log to stderr + + -D Generate an SVG graph of parsing logs + + -g Run the tests with a debugger EOF } @@ -31,7 +35,9 @@ export TREE_SITTER_TEST=1 export RUST_TEST_THREADS=1 export RUST_BACKTRACE=full -while getopts "bdl:e:s:gGhpvD" option; do +mode=normal + +while getopts "dDghl:e:s:t:" option; do case ${option} in h) usage @@ -43,22 +49,35 @@ while getopts "bdl:e:s:gGhpvD" option; do e) export TREE_SITTER_TEST_EXAMPLE_FILTER=${OPTARG} ;; + t) + export TREE_SITTER_TEST_TRIAL_FILTER=${OPTARG} + ;; s) - export TREE_SITTER_SEED=${OPTARG} + export TREE_SITTER_TEST_SEED=${OPTARG} ;; d) - export TREE_SITTER_ENABLE_LOG=1 + export TREE_SITTER_TEST_ENABLE_LOG=1 ;; D) - export TREE_SITTER_ENABLE_LOG_GRAPHS=1 + export TREE_SITTER_TEST_ENABLE_LOG_GRAPHS=1 + ;; + g) + mode=debug ;; esac done -if [[ -n $TREE_SITTER_TEST_LANGUAGE_FILTER || -n $TREE_SITTER_TEST_EXAMPLE_FILTER ]]; then +shift $(expr $OPTIND - 1 ) + +if [[ -n $TREE_SITTER_TEST_LANGUAGE_FILTER || -n $TREE_SITTER_TEST_EXAMPLE_FILTER || -n $TREE_SITTER_TEST_TRIAL_FILTER ]]; then top_level_filter=corpus else top_level_filter=$1 fi -cargo test --jobs 1 $top_level_filter -- --nocapture +if [[ "${mode}" == "debug" ]]; then + test_binary=$(cargo test --no-run --message-format=json 2> /dev/null | jq -rs '.[-1].filenames[0]') + lldb "${test_binary}" -- $top_level_filter +else + cargo test --jobs 1 $top_level_filter -- --nocapture +fi From 5a12fbd927dfff6b44aace2e6ee4c7da1d018d71 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 Jan 2019 15:20:34 -0800 Subject: [PATCH 184/208] Verify changed ranges in randomized tests --- cli/src/tests/corpuses.rs | 33 ++++++++--- cli/src/tests/mod.rs | 1 + cli/src/tests/scope_sequence.rs | 97 +++++++++++++++++++++++++++++++++ lib/binding/helper.c | 17 ++++++ lib/binding/lib.rs | 7 ++- lib/build.rs | 1 + script/test.cmd | 2 +- 7 files changed, 146 insertions(+), 12 deletions(-) create mode 100644 cli/src/tests/scope_sequence.rs create mode 100644 lib/binding/helper.c diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpuses.rs index 76ed02d0..9b60d685 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpuses.rs @@ -1,6 +1,7 @@ use super::allocations; use super::fixtures::{fixtures_dir, get_language, get_test_language}; use super::random::Rand; +use super::scope_sequence::ScopeSequence; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; @@ -125,7 +126,11 @@ fn test_real_language_corpus_files() { // Check that the new tree is consistent. check_consistent_sizes(&tree2, &input); - check_changed_ranges(&tree, &tree2, &input); + if let Err(message) = check_changed_ranges(&tree, &tree2, &input) { + println!("\nUnexpected scope change in trial {}\n{}\n\n", trial, message); + failure_count += 1; + break; + } // Undo all of the edits and re-parse again. while let Some(edit) = undo_stack.pop() { @@ -139,19 +144,26 @@ fn test_real_language_corpus_files() { .parse_utf8(&mut |i, _| input.get(i..).unwrap_or(&[]), Some(&tree2)) .unwrap(); - // Check that the edited tree is consistent. - check_consistent_sizes(&tree3, &input); - check_changed_ranges(&tree2, &tree3, &input); - // Verify that the final tree matches the expectation from the corpus. let actual_output = tree3.root_node().to_sexp(); if actual_output != expected_output { - println!("Incorrect parse for {} - {} - trial {}", language_name, example_name, trial); + println!( + "Incorrect parse for {} - {} - trial {}", + language_name, example_name, trial + ); print_diff_key(); print_diff(&actual_output, &expected_output); println!(""); failure_count += 1; - // break; + break; + } + + // Check that the edited tree is consistent. + check_consistent_sizes(&tree3, &input); + if let Err(message) = check_changed_ranges(&tree2, &tree3, &input) { + eprintln!("Unexpected scope change in trial {}\n{}\n\n", trial, message); + failure_count += 1; + break; } drop(tree); @@ -348,7 +360,12 @@ fn position_for_offset(input: &Vec, offset: usize) -> Point { fn check_consistent_sizes(tree: &Tree, input: &Vec) {} -fn check_changed_ranges(old_tree: &Tree, new_tree: &Tree, input: &Vec) {} +fn check_changed_ranges(old_tree: &Tree, new_tree: &Tree, input: &Vec) -> Result<(), String> { + let changed_ranges = old_tree.changed_ranges(new_tree); + let old_scope_sequence = ScopeSequence::new(old_tree); + let new_scope_sequence = ScopeSequence::new(new_tree); + old_scope_sequence.check_changes(&new_scope_sequence, &input, &changed_ranges) +} fn get_parser(session: &mut Option, log_filename: &str) -> Parser { let mut parser = Parser::new(); diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index 3a0c607a..fa841382 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -3,3 +3,4 @@ mod corpuses; mod fixtures; mod random; mod parser_api; +mod scope_sequence; diff --git a/cli/src/tests/scope_sequence.rs b/cli/src/tests/scope_sequence.rs new file mode 100644 index 00000000..685fe91f --- /dev/null +++ b/cli/src/tests/scope_sequence.rs @@ -0,0 +1,97 @@ +use tree_sitter::{Point, Range, Tree}; + +#[derive(Debug)] +pub struct ScopeSequence(Vec); + +type ScopeStack = Vec<&'static str>; + +impl ScopeSequence { + pub fn new(tree: &Tree) -> Self { + let mut result = ScopeSequence(Vec::new()); + let mut scope_stack = Vec::new(); + + let mut cursor = tree.walk(); + let mut visited_children = false; + loop { + let node = cursor.node(); + for _ in result.0.len()..node.start_byte() { + result.0.push(scope_stack.clone()); + } + if visited_children { + for _ in result.0.len()..node.end_byte() { + result.0.push(scope_stack.clone()); + } + scope_stack.pop(); + if cursor.goto_next_sibling() { + visited_children = false; + } else if !cursor.goto_parent() { + break; + } + } else { + scope_stack.push(cursor.node().kind()); + if !cursor.goto_first_child() { + visited_children = true; + } + } + } + + result + } + + pub fn check_changes( + &self, + other: &ScopeSequence, + text: &Vec, + known_changed_ranges: &Vec, + ) -> Result<(), String> { + if self.0.len() != text.len() { + panic!( + "Inconsistent scope sequence: {:?}", + self.0.iter().zip(text.iter().map(|c| *c as char)).collect::>() + ); + } + + assert_eq!(self.0.len(), other.0.len()); + let mut position = Point { row: 0, column: 0 }; + for (i, stack) in self.0.iter().enumerate() { + let other_stack = &other.0[i]; + if *stack != *other_stack { + let containing_range = known_changed_ranges + .iter() + .find(|range| range.start_point <= position && position < range.end_point); + if containing_range.is_none() { + let line = &text[(i - position.column)..] + .split(|c| *c == '\n' as u8) + .next() + .unwrap(); + return Err(format!( + concat!( + "Position: {}\n", + "Byte offset: {}\n", + "Line: {}\n", + "{}^\n", + "Old scopes: {:?}\n", + "New scopes: {:?}\n", + "Invalidated ranges: {:?}", + ), + position, + i, + String::from_utf8_lossy(line), + String::from(" ").repeat(position.column + "Line: ".len()), + stack, + other_stack, + known_changed_ranges, + )); + } + } + + if text[i] == '\n' as u8 { + position.row += 1; + position.column = 0; + } else { + position.column += 1; + } + } + Ok(()) + } +} diff --git a/lib/binding/helper.c b/lib/binding/helper.c new file mode 100644 index 00000000..4275e445 --- /dev/null +++ b/lib/binding/helper.c @@ -0,0 +1,17 @@ +#if defined(TREE_SITTER_TEST) + +void ts_record_free(void *); + +void rust_tree_sitter_free(void *p) { + ts_record_free(p); +} + +#else + +void free(void *); + +void rust_tree_sitter_free(void *p) { + free(p); +} + +#endif diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 1f29e28a..150dfcf4 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -436,7 +436,7 @@ impl Tree { ffi::ts_tree_get_changed_ranges(self.0, other.0, &mut count as *mut _ as *mut u32); let ranges = slice::from_raw_parts(ptr, count); let result = ranges.into_iter().map(|r| r.clone().into()).collect(); - free(ptr as *mut c_void); + free_ptr(ptr as *mut c_void); result } } @@ -576,7 +576,7 @@ impl<'tree> Node<'tree> { .to_str() .unwrap() .to_string(); - unsafe { free(c_string as *mut c_void) }; + unsafe { free_ptr(c_string as *mut c_void) }; result } @@ -882,5 +882,6 @@ impl

PropertySheet

{ } extern "C" { - fn free(pointer: *mut c_void); + #[link_name = "rust_tree_sitter_free"] + fn free_ptr(ptr: *mut c_void); } diff --git a/lib/build.rs b/lib/build.rs index df66ee7c..7cca001c 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -41,6 +41,7 @@ fn main() { .include("include") .include("utf8proc") .file(src_path.join("lib.c")) + .file(Path::new("binding").join("helper.c")) .compile("tree-sitter"); } diff --git a/script/test.cmd b/script/test.cmd index e62eed0e..ef4ce02e 100644 --- a/script/test.cmd +++ b/script/test.cmd @@ -4,4 +4,4 @@ set TREE_SITTER_TEST=1 set RUST_TEST_THREADS=1 set RUST_BACKTRACE=full -cargo test "%~1" +cargo test "%~1" -- --nocapture From af83e8034e05b4aa00512128828bdd1b4046dffc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 Jan 2019 16:40:26 -0800 Subject: [PATCH 185/208] Move test helpers into their own folder --- cli/src/tests/{corpuses.rs => corpus_test.rs} | 18 ++++++++++++------ cli/src/tests/{ => helpers}/allocations.rs | 0 cli/src/tests/{ => helpers}/fixtures.rs | 0 cli/src/tests/helpers/mod.rs | 4 ++++ cli/src/tests/{ => helpers}/random.rs | 0 cli/src/tests/{ => helpers}/scope_sequence.rs | 0 cli/src/tests/mod.rs | 9 +++------ .../{parser_api.rs => parser_api_test.rs} | 4 ++-- 8 files changed, 21 insertions(+), 14 deletions(-) rename cli/src/tests/{corpuses.rs => corpus_test.rs} (96%) rename cli/src/tests/{ => helpers}/allocations.rs (100%) rename cli/src/tests/{ => helpers}/fixtures.rs (100%) create mode 100644 cli/src/tests/helpers/mod.rs rename cli/src/tests/{ => helpers}/random.rs (100%) rename cli/src/tests/{ => helpers}/scope_sequence.rs (100%) rename cli/src/tests/{parser_api.rs => parser_api_test.rs} (99%) diff --git a/cli/src/tests/corpuses.rs b/cli/src/tests/corpus_test.rs similarity index 96% rename from cli/src/tests/corpuses.rs rename to cli/src/tests/corpus_test.rs index 9b60d685..587a3752 100644 --- a/cli/src/tests/corpuses.rs +++ b/cli/src/tests/corpus_test.rs @@ -1,7 +1,7 @@ -use super::allocations; -use super::fixtures::{fixtures_dir, get_language, get_test_language}; -use super::random::Rand; -use super::scope_sequence::ScopeSequence; +use super::helpers::allocations; +use super::helpers::fixtures::{fixtures_dir, get_language, get_test_language}; +use super::helpers::random::Rand; +use super::helpers::scope_sequence::ScopeSequence; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; @@ -127,7 +127,10 @@ fn test_real_language_corpus_files() { // Check that the new tree is consistent. check_consistent_sizes(&tree2, &input); if let Err(message) = check_changed_ranges(&tree, &tree2, &input) { - println!("\nUnexpected scope change in trial {}\n{}\n\n", trial, message); + println!( + "\nUnexpected scope change in trial {}\n{}\n\n", + trial, message + ); failure_count += 1; break; } @@ -161,7 +164,10 @@ fn test_real_language_corpus_files() { // Check that the edited tree is consistent. check_consistent_sizes(&tree3, &input); if let Err(message) = check_changed_ranges(&tree2, &tree3, &input) { - eprintln!("Unexpected scope change in trial {}\n{}\n\n", trial, message); + eprintln!( + "Unexpected scope change in trial {}\n{}\n\n", + trial, message + ); failure_count += 1; break; } diff --git a/cli/src/tests/allocations.rs b/cli/src/tests/helpers/allocations.rs similarity index 100% rename from cli/src/tests/allocations.rs rename to cli/src/tests/helpers/allocations.rs diff --git a/cli/src/tests/fixtures.rs b/cli/src/tests/helpers/fixtures.rs similarity index 100% rename from cli/src/tests/fixtures.rs rename to cli/src/tests/helpers/fixtures.rs diff --git a/cli/src/tests/helpers/mod.rs b/cli/src/tests/helpers/mod.rs new file mode 100644 index 00000000..bd5c6517 --- /dev/null +++ b/cli/src/tests/helpers/mod.rs @@ -0,0 +1,4 @@ +pub(super) mod allocations; +pub(super) mod fixtures; +pub(super) mod random; +pub(super) mod scope_sequence; diff --git a/cli/src/tests/random.rs b/cli/src/tests/helpers/random.rs similarity index 100% rename from cli/src/tests/random.rs rename to cli/src/tests/helpers/random.rs diff --git a/cli/src/tests/scope_sequence.rs b/cli/src/tests/helpers/scope_sequence.rs similarity index 100% rename from cli/src/tests/scope_sequence.rs rename to cli/src/tests/helpers/scope_sequence.rs diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index fa841382..beafa172 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,6 +1,3 @@ -mod allocations; -mod corpuses; -mod fixtures; -mod random; -mod parser_api; -mod scope_sequence; +mod corpus_test; +mod helpers; +mod parser_api_test; diff --git a/cli/src/tests/parser_api.rs b/cli/src/tests/parser_api_test.rs similarity index 99% rename from cli/src/tests/parser_api.rs rename to cli/src/tests/parser_api_test.rs index 38bc0b69..9584ac4e 100644 --- a/cli/src/tests/parser_api.rs +++ b/cli/src/tests/parser_api_test.rs @@ -1,4 +1,4 @@ -use super::fixtures::get_language; +use super::helpers::fixtures::get_language; use std::thread; use tree_sitter::{InputEdit, Language, LogType, Parser, Point, PropertySheet}; @@ -453,7 +453,7 @@ fn test_editing() { fn test_parallel_parsing() { // Parse this source file so that each thread has a non-trivial amount of // work to do. - let this_file_source = include_str!("parser_api.rs"); + let this_file_source = include_str!("parser_api_test.rs"); let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); From 5927e104c2d8a0dada185cda8f6c6625e0bcf6fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 26 Jan 2019 22:22:29 -0800 Subject: [PATCH 186/208] Check tree consistency in randomized tests --- cli/src/tests/corpus_test.rs | 54 ++++++++++++++++++++++++++++++++++-- script/test | 2 +- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index 587a3752..449669e3 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -6,7 +6,7 @@ use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; use std::{env, fs, time, usize}; -use tree_sitter::{InputEdit, LogType, Parser, Point, Tree}; +use tree_sitter::{InputEdit, LogType, Node, Parser, Point, Tree}; const EDIT_COUNT: usize = 3; const TRIAL_COUNT: usize = 10; @@ -364,7 +364,57 @@ fn position_for_offset(input: &Vec, offset: usize) -> Point { result } -fn check_consistent_sizes(tree: &Tree, input: &Vec) {} +fn check_consistent_sizes(tree: &Tree, input: &Vec) { + fn check(node: Node, line_offsets: &Vec) { + let start_byte = node.start_byte(); + let end_byte = node.end_byte(); + let start_point = node.start_position(); + let end_point = node.end_position(); + + assert!(start_byte <= end_byte); + assert!(start_point <= end_point); + assert_eq!(start_byte, line_offsets[start_point.row] + start_point.column); + assert_eq!(end_byte, line_offsets[end_point.row] + end_point.column); + + let mut last_child_end_byte = start_byte; + let mut last_child_end_point = start_point; + let mut some_child_has_changes = false; + let mut actual_named_child_count = 0; + for child in node.children() { + assert!(child.start_byte() >= last_child_end_byte); + assert!(child.start_position() >= last_child_end_point); + check(child, line_offsets); + if child.has_changes() { + some_child_has_changes = true; + } + if child.is_named() { + actual_named_child_count += 1; + } + last_child_end_byte = child.end_byte(); + last_child_end_point = child.end_position(); + } + + assert_eq!(actual_named_child_count, node.named_child_count()); + + if node.child_count() > 0 { + assert!(end_byte >= last_child_end_byte); + assert!(end_point >= last_child_end_point); + } + + if some_child_has_changes { + assert!(node.has_changes()); + } + } + + let mut line_offsets = vec![0]; + for (i, c) in input.iter().enumerate() { + if *c == '\n' as u8 { + line_offsets.push(i + 1); + } + } + + check(tree.root_node(), &line_offsets); +} fn check_changed_ranges(old_tree: &Tree, new_tree: &Tree, input: &Vec) -> Result<(), String> { let changed_ranges = old_tree.changed_ranges(new_tree); diff --git a/script/test b/script/test index ba95a754..09cf9f83 100755 --- a/script/test +++ b/script/test @@ -6,7 +6,7 @@ function usage { cat <<-EOF USAGE - $0 [-dgGhv] [-f focus-string] [-s seed] + $0 [-adDg] [-s SEED] [-l LANGUAGE] [-e EXAMPLE] [-t TRIAL] OPTIONS From 6d8ef48dad886aa38ada6b0273a901197b8c55c5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 27 Jan 2019 09:53:49 -0800 Subject: [PATCH 187/208] Make test subcommand exit 1 if tests fail --- cli/src/main.rs | 4 +++- cli/src/test.rs | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index aaf45cb1..0bf4f01a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -121,7 +121,9 @@ fn run() -> error::Result<()> { let filter = matches.value_of("filter"); let corpus_path = current_dir.join("corpus"); if let Some(language) = loader.language_at_path(¤t_dir)? { - test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?; + if !test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)? { + exit(1); + } } else { eprintln!("No language found"); } diff --git a/cli/src/test.rs b/cli/src/test.rs index b8b78b8f..d6a2a7ce 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -48,7 +48,7 @@ pub fn run_tests_at_path( debug: bool, debug_graph: bool, filter: Option<&str>, -) -> Result<()> { +) -> Result { let test_entry = parse_tests(path)?; let mut _log_session = None; let mut parser = Parser::new(); @@ -86,9 +86,10 @@ pub fn run_tests_at_path( println!("\n {}. {}:", i + 1, name); print_diff(actual, expected); } + Ok(true) + } else { + Ok(false) } - - Ok(()) } pub fn print_diff_key() { From 8e198016d86e55730676a156155cd42025296540 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 27 Jan 2019 21:17:31 -0800 Subject: [PATCH 188/208] Try to make build-fuzzers script work w/ new structure & build process --- script/build-fuzzers | 11 ++++------- script/build-lib | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/script/build-fuzzers b/script/build-fuzzers index 2c48f6c9..0a19bc4c 100755 --- a/script/build-fuzzers +++ b/script/build-fuzzers @@ -20,11 +20,8 @@ default_fuzz_flags="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-gu CFLAGS=${CFLAGS:-"$default_fuzz_flags"} CXXFLAGS=${CXXFLAGS:-"$default_fuzz_flags"} -CC=$CC CXX=$CXX LINK=$LINK CFLAGS=$CFLAGS CXXFLAGS=$CXXFLAGS ./script/configure - -export BUILDTYPE=Fuzz -make runtime - +export CFLAGS +script/build-lib if [ -z "$@" ]; then languages=$(ls test/fixtures/grammars) @@ -56,9 +53,9 @@ for lang in ${languages[@]}; do modes=(true halt false recover) for i in 0 2; do - $CXX $CXXFLAGS -std=c++11 -Iinclude -D TS_HALT_ON_ERROR="${modes[i]}" -D TS_LANG="tree_sitter_$lang" \ + $CXX $CXXFLAGS -std=c++11 -I lib/include -D TS_HALT_ON_ERROR="${modes[i]}" -D TS_LANG="tree_sitter_$lang" \ "test/fuzz/fuzzer.cc" "${objects[@]}" \ - out/Fuzz/obj.target/libruntime.a "$LIB_FUZZER_PATH" \ + libtree-sitter.a "$LIB_FUZZER_PATH" \ -o "out/${lang}_fuzzer_${modes[i+1]}" done diff --git a/script/build-lib b/script/build-lib index b81a4b0a..be287db0 100755 --- a/script/build-lib +++ b/script/build-lib @@ -11,6 +11,7 @@ ${CC} \ -c \ -O3 \ -std=c99 \ + $CFLAGS \ -I lib/src \ -I lib/include \ -I lib/utf8proc \ From f6d014f3f4c302a16a14a6afc036d31c8cd6605b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 28 Jan 2019 14:23:41 -0800 Subject: [PATCH 189/208] Write tree_sitter/parser.h file in generate command --- cli/src/generate/mod.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 535f9d19..9e954298 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -40,7 +40,9 @@ pub fn generate_parser_in_directory( let (language_name, c_code) = generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; let repo_src_path = repo_path.join("src"); + let repo_header_path = repo_src_path.join("tree_sitter"); fs::create_dir_all(&repo_src_path)?; + fs::create_dir_all(&repo_header_path)?; fs::write(&repo_src_path.join("parser.c"), c_code) .map_err(|e| format!("Failed to write parser.c: {}", e))?; ensure_file(&repo_src_path.join("binding.cc"), || { @@ -52,6 +54,9 @@ pub fn generate_parser_in_directory( ensure_file(&repo_path.join("index.js"), || { npm_files::index_js(&language_name) })?; + ensure_file(&repo_header_path.join("parser.h"), || { + include_str!("../../../lib/include/tree_sitter/parser.h") + })?; } properties::generate_property_sheets(repo_path)?; Ok(()) @@ -96,7 +101,10 @@ fn load_grammar_file(grammar_path: &PathBuf) -> Result { match grammar_path.extension().and_then(|e| e.to_str()) { Some("js") => Ok(load_js_grammar_file(grammar_path)?), Some("json") => Ok(fs::read_to_string(grammar_path)?), - _ => Err(Error(format!("Unknown grammar file extension: {:?}", grammar_path))), + _ => Err(Error(format!( + "Unknown grammar file extension: {:?}", + grammar_path + ))), } } @@ -129,10 +137,10 @@ fn load_js_grammar_file(grammar_path: &PathBuf) -> Result { Ok(String::from_utf8(output.stdout).expect("Got invalid UTF8 from node")) } -fn ensure_file(path: &PathBuf, f: impl Fn() -> String) -> Result<()> { +fn ensure_file>(path: &PathBuf, f: impl Fn() -> T) -> Result<()> { if path.exists() { Ok(()) } else { - fs::write(path, f()).map_err(|e| Error(format!("Failed to write file {:?}: {}", path, e))) + fs::write(path, f().as_ref()).map_err(|e| Error(format!("Failed to write file {:?}: {}", path, e))) } } From 213ccfd3a47b4e0ce0f52d2db3b1875b4ef37998 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 29 Jan 2019 15:30:13 -0800 Subject: [PATCH 190/208] Update trees' metadata bits when setting symbol back to word token --- lib/src/parser.c | 2 +- lib/src/subtree.c | 18 ++++++++++++++++++ lib/src/subtree.h | 10 +--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/lib/src/parser.c b/lib/src/parser.c index 56326feb..85452f8d 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1363,7 +1363,7 @@ static void ts_parser__advance(TSParser *self, StackVersion version, bool allow_ ); MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); - ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token); + ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token, self->language); lookahead = ts_subtree_from_mut(mutable_lookahead); continue; } diff --git a/lib/src/subtree.c b/lib/src/subtree.c index 00af7507..6ca00792 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -225,6 +225,24 @@ Subtree ts_subtree_new_leaf( } } +void ts_subtree_set_symbol( + MutableSubtree *self, + TSSymbol symbol, + const TSLanguage *language +) { + TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); + if (self->data.is_inline) { + assert(symbol < UINT8_MAX); + self->data.symbol = symbol; + self->data.named = metadata.named; + self->data.visible = metadata.visible; + } else { + self->ptr->symbol = symbol; + self->ptr->named = metadata.named; + self->ptr->visible = metadata.visible; + } +} + Subtree ts_subtree_new_error( SubtreePool *pool, int32_t lookahead_char, Length padding, Length size, uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language diff --git a/lib/src/subtree.h b/lib/src/subtree.h index 039494b5..b0423afb 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -132,6 +132,7 @@ void ts_subtree_retain(Subtree); void ts_subtree_release(SubtreePool *, Subtree); bool ts_subtree_eq(Subtree, Subtree); int ts_subtree_compare(Subtree, Subtree); +void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); void ts_subtree_set_children(MutableSubtree, Subtree *, uint32_t, const TSLanguage *); void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); @@ -154,15 +155,6 @@ static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE #undef SUBTREE_GET -static inline void ts_subtree_set_symbol(MutableSubtree *self, TSSymbol symbol) { - if (self->data.is_inline) { - assert(symbol < UINT8_MAX); - self->data.symbol = symbol; - } else { - self->ptr->symbol = symbol; - } -} - static inline void ts_subtree_set_extra(MutableSubtree *self) { if (self->data.is_inline) { self->data.extra = true; From d192eda9cf103f88468acdf40dc77114d30bde67 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 30 Jan 2019 21:43:44 -0800 Subject: [PATCH 191/208] Remove stray word 'runtime' from comment --- lib/src/lib.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/src/lib.c b/lib/src/lib.c index b29f5214..49ddf672 100644 --- a/lib/src/lib.c +++ b/lib/src/lib.c @@ -1,5 +1,4 @@ -// The Tree-sitter runtime library can be built by compiling this -// one source file. +// The Tree-sitter library can be built by compiling this one source file. // // The following directories must be added to the include path: // - src From e26cbb62a580dfd5a5be178b4b5e7ba4fb98dc5b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 31 Jan 2019 08:15:30 -0800 Subject: [PATCH 192/208] Add Tree::edit unit tests --- cli/src/tests/mod.rs | 1 + cli/src/tests/tree_test.rs | 191 +++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 cli/src/tests/tree_test.rs diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index beafa172..3641cc3e 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,3 +1,4 @@ mod corpus_test; mod helpers; mod parser_api_test; +mod tree_test; diff --git a/cli/src/tests/tree_test.rs b/cli/src/tests/tree_test.rs new file mode 100644 index 00000000..401ff03a --- /dev/null +++ b/cli/src/tests/tree_test.rs @@ -0,0 +1,191 @@ +use super::helpers::fixtures::get_language; +use tree_sitter::{InputEdit, Language, Parser, Point}; + +#[test] +fn test_edit() { + let mut parser = Parser::new(); + parser.set_language(javascript()).unwrap(); + let tree = parser.parse_str(" abc !== def", None).unwrap(); + + assert_eq!( + tree.root_node().to_sexp(), + "(program (expression_statement (binary_expression (identifier) (identifier))))" + ); + + // edit entirely within the tree's padding: + // resize the padding of the tree and its leftmost descendants. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 1, + old_end_byte: 1, + new_end_byte: 2, + start_position: Point::new(0, 1), + old_end_position: Point::new(0, 1), + new_end_position: Point::new(0, 2), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 3); + assert_eq!(expr.end_byte(), 16); + assert!(child1.has_changes()); + assert_eq!(child1.start_byte(), 3); + assert_eq!(child1.end_byte(), 6); + assert!(!child2.has_changes()); + assert_eq!(child2.start_byte(), 8); + assert_eq!(child2.end_byte(), 11); + } + + // edit starting in the tree's padding but extending into its content: + // shrink the content to compenstate for the expanded padding. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 1, + old_end_byte: 4, + new_end_byte: 5, + start_position: Point::new(0, 1), + old_end_position: Point::new(0, 5), + new_end_position: Point::new(0, 5), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 5); + assert_eq!(expr.end_byte(), 16); + assert!(child1.has_changes()); + assert_eq!(child1.start_byte(), 5); + assert_eq!(child1.end_byte(), 6); + assert!(!child2.has_changes()); + assert_eq!(child2.start_byte(), 8); + assert_eq!(child2.end_byte(), 11); + } + + // insertion at the edge of a tree's padding: + // expand the tree's padding. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 2, + old_end_byte: 2, + new_end_byte: 4, + start_position: Point::new(0, 2), + old_end_position: Point::new(0, 2), + new_end_position: Point::new(0, 4), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 4); + assert_eq!(expr.end_byte(), 17); + assert!(child1.has_changes()); + assert_eq!(child1.start_byte(), 4); + assert_eq!(child1.end_byte(), 7); + assert!(!child2.has_changes()); + assert_eq!(child2.start_byte(), 9); + assert_eq!(child2.end_byte(), 12); + } + + // replacement starting at the edge of the tree's padding: + // resize the content and not the padding. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 2, + old_end_byte: 2, + new_end_byte: 4, + start_position: Point::new(0, 2), + old_end_position: Point::new(0, 2), + new_end_position: Point::new(0, 4), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 4); + assert_eq!(expr.end_byte(), 17); + assert!(child1.has_changes()); + assert_eq!(child1.start_byte(), 4); + assert_eq!(child1.end_byte(), 7); + assert!(!child2.has_changes()); + assert_eq!(child2.start_byte(), 9); + assert_eq!(child2.end_byte(), 12); + } + + // deletion that spans more than one child node: + // shrink subsequent child nodes. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 1, + old_end_byte: 11, + new_end_byte: 4, + start_position: Point::new(0, 1), + old_end_position: Point::new(0, 11), + new_end_position: Point::new(0, 4), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + let child3 = expr.child(2).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 4); + assert_eq!(expr.end_byte(), 8); + assert!(child1.has_changes()); + assert_eq!(child1.start_byte(), 4); + assert_eq!(child1.end_byte(), 4); + assert!(child2.has_changes()); + assert_eq!(child2.start_byte(), 4); + assert_eq!(child2.end_byte(), 4); + assert!(child3.has_changes()); + assert_eq!(child3.start_byte(), 5); + assert_eq!(child3.end_byte(), 8); + } + + // insertion at the end of the tree: + // extend the tree's content. + { + let mut tree = tree.clone(); + tree.edit(&InputEdit { + start_byte: 15, + old_end_byte: 15, + new_end_byte: 16, + start_position: Point::new(0, 15), + old_end_position: Point::new(0, 15), + new_end_position: Point::new(0, 16), + }); + + let expr = tree.root_node().child(0).unwrap().child(0).unwrap(); + let child1 = expr.child(0).unwrap(); + let child2 = expr.child(1).unwrap(); + let child3 = expr.child(2).unwrap(); + + assert!(expr.has_changes()); + assert_eq!(expr.start_byte(), 2); + assert_eq!(expr.end_byte(), 16); + assert!(!child1.has_changes()); + assert_eq!(child1.end_byte(), 5); + assert!(!child2.has_changes()); + assert_eq!(child2.end_byte(), 10); + assert!(child3.has_changes()); + assert_eq!(child3.end_byte(), 16); + } +} + +fn javascript() -> Language { + get_language("javascript") +} From 4cac85fec4ce5d01e468c0f0fa34bdad4187d882 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Feb 2019 14:39:37 -0800 Subject: [PATCH 193/208] Add benchmark script * Structure `cli` crate as both a library and an executable, so that benchmarks can import code from the crate. * Import macros in the Rust 2018 style. --- .appveyor.yml | 2 +- .travis.yml | 1 + cli/Cargo.toml | 4 + cli/benches/benchmark.rs | 172 ++++++++++++++++++ .../generate/build_tables/build_lex_table.rs | 1 + cli/src/generate/build_tables/item.rs | 1 + .../build_tables/minimize_parse_table.rs | 1 + cli/src/generate/build_tables/mod.rs | 1 + cli/src/generate/mod.rs | 5 +- cli/src/generate/parse_grammar.rs | 1 + .../generate/prepare_grammar/expand_tokens.rs | 1 + cli/src/generate/properties.rs | 2 + cli/src/lib.rs | 10 + cli/src/loader.rs | 1 + cli/src/logger.rs | 2 +- cli/src/main.rs | 25 +-- cli/src/test.rs | 6 +- cli/src/tests/corpus_test.rs | 6 +- cli/src/tests/helpers/allocations.rs | 6 +- cli/src/tests/helpers/dirs.rs | 11 ++ cli/src/tests/helpers/fixtures.rs | 12 +- cli/src/tests/parser_api_test.rs | 1 + cli/src/util.rs | 8 +- script/benchmark | 53 +----- script/benchmark.cmd | 3 + 25 files changed, 244 insertions(+), 92 deletions(-) create mode 100644 cli/benches/benchmark.rs create mode 100644 cli/src/lib.rs create mode 100644 cli/src/tests/helpers/dirs.rs create mode 100644 script/benchmark.cmd diff --git a/.appveyor.yml b/.appveyor.yml index de82a7d5..610ac134 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -25,8 +25,8 @@ test_script: - script\regenerate-fixtures.cmd # Run tests - - set TREE_SITTER_TEST=1 - script\test.cmd + - script\benchmark.cmd before_deploy: - move target\release\tree-sitter.exe tree-sitter.exe diff --git a/.travis.yml b/.travis.yml index 722a4dc9..06c71b34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ script: # Run tests - export TREE_SITTER_STATIC_ANALYSIS=1 - script/test + - script/benchmark branches: only: diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 75efdb18..35b6c7a0 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -8,6 +8,10 @@ edition = "2018" name = "tree-sitter" path = "src/main.rs" +[[bench]] +name = "benchmark" +harness = false + [dependencies] cc = "1.0" ansi_term = "0.11" diff --git a/cli/benches/benchmark.rs b/cli/benches/benchmark.rs new file mode 100644 index 00000000..472ab886 --- /dev/null +++ b/cli/benches/benchmark.rs @@ -0,0 +1,172 @@ +use lazy_static::lazy_static; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use std::time::Instant; +use std::{env, fs, usize}; +use tree_sitter::{Language, Parser}; +use tree_sitter_cli::loader::Loader; + +include!("../src/tests/helpers/dirs.rs"); + +lazy_static! { + static ref LANGUAGE_FILTER: Option = + env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok(); + static ref EXAMPLE_FILTER: Option = + env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok(); + static ref TEST_LOADER: Loader = Loader::new(SCRATCH_DIR.clone()); + static ref EXAMPLE_PATHS_BY_LANGUAGE_NAME: BTreeMap> = { + let mut result = BTreeMap::new(); + let grammar_dirs = fs::read_dir(&(*GRAMMARS_DIR)).unwrap(); + for grammar_dir in grammar_dirs { + let grammar_dir = grammar_dir.unwrap(); + if !grammar_dir.path().is_dir() { + continue; + } + + let language_name = grammar_dir.file_name(); + let language_name = language_name.to_str().unwrap(); + if let Ok(example_files) = fs::read_dir(&grammar_dir.path().join("examples")) { + result.insert( + language_name.to_string(), + example_files + .filter_map(|p| { + let p = p.unwrap().path(); + if p.is_file() { + Some(p) + } else { + None + } + }) + .collect(), + ); + } else { + result.insert(language_name.to_string(), Vec::new()); + } + } + + result + }; +} + +fn main() { + let mut parser = Parser::new(); + let max_path_length = EXAMPLE_PATHS_BY_LANGUAGE_NAME + .iter() + .flat_map(|(_, paths)| paths.iter()) + .map(|p| p.file_name().unwrap().to_str().unwrap().chars().count()) + .max() + .unwrap(); + + let mut all_normal_speeds = Vec::new(); + let mut all_error_speeds = Vec::new(); + + for (language_name, example_paths) in EXAMPLE_PATHS_BY_LANGUAGE_NAME.iter() { + // TODO - remove after fixing slow error parsing HTML. + if language_name == "html" { + continue; + } + + if let Some(filter) = LANGUAGE_FILTER.as_ref() { + if language_name != filter.as_str() { + continue; + } + } + + eprintln!("\nLanguage: {}", language_name); + parser.set_language(get_language(language_name)).unwrap(); + + eprintln!(" Normal examples:"); + let mut normal_speeds = Vec::new(); + for example_path in example_paths { + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !example_path.to_str().unwrap().contains(filter.as_str()) { + continue; + } + } + + normal_speeds.push(parse(&mut parser, example_path, max_path_length)); + } + + eprintln!(" Error examples (mismatched languages):"); + let mut error_speeds = Vec::new(); + for (other_language_name, example_paths) in EXAMPLE_PATHS_BY_LANGUAGE_NAME.iter() { + if other_language_name != language_name { + for example_path in example_paths { + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !example_path.to_str().unwrap().contains(filter.as_str()) { + continue; + } + } + + error_speeds.push(parse(&mut parser, example_path, max_path_length)); + } + } + } + + if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) { + eprintln!(" Average Speed (normal): {} bytes/ms", average_normal); + eprintln!(" Worst Speed (normal): {} bytes/ms", worst_normal); + } + + if let Some((average_error, worst_error)) = aggregate(&error_speeds) { + eprintln!(" Average Speed (errors): {} bytes/ms", average_error); + eprintln!(" Worst Speed (errors): {} bytes/ms", worst_error); + } + + all_normal_speeds.extend(normal_speeds); + all_error_speeds.extend(error_speeds); + } + + eprintln!("\nOverall"); + if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) { + eprintln!(" Average Speed (normal): {} bytes/ms", average_normal); + eprintln!(" Worst Speed (normal): {} bytes/ms", worst_normal); + } + + if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) { + eprintln!(" Average Speed (errors): {} bytes/ms", average_error); + eprintln!(" Worst Speed (errors): {} bytes/ms", worst_error); + } + eprintln!(""); +} + +fn aggregate(speeds: &Vec<(usize)>) -> Option<(usize, usize)> { + if speeds.is_empty() { + return None; + } + let mut total = 0; + let mut max = usize::MAX; + for speed in speeds.iter().cloned() { + total += speed; + if speed < max { + max = speed; + } + } + Some((total / speeds.len(), max)) +} + +fn parse(parser: &mut Parser, example_path: &Path, max_path_length: usize) -> usize { + eprint!( + " {:width$}\t", + example_path.file_name().unwrap().to_str().unwrap(), + width = max_path_length + ); + + let source_code = fs::read(example_path).unwrap(); + let time = Instant::now(); + let _tree = parser + .parse_utf8(&mut |byte, _| &source_code[byte..], None) + .expect("Incompatible language version"); + let duration = time.elapsed(); + let duration_ms = + duration.as_secs() as f64 * 1000.0 + duration.subsec_nanos() as f64 / 1000000.0; + let speed = (source_code.len() as f64 / duration_ms) as usize; + eprintln!("time {} ms\tspeed {} bytes/ms", duration_ms as usize, speed); + speed +} + +fn get_language(name: &str) -> Language { + TEST_LOADER + .load_language_at_path(name, &GRAMMARS_DIR.join(name).join("src"), &HEADER_DIR) + .unwrap() +} diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 38f56cc3..ef4b3e5e 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -5,6 +5,7 @@ use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor}; use crate::generate::rules::Symbol; use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; +use log::info; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 9f3307dd..b450bb75 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -1,6 +1,7 @@ use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; use crate::generate::rules::Associativity; use crate::generate::rules::{Symbol, SymbolType}; +use lazy_static::lazy_static; use smallbitvec::SmallBitVec; use std::cmp::Ordering; use std::fmt; diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index bb9b26eb..9b012afe 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -4,6 +4,7 @@ use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; use crate::generate::rules::{AliasMap, Symbol}; use crate::generate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use hashbrown::{HashMap, HashSet}; +use log::info; pub(crate) fn minimize_parse_table( parse_table: &mut ParseTable, diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 36f6770b..df19f9e0 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -17,6 +17,7 @@ use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGram use crate::generate::nfa::{CharacterSet, NfaCursor}; use crate::generate::rules::{AliasMap, Symbol, SymbolType}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; +use log::info; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 9e954298..397fd677 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -3,6 +3,7 @@ use self::parse_grammar::parse_grammar; use self::prepare_grammar::prepare_grammar; use self::render::render_c_code; use crate::error::{Error, Result}; +use lazy_static::lazy_static; use regex::{Regex, RegexBuilder}; use std::fs; use std::io::Write; @@ -62,7 +63,6 @@ pub fn generate_parser_in_directory( Ok(()) } -#[cfg(test)] pub fn generate_parser_for_grammar(grammar_json: &String) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); generate_parser_for_grammar_with_opts(&grammar_json, true, Vec::new()) @@ -141,6 +141,7 @@ fn ensure_file>(path: &PathBuf, f: impl Fn() -> T) -> Result<()> if path.exists() { Ok(()) } else { - fs::write(path, f().as_ref()).map_err(|e| Error(format!("Failed to write file {:?}: {}", path, e))) + fs::write(path, f().as_ref()) + .map_err(|e| Error(format!("Failed to write file {:?}: {}", path, e))) } } diff --git a/cli/src/generate/parse_grammar.rs b/cli/src/generate/parse_grammar.rs index e77dce9b..cf2005ad 100644 --- a/cli/src/generate/parse_grammar.rs +++ b/cli/src/generate/parse_grammar.rs @@ -1,6 +1,7 @@ use super::grammars::{InputGrammar, Variable, VariableType}; use super::rules::Rule; use crate::error::Result; +use serde_derive::Deserialize; use serde_json::{Map, Value}; #[derive(Deserialize)] diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 8e0f12fe..9e2cf9fe 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -3,6 +3,7 @@ use crate::error::{Error, Result}; use crate::generate::grammars::{LexicalGrammar, LexicalVariable}; use crate::generate::nfa::{CharacterSet, Nfa, NfaState}; use crate::generate::rules::Rule; +use lazy_static::lazy_static; use regex::Regex; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, diff --git a/cli/src/generate/properties.rs b/cli/src/generate/properties.rs index 4df4d67d..f5861159 100644 --- a/cli/src/generate/properties.rs +++ b/cli/src/generate/properties.rs @@ -1,6 +1,8 @@ use crate::error::{Error, Result}; +use log::info; use rsass; use rsass::sass::Value; +use serde_derive::Serialize; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; diff --git a/cli/src/lib.rs b/cli/src/lib.rs new file mode 100644 index 00000000..9038b5b8 --- /dev/null +++ b/cli/src/lib.rs @@ -0,0 +1,10 @@ +pub mod error; +pub mod generate; +pub mod loader; +pub mod logger; +pub mod parse; +pub mod test; +pub mod util; + +#[cfg(test)] +mod tests; diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 6dd4e4db..6c6d2c5c 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -1,5 +1,6 @@ use libloading::{Library, Symbol}; use regex::{Regex, RegexBuilder}; +use serde_derive::Deserialize; use std::collections::HashMap; use std::fs; use std::io; diff --git a/cli/src/logger.rs b/cli/src/logger.rs index 18df763d..6abe6470 100644 --- a/cli/src/logger.rs +++ b/cli/src/logger.rs @@ -23,7 +23,7 @@ impl Log for Logger { fn flush(&self) {} } -pub(crate) fn init() { +pub fn init() { log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap(); log::set_max_level(LevelFilter::Info); } diff --git a/cli/src/main.rs b/cli/src/main.rs index 0bf4f01a..3c0b057e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,31 +1,10 @@ -#[macro_use] -extern crate lazy_static; -#[macro_use] -extern crate log; -#[macro_use] -extern crate serde_derive; -extern crate hashbrown; -extern crate regex; -extern crate rsass; -extern crate serde_json; - -mod error; -mod generate; -mod loader; -mod logger; -mod parse; -mod test; -mod util; - -#[cfg(test)] -mod tests; - -use self::loader::Loader; use clap::{App, AppSettings, Arg, SubCommand}; use std::env; use std::fs; use std::path::Path; use std::process::exit; +use tree_sitter_cli::loader::Loader; +use tree_sitter_cli::{error, generate, logger, parse, test}; use std::usize; fn main() { diff --git a/cli/src/test.rs b/cli/src/test.rs index d6a2a7ce..c8330af9 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -2,6 +2,7 @@ use super::error::Result; use super::util; use ansi_term::Colour; use difference::{Changeset, Difference}; +use lazy_static::lazy_static; use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; use regex::Regex; use std::char; @@ -38,7 +39,10 @@ pub enum TestEntry { impl Default for TestEntry { fn default() -> Self { - TestEntry::Group { name: String::new(), children: Vec::new() } + TestEntry::Group { + name: String::new(), + children: Vec::new(), + } } } diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index 449669e3..1ee3ddc1 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -5,6 +5,7 @@ use super::helpers::scope_sequence::ScopeSequence; use crate::generate; use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; +use lazy_static::lazy_static; use std::{env, fs, time, usize}; use tree_sitter::{InputEdit, LogType, Node, Parser, Point, Tree}; @@ -373,7 +374,10 @@ fn check_consistent_sizes(tree: &Tree, input: &Vec) { assert!(start_byte <= end_byte); assert!(start_point <= end_point); - assert_eq!(start_byte, line_offsets[start_point.row] + start_point.column); + assert_eq!( + start_byte, + line_offsets[start_point.row] + start_point.column + ); assert_eq!(end_byte, line_offsets[end_point.row] + end_point.column); let mut last_child_end_byte = start_byte; diff --git a/cli/src/tests/helpers/allocations.rs b/cli/src/tests/helpers/allocations.rs index e3cdae27..ae246c40 100644 --- a/cli/src/tests/helpers/allocations.rs +++ b/cli/src/tests/helpers/allocations.rs @@ -1,6 +1,7 @@ #![cfg(test)] #![allow(dead_code)] +use lazy_static::lazy_static; use spin::Mutex; use std::collections::HashMap; use std::os::raw::{c_ulong, c_void}; @@ -46,10 +47,7 @@ pub fn stop_recording() { .map(|e| e.1) .collect::>(); allocation_indices.sort_unstable(); - panic!( - "Leaked allocation indices: {:?}", - allocation_indices - ); + panic!("Leaked allocation indices: {:?}", allocation_indices); } } diff --git a/cli/src/tests/helpers/dirs.rs b/cli/src/tests/helpers/dirs.rs new file mode 100644 index 00000000..4bf345d8 --- /dev/null +++ b/cli/src/tests/helpers/dirs.rs @@ -0,0 +1,11 @@ +lazy_static! { + static ref ROOT_DIR: PathBuf = PathBuf::from(env!("CARGO_MANIFEST_DIR")).parent().unwrap().to_owned(); + static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); + static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); + static ref GRAMMARS_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures").join("grammars"); + static ref SCRATCH_DIR: PathBuf = { + let result = ROOT_DIR.join("target").join("scratch"); + fs::create_dir_all(&result).unwrap(); + result + }; +} diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index 639b1004..981f0ab6 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -1,18 +1,12 @@ use crate::loader::Loader; +use lazy_static::lazy_static; use std::fs; use std::path::{Path, PathBuf}; use tree_sitter::Language; +include!("./dirs.rs"); + lazy_static! { - static ref ROOT_DIR: PathBuf = [env!("CARGO_MANIFEST_DIR"), ".."].iter().collect(); - static ref FIXTURES_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures"); - static ref HEADER_DIR: PathBuf = ROOT_DIR.join("lib").join("include"); - static ref GRAMMARS_DIR: PathBuf = ROOT_DIR.join("test").join("fixtures").join("grammars"); - static ref SCRATCH_DIR: PathBuf = { - let result = ROOT_DIR.join("target").join("scratch"); - fs::create_dir_all(&result).unwrap(); - result - }; static ref TEST_LOADER: Loader = Loader::new(SCRATCH_DIR.clone()); } diff --git a/cli/src/tests/parser_api_test.rs b/cli/src/tests/parser_api_test.rs index 9584ac4e..e46d9b55 100644 --- a/cli/src/tests/parser_api_test.rs +++ b/cli/src/tests/parser_api_test.rs @@ -1,4 +1,5 @@ use super::helpers::fixtures::get_language; +use serde_derive::Deserialize; use std::thread; use tree_sitter::{InputEdit, Language, LogType, Parser, Point, PropertySheet}; diff --git a/cli/src/util.rs b/cli/src/util.rs index 004d3b06..e880bea1 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -8,18 +8,18 @@ use tree_sitter::Parser; const HTML_HEADER: &[u8] = b"\n\n\n"; #[cfg(windows)] -pub(crate) struct LogSession(); +pub struct LogSession(); #[cfg(unix)] -pub(crate) struct LogSession(PathBuf, Option, Option); +pub struct LogSession(PathBuf, Option, Option); #[cfg(windows)] -pub(crate) fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result { +pub fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result { Ok(LogSession()) } #[cfg(unix)] -pub(crate) fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { +pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { use std::io::Write; let mut dot_file = std::fs::File::create(path)?; diff --git a/script/benchmark b/script/benchmark index e24c6b58..9b4ec3f0 100755 --- a/script/benchmark +++ b/script/benchmark @@ -6,7 +6,7 @@ function usage { cat <<-EOF USAGE - $0 [-Ld] [-l language-name] [-f example-file-name] + $0 [-h] [-l language-name] [-e example-file-name] OPTIONS @@ -14,63 +14,24 @@ OPTIONS -l run only the benchmarks for the given language - -f run only the benchmarks that parse the file with the given name - - -d run tests in a debugger (either lldb or gdb) - - -L run benchmarks with parse logging turned on - - -b run make under the scan-build static analyzer + -e run only the benchmarks that parse the example file with the given name EOF } -if [ "$(uname -s)" == "Darwin" ]; then - export LINK="clang++ -fsanitize=address" -fi - -mode=normal -export BUILDTYPE=Release -cmd=out/$BUILDTYPE/benchmarks -run_scan_build= - -while getopts "bdhf:l:SL" option; do +while getopts "hl:e:" option; do case ${option} in h) usage exit ;; - d) - mode=debug - ;; - f) - export TREE_SITTER_BENCHMARK_FILE_NAME=${OPTARG} + e) + export TREE_SITTER_BENCHMARK_EXAMPLE_FILTER=${OPTARG} ;; l) - export TREE_SITTER_BENCHMARK_LANGUAGE=${OPTARG} - ;; - L) - export TREE_SITTER_BENCHMARK_LOG=1 - ;; - b) - run_scan_build=true + export TREE_SITTER_BENCHMARK_LANGUAGE_FILTER=${OPTARG} ;; esac done -if [[ -n "$run_scan_build" ]]; then - . script/util/scan-build.sh - scan_build make -j2 benchmarks -else - make -j2 benchmarks -fi - -case $mode in - debug) - lldb $cmd - ;; - - normal) - exec $cmd - ;; -esac +cargo bench diff --git a/script/benchmark.cmd b/script/benchmark.cmd new file mode 100644 index 00000000..f5608d9d --- /dev/null +++ b/script/benchmark.cmd @@ -0,0 +1,3 @@ +@echo off + +cargo bench From 91da7206b7d10f423042846e7bbabf439191eba9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Feb 2019 15:54:34 -0800 Subject: [PATCH 194/208] Clean up environment variables after windows batch scripts --- script/fetch-fixtures.cmd | 14 +++++++------- script/regenerate-fixtures.cmd | 10 +++++----- script/test.cmd | 3 ++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/script/fetch-fixtures.cmd b/script/fetch-fixtures.cmd index 98d5d578..011d73ff 100644 --- a/script/fetch-fixtures.cmd +++ b/script/fetch-fixtures.cmd @@ -12,18 +12,18 @@ call:fetch_grammar python master call:fetch_grammar ruby master call:fetch_grammar rust master call:fetch_grammar typescript master -EXIT /B 0 +exit /B 0 :fetch_grammar -SETLOCAL -SET grammar_dir=test\fixtures\grammars\%~1 -SET grammar_url=https://github.com/tree-sitter/tree-sitter-%~1 -SET grammar_branch=%~2 -@IF NOT EXIST %grammar_dir% ( +setlocal +set grammar_dir=test\fixtures\grammars\%~1 +set grammar_url=https://github.com/tree-sitter/tree-sitter-%~1 +set grammar_branch=%~2 +@if not exist %grammar_dir% ( git clone %grammar_url% %grammar_dir% --depth=1 ) pushd %grammar_dir% git fetch origin %2 --depth=1 git reset --hard FETCH_HEAD popd -EXIT /B 0 +exit /B 0 diff --git a/script/regenerate-fixtures.cmd b/script/regenerate-fixtures.cmd index 739bdba1..b307409e 100644 --- a/script/regenerate-fixtures.cmd +++ b/script/regenerate-fixtures.cmd @@ -10,13 +10,13 @@ call:regenerate javascript call:regenerate json call:regenerate python call:regenerate rust -EXIT /B 0 +exit /B 0 :regenerate -SETLOCAL -SET tree_sitter=%cd%\target\release\tree-sitter -SET grammar_dir=test\fixtures\grammars\%~1 +setlocal +set tree_sitter=%cd%\target\release\tree-sitter +set grammar_dir=test\fixtures\grammars\%~1 pushd %grammar_dir% %tree_sitter% generate src\grammar.json popd -EXIT /B 0 +exit /B 0 diff --git a/script/test.cmd b/script/test.cmd index ef4ce02e..d1b462e8 100644 --- a/script/test.cmd +++ b/script/test.cmd @@ -1,7 +1,8 @@ @echo off +setlocal set TREE_SITTER_TEST=1 set RUST_TEST_THREADS=1 set RUST_BACKTRACE=full - cargo test "%~1" -- --nocapture +endlocal From e143710f4aad9ec1b5b493876b5456db942b0b88 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Feb 2019 19:57:00 -0800 Subject: [PATCH 195/208] Move `properties` module out of `generate` --- cli/src/generate/mod.rs | 49 +++++++++++++--------------- cli/src/lib.rs | 1 + cli/src/main.rs | 20 +++++++----- cli/src/{generate => }/properties.rs | 0 4 files changed, 34 insertions(+), 36 deletions(-) rename cli/src/{generate => }/properties.rs (100%) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 397fd677..127e956e 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -16,7 +16,6 @@ mod nfa; mod npm_files; mod parse_grammar; mod prepare_grammar; -mod properties; mod render; mod rules; mod tables; @@ -33,33 +32,29 @@ pub fn generate_parser_in_directory( grammar_path: Option<&str>, minimize: bool, state_ids_to_log: Vec, - properties_only: bool, ) -> Result<()> { - if !properties_only { - let grammar_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into()); - let grammar_json = load_grammar_file(&grammar_path)?; - let (language_name, c_code) = - generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; - let repo_src_path = repo_path.join("src"); - let repo_header_path = repo_src_path.join("tree_sitter"); - fs::create_dir_all(&repo_src_path)?; - fs::create_dir_all(&repo_header_path)?; - fs::write(&repo_src_path.join("parser.c"), c_code) - .map_err(|e| format!("Failed to write parser.c: {}", e))?; - ensure_file(&repo_src_path.join("binding.cc"), || { - npm_files::binding_cc(&language_name) - })?; - ensure_file(&repo_path.join("binding.gyp"), || { - npm_files::binding_gyp(&language_name) - })?; - ensure_file(&repo_path.join("index.js"), || { - npm_files::index_js(&language_name) - })?; - ensure_file(&repo_header_path.join("parser.h"), || { - include_str!("../../../lib/include/tree_sitter/parser.h") - })?; - } - properties::generate_property_sheets(repo_path)?; + let grammar_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into()); + let grammar_json = load_grammar_file(&grammar_path)?; + let (language_name, c_code) = + generate_parser_for_grammar_with_opts(&grammar_json, minimize, state_ids_to_log)?; + let repo_src_path = repo_path.join("src"); + let repo_header_path = repo_src_path.join("tree_sitter"); + fs::create_dir_all(&repo_src_path)?; + fs::create_dir_all(&repo_header_path)?; + fs::write(&repo_src_path.join("parser.c"), c_code) + .map_err(|e| format!("Failed to write parser.c: {}", e))?; + ensure_file(&repo_src_path.join("binding.cc"), || { + npm_files::binding_cc(&language_name) + })?; + ensure_file(&repo_path.join("binding.gyp"), || { + npm_files::binding_gyp(&language_name) + })?; + ensure_file(&repo_path.join("index.js"), || { + npm_files::index_js(&language_name) + })?; + ensure_file(&repo_header_path.join("parser.h"), || { + include_str!("../../../lib/include/tree_sitter/parser.h") + })?; Ok(()) } diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 9038b5b8..3a15b457 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -3,6 +3,7 @@ pub mod generate; pub mod loader; pub mod logger; pub mod parse; +pub mod properties; pub mod test; pub mod util; diff --git a/cli/src/main.rs b/cli/src/main.rs index 3c0b057e..4d4dc1c6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -3,9 +3,9 @@ use std::env; use std::fs; use std::path::Path; use std::process::exit; -use tree_sitter_cli::loader::Loader; -use tree_sitter_cli::{error, generate, logger, parse, test}; use std::usize; +use tree_sitter_cli::loader::Loader; +use tree_sitter_cli::{error, generate, logger, parse, properties, test}; fn main() { if let Err(e) = run() { @@ -87,13 +87,15 @@ fn run() -> error::Result<()> { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); - generate::generate_parser_in_directory( - ¤t_dir, - grammar_path, - minimize, - state_ids_to_log, - properties_only, - )?; + if !properties_only { + generate::generate_parser_in_directory( + ¤t_dir, + grammar_path, + minimize, + state_ids_to_log, + )?; + } + properties::generate_property_sheets(¤t_dir)?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); diff --git a/cli/src/generate/properties.rs b/cli/src/properties.rs similarity index 100% rename from cli/src/generate/properties.rs rename to cli/src/properties.rs From 6ca1047bb0a067371cc2b7fc492d99c424adf44c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Feb 2019 20:19:38 -0800 Subject: [PATCH 196/208] Remove ci shell script --- script/ci | 9 --------- 1 file changed, 9 deletions(-) delete mode 100755 script/ci diff --git a/script/ci b/script/ci deleted file mode 100755 index 6ad8a2b7..00000000 --- a/script/ci +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -set -e - -script/fetch-fixtures -script/check-mallocs -script/build-runtime -script/test -b -script/benchmark -b From f263a4fbe335404c6f79048187b57f6184587602 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Feb 2019 21:20:27 -0800 Subject: [PATCH 197/208] Separate walk_with_properties tests from parser tests --- cli/src/main.rs | 2 +- cli/src/properties.rs | 7 +- cli/src/tests/mod.rs | 3 +- .../{parser_api_test.rs => parser_test.rs} | 199 +----------------- cli/src/tests/properties_test.rs | 134 ++++++++++++ 5 files changed, 145 insertions(+), 200 deletions(-) rename cli/src/tests/{parser_api_test.rs => parser_test.rs} (58%) create mode 100644 cli/src/tests/properties_test.rs diff --git a/cli/src/main.rs b/cli/src/main.rs index 4d4dc1c6..299ab896 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -95,7 +95,7 @@ fn run() -> error::Result<()> { state_ids_to_log, )?; } - properties::generate_property_sheets(¤t_dir)?; + properties::generate_property_sheets_in_directory(¤t_dir)?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); diff --git a/cli/src/properties.rs b/cli/src/properties.rs index f5861159..fccfd7ed 100644 --- a/cli/src/properties.rs +++ b/cli/src/properties.rs @@ -421,7 +421,7 @@ impl fmt::Debug for Selector { } } -pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { +pub fn generate_property_sheets_in_directory(repo_path: &Path) -> Result<()> { let src_dir_path = repo_path.join("src"); let properties_dir_path = repo_path.join("properties"); @@ -443,6 +443,11 @@ pub fn generate_property_sheets(repo_path: &Path) -> Result<()> { Ok(()) } +pub fn generate_property_sheet_string(path: impl AsRef, css: &str) -> Result { + let sheet = generate_property_sheet(path, css)?; + Ok(serde_json::to_string(&sheet)?) +} + fn generate_property_sheet(path: impl AsRef, css: &str) -> Result { let rules = parse_property_sheet(path.as_ref(), &css)?; Ok(Builder::new(rules).build()) diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index 3641cc3e..b8f6ad1f 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,4 +1,5 @@ mod corpus_test; mod helpers; -mod parser_api_test; +mod parser_test; +mod properties_test; mod tree_test; diff --git a/cli/src/tests/parser_api_test.rs b/cli/src/tests/parser_test.rs similarity index 58% rename from cli/src/tests/parser_api_test.rs rename to cli/src/tests/parser_test.rs index e46d9b55..a061d8c6 100644 --- a/cli/src/tests/parser_api_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,7 +1,6 @@ use super::helpers::fixtures::get_language; -use serde_derive::Deserialize; use std::thread; -use tree_sitter::{InputEdit, Language, LogType, Parser, Point, PropertySheet}; +use tree_sitter::{InputEdit, Language, LogType, Parser, Point}; #[test] fn test_basic_parsing() { @@ -93,200 +92,6 @@ fn test_tree_cursor() { assert_eq!(cursor.node().is_named(), true); } -#[test] -fn test_tree_property_matching() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let source_code = "fn f1() { f2(); }"; - let tree = parser.parse_str(source_code, None).unwrap(); - - #[derive(Debug, Deserialize, PartialEq, Eq)] - struct Properties { - reference: Option, - define: Option, - } - - let empty_properties = Properties { - reference: None, - define: None, - }; - - let property_sheet = PropertySheet::::new( - rust(), - r##" - { - "states": [ - { - "transitions": [ - {"type": "call_expression", "named": true, "state_id": 1}, - {"type": "function_item", "named": true, "state_id": 2} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [ - {"type": "identifier", "named": true, "state_id": 3} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [ - {"type": "identifier", "named": true, "state_id": 4} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 1 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 2 - } - ], - "property_sets": [ - {}, - {"reference": "function"}, - {"define": "function"} - ] - } - "##, - ) - .unwrap(); - - let mut cursor = tree.walk_with_properties(&property_sheet, source_code); - assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "function_item"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "fn"); - assert_eq!(*cursor.node_properties(), empty_properties); - assert!(!cursor.goto_first_child()); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!(cursor.node_properties().define, Some("function".to_owned())); - assert!(!cursor.goto_first_child()); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "parameters"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "("); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), ")"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_parent()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!( - cursor.node_properties().reference, - Some("function".to_owned()) - ); -} - -#[test] -fn test_tree_property_matching_with_regexes() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let source_code = "fn f1() { None(a()) }"; - let tree = parser.parse_str(source_code, None).unwrap(); - - #[derive(Debug, Deserialize, PartialEq, Eq)] - struct Properties { - scope: Option, - } - - let empty_properties = Properties { scope: None }; - - let property_sheet = PropertySheet::::new( - rust(), - r##" - { - "states": [ - { - "id": 0, - "transitions": [ - {"type": "call_expression", "named": true, "state_id": 1} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "id": 1, - "transitions": [ - {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, - {"type": "identifier", "named": true, "state_id": 3} - ], - "default_next_state_id": 0, - "property_set_id": 0 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 1 - }, - { - "transitions": [], - "default_next_state_id": 0, - "property_set_id": 2 - } - ], - "property_sets": [ - {}, - {"scope": "constructor"}, - {"scope": "function"} - ] - } - "##, - ) - .unwrap(); - - let mut cursor = tree.walk_with_properties(&property_sheet, source_code); - assert_eq!(cursor.node().kind(), "source_file"); - assert_eq!(*cursor.node_properties(), empty_properties); - - cursor.goto_first_child(); - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert!(cursor.goto_next_sibling()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "block"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "call_expression"); - assert_eq!(*cursor.node_properties(), empty_properties); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "identifier"); - assert_eq!( - cursor.node_properties().scope, - Some("constructor".to_owned()) - ); -} - #[test] fn test_custom_utf8_input() { let mut parser = Parser::new(); @@ -454,7 +259,7 @@ fn test_editing() { fn test_parallel_parsing() { // Parse this source file so that each thread has a non-trivial amount of // work to do. - let this_file_source = include_str!("parser_api_test.rs"); + let this_file_source = include_str!("parser_test.rs"); let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); diff --git a/cli/src/tests/properties_test.rs b/cli/src/tests/properties_test.rs new file mode 100644 index 00000000..213eb9d0 --- /dev/null +++ b/cli/src/tests/properties_test.rs @@ -0,0 +1,134 @@ +use super::helpers::fixtures::get_language; +use crate::properties; +use serde_derive::Deserialize; +use tree_sitter::{Parser, PropertySheet}; + +#[derive(Debug, Default, Deserialize, PartialEq, Eq)] +struct Properties { + a: Option, + b: Option, +} + +#[test] +fn test_walk_with_properties_with_nth_child() { + let language = get_language("javascript"); + let property_sheet = PropertySheet::::new( + language, + &properties::generate_property_sheet_string( + "/some/path.css", + " + binary_expression > identifier:nth-child(2) { + a: x; + } + + binary_expression > identifier { + a: y; + } + + identifier { + a: z; + } + ", + ) + .unwrap(), + ) + .unwrap(); + + let source_code = "a = b || c;"; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse_str(source_code, None).unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "program"); + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "expression_statement"); + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "assignment_expression"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("z".to_string()), b: None }); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "="); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "binary_expression"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("y".to_string()), b: None }); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "||"); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("x".to_string()), b: None }); +} + +#[test] +fn test_walk_with_properties_with_regexes() { + let language = get_language("javascript"); + let property_sheet = PropertySheet::::new( + language, + &properties::generate_property_sheet_string( + "/some/path.css", + " + identifier { + &[text='^[A-Z]'] { + a: y; + } + + &[text='^[A-Z_]+$'] { + a: z; + } + + a: x; + } + ", + ) + .unwrap(), + ) + .unwrap(); + + let source_code = "const ABC = Def(ghi);"; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse_str(source_code, None).unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "program"); + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "lexical_declaration"); + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "const"); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "variable_declarator"); + + // The later selector with a text regex overrides the earlier one. + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("z".to_string()), b: None }); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "="); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + + // The selectors with text regexes override the selector without one. + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("y".to_string()), b: None }); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "arguments"); + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "("); + + // This node doesn't match either of the regexes. + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(*cursor.node_properties(), Properties { a: Some("x".to_string()), b: None }); +} From d465850aba1a3ffca2499ea2bb4f628218886bb9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 2 Feb 2019 14:00:11 -0800 Subject: [PATCH 198/208] Add unit tests for ts_tree_get_changed_ranges --- cli/src/tests/corpus_test.rs | 95 +--------------- cli/src/tests/helpers/edits.rs | 94 ++++++++++++++++ cli/src/tests/helpers/mod.rs | 1 + cli/src/tests/parser_test.rs | 58 +--------- cli/src/tests/tree_test.rs | 198 ++++++++++++++++++++++++++++++++- 5 files changed, 295 insertions(+), 151 deletions(-) create mode 100644 cli/src/tests/helpers/edits.rs diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index 1ee3ddc1..f1990963 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -1,4 +1,5 @@ use super::helpers::allocations; +use super::helpers::edits::{get_random_edit, invert_edit, perform_edit}; use super::helpers::fixtures::{fixtures_dir, get_language, get_test_language}; use super::helpers::random::Rand; use super::helpers::scope_sequence::ScopeSequence; @@ -7,7 +8,7 @@ use crate::test::{parse_tests, print_diff, print_diff_key, TestEntry}; use crate::util; use lazy_static::lazy_static; use std::{env, fs, time, usize}; -use tree_sitter::{InputEdit, LogType, Node, Parser, Point, Tree}; +use tree_sitter::{LogType, Node, Parser, Tree}; const EDIT_COUNT: usize = 3; const TRIAL_COUNT: usize = 10; @@ -187,12 +188,6 @@ fn test_real_language_corpus_files() { } } -struct Edit { - position: usize, - deleted_length: usize, - inserted_text: Vec, -} - #[test] fn test_feature_corpus_files() { let test_grammars_dir = fixtures_dir().join("test_grammars"); @@ -279,92 +274,6 @@ fn test_feature_corpus_files() { } } -fn get_random_edit(rand: &mut Rand, input: &Vec) -> Edit { - let choice = rand.unsigned(10); - if choice < 2 { - // Insert text at end - let inserted_text = rand.words(3); - Edit { - position: input.len(), - deleted_length: 0, - inserted_text, - } - } else if choice < 5 { - // Delete text from the end - let mut deleted_length = rand.unsigned(10); - if deleted_length > input.len() { - deleted_length = input.len(); - } - Edit { - position: input.len() - deleted_length, - deleted_length, - inserted_text: vec![], - } - } else if choice < 8 { - // Insert at a random position - let position = rand.unsigned(input.len()); - let word_count = 1 + rand.unsigned(3); - let inserted_text = rand.words(word_count); - Edit { - position, - deleted_length: 0, - inserted_text, - } - } else { - // Replace at random position - let position = rand.unsigned(input.len()); - let deleted_length = rand.unsigned(input.len() - position); - let word_count = 1 + rand.unsigned(3); - let inserted_text = rand.words(word_count); - Edit { - position, - deleted_length, - inserted_text, - } - } -} - -fn invert_edit(input: &Vec, edit: &Edit) -> Edit { - let position = edit.position; - let removed_content = &input[position..(position + edit.deleted_length)]; - Edit { - position, - deleted_length: edit.inserted_text.len(), - inserted_text: removed_content.to_vec(), - } -} - -fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { - let start_byte = edit.position; - let old_end_byte = edit.position + edit.deleted_length; - let new_end_byte = edit.position + edit.inserted_text.len(); - let start_position = position_for_offset(input, start_byte); - let old_end_position = position_for_offset(input, old_end_byte); - input.splice(start_byte..old_end_byte, edit.inserted_text.iter().cloned()); - let new_end_position = position_for_offset(input, new_end_byte); - tree.edit(&InputEdit { - start_byte, - old_end_byte, - new_end_byte, - start_position, - old_end_position, - new_end_position, - }); -} - -fn position_for_offset(input: &Vec, offset: usize) -> Point { - let mut result = Point { row: 0, column: 0 }; - for c in &input[0..offset] { - if *c as char == '\n' { - result.row += 1; - result.column = 0; - } else { - result.column += 1; - } - } - result -} - fn check_consistent_sizes(tree: &Tree, input: &Vec) { fn check(node: Node, line_offsets: &Vec) { let start_byte = node.start_byte(); diff --git a/cli/src/tests/helpers/edits.rs b/cli/src/tests/helpers/edits.rs new file mode 100644 index 00000000..4e4d0c25 --- /dev/null +++ b/cli/src/tests/helpers/edits.rs @@ -0,0 +1,94 @@ +use super::random::Rand; +use tree_sitter::{InputEdit, Point, Tree}; + +pub struct Edit { + pub position: usize, + pub deleted_length: usize, + pub inserted_text: Vec, +} + +pub fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { + let start_byte = edit.position; + let old_end_byte = edit.position + edit.deleted_length; + let new_end_byte = edit.position + edit.inserted_text.len(); + let start_position = position_for_offset(input, start_byte); + let old_end_position = position_for_offset(input, old_end_byte); + input.splice(start_byte..old_end_byte, edit.inserted_text.iter().cloned()); + let new_end_position = position_for_offset(input, new_end_byte); + tree.edit(&InputEdit { + start_byte, + old_end_byte, + new_end_byte, + start_position, + old_end_position, + new_end_position, + }); +} + +pub fn invert_edit(input: &Vec, edit: &Edit) -> Edit { + let position = edit.position; + let removed_content = &input[position..(position + edit.deleted_length)]; + Edit { + position, + deleted_length: edit.inserted_text.len(), + inserted_text: removed_content.to_vec(), + } +} + +pub fn get_random_edit(rand: &mut Rand, input: &Vec) -> Edit { + let choice = rand.unsigned(10); + if choice < 2 { + // Insert text at end + let inserted_text = rand.words(3); + Edit { + position: input.len(), + deleted_length: 0, + inserted_text, + } + } else if choice < 5 { + // Delete text from the end + let mut deleted_length = rand.unsigned(10); + if deleted_length > input.len() { + deleted_length = input.len(); + } + Edit { + position: input.len() - deleted_length, + deleted_length, + inserted_text: vec![], + } + } else if choice < 8 { + // Insert at a random position + let position = rand.unsigned(input.len()); + let word_count = 1 + rand.unsigned(3); + let inserted_text = rand.words(word_count); + Edit { + position, + deleted_length: 0, + inserted_text, + } + } else { + // Replace at random position + let position = rand.unsigned(input.len()); + let deleted_length = rand.unsigned(input.len() - position); + let word_count = 1 + rand.unsigned(3); + let inserted_text = rand.words(word_count); + Edit { + position, + deleted_length, + inserted_text, + } + } +} + +fn position_for_offset(input: &Vec, offset: usize) -> Point { + let mut result = Point { row: 0, column: 0 }; + for c in &input[0..offset] { + if *c as char == '\n' { + result.row += 1; + result.column = 0; + } else { + result.column += 1; + } + } + result +} diff --git a/cli/src/tests/helpers/mod.rs b/cli/src/tests/helpers/mod.rs index bd5c6517..2d1ce574 100644 --- a/cli/src/tests/helpers/mod.rs +++ b/cli/src/tests/helpers/mod.rs @@ -2,3 +2,4 @@ pub(super) mod allocations; pub(super) mod fixtures; pub(super) mod random; pub(super) mod scope_sequence; +pub(super) mod edits; diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index a061d8c6..43fbbc1b 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -30,7 +30,7 @@ fn test_basic_parsing() { } #[test] -fn test_logging() { +fn test_parsing_with_logging() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); @@ -57,43 +57,7 @@ fn test_logging() { } #[test] -fn test_tree_cursor() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - - let tree = parser - .parse_str( - " - struct Stuff { - a: A; - b: Option, - } - ", - None, - ) - .unwrap(); - - let mut cursor = tree.walk(); - assert_eq!(cursor.node().kind(), "source_file"); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "struct_item"); - - assert!(cursor.goto_first_child()); - assert_eq!(cursor.node().kind(), "struct"); - assert_eq!(cursor.node().is_named(), false); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "type_identifier"); - assert_eq!(cursor.node().is_named(), true); - - assert!(cursor.goto_next_sibling()); - assert_eq!(cursor.node().kind(), "field_declaration_list"); - assert_eq!(cursor.node().is_named(), true); -} - -#[test] -fn test_custom_utf8_input() { +fn test_parsing_with_custom_utf8_input() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); @@ -126,7 +90,7 @@ fn test_custom_utf8_input() { } #[test] -fn test_custom_utf16_input() { +fn test_parsing_with_custom_utf16_input() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); @@ -162,19 +126,7 @@ fn test_custom_utf16_input() { } #[test] -fn test_node_equality() { - let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); - let tree = parser.parse_str("struct A {}", None).unwrap(); - let node1 = tree.root_node(); - let node2 = tree.root_node(); - assert_eq!(node1, node2); - assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); - assert_ne!(node1.child(0).unwrap(), node2); -} - -#[test] -fn test_editing() { +fn test_parsing_after_editing() { let mut parser = Parser::new(); parser.set_language(rust()).unwrap(); @@ -256,7 +208,7 @@ fn test_editing() { } #[test] -fn test_parallel_parsing() { +fn test_parsing_on_multiple_threads() { // Parse this source file so that each thread has a non-trivial amount of // work to do. let this_file_source = include_str!("parser_test.rs"); diff --git a/cli/src/tests/tree_test.rs b/cli/src/tests/tree_test.rs index 401ff03a..d3a16cba 100644 --- a/cli/src/tests/tree_test.rs +++ b/cli/src/tests/tree_test.rs @@ -1,10 +1,12 @@ +use super::helpers::edits::{invert_edit, perform_edit, Edit}; use super::helpers::fixtures::get_language; -use tree_sitter::{InputEdit, Language, Parser, Point}; +use std::str; +use tree_sitter::{InputEdit, Parser, Point, Range, Tree}; #[test] -fn test_edit() { +fn test_tree_edit() { let mut parser = Parser::new(); - parser.set_language(javascript()).unwrap(); + parser.set_language(get_language("javascript")).unwrap(); let tree = parser.parse_str(" abc !== def", None).unwrap(); assert_eq!( @@ -186,6 +188,192 @@ fn test_edit() { } } -fn javascript() -> Language { - get_language("javascript") +#[test] +fn test_tree_walk() { + let mut parser = Parser::new(); + parser.set_language(get_language("rust")).unwrap(); + + let tree = parser + .parse_str( + " + struct Stuff { + a: A; + b: Option, + } + ", + None, + ) + .unwrap(); + + let mut cursor = tree.walk(); + assert_eq!(cursor.node().kind(), "source_file"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct_item"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct"); + assert_eq!(cursor.node().is_named(), false); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "type_identifier"); + assert_eq!(cursor.node().is_named(), true); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "field_declaration_list"); + assert_eq!(cursor.node().is_named(), true); +} + +#[test] +fn test_tree_node_equality() { + let mut parser = Parser::new(); + parser.set_language(get_language("rust")).unwrap(); + let tree = parser.parse_str("struct A {}", None).unwrap(); + let node1 = tree.root_node(); + let node2 = tree.root_node(); + assert_eq!(node1, node2); + assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); + assert_ne!(node1.child(0).unwrap(), node2); +} + +#[test] +fn test_get_changed_ranges() { + let source_code = b"{a: null};\n".to_vec(); + + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + let tree = parser + .parse_utf8(&mut |i, _| &source_code[i..], None) + .unwrap(); + + assert_eq!( + tree.root_node().to_sexp(), + "(program (expression_statement (object (pair (property_identifier) (null)))))" + ); + + // Updating one token + { + let mut tree = tree.clone(); + let mut source_code = source_code.clone(); + + // Replace `null` with `nothing` - that token has changed syntax + let edit = Edit { + position: index_of(&source_code, "ull"), + deleted_length: 3, + inserted_text: b"othing".to_vec(), + }; + let inverse_edit = invert_edit(&source_code, &edit); + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, edit); + assert_eq!(ranges, vec![range_of(&source_code, "nothing")]); + + // Replace `nothing` with `null` - that token has changed syntax + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, inverse_edit); + assert_eq!(ranges, vec![range_of(&source_code, "null")]); + } + + // Changing only leading whitespace + { + let mut tree = tree.clone(); + let mut source_code = source_code.clone(); + + // Insert leading newline - no changed ranges + let edit = Edit { + position: 0, + deleted_length: 0, + inserted_text: b"\n".to_vec(), + }; + let inverse_edit = invert_edit(&source_code, &edit); + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, edit); + assert_eq!(ranges, vec![]); + + // Remove leading newline - no changed ranges + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, inverse_edit); + assert_eq!(ranges, vec![]); + } + + // Inserting elements + { + let mut tree = tree.clone(); + let mut source_code = source_code.clone(); + + // Insert a key-value pair before the `}` - those tokens are changed + let edit1 = Edit { + position: index_of(&source_code, "}"), + deleted_length: 0, + inserted_text: b", b: false".to_vec(), + }; + let inverse_edit1 = invert_edit(&source_code, &edit1); + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, edit1); + assert_eq!(ranges, vec![range_of(&source_code, ", b: false")]); + + let edit2 = Edit { + position: index_of(&source_code, ", b"), + deleted_length: 0, + inserted_text: b", c: 1".to_vec(), + }; + let inverse_edit2 = invert_edit(&source_code, &edit2); + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, edit2); + assert_eq!(ranges, vec![range_of(&source_code, ", c: 1")]); + + // Remove the middle pair + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, inverse_edit2); + assert_eq!(ranges, vec![]); + + // Remove the second pair + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, inverse_edit1); + assert_eq!(ranges, vec![]); + } + + // Wrapping elements in larger expressions + { + let mut tree = tree.clone(); + let mut source_code = source_code.clone(); + + // Replace `null` with the binary expression `b === null` + let edit1 = Edit { + position: index_of(&source_code, "null"), + deleted_length: 0, + inserted_text: b"b === ".to_vec(), + }; + let inverse_edit1 = invert_edit(&source_code, &edit1); + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, edit1); + assert_eq!(ranges, vec![range_of(&source_code, "b === null")]); + + // Undo + let ranges = get_changed_ranges(&mut parser, &mut tree, &mut source_code, inverse_edit1); + assert_eq!(ranges, vec![range_of(&source_code, "null")]); + } +} + +fn index_of(text: &Vec, substring: &str) -> usize { + str::from_utf8(text.as_slice()) + .unwrap() + .find(substring) + .unwrap() +} + +fn range_of(text: &Vec, substring: &str) -> Range { + let start_byte = index_of(text, substring); + let end_byte = start_byte + substring.as_bytes().len(); + Range { + start_byte, + end_byte, + start_point: Point::new(0, start_byte), + end_point: Point::new(0, end_byte), + } +} + +fn get_changed_ranges( + parser: &mut Parser, + tree: &mut Tree, + source_code: &mut Vec, + edit: Edit, +) -> Vec { + perform_edit(tree, source_code, &edit); + let new_tree = parser + .parse_utf8(&mut |i, _| &source_code[i..], Some(tree)) + .unwrap(); + let result = tree.changed_ranges(&new_tree); + *tree = new_tree; + result } From 6b8483c53c9ac765fbf1114ff902a946b9353f4c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 2 Feb 2019 21:37:54 -0800 Subject: [PATCH 199/208] Start work on porting included range unit tests --- cli/src/tests/parser_test.rs | 144 ++++++++++++++++++++++++++++++++++- lib/binding/lib.rs | 26 ++++++- 2 files changed, 168 insertions(+), 2 deletions(-) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 43fbbc1b..8a11d22a 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,6 +1,6 @@ use super::helpers::fixtures::get_language; use std::thread; -use tree_sitter::{InputEdit, Language, LogType, Parser, Point}; +use tree_sitter::{InputEdit, Language, LogType, Parser, Point, Range}; #[test] fn test_basic_parsing() { @@ -260,6 +260,148 @@ fn test_parsing_on_multiple_threads() { assert_eq!(child_count_differences, &[1, 2, 3, 4]); } +// Included Ranges + +#[test] +fn test_parsing_with_one_included_range() { + let source_code = "hi"; + + let mut parser = Parser::new(); + parser.set_language(get_language("html")).unwrap(); + let html_tree = parser.parse_str(source_code, None).unwrap(); + let script_content_node = html_tree.root_node().child(1).unwrap().child(1).unwrap(); + assert_eq!(script_content_node.kind(), "raw_text"); + + parser.set_included_ranges(&[script_content_node.range()]); + parser.set_language(get_language("javascript")).unwrap(); + let js_tree = parser.parse_str(source_code, None).unwrap(); + + assert_eq!( + js_tree.root_node().to_sexp(), + concat!( + "(program (expression_statement (call_expression", + " (member_expression (identifier) (property_identifier))", + " (arguments (string)))))", + ) + ); + assert_eq!( + js_tree.root_node().start_position(), + Point::new(0, source_code.find("console").unwrap()) + ); +} + +#[test] +fn test_parsing_with_multiple_included_ranges() { + let source_code = "html `

Hello, ${name.toUpperCase()}, it's ${now()}.
`"; + + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + let js_tree = parser.parse_str(source_code, None).unwrap(); + let template_string_node = js_tree + .root_node() + .descendant_for_byte_range( + source_code.find("
").unwrap(), + source_code.find("Hello").unwrap(), + ) + .unwrap(); + assert_eq!(template_string_node.kind(), "template_string"); + + let open_quote_node = template_string_node.child(0).unwrap(); + let interpolation_node1 = template_string_node.child(1).unwrap(); + let interpolation_node2 = template_string_node.child(2).unwrap(); + let close_quote_node = template_string_node.child(3).unwrap(); + + parser.set_language(get_language("html")).unwrap(); + parser.set_included_ranges(&[ + Range { + start_byte: open_quote_node.end_byte(), + start_point: open_quote_node.end_position(), + end_byte: interpolation_node1.start_byte(), + end_point: interpolation_node1.start_position(), + }, + Range { + start_byte: interpolation_node1.end_byte(), + start_point: interpolation_node1.end_position(), + end_byte: interpolation_node2.start_byte(), + end_point: interpolation_node2.start_position(), + }, + Range { + start_byte: interpolation_node2.end_byte(), + start_point: interpolation_node2.end_position(), + end_byte: close_quote_node.start_byte(), + end_point: close_quote_node.start_position(), + }, + ]); + let html_tree = parser.parse_str(source_code, None).unwrap(); + + assert_eq!( + html_tree.root_node().to_sexp(), + concat!( + "(fragment (element", + " (start_tag (tag_name))", + " (text)", + " (element (start_tag (tag_name)) (end_tag (tag_name)))", + " (text)", + " (end_tag (tag_name))))", + ) + ); + + let div_element_node = html_tree.root_node().child(0).unwrap(); + let hello_text_node = div_element_node.child(1).unwrap(); + let b_element_node = div_element_node.child(2).unwrap(); + let b_start_tag_node = b_element_node.child(0).unwrap(); + let b_end_tag_node = b_element_node.child(1).unwrap(); + + assert_eq!(hello_text_node.kind(), "text"); + assert_eq!( + hello_text_node.start_byte(), + source_code.find("Hello").unwrap() + ); + assert_eq!(hello_text_node.end_byte(), source_code.find("").unwrap()); + + assert_eq!(b_start_tag_node.kind(), "start_tag"); + assert_eq!( + b_start_tag_node.start_byte(), + source_code.find("").unwrap() + ); + assert_eq!( + b_start_tag_node.end_byte(), + source_code.find("${now()}").unwrap() + ); + + assert_eq!(b_end_tag_node.kind(), "end_tag"); + assert_eq!( + b_end_tag_node.start_byte(), + source_code.find("").unwrap() + ); + assert_eq!( + b_end_tag_node.end_byte(), + source_code.find(".
").unwrap() + ); +} + +#[test] +fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() { + let source_code = ""; + let utf16_source_code: Vec = source_code.as_bytes().iter().map(|c| *c as u16).collect(); + + let start_byte = 2 * source_code.find("a.").unwrap(); + let end_byte = 2 * source_code.find("").unwrap(); + + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + parser.set_included_ranges(&[Range { + start_byte, + end_byte, + start_point: Point::new(0, start_byte), + end_point: Point::new(0, end_byte), + }]); + let tree = parser + .parse_utf16(&mut |i, _| &utf16_source_code[i..], None) + .unwrap(); + assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))"); +} + fn rust() -> Language { get_language("rust") } diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 150dfcf4..9e04ed35 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -261,7 +261,7 @@ impl Parser { ) -> Option { self.parse_utf16_ptr( &mut |byte, position| { - let slice = input(byte, position); + let slice = input(byte / 2, position); (slice.as_ptr(), slice.len()) }, old_tree, @@ -570,6 +570,30 @@ impl<'tree> Node<'tree> { Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) } + pub fn descendant_for_byte_range(&self, start: usize, end: usize) -> Option { + Self::new(unsafe { + ffi::ts_node_descendant_for_byte_range(self.0, start as u32, end as u32) + }) + } + + pub fn named_descendant_for_byte_range(&self, start: usize, end: usize) -> Option { + Self::new(unsafe { + ffi::ts_node_named_descendant_for_byte_range(self.0, start as u32, end as u32) + }) + } + + pub fn descendant_for_point_range(&self, start: Point, end: Point) -> Option { + Self::new(unsafe { + ffi::ts_node_descendant_for_point_range(self.0, start.into(), end.into()) + }) + } + + pub fn named_descendant_for_point_range(&self, start: Point, end: Point) -> Option { + Self::new(unsafe { + ffi::ts_node_named_descendant_for_point_range(self.0, start.into(), end.into()) + }) + } + pub fn to_sexp(&self) -> String { let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) } From b5c057ba0420deb69a162f07bff13d60e83a3125 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 3 Feb 2019 13:59:27 -0800 Subject: [PATCH 200/208] 0.14.0-beta4 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package-lock.json | 2 +- cli/npm/package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c2dcd62..407d1189 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -572,7 +572,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.14.0-beta3" +version = "0.14.0-beta4" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 35b6c7a0..c63209db 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tree-sitter-cli" -version = "0.14.0-beta3" +version = "0.14.0-beta4" authors = ["Max Brunsfeld "] edition = "2018" diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index 4590ac72..fa5b766f 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta3", + "version": "0.14.0-beta4", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index 276ea9d8..5dec57d7 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.0-beta3", + "version": "0.14.0-beta4", "author": "Max Brunsfeld", "license": "MIT", "repository": { From 59f7511b1c9e9ae269278f05e4b0843a3d086922 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Feb 2019 09:12:25 -0800 Subject: [PATCH 201/208] Fix test command's exit code --- cli/src/main.rs | 4 +--- cli/src/test.rs | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 299ab896..3fb1890d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -102,9 +102,7 @@ fn run() -> error::Result<()> { let filter = matches.value_of("filter"); let corpus_path = current_dir.join("corpus"); if let Some(language) = loader.language_at_path(¤t_dir)? { - if !test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)? { - exit(1); - } + test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?; } else { eprintln!("No language found"); } diff --git a/cli/src/test.rs b/cli/src/test.rs index c8330af9..7a2fab25 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -1,4 +1,4 @@ -use super::error::Result; +use super::error::{Error, Result}; use super::util; use ansi_term::Colour; use difference::{Changeset, Difference}; @@ -52,7 +52,7 @@ pub fn run_tests_at_path( debug: bool, debug_graph: bool, filter: Option<&str>, -) -> Result { +) -> Result<()> { let test_entry = parse_tests(path)?; let mut _log_session = None; let mut parser = Parser::new(); @@ -90,9 +90,9 @@ pub fn run_tests_at_path( println!("\n {}. {}:", i + 1, name); print_diff(actual, expected); } - Ok(true) + Err(Error(String::new())) } else { - Ok(false) + Ok(()) } } From e62a8a2302104e5b2bdfc194e54bb7859684ab22 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Feb 2019 10:38:44 -0800 Subject: [PATCH 202/208] Port more parser unit tests --- cli/src/tests/parser_test.rs | 268 +++++++++++++++++++++++++++++++++-- 1 file changed, 257 insertions(+), 11 deletions(-) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 8a11d22a..94694a32 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,11 +1,11 @@ use super::helpers::fixtures::get_language; -use std::thread; -use tree_sitter::{InputEdit, Language, LogType, Parser, Point, Range}; +use std::{thread, usize}; +use tree_sitter::{InputEdit, LogType, Parser, Point, Range}; #[test] fn test_basic_parsing() { let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let tree = parser .parse_str( @@ -32,7 +32,7 @@ fn test_basic_parsing() { #[test] fn test_parsing_with_logging() { let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let mut messages = Vec::new(); parser.set_logger(Some(Box::new(|log_type, message| { @@ -59,7 +59,7 @@ fn test_parsing_with_logging() { #[test] fn test_parsing_with_custom_utf8_input() { let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let lines = &["pub fn foo() {", " 1", "}"]; @@ -92,7 +92,7 @@ fn test_parsing_with_custom_utf8_input() { #[test] fn test_parsing_with_custom_utf16_input() { let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let lines: Vec> = ["pub fn foo() {", " 1", "}"] .iter() @@ -128,7 +128,7 @@ fn test_parsing_with_custom_utf16_input() { #[test] fn test_parsing_after_editing() { let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); let mut input_bytes_read = Vec::new(); @@ -214,7 +214,7 @@ fn test_parsing_on_multiple_threads() { let this_file_source = include_str!("parser_test.rs"); let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); let tree = parser.parse_str(this_file_source, None).unwrap(); let mut parse_threads = Vec::new(); @@ -242,7 +242,7 @@ fn test_parsing_on_multiple_threads() { // Reparse using the old tree as a starting point. let mut parser = Parser::new(); - parser.set_language(rust()).unwrap(); + parser.set_language(get_language("rust")).unwrap(); parser .parse_str(&prepended_source, Some(&tree_clone)) .unwrap() @@ -260,6 +260,76 @@ fn test_parsing_on_multiple_threads() { assert_eq!(child_count_differences, &[1, 2, 3, 4]); } +// Operation limits + +#[test] +fn test_parsing_with_an_operation_limit() { + let mut parser = Parser::new(); + parser.set_language(get_language("json")).unwrap(); + + // Start parsing from an infinite input. Parsing should abort after 5 "operations". + parser.set_operation_limit(5); + let mut call_count = 0; + let tree = parser.parse_utf8(&mut |_, _| { + if call_count == 0 { + call_count += 1; + b"[0" + } else { + call_count += 1; + b", 0" + } + }, None); + assert!(tree.is_none()); + assert!(call_count >= 3); + assert!(call_count <= 8); + + // Resume parsing from the previous state. + call_count = 0; + parser.set_operation_limit(20); + let tree = parser.parse_utf8(&mut |_, _| { + if call_count == 0 { + call_count += 1; + b"]" + } else { + b"" + } + }, None).unwrap(); + assert_eq!(tree.root_node().to_sexp(), "(value (array (number) (number) (number)))"); +} + +#[test] +fn test_parsing_with_a_reset_after_reaching_an_operation_limit() { + let mut parser = Parser::new(); + parser.set_language(get_language("json")).unwrap(); + + parser.set_operation_limit(3); + let tree = parser.parse_str("[1234, 5, 6, 7, 8]", None); + assert!(tree.is_none()); + + // Without calling reset, the parser continues from where it left off, so + // it does not see the changes to the beginning of the source code. + parser.set_operation_limit(usize::MAX); + let tree = parser.parse_str("[null, 5, 6, 4, 5]", None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(value (array (number) (number) (number) (number) (number)))" + ); + + parser.set_operation_limit(3); + let tree = parser.parse_str("[1234, 5, 6, 7, 8]", None); + assert!(tree.is_none()); + + // By calling reset, we force the parser to start over from scratch so + // that it sees the changes to the beginning of the source code. + parser.set_operation_limit(usize::MAX); + parser.reset(); + let tree = parser.parse_str("[null, 5, 6, 4, 5]", None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(value (array (null) (number) (number) (number) (number)))" + ); +} + // Included Ranges #[test] @@ -402,6 +472,182 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() { assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))"); } -fn rust() -> Language { - get_language("rust") +#[test] +fn test_parsing_with_external_scanner_that_uses_included_range_boundaries() { + let source_code = "a <%= b() %> c <% d() %>"; + let range1_start_byte = source_code.find(" b() ").unwrap(); + let range1_end_byte = range1_start_byte + " b() ".len(); + let range2_start_byte = source_code.find(" d() ").unwrap(); + let range2_end_byte = range2_start_byte + " d() ".len(); + + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + parser.set_included_ranges(&[ + Range { + start_byte: range1_start_byte, + end_byte: range1_end_byte, + start_point: Point::new(0, range1_start_byte), + end_point: Point::new(0, range1_end_byte), + }, + Range { + start_byte: range2_start_byte, + end_byte: range2_end_byte, + start_point: Point::new(0, range2_start_byte), + end_point: Point::new(0, range2_end_byte), + }, + ]); + + let tree = parser.parse_str(source_code, None).unwrap(); + let root = tree.root_node(); + let statement1 = root.child(0).unwrap(); + let statement2 = root.child(1).unwrap(); + + assert_eq!( + root.to_sexp(), + concat!( + "(program", + " (expression_statement (call_expression (identifier) (arguments)))", + " (expression_statement (call_expression (identifier) (arguments))))" + ) + ); + + assert_eq!(statement1.start_byte(), source_code.find("b()").unwrap()); + assert_eq!(statement1.end_byte(), source_code.find(" %> c").unwrap()); + assert_eq!(statement2.start_byte(), source_code.find("d()").unwrap()); + assert_eq!(statement2.end_byte(), source_code.len() - " %>".len()); +} + +#[test] +fn test_parsing_with_a_newly_excluded_range() { + let mut source_code = String::from("
<%= something %>
"); + + // Parse HTML including the template directive, which will cause an error + let mut parser = Parser::new(); + parser.set_language(get_language("html")).unwrap(); + let mut first_tree = parser.parse_str(&source_code, None).unwrap(); + + // Insert code at the beginning of the document. + let prefix = "a very very long line of plain text. "; + first_tree.edit(&InputEdit { + start_byte: 0, + old_end_byte: 0, + new_end_byte: prefix.len(), + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(0, prefix.len()), + }); + source_code.insert_str(0, prefix); + + // Parse the HTML again, this time *excluding* the template directive + // (which has moved since the previous parse). + let directive_start = source_code.find("<%=").unwrap(); + let directive_end = source_code.find("").unwrap(); + let source_code_end = source_code.len(); + parser.set_included_ranges(&[ + Range { + start_byte: 0, + end_byte: directive_start, + start_point: Point::new(0, 0), + end_point: Point::new(0, directive_start), + }, + Range { + start_byte: directive_end, + end_byte: source_code_end, + start_point: Point::new(0, directive_end), + end_point: Point::new(0, source_code_end), + }, + ]); + let tree = parser.parse_str(&source_code, Some(&first_tree)).unwrap(); + + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(fragment (text) (element", + " (start_tag (tag_name))", + " (element (start_tag (tag_name)) (end_tag (tag_name)))", + " (end_tag (tag_name))))" + ) + ); + + assert_eq!( + tree.changed_ranges(&first_tree), + vec![ + // The first range that has changed syntax is the range of the newly-inserted text. + Range { + start_byte: 0, + end_byte: prefix.len(), + start_point: Point::new(0, 0), + end_point: Point::new(0, prefix.len()), + }, + // Even though no edits were applied to the outer `div` element, + // its contents have changed syntax because a range of text that + // was previously included is now excluded. + Range { + start_byte: directive_start, + end_byte: directive_end, + start_point: Point::new(0, directive_start), + end_point: Point::new(0, directive_end), + }, + ] + ); +} + +#[test] +fn test_parsing_with_a_newly_included_range() { + let source_code = "
<%= foo() %>
<%= bar() %>"; + let first_code_start_index = source_code.find(" foo").unwrap(); + let first_code_end_index = first_code_start_index + 7; + let second_code_start_index = source_code.find(" bar").unwrap(); + let second_code_end_index = second_code_start_index + 7; + let ranges = [ + Range { + start_byte: first_code_start_index, + end_byte: first_code_end_index, + start_point: Point::new(0, first_code_start_index), + end_point: Point::new(0, first_code_end_index), + }, + Range { + start_byte: second_code_start_index, + end_byte: second_code_end_index, + start_point: Point::new(0, second_code_start_index), + end_point: Point::new(0, second_code_end_index), + }, + ]; + + // Parse only the first code directive as JavaScript + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + parser.set_included_ranges(&ranges[0..1]); + let first_tree = parser.parse_str(source_code, None).unwrap(); + assert_eq!( + first_tree.root_node().to_sexp(), + concat!( + "(program", + " (expression_statement (call_expression (identifier) (arguments))))", + ) + ); + + // Parse both the code directives as JavaScript, using the old tree as a reference. + parser.set_included_ranges(&ranges); + let tree = parser.parse_str(&source_code, Some(&first_tree)).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(program", + " (expression_statement (call_expression (identifier) (arguments)))", + " (expression_statement (call_expression (identifier) (arguments))))", + ) + ); + + assert_eq!( + tree.changed_ranges(&first_tree), + vec![ + Range { + start_byte: first_code_end_index + 1, + end_byte: second_code_end_index + 1, + start_point: Point::new(0, first_code_end_index + 1), + end_point: Point::new(0, second_code_end_index + 1), + } + ] + ); } From 4a98f0b87ef0eeb965ad1e2f55ed3d7e7ca45e0a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Feb 2019 14:44:06 -0800 Subject: [PATCH 203/208] Port unit test for missing tokens and included ranges --- cli/src/generate/mod.rs | 2 +- cli/src/tests/corpus_test.rs | 2 +- cli/src/tests/helpers/fixtures.rs | 16 ++-- cli/src/tests/parser_test.rs | 123 +++++++++++++++++++++++------- 4 files changed, 107 insertions(+), 36 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 127e956e..b00379af 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -58,7 +58,7 @@ pub fn generate_parser_in_directory( Ok(()) } -pub fn generate_parser_for_grammar(grammar_json: &String) -> Result<(String, String)> { +pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); generate_parser_for_grammar_with_opts(&grammar_json, true, Vec::new()) } diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index f1990963..c2e8b6c8 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -239,7 +239,7 @@ fn test_feature_corpus_files() { } else { let corpus_path = test_path.join("corpus.txt"); let c_code = generate_result.unwrap().1; - let language = get_test_language(language_name, c_code, &test_path); + let language = get_test_language(language_name, &c_code, Some(&test_path)); let test = parse_tests(&corpus_path).unwrap(); let tests = flatten_tests(test); diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index 981f0ab6..a5ea9ed0 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -20,7 +20,7 @@ pub fn get_language(name: &str) -> Language { .unwrap() } -pub fn get_test_language(name: &str, parser_code: String, path: &Path) -> Language { +pub fn get_test_language(name: &str, parser_code: &str, path: Option<&Path>) -> Language { let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", name)); if !fs::read_to_string(&parser_c_path) .map(|content| content == parser_code) @@ -28,12 +28,14 @@ pub fn get_test_language(name: &str, parser_code: String, path: &Path) -> Langua { fs::write(&parser_c_path, parser_code).unwrap(); } - let scanner_path = path.join("scanner.c"); - let scanner_path = if scanner_path.exists() { - Some(scanner_path) - } else { - None - }; + let scanner_path = path.and_then(|p| { + let result = p.join("scanner.c"); + if result.exists() { + Some(result) + } else { + None + } + }); TEST_LOADER .load_language_from_sources(name, &HEADER_DIR, &parser_c_path, &scanner_path) .unwrap() diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 94694a32..6790d37f 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,4 +1,5 @@ -use super::helpers::fixtures::get_language; +use super::helpers::fixtures::{get_language, get_test_language}; +use crate::generate::generate_parser_for_grammar; use std::{thread, usize}; use tree_sitter::{InputEdit, LogType, Parser, Point, Range}; @@ -270,15 +271,18 @@ fn test_parsing_with_an_operation_limit() { // Start parsing from an infinite input. Parsing should abort after 5 "operations". parser.set_operation_limit(5); let mut call_count = 0; - let tree = parser.parse_utf8(&mut |_, _| { - if call_count == 0 { - call_count += 1; - b"[0" - } else { - call_count += 1; - b", 0" - } - }, None); + let tree = parser.parse_utf8( + &mut |_, _| { + if call_count == 0 { + call_count += 1; + b"[0" + } else { + call_count += 1; + b", 0" + } + }, + None, + ); assert!(tree.is_none()); assert!(call_count >= 3); assert!(call_count <= 8); @@ -286,15 +290,23 @@ fn test_parsing_with_an_operation_limit() { // Resume parsing from the previous state. call_count = 0; parser.set_operation_limit(20); - let tree = parser.parse_utf8(&mut |_, _| { - if call_count == 0 { - call_count += 1; - b"]" - } else { - b"" - } - }, None).unwrap(); - assert_eq!(tree.root_node().to_sexp(), "(value (array (number) (number) (number)))"); + let tree = parser + .parse_utf8( + &mut |_, _| { + if call_count == 0 { + call_count += 1; + b"]" + } else { + b"" + } + }, + None, + ) + .unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(value (array (number) (number) (number)))" + ); } #[test] @@ -641,13 +653,70 @@ fn test_parsing_with_a_newly_included_range() { assert_eq!( tree.changed_ranges(&first_tree), - vec![ - Range { - start_byte: first_code_end_index + 1, - end_byte: second_code_end_index + 1, - start_point: Point::new(0, first_code_end_index + 1), - end_point: Point::new(0, second_code_end_index + 1), - } - ] + vec![Range { + start_byte: first_code_end_index + 1, + end_byte: second_code_end_index + 1, + start_point: Point::new(0, first_code_end_index + 1), + end_point: Point::new(0, second_code_end_index + 1), + }] ); } + +#[test] +fn test_parsing_with_included_ranges_and_missing_tokens() { + let (parser_name, parser_code) = generate_parser_for_grammar( + r#"{ + "name": "test_leading_missing_token", + "rules": { + "program": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "A"}, + {"type": "SYMBOL", "name": "b"}, + {"type": "SYMBOL", "name": "c"}, + {"type": "SYMBOL", "name": "A"}, + {"type": "SYMBOL", "name": "b"}, + {"type": "SYMBOL", "name": "c"} + ] + }, + "A": {"type": "SYMBOL", "name": "a"}, + "a": {"type": "STRING", "value": "a"}, + "b": {"type": "STRING", "value": "b"}, + "c": {"type": "STRING", "value": "c"} + } + }"#, + ) + .unwrap(); + + let mut parser = Parser::new(); + parser + .set_language(get_test_language(&parser_name, &parser_code, None)) + .unwrap(); + + // There's a missing `a` token at the beginning of the code. It must be inserted + // at the beginning of the first included range, not at {0, 0}. + let source_code = "__bc__bc__"; + parser.set_included_ranges(&[ + Range { + start_byte: 2, + end_byte: 4, + start_point: Point::new(0, 2), + end_point: Point::new(0, 4), + }, + Range { + start_byte: 6, + end_byte: 8, + start_point: Point::new(0, 6), + end_point: Point::new(0, 8), + }, + ]); + + let tree = parser.parse_str(source_code, None).unwrap(); + let root = tree.root_node(); + assert_eq!( + root.to_sexp(), + "(program (A (MISSING)) (b) (c) (A (MISSING)) (b) (c))" + ); + assert_eq!(root.start_byte(), 2); + assert_eq!(root.child(3).unwrap().start_byte(), 4); +} From 9a8cf39277c2a6a3f39112f00a453775a65d8f00 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Feb 2019 16:43:21 -0800 Subject: [PATCH 204/208] Add incremental parsing unit tests --- cli/src/tests/helpers/edits.rs | 49 +++++++++++ cli/src/tests/parser_test.rs | 146 ++++++++++++++++++--------------- 2 files changed, 127 insertions(+), 68 deletions(-) diff --git a/cli/src/tests/helpers/edits.rs b/cli/src/tests/helpers/edits.rs index 4e4d0c25..d4eba7d9 100644 --- a/cli/src/tests/helpers/edits.rs +++ b/cli/src/tests/helpers/edits.rs @@ -1,4 +1,6 @@ use super::random::Rand; +use std::ops::Range; +use std::str; use tree_sitter::{InputEdit, Point, Tree}; pub struct Edit { @@ -7,6 +9,53 @@ pub struct Edit { pub inserted_text: Vec, } +#[derive(Debug)] +pub struct ReadRecorder<'a> { + content: &'a Vec, + indices_read: Vec, +} + +impl<'a> ReadRecorder<'a> { + pub fn new(content: &'a Vec) -> Self { + Self { + content, + indices_read: Vec::new(), + } + } + + pub fn read(&mut self, offset: usize) -> &'a [u8] { + if offset < self.content.len() { + if let Err(i) = self.indices_read.binary_search(&offset) { + self.indices_read.insert(i, offset); + } + &self.content[offset..(offset + 1)] + } else { + &[] + } + } + + pub fn strings_read(&self) -> Vec<&'a str> { + let mut result = Vec::new(); + let mut last_range: Option> = None; + for index in self.indices_read.iter() { + if let Some(ref mut range) = &mut last_range { + if range.end == *index { + range.end += 1; + } else { + result.push(str::from_utf8(&self.content[range.clone()]).unwrap()); + last_range = None; + } + } else { + last_range = Some(*index..(*index + 1)); + } + } + if let Some(range) = last_range { + result.push(str::from_utf8(&self.content[range.clone()]).unwrap()); + } + result + } +} + pub fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { let start_byte = edit.position; let old_end_byte = edit.position + edit.deleted_length; diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 6790d37f..32554e7f 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,3 +1,4 @@ +use super::helpers::edits::{perform_edit, Edit, ReadRecorder}; use super::helpers::fixtures::{get_language, get_test_language}; use crate::generate::generate_parser_for_grammar; use std::{thread, usize}; @@ -126,88 +127,97 @@ fn test_parsing_with_custom_utf16_input() { assert_eq!(root.child(0).unwrap().kind(), "function_item"); } +// Incremental parsing + #[test] -fn test_parsing_after_editing() { +fn test_parsing_after_editing_beginning_of_code() { let mut parser = Parser::new(); - parser.set_language(get_language("rust")).unwrap(); + parser.set_language(get_language("javascript")).unwrap(); - let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); - let mut input_bytes_read = Vec::new(); - - let mut tree = parser - .parse_utf8( - &mut |offset, _| { - let offset = offset; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, - None, - ) - .unwrap(); - - let parameters_sexp = tree - .root_node() - .named_child(0) - .unwrap() - .named_child(1) - .unwrap() - .to_sexp(); + let mut code = b"123 + 456 * (10 + x);".to_vec(); + let mut tree = parser.parse_utf8(&mut |i, _| &code[i..], None).unwrap(); assert_eq!( - parameters_sexp, - "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + tree.root_node().to_sexp(), + concat!( + "(program (expression_statement (binary_expression ", + "(number) ", + "(binary_expression (number) (parenthesized_expression (binary_expression (number) (identifier)))))))", + ) ); - input_bytes_read.clear(); - input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); - tree.edit(&InputEdit { - start_byte: 14, - old_end_byte: 14, - new_end_byte: 20, - start_position: Point::new(0, 14), - old_end_position: Point::new(0, 14), - new_end_position: Point::new(0, 20), - }); + perform_edit( + &mut tree, + &mut code, + &Edit { + position: 3, + deleted_length: 0, + inserted_text: b" || 5".to_vec(), + }, + ); + let mut recorder = ReadRecorder::new(&code); let tree = parser - .parse_utf8( - &mut |offset, _| { - let offset = offset; - if offset < input_bytes.len() { - let result = &input_bytes[offset..offset + 1]; - input_bytes_read.extend(result.iter()); - result - } else { - &[] - } - }, - Some(&tree), - ) + .parse_utf8(&mut |i, _| recorder.read(i), Some(&tree)) .unwrap(); - - let parameters_sexp = tree - .root_node() - .named_child(0) - .unwrap() - .named_child(1) - .unwrap() - .to_sexp(); assert_eq!( - parameters_sexp, - "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + tree.root_node().to_sexp(), + concat!( + "(program (expression_statement (binary_expression ", + "(number) ", + "(binary_expression ", + "(number) ", + "(binary_expression (number) (parenthesized_expression (binary_expression (number) (identifier))))))))", + ) ); - let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); - assert!(retokenized_content.contains("b: B")); - assert!(!retokenized_content.contains("a: A")); - assert!(!retokenized_content.contains("c: C")); - assert!(!retokenized_content.contains("{}")); + assert_eq!(recorder.strings_read(), vec!["123 || 5 "]); } +#[test] +fn test_parsing_after_editing_end_of_code() { + let mut parser = Parser::new(); + parser.set_language(get_language("javascript")).unwrap(); + + let mut code = b"x * (100 + abc);".to_vec(); + let mut tree = parser.parse_utf8(&mut |i, _| &code[i..], None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(program (expression_statement (binary_expression ", + "(identifier) ", + "(parenthesized_expression (binary_expression (number) (identifier))))))", + ) + ); + + let position = code.len() - 2; + perform_edit( + &mut tree, + &mut code, + &Edit { + position, + deleted_length: 0, + inserted_text: b".d".to_vec(), + }, + ); + + let mut recorder = ReadRecorder::new(&code); + let tree = parser + .parse_utf8(&mut |i, _| recorder.read(i), Some(&tree)) + .unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(program (expression_statement (binary_expression ", + "(identifier) ", + "(parenthesized_expression (binary_expression (number) (member_expression (identifier) (property_identifier)))))))" + ) + ); + + assert_eq!(recorder.strings_read(), vec![" * ", "abc.d)",]); +} + +// Thread safety + #[test] fn test_parsing_on_multiple_threads() { // Parse this source file so that each thread has a non-trivial amount of From efe79889be94623325c0d32ed4912766066a0d9a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Feb 2019 20:42:56 -0800 Subject: [PATCH 205/208] Port node tests --- cli/src/tests/helpers/edits.rs | 8 +- cli/src/tests/mod.rs | 1 + cli/src/tests/node_test.rs | 364 +++++++++++++++++++++++++++++++++ lib/binding/lib.rs | 27 ++- 4 files changed, 389 insertions(+), 11 deletions(-) create mode 100644 cli/src/tests/node_test.rs diff --git a/cli/src/tests/helpers/edits.rs b/cli/src/tests/helpers/edits.rs index d4eba7d9..e84477c4 100644 --- a/cli/src/tests/helpers/edits.rs +++ b/cli/src/tests/helpers/edits.rs @@ -56,7 +56,7 @@ impl<'a> ReadRecorder<'a> { } } -pub fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { +pub fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) -> InputEdit { let start_byte = edit.position; let old_end_byte = edit.position + edit.deleted_length; let new_end_byte = edit.position + edit.inserted_text.len(); @@ -64,14 +64,16 @@ pub fn perform_edit(tree: &mut Tree, input: &mut Vec, edit: &Edit) { let old_end_position = position_for_offset(input, old_end_byte); input.splice(start_byte..old_end_byte, edit.inserted_text.iter().cloned()); let new_end_position = position_for_offset(input, new_end_byte); - tree.edit(&InputEdit { + let edit = InputEdit { start_byte, old_end_byte, new_end_byte, start_position, old_end_position, new_end_position, - }); + }; + tree.edit(&edit); + edit } pub fn invert_edit(input: &Vec, edit: &Edit) -> Edit { diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index b8f6ad1f..af2b4582 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,5 +1,6 @@ mod corpus_test; mod helpers; +mod node_test; mod parser_test; mod properties_test; mod tree_test; diff --git a/cli/src/tests/node_test.rs b/cli/src/tests/node_test.rs new file mode 100644 index 00000000..fc6038f4 --- /dev/null +++ b/cli/src/tests/node_test.rs @@ -0,0 +1,364 @@ +use super::helpers::fixtures::{get_language, get_test_language}; +use super::helpers::random::Rand; +use super::helpers::edits::{get_random_edit, perform_edit}; +use crate::generate::generate_parser_for_grammar; +use tree_sitter::{Node, Parser, Point, Tree}; + +const JSON_EXAMPLE: &'static str = r#" + +[ + 123, + false, + { + "x": null + } +] +"#; + +const GRAMMAR_WITH_ALIASES_AND_EXTRAS: &'static str = r#"{ + "name": "aliases_and_extras", + + "extras": [ + {"type": "PATTERN", "value": "\\s+"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + "a": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "b"}, + { + "type": "ALIAS", + "value": "B", + "named": true, + "content": {"type": "SYMBOL", "name": "b"} + }, + { + "type": "ALIAS", + "value": "C", + "named": true, + "content": {"type": "SYMBOL", "name": "_c"} + } + ] + }, + + "b": {"type": "STRING", "value": "b"}, + + "_c": {"type": "STRING", "value": "c"}, + + "comment": {"type": "STRING", "value": "..."} + } +}"#; + +#[test] +fn test_node_child() { + let tree = parse_json_example(); + let array_node = tree.root_node().child(0).unwrap(); + + assert_eq!(array_node.kind(), "array"); + assert_eq!(array_node.named_child_count(), 3); + assert_eq!(array_node.start_byte(), JSON_EXAMPLE.find("[").unwrap()); + assert_eq!(array_node.end_byte(), JSON_EXAMPLE.find("]").unwrap() + 1); + assert_eq!(array_node.start_position(), Point::new(2, 0)); + assert_eq!(array_node.end_position(), Point::new(8, 1)); + assert_eq!(array_node.child_count(), 7); + + let left_bracket_node = array_node.child(0).unwrap(); + let number_node = array_node.child(1).unwrap(); + let comma_node1 = array_node.child(2).unwrap(); + let false_node = array_node.child(3).unwrap(); + let comma_node2 = array_node.child(4).unwrap(); + let object_node = array_node.child(5).unwrap(); + let right_bracket_node = array_node.child(6).unwrap(); + + assert_eq!(left_bracket_node.kind(), "["); + assert_eq!(number_node.kind(), "number"); + assert_eq!(comma_node1.kind(), ","); + assert_eq!(false_node.kind(), "false"); + assert_eq!(comma_node2.kind(), ","); + assert_eq!(object_node.kind(), "object"); + assert_eq!(right_bracket_node.kind(), "]"); + + assert_eq!(left_bracket_node.is_named(), false); + assert_eq!(number_node.is_named(), true); + assert_eq!(comma_node1.is_named(), false); + assert_eq!(false_node.is_named(), true); + assert_eq!(comma_node2.is_named(), false); + assert_eq!(object_node.is_named(), true); + assert_eq!(right_bracket_node.is_named(), false); + + assert_eq!(number_node.start_byte(), JSON_EXAMPLE.find("123").unwrap()); + assert_eq!( + number_node.end_byte(), + JSON_EXAMPLE.find("123").unwrap() + 3 + ); + assert_eq!(number_node.start_position(), Point::new(3, 2)); + assert_eq!(number_node.end_position(), Point::new(3, 5)); + + assert_eq!(false_node.start_byte(), JSON_EXAMPLE.find("false").unwrap()); + assert_eq!( + false_node.end_byte(), + JSON_EXAMPLE.find("false").unwrap() + 5 + ); + assert_eq!(false_node.start_position(), Point::new(4, 2)); + assert_eq!(false_node.end_position(), Point::new(4, 7)); + + assert_eq!(object_node.start_byte(), JSON_EXAMPLE.find("{").unwrap()); + assert_eq!(object_node.start_position(), Point::new(5, 2)); + assert_eq!(object_node.end_position(), Point::new(7, 3)); + + assert_eq!(object_node.child_count(), 3); + let left_brace_node = object_node.child(0).unwrap(); + let pair_node = object_node.child(1).unwrap(); + let right_brace_node = object_node.child(2).unwrap(); + + assert_eq!(left_brace_node.kind(), "{"); + assert_eq!(pair_node.kind(), "pair"); + assert_eq!(right_brace_node.kind(), "}"); + + assert_eq!(left_brace_node.is_named(), false); + assert_eq!(pair_node.is_named(), true); + assert_eq!(right_brace_node.is_named(), false); + + assert_eq!(pair_node.start_byte(), JSON_EXAMPLE.find("\"x\"").unwrap()); + assert_eq!(pair_node.end_byte(), JSON_EXAMPLE.find("null").unwrap() + 4); + assert_eq!(pair_node.start_position(), Point::new(6, 4)); + assert_eq!(pair_node.end_position(), Point::new(6, 13)); + + assert_eq!(pair_node.child_count(), 3); + let string_node = pair_node.child(0).unwrap(); + let colon_node = pair_node.child(1).unwrap(); + let null_node = pair_node.child(2).unwrap(); + + assert_eq!(string_node.kind(), "string"); + assert_eq!(colon_node.kind(), ":"); + assert_eq!(null_node.kind(), "null"); + + assert_eq!(string_node.is_named(), true); + assert_eq!(colon_node.is_named(), false); + assert_eq!(null_node.is_named(), true); + + assert_eq!( + string_node.start_byte(), + JSON_EXAMPLE.find("\"x\"").unwrap() + ); + assert_eq!( + string_node.end_byte(), + JSON_EXAMPLE.find("\"x\"").unwrap() + 3 + ); + assert_eq!(string_node.start_position(), Point::new(6, 4)); + assert_eq!(string_node.end_position(), Point::new(6, 7)); + + assert_eq!(null_node.start_byte(), JSON_EXAMPLE.find("null").unwrap()); + assert_eq!(null_node.end_byte(), JSON_EXAMPLE.find("null").unwrap() + 4); + assert_eq!(null_node.start_position(), Point::new(6, 9)); + assert_eq!(null_node.end_position(), Point::new(6, 13)); + + assert_eq!(string_node.parent().unwrap(), pair_node); + assert_eq!(null_node.parent().unwrap(), pair_node); + assert_eq!(pair_node.parent().unwrap(), object_node); + assert_eq!(number_node.parent().unwrap(), array_node); + assert_eq!(false_node.parent().unwrap(), array_node); + assert_eq!(object_node.parent().unwrap(), array_node); + assert_eq!(array_node.parent().unwrap(), tree.root_node()); + assert_eq!(tree.root_node().parent(), None); +} + +#[test] +fn test_node_named_child() { + let tree = parse_json_example(); + let array_node = tree.root_node().child(0).unwrap(); + + let number_node = array_node.named_child(0).unwrap(); + let false_node = array_node.named_child(1).unwrap(); + let object_node = array_node.named_child(2).unwrap(); + + assert_eq!(number_node.kind(), "number"); + assert_eq!(number_node.start_byte(), JSON_EXAMPLE.find("123").unwrap()); + assert_eq!( + number_node.end_byte(), + JSON_EXAMPLE.find("123").unwrap() + 3 + ); + assert_eq!(number_node.start_position(), Point::new(3, 2)); + assert_eq!(number_node.end_position(), Point::new(3, 5)); + + assert_eq!(false_node.kind(), "false"); + assert_eq!(false_node.start_byte(), JSON_EXAMPLE.find("false").unwrap()); + assert_eq!( + false_node.end_byte(), + JSON_EXAMPLE.find("false").unwrap() + 5 + ); + assert_eq!(false_node.start_position(), Point::new(4, 2)); + assert_eq!(false_node.end_position(), Point::new(4, 7)); + + assert_eq!(object_node.kind(), "object"); + assert_eq!(object_node.start_byte(), JSON_EXAMPLE.find("{").unwrap()); + assert_eq!(object_node.start_position(), Point::new(5, 2)); + assert_eq!(object_node.end_position(), Point::new(7, 3)); + + assert_eq!(object_node.named_child_count(), 1); + + let pair_node = object_node.named_child(0).unwrap(); + assert_eq!(pair_node.kind(), "pair"); + assert_eq!(pair_node.start_byte(), JSON_EXAMPLE.find("\"x\"").unwrap()); + assert_eq!(pair_node.end_byte(), JSON_EXAMPLE.find("null").unwrap() + 4); + assert_eq!(pair_node.start_position(), Point::new(6, 4)); + assert_eq!(pair_node.end_position(), Point::new(6, 13)); + + let string_node = pair_node.named_child(0).unwrap(); + let null_node = pair_node.named_child(1).unwrap(); + + assert_eq!(string_node.kind(), "string"); + assert_eq!(null_node.kind(), "null"); + + assert_eq!( + string_node.start_byte(), + JSON_EXAMPLE.find("\"x\"").unwrap() + ); + assert_eq!( + string_node.end_byte(), + JSON_EXAMPLE.find("\"x\"").unwrap() + 3 + ); + assert_eq!(string_node.start_position(), Point::new(6, 4)); + assert_eq!(string_node.end_position(), Point::new(6, 7)); + + assert_eq!(null_node.start_byte(), JSON_EXAMPLE.find("null").unwrap()); + assert_eq!(null_node.end_byte(), JSON_EXAMPLE.find("null").unwrap() + 4); + assert_eq!(null_node.start_position(), Point::new(6, 9)); + assert_eq!(null_node.end_position(), Point::new(6, 13)); + + assert_eq!(string_node.parent().unwrap(), pair_node); + assert_eq!(null_node.parent().unwrap(), pair_node); + assert_eq!(pair_node.parent().unwrap(), object_node); + assert_eq!(number_node.parent().unwrap(), array_node); + assert_eq!(false_node.parent().unwrap(), array_node); + assert_eq!(object_node.parent().unwrap(), array_node); + assert_eq!(array_node.parent().unwrap(), tree.root_node()); + assert_eq!(tree.root_node().parent(), None); +} + +#[test] +fn test_node_named_child_with_aliases_and_extras() { + let (parser_name, parser_code) = + generate_parser_for_grammar(GRAMMAR_WITH_ALIASES_AND_EXTRAS).unwrap(); + + let mut parser = Parser::new(); + parser + .set_language(get_test_language(&parser_name, &parser_code, None)) + .unwrap(); + + let tree = parser.parse_str("b ... b ... c", None).unwrap(); + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(a (b) (comment) (B) (comment) (C))"); + assert_eq!(root.named_child_count(), 5); + assert_eq!(root.named_child(0).unwrap().kind(), "b"); + assert_eq!(root.named_child(1).unwrap().kind(), "comment"); + assert_eq!(root.named_child(2).unwrap().kind(), "B"); + assert_eq!(root.named_child(3).unwrap().kind(), "comment"); + assert_eq!(root.named_child(4).unwrap().kind(), "C"); +} + +#[test] +fn test_node_descendant_for_range() { + let tree = parse_json_example(); + let array_node = tree.root_node().child(0).unwrap(); + + let colon_index = JSON_EXAMPLE.find(":").unwrap(); + let node1 = array_node + .descendant_for_byte_range(colon_index, colon_index) + .unwrap(); + assert_eq!(node1.kind(), ":"); + assert_eq!(node1.start_byte(), colon_index); + assert_eq!(node1.end_byte(), colon_index + 1); + assert_eq!(node1.start_position(), Point::new(6, 7)); + assert_eq!(node1.end_position(), Point::new(6, 8)); + + let string_index = JSON_EXAMPLE.find("\"x\"").unwrap(); + let node2 = array_node + .descendant_for_byte_range(string_index + 2, string_index + 4) + .unwrap(); + assert_eq!(node2.kind(), "pair"); + assert_eq!(node2.start_byte(), string_index); + assert_eq!(node2.end_byte(), string_index + 9); + assert_eq!(node2.start_position(), Point::new(6, 4)); + assert_eq!(node2.end_position(), Point::new(6, 13)); + + assert_eq!(node1.parent(), Some(node2)); + + let node3 = array_node + .named_descendant_for_byte_range(string_index, string_index + 2) + .unwrap(); + assert_eq!(node3.kind(), "string"); + assert_eq!(node3.start_byte(), string_index); + assert_eq!(node3.end_byte(), string_index + 3); + + // no leaf spans the given range - return the smallest node that does span it. + let node4 = array_node + .named_descendant_for_byte_range(string_index, string_index + 3) + .unwrap(); + assert_eq!(node4.kind(), "pair"); + assert_eq!(node4.start_byte(), string_index); + assert_eq!(node4.end_byte(), string_index + 9); +} + +#[test] +fn test_node_edit() { + let mut code = JSON_EXAMPLE.as_bytes().to_vec(); + let mut tree = parse_json_example(); + let mut rand = Rand::new(0); + + for _ in 0..10 { + let mut nodes_before = get_all_nodes(&tree); + + let edit = get_random_edit(&mut rand, &mut code); + let mut tree2 = tree.clone(); + let edit = perform_edit(&mut tree2, &mut code, &edit); + for node in nodes_before.iter_mut() { + node.edit(&edit); + } + + let nodes_after = get_all_nodes(&tree2); + for (i, node) in nodes_before.into_iter().enumerate() { + assert_eq!( + ( + node.kind(), + node.start_byte(), + node.start_position() + ), + ( + nodes_after[i].kind(), + nodes_after[i].start_byte(), + nodes_after[i].start_position() + ), + ); + } + + tree = tree2; + } +} + +fn get_all_nodes(tree: &Tree) -> Vec { + let mut result = Vec::new(); + let mut visited_children = false; + let mut cursor = tree.walk(); + loop { + result.push(cursor.node()); + if !visited_children && cursor.goto_first_child() { + continue; + } else if cursor.goto_next_sibling() { + visited_children = false; + } else if cursor.goto_parent() { + visited_children = true; + } else { + break; + } + } + return result; +} + +fn parse_json_example() -> Tree { + let mut parser = Parser::new(); + parser.set_language(get_language("json")).unwrap(); + parser.parse_str(JSON_EXAMPLE, None).unwrap() +} diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs index 9e04ed35..26335a09 100644 --- a/lib/binding/lib.rs +++ b/lib/binding/lib.rs @@ -406,14 +406,7 @@ impl Tree { } pub fn edit(&mut self, edit: &InputEdit) { - let edit = ffi::TSInputEdit { - start_byte: edit.start_byte as u32, - old_end_byte: edit.old_end_byte as u32, - new_end_byte: edit.new_end_byte as u32, - start_point: edit.start_position.into(), - old_end_point: edit.old_end_position.into(), - new_end_point: edit.new_end_position.into(), - }; + let edit = edit.into(); unsafe { ffi::ts_tree_edit(self.0, &edit) }; } @@ -615,6 +608,11 @@ impl<'tree> Node<'tree> { pub fn walk(&self) -> TreeCursor<'tree> { TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) } + + pub fn edit(&mut self, edit: &InputEdit) { + let edit = edit.into(); + unsafe { ffi::ts_node_edit(&mut self.0 as *mut ffi::TSNode, &edit) } + } } impl<'a> PartialEq for Node<'a> { @@ -832,6 +830,19 @@ impl From for Range { } } +impl<'a> Into for &'a InputEdit { + fn into(self) -> ffi::TSInputEdit { + ffi::TSInputEdit { + start_byte: self.start_byte as u32, + old_end_byte: self.old_end_byte as u32, + new_end_byte: self.new_end_byte as u32, + start_point: self.start_position.into(), + old_end_point: self.old_end_position.into(), + new_end_point: self.new_end_position.into(), + } + } +} + impl

PropertySheet

{ pub fn new(language: Language, json: &str) -> Result where From ce040e21e19ebf521a22c3e85d75383e66847a5c Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Sat, 2 Feb 2019 22:14:54 +0000 Subject: [PATCH 206/208] trace-pc-guard is unsupported in newer versions of clang --- script/build-fuzzers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/build-fuzzers b/script/build-fuzzers index 0a19bc4c..d48e1da1 100755 --- a/script/build-fuzzers +++ b/script/build-fuzzers @@ -15,7 +15,7 @@ CC=${CC:-clang} CXX=${CXX:-clang++} LINK=${LINK:-clang++} -default_fuzz_flags="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard" +default_fuzz_flags="-fsanitize=fuzzer,address,undefined" CFLAGS=${CFLAGS:-"$default_fuzz_flags"} CXXFLAGS=${CXXFLAGS:-"$default_fuzz_flags"} From d102c473e8f17996659bf01a2be5b04a11f89653 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Sat, 2 Feb 2019 22:15:04 +0000 Subject: [PATCH 207/208] Remove invalid characters from grammar names --- script/build-fuzzers | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/script/build-fuzzers b/script/build-fuzzers index d48e1da1..2a44b10c 100755 --- a/script/build-fuzzers +++ b/script/build-fuzzers @@ -53,7 +53,11 @@ for lang in ${languages[@]}; do modes=(true halt false recover) for i in 0 2; do - $CXX $CXXFLAGS -std=c++11 -I lib/include -D TS_HALT_ON_ERROR="${modes[i]}" -D TS_LANG="tree_sitter_$lang" \ + # FIXME: We should extract the grammar name from grammar.js. Use the name of + # the directory instead. Also, the grammar name needs to be a valid C + # identifier so replace any '-' characters + ts_lang="tree_sitter_$(echo $lang | tr -- - _)" + $CXX $CXXFLAGS -std=c++11 -I lib/include -D TS_HALT_ON_ERROR="${modes[i]}" -D TS_LANG="$ts_lang" \ "test/fuzz/fuzzer.cc" "${objects[@]}" \ libtree-sitter.a "$LIB_FUZZER_PATH" \ -o "out/${lang}_fuzzer_${modes[i+1]}" From 6df2adc8032ca672d898822e50681478a66b6697 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Sat, 2 Feb 2019 22:13:22 +0000 Subject: [PATCH 208/208] clang must be >= 7 --- test/fuzz/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/fuzz/README.md b/test/fuzz/README.md index d5040b95..649d2d89 100644 --- a/test/fuzz/README.md +++ b/test/fuzz/README.md @@ -15,7 +15,7 @@ cd compiler-rt/lib/fuzzer ## clang -Using libFuzzer requires a reasonably new version of `clang` and will probably _not_ work with your system-installed version. The easiest way to get started is to use the version provided by the Chromium team. Instructions are available at [libFuzzer.info](http://libfuzzer.info). +Using libFuzzer requires at least version 7 of `clang` and may _not_ work with your system-installed version. If your system-installed version is too old, the easiest way to get started is to use the version provided by the Chromium team. Instructions are available at [libFuzzer.info](http://libfuzzer.info). The fuzzers can then be built with: ```